Chore: Refactor Transcribe server (#14381)

2026-05-07 20:02:45 +00:00 · 2026-02-19 12:51:42 +00:00
parent 8aca7445c7
commit 5beccb9a86
12 changed files with 128 additions and 70 deletions
@@ -31,6 +31,7 @@
 # QUEUE_DATABASE_PASSWORD=transcribe
 # QUEUE_DATABASE_PORT=5431
 # HTR_CLI_IMAGES_FOLDER=/home/user/images_storage
+# HTR_CLI_MODELS_FOLDER=/home/user/transcribe_models

 # =============================================================================
 # DEV CONFIG EXAMPLE
@@ -2,25 +2,24 @@ FROM node:24-bullseye

 RUN apt-get update \
    && apt-get install -y \
-    ca-certificates curl \
-    python3 tini
-
-## install docker
-RUN install -m 0755 -d /etc/apt/keyrings
-RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-RUN chmod a+r /etc/apt/keyrings/docker.asc
-RUN echo \
-    "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
-    $(. /etc/os-release && echo bullseye) stable" | \
-    tee /etc/apt/sources.list.d/docker.list > /dev/null
-RUN apt-get update \
-    && apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \
+    ca-certificates curl wget unzip \
+    python3 tini \
    && rm -rf /var/lib/apt/lists/*

 ENV NODE_ENV=production

 RUN corepack enable

+# Download llama.cpp binary
+WORKDIR /opt/llama
+RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64.zip \
+    && unzip llama-b5449-bin-ubuntu-x64.zip \
+    && rm llama-b5449-bin-ubuntu-x64.zip \
+    && chmod +x /opt/llama/build/bin/llama-mtmd-cli
+
+# Create non-root user for security
+RUN groupadd -r transcribe && useradd -r -g transcribe transcribe
+
 WORKDIR /app

 COPY .yarn/releases ./.yarn/releases
@@ -44,7 +43,17 @@ RUN BUILD_SEQUENCIAL=1 yarn install --inline-builds \
    && yarn cache clean \
    && rm -rf .yarn/berry

+# Create images directory and set permissions
+RUN mkdir -p /app/packages/transcribe/images \
+    && chown -R transcribe:transcribe /app/packages/transcribe/images
+
 WORKDIR /app/packages/transcribe

+# Switch to non-root user
+USER transcribe
+
+# Set environment variable for embedded llama.cpp binary
+ENV HTR_CLI_BINARY_PATH=/opt/llama/build/bin/llama-mtmd-cli
+
 # Start the Node.js application
 CMD ["yarn", "start"]
@@ -84,8 +84,8 @@ services:
        profiles:
            - full
        volumes:
-            - /var/run/docker.sock:/var/run/docker.sock
            - ${HTR_CLI_IMAGES_FOLDER}:/app/packages/transcribe/images
+            - ${HTR_CLI_MODELS_FOLDER}:/opt/models:ro
        depends_on:
            - transcribe-db
        ports:
@@ -94,6 +94,16 @@ services:
            - transcribe-network
            - shared-network
        restart: unless-stopped
+        # Security: limit resources to prevent runaway processes
+        deploy:
+            resources:
+                limits:
+                    memory: 16G
+                    cpus: '4'
+        # Security: read-only root filesystem with only images folder writable
+        read_only: true
+        tmpfs:
+            - /tmp
        environment:
            - APP_PORT=4567
            - DB_CLIENT=pg
@@ -103,5 +113,6 @@ services:
            - QUEUE_DATABASE_PORT=${QUEUE_DATABASE_PORT}
            - QUEUE_DATABASE_HOST=transcribe-db
            - API_KEY=${TRANSCRIBE_API_KEY}
-            - HTR_CLI_IMAGES_FOLDER=${HTR_CLI_IMAGES_FOLDER}
+            - HTR_CLI_IMAGES_FOLDER=/app/packages/transcribe/images
+            - HTR_CLI_MODELS_FOLDER=/opt/models

@@ -251,5 +251,7 @@ codegen
 analyzed
 Perfetto
 appmodules
+mtmd
+gguf
 armor
 clearsign
@@ -1,26 +0,0 @@
-FROM bitnami/minideb:bookworm
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    cmake \
-    git \
-    wget \
-    unzip \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64.zip
-
-RUN mkdir /models/
-RUN wget -q -O /models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf 
-RUN wget -q -O /models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf
-
-WORKDIR /app
-RUN unzip llama-b5449-bin-ubuntu-x64.zip
-WORKDIR /app/build/bin
-
-# Create an entrypoint script
-COPY entrypoint.sh /entrypoint.sh 
-RUN chmod +x /entrypoint.sh
-
-ENTRYPOINT ["/entrypoint.sh"]
@@ -2,16 +2,33 @@

 ## Configure Docker for Transcribe

+The transcribe server embeds the llama.cpp binary directly in the Docker image. The AI models must be downloaded separately and mounted as a volume.
+
+### 1. Download the models
+
+Create a directory for the models and download them:
+
+```shell
+mkdir -p ./data/transcribe-models
+wget -O ./data/transcribe-models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf
+wget -O ./data/transcribe-models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf
+```
+
+### 2. Configure environment
+
 1. Copy `.env-transcribe-sample` to your Docker configuration directory.
 2. Rename it to `.env-transcribe`.
-3. Set `HTR_CLI_IMAGES_FOLDER` to the full path of the folder where images will be stored. This folder must be outside the Docker container.
-4. Test the server with the default configuration:
+
+### 3. Run the server
+
+The models directory on your host is mounted into the container at `/opt/models`. The `HTR_CLI_MODELS_FOLDER` environment variable refers to the path inside the container, not the host path.

 ```shell
 docker build -f ./Dockerfile.transcribe -t transcribe .
 docker run --env-file .env-transcribe -p 4567:4567 \
-	-v /var/run/docker.sock:/var/run/docker.sock \
 	-v ./packages/transcribe/images:/app/packages/transcribe/images \
+	-v ./data/transcribe-models:/opt/models:ro \
+	-e HTR_CLI_MODELS_FOLDER=/opt/models \
 	transcribe
 ```

@@ -29,6 +46,15 @@ The minimal configuration is provided in `.env-sample` and `docker-compose.serve

 For advanced configuration, refer to `.env-sample-transcribe`.

+## Security
+
+The transcribe container runs with these security measures:
+
+- **Non-root user**: The application runs as the `transcribe` user, not root
+- **Read-only filesystem**: The container filesystem is read-only (only `/app/packages/transcribe/images` and `/tmp` are writable)
+- **Resource limits**: Memory and CPU limits prevent runaway processes
+- **No Docker socket**: Unlike previous versions, no Docker socket mount is required
+
 ---

 # Development Setup
@@ -56,9 +82,15 @@ The queue driver can be **SQLite** or **PostgreSQL**:
 From `packages/transcribe`, run:

 ```shell
-npm run start
+yarn start
 ```

+### Environment variables
+
+- `HTR_CLI_BINARY_PATH`: Path to the llama-mtmd-cli binary
+- `HTR_CLI_MODELS_FOLDER`: Path to the models directory
+- `HTR_CLI_IMAGES_FOLDER`: Path where uploaded images are stored
+
 ---

 # API Endpoints
@@ -1,7 +0,0 @@
-#!/bin/bash
-if [ ! -f "/images/$1" ]; then
-    echo "Error: Image file /images/$1 does not exist. Check if HTR_CLI_IMAGES_FOLDER environment variable is set correctly."
-    exit 1
-fi
-
-./llama-mtmd-cli -m /models/Model-7.6B-Q4_K_M.gguf --mmproj /models/mmproj-model-f16.gguf -c 4096 --temp 0.05 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image /images/"$1" -p "SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: \`\`\`{{example}}\`\`\`. If you find no text, output \`\`\`\`\`\`.. Your turn:"
@@ -34,7 +34,11 @@ const init = async (logger: LoggerWrapper) => {
 	app.context.queue = queue;
 	app.context.storage = fileStorage;

-	const htrCli = new HtrCli(envVariables.HTR_CLI_DOCKER_IMAGE, envVariables.HTR_CLI_IMAGES_FOLDER);
+	const htrCli = new HtrCli({
+		htrCliImagesFolder: envVariables.HTR_CLI_IMAGES_FOLDER,
+		binaryPath: envVariables.HTR_CLI_BINARY_PATH,
+		modelsFolder: envVariables.HTR_CLI_MODELS_FOLDER,
+	});

 	const jobProcessor = new JobProcessor(queue, htrCli, fileStorage);

@@ -46,6 +50,8 @@ const init = async (logger: LoggerWrapper) => {
 const checkServerConfigurations = (envVariables: EnvVariables) => {
 	if (!envVariables.API_KEY) throw Error('API_KEY environment variable not set.');
 	if (!envVariables.HTR_CLI_IMAGES_FOLDER) throw Error('HTR_CLI_IMAGES_FOLDER environment variable not set. This should point to a folder where images will be stored.');
+	if (!envVariables.HTR_CLI_BINARY_PATH) throw Error('HTR_CLI_BINARY_PATH environment variable not set. This should point to the llama-mtmd-cli binary.');
+	if (!envVariables.HTR_CLI_MODELS_FOLDER) throw Error('HTR_CLI_MODELS_FOLDER environment variable not set. This should point to the folder containing the AI models.');
 };

 const main = async () => {
@@ -2,7 +2,7 @@ import { readFile } from 'fs-extra';
 import HtrCli from './HtrCli';

 describe('HtrCli', () => {
-	const dt = new HtrCli('', '');
+	const dt = new HtrCli({ htrCliImagesFolder: '', binaryPath: '', modelsFolder: '' });
 	it('should parse multiline result', async () => {
 		const testCase = await readFile('./test-cases/1.txt');
 		const result = dt.cleanUpResult(testCase.toString());
@@ -1,29 +1,41 @@
 import Logger from '@joplin/utils/Logger';
 import { commandToString, execCommand } from '@joplin/utils';
 import { WorkHandler } from '../types';
+import { basename } from 'path';

 const logger = Logger.create('HtrCli');

+const systemPrompt = 'SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: ```{{example}}```. If you find no text, output an empty code block: ``````. Your turn:';
+
+export interface HtrCliOptions {
+	htrCliImagesFolder: string;
+	binaryPath: string;
+	modelsFolder: string;
+}
+
 export default class HtrCli implements WorkHandler {

-	private htrCliDockerImage: string;
-	private htrCliImagesFolder: string;
+	private options: HtrCliOptions;

-	public constructor(htrCliDockerImage: string, htrCliImagesFolder: string) {
-		this.htrCliDockerImage = htrCliDockerImage;
-		this.htrCliImagesFolder = htrCliImagesFolder;
+	public constructor(options: HtrCliOptions) {
+		this.options = options;
 	}

 	public async init() {
-		logger.info('Loading');
-		const result = await execCommand(['docker', 'pull', this.htrCliDockerImage], { quiet: true });
-		logger.info('Finished loading: ', result);
+		logger.info('Using embedded llama.cpp binary');
 	}

 	public async run(imageName: string) {
-		const command = ['docker', 'run', '--rm', '-t', '-v', `${this.htrCliImagesFolder}:/images`, this.htrCliDockerImage, imageName];
-
 		logger.info('Running transcription...');
+
+		// Sanitize imageName to prevent path traversal attacks
+		const sanitizedImageName = basename(imageName);
+		if (sanitizedImageName !== imageName || imageName.includes('..')) {
+			throw new Error(`Invalid image name: ${imageName}`);
+		}
+
+		const command = this.buildCommand(imageName);
+
 		logger.info(`Command: ${commandToString(command[0], command.slice(1))}`);
 		const result = await execCommand(command, { quiet: true });

@@ -31,6 +43,22 @@ export default class HtrCli implements WorkHandler {
 		return this.cleanUpResult(result);
 	}

+	private buildCommand(imageName: string): string[] {
+		const { binaryPath, modelsFolder, htrCliImagesFolder } = this.options;
+		return [
+			binaryPath,
+			'-m', `${modelsFolder}/Model-7.6B-Q4_K_M.gguf`,
+			'--mmproj', `${modelsFolder}/mmproj-model-f16.gguf`,
+			'-c', '4096',
+			'--temp', '0.05',
+			'--top-p', '0.8',
+			'--top-k', '100',
+			'--repeat-penalty', '1.05',
+			'--image', `${htrCliImagesFolder}/${imageName}`,
+			'-p', systemPrompt,
+		];
+	}
+
 	public cleanUpResult(transcriptionAndLogs: string) {
 		const s1 = transcriptionAndLogs.split(/image decoded.*/);
 		// Before the last `image decoded` line it is all logs generated by the transcription tool
@@ -6,8 +6,9 @@ export const defaultEnvValues: EnvVariables = {
 	QUEUE_TTL: 15 * Minute,
 	QUEUE_RETRY_COUNT: 2,
 	QUEUE_MAINTENANCE_INTERVAL: 60 * Second,
-	HTR_CLI_DOCKER_IMAGE: 'joplin/htr-cli:latest',
 	HTR_CLI_IMAGES_FOLDER: '',
+	HTR_CLI_BINARY_PATH: '',
+	HTR_CLI_MODELS_FOLDER: '',
 	QUEUE_DRIVER: 'pg', // 'sqlite'
 	QUEUE_DATABASE_PASSWORD: '',
 	QUEUE_DATABASE_NAME: '',
@@ -25,8 +26,9 @@ export interface EnvVariables {
 	QUEUE_TTL: number;
 	QUEUE_RETRY_COUNT: number;
 	QUEUE_MAINTENANCE_INTERVAL: number;
-	HTR_CLI_DOCKER_IMAGE: string;
 	HTR_CLI_IMAGES_FOLDER: string;
+	HTR_CLI_BINARY_PATH: string;
+	HTR_CLI_MODELS_FOLDER: string;
 	QUEUE_DRIVER: string;
 	QUEUE_DATABASE_PASSWORD: string;
 	QUEUE_DATABASE_NAME: string;
@@ -36,7 +36,7 @@ describe('JobProcessor', () => {

 	skipByDefault('should execute work on job in the queue', async () => {
 		jest.useRealTimers();
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000);
 		await tw.init();

 		await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy.png'));
@@ -59,7 +59,7 @@ describe('JobProcessor', () => {

 	skipByDefault('should execute work on job in the queue even if one fails', async () => {
 		jest.useRealTimers();
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000);
 		await tw.init();
 		await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy_2.png'));

@@ -84,7 +84,7 @@ describe('JobProcessor', () => {

 	skipByDefault('should remove file sent to queue if job is completed', async () => {
 		jest.useRealTimers();
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000);
 		await tw.init();
 		const imagePath = join('images', 'htr_sample_copy_3.png');
 		await copy(join('images', 'htr_sample.png'), imagePath);
@@ -112,7 +112,7 @@ describe('JobProcessor', () => {
 		const fileStorage = new FileStorage();
 		const mockedFileStorageRemove = jest.fn();
 		fileStorage.remove = mockedFileStorageRemove;
-		const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), fileStorage, 1000);
+		const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), fileStorage, 1000);
 		await tw.init();

 		// file doesn't exist to force a fail, but the call to remove the file should still exist