diff --git a/.env-sample b/.env-sample index 22ebd4d026..6d71f86897 100644 --- a/.env-sample +++ b/.env-sample @@ -31,6 +31,7 @@ # QUEUE_DATABASE_PASSWORD=transcribe # QUEUE_DATABASE_PORT=5431 # HTR_CLI_IMAGES_FOLDER=/home/user/images_storage +# HTR_CLI_MODELS_FOLDER=/home/user/transcribe_models # ============================================================================= # DEV CONFIG EXAMPLE diff --git a/Dockerfile.transcribe b/Dockerfile.transcribe index fda3d45aa7..6fc7e73476 100644 --- a/Dockerfile.transcribe +++ b/Dockerfile.transcribe @@ -2,25 +2,24 @@ FROM node:24-bullseye RUN apt-get update \ && apt-get install -y \ - ca-certificates curl \ - python3 tini - -## install docker -RUN install -m 0755 -d /etc/apt/keyrings -RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc -RUN chmod a+r /etc/apt/keyrings/docker.asc -RUN echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \ - $(. /etc/os-release && echo bullseye) stable" | \ - tee /etc/apt/sources.list.d/docker.list > /dev/null -RUN apt-get update \ - && apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \ + ca-certificates curl wget unzip \ + python3 tini \ && rm -rf /var/lib/apt/lists/* ENV NODE_ENV=production RUN corepack enable +# Download llama.cpp binary +WORKDIR /opt/llama +RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64.zip \ + && unzip llama-b5449-bin-ubuntu-x64.zip \ + && rm llama-b5449-bin-ubuntu-x64.zip \ + && chmod +x /opt/llama/build/bin/llama-mtmd-cli + +# Create non-root user for security +RUN groupadd -r transcribe && useradd -r -g transcribe transcribe + WORKDIR /app COPY .yarn/releases ./.yarn/releases @@ -44,7 +43,17 @@ RUN BUILD_SEQUENCIAL=1 yarn install --inline-builds \ && yarn cache clean \ && rm -rf .yarn/berry +# Create images directory and set permissions +RUN mkdir -p /app/packages/transcribe/images \ + && chown -R transcribe:transcribe /app/packages/transcribe/images + WORKDIR /app/packages/transcribe +# Switch to non-root user +USER transcribe + +# Set environment variable for embedded llama.cpp binary +ENV HTR_CLI_BINARY_PATH=/opt/llama/build/bin/llama-mtmd-cli + # Start the Node.js application CMD ["yarn", "start"] diff --git a/docker-compose.server.yml b/docker-compose.server.yml index 521cd95549..2ef75ca25f 100644 --- a/docker-compose.server.yml +++ b/docker-compose.server.yml @@ -84,8 +84,8 @@ services: profiles: - full volumes: - - /var/run/docker.sock:/var/run/docker.sock - ${HTR_CLI_IMAGES_FOLDER}:/app/packages/transcribe/images + - ${HTR_CLI_MODELS_FOLDER}:/opt/models:ro depends_on: - transcribe-db ports: @@ -94,6 +94,16 @@ services: - transcribe-network - shared-network restart: unless-stopped + # Security: limit resources to prevent runaway processes + deploy: + resources: + limits: + memory: 16G + cpus: '4' + # Security: read-only root filesystem with only images folder writable + read_only: true + tmpfs: + - /tmp environment: - APP_PORT=4567 - DB_CLIENT=pg @@ -103,5 +113,6 @@ services: - QUEUE_DATABASE_PORT=${QUEUE_DATABASE_PORT} - QUEUE_DATABASE_HOST=transcribe-db - API_KEY=${TRANSCRIBE_API_KEY} - - HTR_CLI_IMAGES_FOLDER=${HTR_CLI_IMAGES_FOLDER} + - HTR_CLI_IMAGES_FOLDER=/app/packages/transcribe/images + - HTR_CLI_MODELS_FOLDER=/opt/models diff --git a/packages/tools/cspell/dictionary4.txt b/packages/tools/cspell/dictionary4.txt index 3fac75003f..46f504271d 100644 --- a/packages/tools/cspell/dictionary4.txt +++ b/packages/tools/cspell/dictionary4.txt @@ -251,5 +251,7 @@ codegen analyzed Perfetto appmodules +mtmd +gguf armor clearsign diff --git a/packages/transcribe/Dockerfile.htr-cli b/packages/transcribe/Dockerfile.htr-cli deleted file mode 100644 index d37a4694a0..0000000000 --- a/packages/transcribe/Dockerfile.htr-cli +++ /dev/null @@ -1,26 +0,0 @@ -FROM bitnami/minideb:bookworm - -RUN apt-get update && apt-get install -y \ - build-essential \ - cmake \ - git \ - wget \ - unzip \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /app -RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64.zip - -RUN mkdir /models/ -RUN wget -q -O /models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf -RUN wget -q -O /models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf - -WORKDIR /app -RUN unzip llama-b5449-bin-ubuntu-x64.zip -WORKDIR /app/build/bin - -# Create an entrypoint script -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/packages/transcribe/README.md b/packages/transcribe/README.md index 2fc1deddee..04ad243758 100644 --- a/packages/transcribe/README.md +++ b/packages/transcribe/README.md @@ -2,16 +2,33 @@ ## Configure Docker for Transcribe +The transcribe server embeds the llama.cpp binary directly in the Docker image. The AI models must be downloaded separately and mounted as a volume. + +### 1. Download the models + +Create a directory for the models and download them: + +```shell +mkdir -p ./data/transcribe-models +wget -O ./data/transcribe-models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf +wget -O ./data/transcribe-models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf +``` + +### 2. Configure environment + 1. Copy `.env-transcribe-sample` to your Docker configuration directory. 2. Rename it to `.env-transcribe`. -3. Set `HTR_CLI_IMAGES_FOLDER` to the full path of the folder where images will be stored. This folder must be outside the Docker container. -4. Test the server with the default configuration: + +### 3. Run the server + +The models directory on your host is mounted into the container at `/opt/models`. The `HTR_CLI_MODELS_FOLDER` environment variable refers to the path inside the container, not the host path. ```shell docker build -f ./Dockerfile.transcribe -t transcribe . docker run --env-file .env-transcribe -p 4567:4567 \ - -v /var/run/docker.sock:/var/run/docker.sock \ -v ./packages/transcribe/images:/app/packages/transcribe/images \ + -v ./data/transcribe-models:/opt/models:ro \ + -e HTR_CLI_MODELS_FOLDER=/opt/models \ transcribe ``` @@ -29,6 +46,15 @@ The minimal configuration is provided in `.env-sample` and `docker-compose.serve For advanced configuration, refer to `.env-sample-transcribe`. +## Security + +The transcribe container runs with these security measures: + +- **Non-root user**: The application runs as the `transcribe` user, not root +- **Read-only filesystem**: The container filesystem is read-only (only `/app/packages/transcribe/images` and `/tmp` are writable) +- **Resource limits**: Memory and CPU limits prevent runaway processes +- **No Docker socket**: Unlike previous versions, no Docker socket mount is required + --- # Development Setup @@ -56,9 +82,15 @@ The queue driver can be **SQLite** or **PostgreSQL**: From `packages/transcribe`, run: ```shell -npm run start +yarn start ``` +### Environment variables + +- `HTR_CLI_BINARY_PATH`: Path to the llama-mtmd-cli binary +- `HTR_CLI_MODELS_FOLDER`: Path to the models directory +- `HTR_CLI_IMAGES_FOLDER`: Path where uploaded images are stored + --- # API Endpoints diff --git a/packages/transcribe/entrypoint.sh b/packages/transcribe/entrypoint.sh deleted file mode 100755 index 0af09ab48c..0000000000 --- a/packages/transcribe/entrypoint.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -if [ ! -f "/images/$1" ]; then - echo "Error: Image file /images/$1 does not exist. Check if HTR_CLI_IMAGES_FOLDER environment variable is set correctly." - exit 1 -fi - -./llama-mtmd-cli -m /models/Model-7.6B-Q4_K_M.gguf --mmproj /models/mmproj-model-f16.gguf -c 4096 --temp 0.05 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image /images/"$1" -p "SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: \`\`\`{{example}}\`\`\`. If you find no text, output \`\`\`\`\`\`.. Your turn:" \ No newline at end of file diff --git a/packages/transcribe/src/api/app.ts b/packages/transcribe/src/api/app.ts index 302002d40f..3ba3c2a4b6 100644 --- a/packages/transcribe/src/api/app.ts +++ b/packages/transcribe/src/api/app.ts @@ -34,7 +34,11 @@ const init = async (logger: LoggerWrapper) => { app.context.queue = queue; app.context.storage = fileStorage; - const htrCli = new HtrCli(envVariables.HTR_CLI_DOCKER_IMAGE, envVariables.HTR_CLI_IMAGES_FOLDER); + const htrCli = new HtrCli({ + htrCliImagesFolder: envVariables.HTR_CLI_IMAGES_FOLDER, + binaryPath: envVariables.HTR_CLI_BINARY_PATH, + modelsFolder: envVariables.HTR_CLI_MODELS_FOLDER, + }); const jobProcessor = new JobProcessor(queue, htrCli, fileStorage); @@ -46,6 +50,8 @@ const init = async (logger: LoggerWrapper) => { const checkServerConfigurations = (envVariables: EnvVariables) => { if (!envVariables.API_KEY) throw Error('API_KEY environment variable not set.'); if (!envVariables.HTR_CLI_IMAGES_FOLDER) throw Error('HTR_CLI_IMAGES_FOLDER environment variable not set. This should point to a folder where images will be stored.'); + if (!envVariables.HTR_CLI_BINARY_PATH) throw Error('HTR_CLI_BINARY_PATH environment variable not set. This should point to the llama-mtmd-cli binary.'); + if (!envVariables.HTR_CLI_MODELS_FOLDER) throw Error('HTR_CLI_MODELS_FOLDER environment variable not set. This should point to the folder containing the AI models.'); }; const main = async () => { diff --git a/packages/transcribe/src/core/HtrCli.test.ts b/packages/transcribe/src/core/HtrCli.test.ts index c87de53b49..52d895be1c 100644 --- a/packages/transcribe/src/core/HtrCli.test.ts +++ b/packages/transcribe/src/core/HtrCli.test.ts @@ -2,7 +2,7 @@ import { readFile } from 'fs-extra'; import HtrCli from './HtrCli'; describe('HtrCli', () => { - const dt = new HtrCli('', ''); + const dt = new HtrCli({ htrCliImagesFolder: '', binaryPath: '', modelsFolder: '' }); it('should parse multiline result', async () => { const testCase = await readFile('./test-cases/1.txt'); const result = dt.cleanUpResult(testCase.toString()); diff --git a/packages/transcribe/src/core/HtrCli.ts b/packages/transcribe/src/core/HtrCli.ts index 7784e07f6a..c11207cf5b 100644 --- a/packages/transcribe/src/core/HtrCli.ts +++ b/packages/transcribe/src/core/HtrCli.ts @@ -1,29 +1,41 @@ import Logger from '@joplin/utils/Logger'; import { commandToString, execCommand } from '@joplin/utils'; import { WorkHandler } from '../types'; +import { basename } from 'path'; const logger = Logger.create('HtrCli'); +const systemPrompt = 'SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: ```{{example}}```. If you find no text, output an empty code block: ``````. Your turn:'; + +export interface HtrCliOptions { + htrCliImagesFolder: string; + binaryPath: string; + modelsFolder: string; +} + export default class HtrCli implements WorkHandler { - private htrCliDockerImage: string; - private htrCliImagesFolder: string; + private options: HtrCliOptions; - public constructor(htrCliDockerImage: string, htrCliImagesFolder: string) { - this.htrCliDockerImage = htrCliDockerImage; - this.htrCliImagesFolder = htrCliImagesFolder; + public constructor(options: HtrCliOptions) { + this.options = options; } public async init() { - logger.info('Loading'); - const result = await execCommand(['docker', 'pull', this.htrCliDockerImage], { quiet: true }); - logger.info('Finished loading: ', result); + logger.info('Using embedded llama.cpp binary'); } public async run(imageName: string) { - const command = ['docker', 'run', '--rm', '-t', '-v', `${this.htrCliImagesFolder}:/images`, this.htrCliDockerImage, imageName]; - logger.info('Running transcription...'); + + // Sanitize imageName to prevent path traversal attacks + const sanitizedImageName = basename(imageName); + if (sanitizedImageName !== imageName || imageName.includes('..')) { + throw new Error(`Invalid image name: ${imageName}`); + } + + const command = this.buildCommand(imageName); + logger.info(`Command: ${commandToString(command[0], command.slice(1))}`); const result = await execCommand(command, { quiet: true }); @@ -31,6 +43,22 @@ export default class HtrCli implements WorkHandler { return this.cleanUpResult(result); } + private buildCommand(imageName: string): string[] { + const { binaryPath, modelsFolder, htrCliImagesFolder } = this.options; + return [ + binaryPath, + '-m', `${modelsFolder}/Model-7.6B-Q4_K_M.gguf`, + '--mmproj', `${modelsFolder}/mmproj-model-f16.gguf`, + '-c', '4096', + '--temp', '0.05', + '--top-p', '0.8', + '--top-k', '100', + '--repeat-penalty', '1.05', + '--image', `${htrCliImagesFolder}/${imageName}`, + '-p', systemPrompt, + ]; + } + public cleanUpResult(transcriptionAndLogs: string) { const s1 = transcriptionAndLogs.split(/image decoded.*/); // Before the last `image decoded` line it is all logs generated by the transcription tool diff --git a/packages/transcribe/src/env.ts b/packages/transcribe/src/env.ts index 893e9ff5d6..c17c1b5926 100644 --- a/packages/transcribe/src/env.ts +++ b/packages/transcribe/src/env.ts @@ -6,8 +6,9 @@ export const defaultEnvValues: EnvVariables = { QUEUE_TTL: 15 * Minute, QUEUE_RETRY_COUNT: 2, QUEUE_MAINTENANCE_INTERVAL: 60 * Second, - HTR_CLI_DOCKER_IMAGE: 'joplin/htr-cli:latest', HTR_CLI_IMAGES_FOLDER: '', + HTR_CLI_BINARY_PATH: '', + HTR_CLI_MODELS_FOLDER: '', QUEUE_DRIVER: 'pg', // 'sqlite' QUEUE_DATABASE_PASSWORD: '', QUEUE_DATABASE_NAME: '', @@ -25,8 +26,9 @@ export interface EnvVariables { QUEUE_TTL: number; QUEUE_RETRY_COUNT: number; QUEUE_MAINTENANCE_INTERVAL: number; - HTR_CLI_DOCKER_IMAGE: string; HTR_CLI_IMAGES_FOLDER: string; + HTR_CLI_BINARY_PATH: string; + HTR_CLI_MODELS_FOLDER: string; QUEUE_DRIVER: string; QUEUE_DATABASE_PASSWORD: string; QUEUE_DATABASE_NAME: string; diff --git a/packages/transcribe/src/workers/JobProcessor.test.ts b/packages/transcribe/src/workers/JobProcessor.test.ts index ea3fba8fc3..209b101665 100644 --- a/packages/transcribe/src/workers/JobProcessor.test.ts +++ b/packages/transcribe/src/workers/JobProcessor.test.ts @@ -36,7 +36,7 @@ describe('JobProcessor', () => { skipByDefault('should execute work on job in the queue', async () => { jest.useRealTimers(); - const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000); + const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000); await tw.init(); await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy.png')); @@ -59,7 +59,7 @@ describe('JobProcessor', () => { skipByDefault('should execute work on job in the queue even if one fails', async () => { jest.useRealTimers(); - const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000); + const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000); await tw.init(); await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy_2.png')); @@ -84,7 +84,7 @@ describe('JobProcessor', () => { skipByDefault('should remove file sent to queue if job is completed', async () => { jest.useRealTimers(); - const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000); + const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000); await tw.init(); const imagePath = join('images', 'htr_sample_copy_3.png'); await copy(join('images', 'htr_sample.png'), imagePath); @@ -112,7 +112,7 @@ describe('JobProcessor', () => { const fileStorage = new FileStorage(); const mockedFileStorageRemove = jest.fn(); fileStorage.remove = mockedFileStorageRemove; - const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), fileStorage, 1000); + const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), fileStorage, 1000); await tw.init(); // file doesn't exist to force a fail, but the call to remove the file should still exist