Chore: Refactor Transcribe server (#14381)

This commit is contained in:
Laurent Cozic
2026-02-19 12:51:42 +00:00
committed by GitHub
parent 8aca7445c7
commit 5beccb9a86
12 changed files with 128 additions and 70 deletions
+1
View File
@@ -31,6 +31,7 @@
# QUEUE_DATABASE_PASSWORD=transcribe
# QUEUE_DATABASE_PORT=5431
# HTR_CLI_IMAGES_FOLDER=/home/user/images_storage
# HTR_CLI_MODELS_FOLDER=/home/user/transcribe_models
# =============================================================================
# DEV CONFIG EXAMPLE
+22 -13
View File
@@ -2,25 +2,24 @@ FROM node:24-bullseye
RUN apt-get update \
&& apt-get install -y \
ca-certificates curl \
python3 tini
## install docker
RUN install -m 0755 -d /etc/apt/keyrings
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
RUN chmod a+r /etc/apt/keyrings/docker.asc
RUN echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \
$(. /etc/os-release && echo bullseye) stable" | \
tee /etc/apt/sources.list.d/docker.list > /dev/null
RUN apt-get update \
&& apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \
ca-certificates curl wget unzip \
python3 tini \
&& rm -rf /var/lib/apt/lists/*
ENV NODE_ENV=production
RUN corepack enable
# Download llama.cpp binary
WORKDIR /opt/llama
RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64.zip \
&& unzip llama-b5449-bin-ubuntu-x64.zip \
&& rm llama-b5449-bin-ubuntu-x64.zip \
&& chmod +x /opt/llama/build/bin/llama-mtmd-cli
# Create non-root user for security
RUN groupadd -r transcribe && useradd -r -g transcribe transcribe
WORKDIR /app
COPY .yarn/releases ./.yarn/releases
@@ -44,7 +43,17 @@ RUN BUILD_SEQUENCIAL=1 yarn install --inline-builds \
&& yarn cache clean \
&& rm -rf .yarn/berry
# Create images directory and set permissions
RUN mkdir -p /app/packages/transcribe/images \
&& chown -R transcribe:transcribe /app/packages/transcribe/images
WORKDIR /app/packages/transcribe
# Switch to non-root user
USER transcribe
# Set environment variable for embedded llama.cpp binary
ENV HTR_CLI_BINARY_PATH=/opt/llama/build/bin/llama-mtmd-cli
# Start the Node.js application
CMD ["yarn", "start"]
+13 -2
View File
@@ -84,8 +84,8 @@ services:
profiles:
- full
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${HTR_CLI_IMAGES_FOLDER}:/app/packages/transcribe/images
- ${HTR_CLI_MODELS_FOLDER}:/opt/models:ro
depends_on:
- transcribe-db
ports:
@@ -94,6 +94,16 @@ services:
- transcribe-network
- shared-network
restart: unless-stopped
# Security: limit resources to prevent runaway processes
deploy:
resources:
limits:
memory: 16G
cpus: '4'
# Security: read-only root filesystem with only images folder writable
read_only: true
tmpfs:
- /tmp
environment:
- APP_PORT=4567
- DB_CLIENT=pg
@@ -103,5 +113,6 @@ services:
- QUEUE_DATABASE_PORT=${QUEUE_DATABASE_PORT}
- QUEUE_DATABASE_HOST=transcribe-db
- API_KEY=${TRANSCRIBE_API_KEY}
- HTR_CLI_IMAGES_FOLDER=${HTR_CLI_IMAGES_FOLDER}
- HTR_CLI_IMAGES_FOLDER=/app/packages/transcribe/images
- HTR_CLI_MODELS_FOLDER=/opt/models
+2
View File
@@ -251,5 +251,7 @@ codegen
analyzed
Perfetto
appmodules
mtmd
gguf
armor
clearsign
-26
View File
@@ -1,26 +0,0 @@
FROM bitnami/minideb:bookworm
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
git \
wget \
unzip \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
RUN wget -q https://github.com/ggml-org/llama.cpp/releases/download/b5449/llama-b5449-bin-ubuntu-x64.zip
RUN mkdir /models/
RUN wget -q -O /models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf
RUN wget -q -O /models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf
WORKDIR /app
RUN unzip llama-b5449-bin-ubuntu-x64.zip
WORKDIR /app/build/bin
# Create an entrypoint script
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
+36 -4
View File
@@ -2,16 +2,33 @@
## Configure Docker for Transcribe
The transcribe server embeds the llama.cpp binary directly in the Docker image. The AI models must be downloaded separately and mounted as a volume.
### 1. Download the models
Create a directory for the models and download them:
```shell
mkdir -p ./data/transcribe-models
wget -O ./data/transcribe-models/Model-7.6B-Q4_K_M.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/Model-7.6B-Q4_K_M.gguf
wget -O ./data/transcribe-models/mmproj-model-f16.gguf https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf/resolve/main/mmproj-model-f16.gguf
```
### 2. Configure environment
1. Copy `.env-transcribe-sample` to your Docker configuration directory.
2. Rename it to `.env-transcribe`.
3. Set `HTR_CLI_IMAGES_FOLDER` to the full path of the folder where images will be stored. This folder must be outside the Docker container.
4. Test the server with the default configuration:
### 3. Run the server
The models directory on your host is mounted into the container at `/opt/models`. The `HTR_CLI_MODELS_FOLDER` environment variable refers to the path inside the container, not the host path.
```shell
docker build -f ./Dockerfile.transcribe -t transcribe .
docker run --env-file .env-transcribe -p 4567:4567 \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ./packages/transcribe/images:/app/packages/transcribe/images \
-v ./data/transcribe-models:/opt/models:ro \
-e HTR_CLI_MODELS_FOLDER=/opt/models \
transcribe
```
@@ -29,6 +46,15 @@ The minimal configuration is provided in `.env-sample` and `docker-compose.serve
For advanced configuration, refer to `.env-sample-transcribe`.
## Security
The transcribe container runs with these security measures:
- **Non-root user**: The application runs as the `transcribe` user, not root
- **Read-only filesystem**: The container filesystem is read-only (only `/app/packages/transcribe/images` and `/tmp` are writable)
- **Resource limits**: Memory and CPU limits prevent runaway processes
- **No Docker socket**: Unlike previous versions, no Docker socket mount is required
---
# Development Setup
@@ -56,9 +82,15 @@ The queue driver can be **SQLite** or **PostgreSQL**:
From `packages/transcribe`, run:
```shell
npm run start
yarn start
```
### Environment variables
- `HTR_CLI_BINARY_PATH`: Path to the llama-mtmd-cli binary
- `HTR_CLI_MODELS_FOLDER`: Path to the models directory
- `HTR_CLI_IMAGES_FOLDER`: Path where uploaded images are stored
---
# API Endpoints
-7
View File
@@ -1,7 +0,0 @@
#!/bin/bash
if [ ! -f "/images/$1" ]; then
echo "Error: Image file /images/$1 does not exist. Check if HTR_CLI_IMAGES_FOLDER environment variable is set correctly."
exit 1
fi
./llama-mtmd-cli -m /models/Model-7.6B-Q4_K_M.gguf --mmproj /models/mmproj-model-f16.gguf -c 4096 --temp 0.05 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image /images/"$1" -p "SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: \`\`\`{{example}}\`\`\`. If you find no text, output \`\`\`\`\`\`.. Your turn:"
+7 -1
View File
@@ -34,7 +34,11 @@ const init = async (logger: LoggerWrapper) => {
app.context.queue = queue;
app.context.storage = fileStorage;
const htrCli = new HtrCli(envVariables.HTR_CLI_DOCKER_IMAGE, envVariables.HTR_CLI_IMAGES_FOLDER);
const htrCli = new HtrCli({
htrCliImagesFolder: envVariables.HTR_CLI_IMAGES_FOLDER,
binaryPath: envVariables.HTR_CLI_BINARY_PATH,
modelsFolder: envVariables.HTR_CLI_MODELS_FOLDER,
});
const jobProcessor = new JobProcessor(queue, htrCli, fileStorage);
@@ -46,6 +50,8 @@ const init = async (logger: LoggerWrapper) => {
const checkServerConfigurations = (envVariables: EnvVariables) => {
if (!envVariables.API_KEY) throw Error('API_KEY environment variable not set.');
if (!envVariables.HTR_CLI_IMAGES_FOLDER) throw Error('HTR_CLI_IMAGES_FOLDER environment variable not set. This should point to a folder where images will be stored.');
if (!envVariables.HTR_CLI_BINARY_PATH) throw Error('HTR_CLI_BINARY_PATH environment variable not set. This should point to the llama-mtmd-cli binary.');
if (!envVariables.HTR_CLI_MODELS_FOLDER) throw Error('HTR_CLI_MODELS_FOLDER environment variable not set. This should point to the folder containing the AI models.');
};
const main = async () => {
+1 -1
View File
@@ -2,7 +2,7 @@ import { readFile } from 'fs-extra';
import HtrCli from './HtrCli';
describe('HtrCli', () => {
const dt = new HtrCli('', '');
const dt = new HtrCli({ htrCliImagesFolder: '', binaryPath: '', modelsFolder: '' });
it('should parse multiline result', async () => {
const testCase = await readFile('./test-cases/1.txt');
const result = dt.cleanUpResult(testCase.toString());
+38 -10
View File
@@ -1,29 +1,41 @@
import Logger from '@joplin/utils/Logger';
import { commandToString, execCommand } from '@joplin/utils';
import { WorkHandler } from '../types';
import { basename } from 'path';
const logger = Logger.create('HtrCli');
const systemPrompt = 'SYSTEM: you are an agent of a OCR system. Your job is to be concise and correct. You should NEVER deviate from the content of the image. You should NEVER add any context or new information. Your only job should be to transcribe the text presented in the image as text without anything new information. The output for it should be inside triple backticks like: ```{{example}}```. If you find no text, output an empty code block: ``````. Your turn:';
export interface HtrCliOptions {
htrCliImagesFolder: string;
binaryPath: string;
modelsFolder: string;
}
export default class HtrCli implements WorkHandler {
private htrCliDockerImage: string;
private htrCliImagesFolder: string;
private options: HtrCliOptions;
public constructor(htrCliDockerImage: string, htrCliImagesFolder: string) {
this.htrCliDockerImage = htrCliDockerImage;
this.htrCliImagesFolder = htrCliImagesFolder;
public constructor(options: HtrCliOptions) {
this.options = options;
}
public async init() {
logger.info('Loading');
const result = await execCommand(['docker', 'pull', this.htrCliDockerImage], { quiet: true });
logger.info('Finished loading: ', result);
logger.info('Using embedded llama.cpp binary');
}
public async run(imageName: string) {
const command = ['docker', 'run', '--rm', '-t', '-v', `${this.htrCliImagesFolder}:/images`, this.htrCliDockerImage, imageName];
logger.info('Running transcription...');
// Sanitize imageName to prevent path traversal attacks
const sanitizedImageName = basename(imageName);
if (sanitizedImageName !== imageName || imageName.includes('..')) {
throw new Error(`Invalid image name: ${imageName}`);
}
const command = this.buildCommand(imageName);
logger.info(`Command: ${commandToString(command[0], command.slice(1))}`);
const result = await execCommand(command, { quiet: true });
@@ -31,6 +43,22 @@ export default class HtrCli implements WorkHandler {
return this.cleanUpResult(result);
}
private buildCommand(imageName: string): string[] {
const { binaryPath, modelsFolder, htrCliImagesFolder } = this.options;
return [
binaryPath,
'-m', `${modelsFolder}/Model-7.6B-Q4_K_M.gguf`,
'--mmproj', `${modelsFolder}/mmproj-model-f16.gguf`,
'-c', '4096',
'--temp', '0.05',
'--top-p', '0.8',
'--top-k', '100',
'--repeat-penalty', '1.05',
'--image', `${htrCliImagesFolder}/${imageName}`,
'-p', systemPrompt,
];
}
public cleanUpResult(transcriptionAndLogs: string) {
const s1 = transcriptionAndLogs.split(/image decoded.*/);
// Before the last `image decoded` line it is all logs generated by the transcription tool
+4 -2
View File
@@ -6,8 +6,9 @@ export const defaultEnvValues: EnvVariables = {
QUEUE_TTL: 15 * Minute,
QUEUE_RETRY_COUNT: 2,
QUEUE_MAINTENANCE_INTERVAL: 60 * Second,
HTR_CLI_DOCKER_IMAGE: 'joplin/htr-cli:latest',
HTR_CLI_IMAGES_FOLDER: '',
HTR_CLI_BINARY_PATH: '',
HTR_CLI_MODELS_FOLDER: '',
QUEUE_DRIVER: 'pg', // 'sqlite'
QUEUE_DATABASE_PASSWORD: '',
QUEUE_DATABASE_NAME: '',
@@ -25,8 +26,9 @@ export interface EnvVariables {
QUEUE_TTL: number;
QUEUE_RETRY_COUNT: number;
QUEUE_MAINTENANCE_INTERVAL: number;
HTR_CLI_DOCKER_IMAGE: string;
HTR_CLI_IMAGES_FOLDER: string;
HTR_CLI_BINARY_PATH: string;
HTR_CLI_MODELS_FOLDER: string;
QUEUE_DRIVER: string;
QUEUE_DATABASE_PASSWORD: string;
QUEUE_DATABASE_NAME: string;
@@ -36,7 +36,7 @@ describe('JobProcessor', () => {
skipByDefault('should execute work on job in the queue', async () => {
jest.useRealTimers();
const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000);
await tw.init();
await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy.png'));
@@ -59,7 +59,7 @@ describe('JobProcessor', () => {
skipByDefault('should execute work on job in the queue even if one fails', async () => {
jest.useRealTimers();
const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000);
await tw.init();
await copy(join('images', 'htr_sample.png'), join('images', 'htr_sample_copy_2.png'));
@@ -84,7 +84,7 @@ describe('JobProcessor', () => {
skipByDefault('should remove file sent to queue if job is completed', async () => {
jest.useRealTimers();
const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), new FileStorage(), 1000);
const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), new FileStorage(), 1000);
await tw.init();
const imagePath = join('images', 'htr_sample_copy_3.png');
await copy(join('images', 'htr_sample.png'), imagePath);
@@ -112,7 +112,7 @@ describe('JobProcessor', () => {
const fileStorage = new FileStorage();
const mockedFileStorageRemove = jest.fn();
fileStorage.remove = mockedFileStorageRemove;
const tw = new JobProcessor(queue, new HtrCli('joplin/htr-cli:latest', join(process.cwd(), 'images')), fileStorage, 1000);
const tw = new JobProcessor(queue, new HtrCli({ htrCliImagesFolder: join(process.cwd(), 'images'), binaryPath: '/opt/llama/build/bin/llama-mtmd-cli', modelsFolder: '/opt/models' }), fileStorage, 1000);
await tw.init();
// file doesn't exist to force a fail, but the call to remove the file should still exist