Files
Michele Dolfi 453db676ee feat: new ray orchestrator (#557)
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2026-04-01 12:54:16 +02:00

243 lines
10 KiB
Bash

TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
UVICORN_WORKERS=2
UVICORN_RELOAD=True
# Logging configuration (case-insensitive)
# DOCLING_SERVE_LOG_LEVEL=WARNING # Options: WARNING, INFO, DEBUG (or warning, info, debug)
# RQ Engine Redis Connection Pool Configuration
# Adjust these values based on your deployment scale:
# - Small (1-4 workers): Use defaults (50 connections, no timeouts)
# - Medium (5-10 workers): Set max_connections to 100
# - Large (10+ workers): Set max_connections to 150-200
# - Timeout settings: Only set if experiencing connection issues
# DOCLING_SERVE_ENG_RQ_REDIS_MAX_CONNECTIONS=50
# DOCLING_SERVE_ENG_RQ_REDIS_SOCKET_TIMEOUT=5.0
# DOCLING_SERVE_ENG_RQ_REDIS_SOCKET_CONNECT_TIMEOUT=5.0
# ============================================================================
# Configuration File Support
# ============================================================================
# Load configuration from a YAML or JSON file
# Environment variables take precedence over config file values
# DOCLING_SERVE_CONFIG_FILE=examples/config.yaml
# ============================================================================
# VLM Pipeline Control
# ============================================================================
# Default VLM preset to use when user specifies "default"
# DOCLING_SERVE_DEFAULT_VLM_PRESET=granite_docling
# List of allowed VLM preset IDs (JSON array or comma-separated)
# JSON format:
# DOCLING_SERVE_ALLOWED_VLM_PRESETS='["granite_docling", "custom_preset"]'
# Comma-separated format:
# DOCLING_SERVE_ALLOWED_VLM_PRESETS=granite_docling,custom_preset
# Custom VLM presets (JSON object)
# DOCLING_SERVE_CUSTOM_VLM_PRESETS='{"my_preset": {"engine": "openai", "model": "gpt-4-vision"}}'
# List of allowed VLM engine types (JSON array or comma-separated)
# DOCLING_SERVE_ALLOWED_VLM_ENGINES='["openai", "anthropic"]'
# DOCLING_SERVE_ALLOWED_VLM_ENGINES=openai,anthropic
# Whether users can specify custom VLM engine configurations
# DOCLING_SERVE_ALLOW_CUSTOM_VLM_CONFIG=false
# ============================================================================
# Picture Description Control
# ============================================================================
# DOCLING_SERVE_DEFAULT_PICTURE_DESCRIPTION_PRESET=smolvlm
# DOCLING_SERVE_ALLOWED_PICTURE_DESCRIPTION_PRESETS='["smolvlm", "custom"]'
# DOCLING_SERVE_CUSTOM_PICTURE_DESCRIPTION_PRESETS='{}'
# DOCLING_SERVE_ALLOWED_PICTURE_DESCRIPTION_ENGINES='["smolvlm"]'
# DOCLING_SERVE_ALLOW_CUSTOM_PICTURE_DESCRIPTION_CONFIG=false
# ============================================================================
# Code/Formula Control
# ============================================================================
# DOCLING_SERVE_DEFAULT_CODE_FORMULA_PRESET=default
# DOCLING_SERVE_ALLOWED_CODE_FORMULA_PRESETS='["default"]'
# DOCLING_SERVE_CUSTOM_CODE_FORMULA_PRESETS='{}'
# DOCLING_SERVE_ALLOWED_CODE_FORMULA_ENGINES='["default"]'
# DOCLING_SERVE_ALLOW_CUSTOM_CODE_FORMULA_CONFIG=false
# ============================================================================
# Table Structure Control
# ============================================================================
# DOCLING_SERVE_DEFAULT_TABLE_STRUCTURE_KIND=docling_tableformer
# DOCLING_SERVE_ALLOWED_TABLE_STRUCTURE_KINDS='["docling_tableformer", "approved_plugin"]'
# DOCLING_SERVE_ALLOWED_TABLE_STRUCTURE_KINDS=docling_tableformer,approved_plugin
# ============================================================================
# Layout Control
# ============================================================================
# DOCLING_SERVE_DEFAULT_LAYOUT_KIND=docling_layout_default
# DOCLING_SERVE_ALLOWED_LAYOUT_KINDS='["docling_layout_default", "layout_object_detection"]'
# DOCLING_SERVE_ALLOWED_LAYOUT_KINDS=docling_layout_default,layout_object_detection
# Ray Engine Configuration
# ============================================================================
# The Ray engine uses Ray Serve for autoscaling document processing with
# Redis for state management and fair task scheduling across tenants.
#
# To use: Set DOCLING_SERVE_ENG_KIND=ray
#
# Key Features:
# - Fair round-robin scheduling at task level
# - Per-tenant task queues and resource limits
# - Ray Serve autoscaling with persistent converters
# - Redis for HA state management (supports Sentinel/Cluster)
# - Fault tolerance with automatic retries
# ============================================================================
# --- Required Settings ---
# Redis URL for state management (REQUIRED)
# DOCLING_SERVE_ENG_RAY_REDIS_URL=redis://localhost:6379/
# Ray cluster address (REQUIRED)
# Use "auto" or "local" to start local Ray cluster
# Or provide Ray cluster address like "ray://host:10001"
# DOCLING_SERVE_ENG_RAY_ADDRESS=auto
# --- Redis Configuration ---
# Connection pool and timeout settings
# DOCLING_SERVE_ENG_RAY_REDIS_MAX_CONNECTIONS=50
# DOCLING_SERVE_ENG_RAY_REDIS_SOCKET_TIMEOUT=5.0
# DOCLING_SERVE_ENG_RAY_REDIS_SOCKET_CONNECT_TIMEOUT=5.0
# --- Result Storage ---
# How long to keep task results in Redis (seconds)
# DOCLING_SERVE_ENG_RAY_RESULTS_TTL=14400 # 4 hours
# DOCLING_SERVE_ENG_RAY_RESULTS_PREFIX=docling:ray:results
# --- Pub/Sub ---
# Redis channel for task status updates
# DOCLING_SERVE_ENG_RAY_SUB_CHANNEL=docling:ray:updates
# --- Fair Dispatcher ---
# How often to check for new tasks (seconds)
# DOCLING_SERVE_ENG_RAY_DISPATCHER_INTERVAL=2.0
# --- Per-User Limits ---
# Maximum tasks being processed simultaneously per tenant
# DOCLING_SERVE_ENG_RAY_DEFAULT_MAX_CONCURRENT_TASKS=5
# Maximum tasks in queue per tenant (None = unlimited)
# DOCLING_SERVE_ENG_RAY_DEFAULT_MAX_QUEUED_TASKS=
# Return 429 if queue limit exceeded (requires max_queued_tasks)
# DOCLING_SERVE_ENG_RAY_ENABLE_QUEUE_LIMIT_REJECTION=false
# Maximum documents being processed per tenant (None = unlimited)
# DOCLING_SERVE_ENG_RAY_DEFAULT_MAX_DOCUMENTS=
# Enable per-tenant document limits
# DOCLING_SERVE_ENG_RAY_ENABLE_DOCUMENT_LIMITS=false
# --- Ray Configuration ---
# Ray namespace for isolation
# DOCLING_SERVE_ENG_RAY_NAMESPACE=docling
# Ray runtime environment (JSON dict)
# DOCLING_SERVE_ENG_RAY_RUNTIME_ENV=
# --- Ray Serve Autoscaling ---
# Minimum number of Ray Serve replicas
# DOCLING_SERVE_ENG_RAY_MIN_WORKERS=1
# Maximum number of Ray Serve replicas
# DOCLING_SERVE_ENG_RAY_MAX_WORKERS=10
# Target concurrent requests per replica for autoscaling
# DOCLING_SERVE_ENG_RAY_TARGET_REQUESTS_PER_REPLICA=1
# Seconds to wait before scaling up (prevents flapping)
# DOCLING_SERVE_ENG_RAY_UPSCALE_DELAY_S=30.0
# Seconds to wait before scaling down
# DOCLING_SERVE_ENG_RAY_DOWNSCALE_DELAY_S=600.0
# CPUs to allocate per Ray Serve replica
# DOCLING_SERVE_ENG_RAY_NUM_CPUS_PER_ACTOR=1.0
# --- Fault Tolerance & Retry ---
# Maximum retries for failed tasks
# DOCLING_SERVE_ENG_RAY_MAX_TASK_RETRIES=3
# Seconds to wait between task retries
# DOCLING_SERVE_ENG_RAY_RETRY_DELAY=5.0
# Maximum retries per document within a task
# DOCLING_SERVE_ENG_RAY_MAX_DOCUMENT_RETRIES=2
# --- Ray Actor Configuration ---
# Max dispatcher actor restarts (-1 = unlimited for HA)
# DOCLING_SERVE_ENG_RAY_DISPATCHER_MAX_RESTARTS=-1
# Ray-level task retries for dispatcher operations
# DOCLING_SERVE_ENG_RAY_DISPATCHER_MAX_TASK_RETRIES=3
# --- Timeouts ---
# Maximum seconds per task (None = no limit)
# DOCLING_SERVE_ENG_RAY_TASK_TIMEOUT=3600.0
# Maximum seconds per document (None = no limit)
# DOCLING_SERVE_ENG_RAY_DOCUMENT_TIMEOUT=300.0
# Timeout for Redis operations in seconds
# DOCLING_SERVE_ENG_RAY_REDIS_OPERATION_TIMEOUT=30.0
# --- Health Checks ---
# Enable dispatcher heartbeat monitoring
# DOCLING_SERVE_ENG_RAY_ENABLE_HEARTBEAT=true
# --- Resource Management & Memory Monitoring ---
# Memory limit per Ray actor (e.g., "4GB")
# DOCLING_SERVE_ENG_RAY_MEMORY_LIMIT_PER_ACTOR=
# Ray object store memory (e.g., "10GB")
# DOCLING_SERVE_ENG_RAY_OBJECT_STORE_MEMORY=
# Enable out-of-memory detection and recovery
# DOCLING_SERVE_ENG_RAY_ENABLE_OOM_PROTECTION=true
# Memory usage threshold for warnings (0.0-1.0)
# DOCLING_SERVE_ENG_RAY_MEMORY_WARNING_THRESHOLD=0.9
# --- Scratch Directory ---
# Directory for temporary files during processing
# DOCLING_SERVE_ENG_RAY_SCRATCH_DIR=
# --- Logging ---
# Logging level for Ray orchestrator
# DOCLING_SERVE_ENG_RAY_LOG_LEVEL=INFO
# --- Tenant Identification ---
# Header name for tenant identification (for fair scheduling)
# DOCLING_SERVE_ENG_RAY_TENANT_ID_HEADER=X-Tenant-Id
# ============================================================================
# Example Configurations
# ============================================================================
# Small deployment (1-2 workers) with local Ray:
# DOCLING_SERVE_ENG_KIND=ray
# DOCLING_SERVE_ENG_RAY_REDIS_URL=redis://localhost:6379/
# DOCLING_SERVE_ENG_RAY_ADDRESS=auto
# DOCLING_SERVE_ENG_RAY_MIN_ACTORS=1
# DOCLING_SERVE_ENG_RAY_MAX_ACTORS=2
# Medium deployment (5-10 workers) with existing Ray cluster:
# DOCLING_SERVE_ENG_KIND=ray
# DOCLING_SERVE_ENG_RAY_REDIS_URL=redis://redis-cluster:6379/
# DOCLING_SERVE_ENG_RAY_ADDRESS=ray://ray-head:10001
# DOCLING_SERVE_ENG_RAY_MIN_ACTORS=2
# DOCLING_SERVE_ENG_RAY_MAX_ACTORS=10
# DOCLING_SERVE_ENG_RAY_MAX_QUEUED_TASKS=50
# DOCLING_SERVE_ENG_RAY_ENABLE_QUEUE_LIMIT_REJECTION=true
# Large deployment (10+ workers) with Redis Sentinel and Ray cluster:
# DOCLING_SERVE_ENG_KIND=ray
# DOCLING_SERVE_ENG_RAY_REDIS_URL=redis+sentinel://sentinel-host:26379/mymaster/0
# DOCLING_SERVE_ENG_RAY_ADDRESS=ray://ray-head:10001
# DOCLING_SERVE_ENG_RAY_MIN_ACTORS=5
# DOCLING_SERVE_ENG_RAY_MAX_ACTORS=20
# DOCLING_SERVE_ENG_RAY_REDIS_MAX_CONNECTIONS=150
# ============================================================================
# Usage with Tenant Identification
# ============================================================================
# When using the ray engine, you can identify tenants via HTTP header:
#
# curl -X POST "http://localhost:5001/v1/convert/source" \
# -H "X-Tenant-Id: alice@example.com" \
# -H "Content-Type: application/json" \
# -d '{"sources": [{"url": "https://example.com/doc.pdf"}]}'
#
# Without the header, all requests are treated as "default" tenant.
# Fair scheduling ensures tasks from different tenants are processed fairly.