From be085c0e39dd5c51572b883d0f795c5a7abefd5d Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 19 Dec 2025 13:16:59 +0100
Subject: [PATCH] docs(RTX): Guidelines for best performance on RTX GPUs
 (#2765)

* add RTX docs

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add artwork and fix title

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix series definition

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add nvidia logo and update todo

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docs/assets/nvidia_logo_green.svg |  39 +++++
 docs/getting_started/rtx.md       | 261 ++++++++++++++++++++++++++++++
 mkdocs.yml                        |   1 +
 3 files changed, 301 insertions(+)
 create mode 100644 docs/assets/nvidia_logo_green.svg
 create mode 100644 docs/getting_started/rtx.md
diff --git a/docs/assets/nvidia_logo_green.svg b/docs/assets/nvidia_logo_green.svg
new file mode 100644
index 00000000..ffd2e63c
--- /dev/null
+++ b/docs/assets/nvidia_logo_green.svg
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 16.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="svg2" xmlns:svg="http://www.w3.org/2000/svg"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="px" y="0px" width="386.648px"
+	 height="290.297px" viewBox="0 0 414.836 321.809" enable-background="new 35.188 31.512 351.46 258.785"
+	 xml:space="preserve">
+<title  id="title4">generated by pstoedit version:3.44 from NVBadge_2D.eps</title>
+  <rect
+    x="0"
+    y="0"
+    width="414.836"
+    height="321.809"
+    fill="#77B900" />
+
+<path id="path17" fill="#fff" d="M384.195,282.109c0,3.771-2.769,6.302-6.047,6.302v-0.023c-3.371,0.023-6.089-2.508-6.089-6.278
+	c0-3.769,2.718-6.293,6.089-6.293C381.427,275.816,384.195,278.34,384.195,282.109z M386.648,282.109c0-5.175-4.02-8.179-8.5-8.179
+	c-4.511,0-8.531,3.004-8.531,8.179c0,5.172,4.021,8.188,8.531,8.188C382.629,290.297,386.648,287.281,386.648,282.109
+	 M376.738,282.801h0.91l2.109,3.703h2.316l-2.336-3.859c1.207-0.086,2.2-0.661,2.2-2.286c0-2.019-1.392-2.668-3.75-2.668h-3.411
+	v8.813h1.961V282.801 M376.738,281.309v-2.122h1.364c0.742,0,1.753,0.06,1.753,0.965c0,0.985-0.523,1.157-1.398,1.157H376.738"/>
+<path id="path19" fill="#fff" d="M329.406,237.027l10.598,28.993H318.48L329.406,237.027z M318.056,225.738l-24.423,61.88h17.246l3.863-10.934
+	h28.903l3.656,10.934h18.722l-24.605-61.888L318.056,225.738z M269.023,287.641h17.497v-61.922l-17.5-0.004L269.023,287.641z
+	 M147.556,225.715l-14.598,49.078l-13.984-49.074l-18.879-0.004l19.972,61.926h25.207l20.133-61.926H147.556z M218.281,239.199h7.52
+	c10.91,0,17.966,4.898,17.966,17.609c0,12.714-7.056,17.613-17.966,17.613h-7.52V239.199z M200.931,225.715v61.926h28.366
+	c15.113,0,20.048-2.512,25.384-8.148c3.769-3.957,6.207-12.641,6.207-22.134c0-8.707-2.063-16.468-5.66-21.304
+	c-6.481-8.649-15.817-10.34-29.75-10.34H200.931z M35.188,225.629v62.012h17.645v-47.086l13.672,0.004
+	c4.527,0,7.754,1.128,9.934,3.457c2.765,2.945,3.894,7.699,3.894,16.395v27.23h17.098v-34.262c0-24.453-15.586-27.75-30.836-27.75
+	H35.188z M172.771,225.715l0.007,61.926h17.489v-61.926H172.771z"/>
+<path id="path21" fill="#fff" d="M82.211,102.414c0,0,22.504-33.203,67.437-36.638V53.73
+	c-49.769,3.997-92.867,46.149-92.867,46.149s24.41,70.565,92.867,77.026v-12.804C99.411,157.781,82.211,102.414,82.211,102.414z
+	 M149.648,138.637v11.726c-37.968-6.769-48.507-46.237-48.507-46.237s18.23-20.195,48.507-23.47v12.867
+	c-0.023,0-0.039-0.007-0.058-0.007c-15.891-1.907-28.305,12.938-28.305,12.938S128.243,131.445,149.648,138.637 M149.648,31.512
+	V53.73c1.461-0.112,2.922-0.207,4.391-0.257c56.582-1.907,93.449,46.406,93.449,46.406s-42.343,51.488-86.457,51.488
+	c-4.043,0-7.828-0.375-11.383-1.005v13.739c3.04,0.386,6.192,0.613,9.481,0.613c41.051,0,70.738-20.965,99.484-45.778
+	c4.766,3.817,24.278,13.103,28.289,17.168c-27.332,22.883-91.031,41.329-127.144,41.329c-3.481,0-6.824-0.211-10.11-0.528v19.306
+	h156.032V31.512H149.648z M149.648,80.656V65.777c1.446-0.101,2.903-0.179,4.391-0.226c40.688-1.278,67.382,34.965,67.382,34.965
+	s-28.832,40.043-59.746,40.043c-4.449,0-8.438-0.715-12.028-1.922V93.523c15.84,1.914,19.028,8.911,28.551,24.786l21.18-17.859
+	c0,0-15.461-20.277-41.524-20.277C155.021,80.172,152.31,80.371,149.648,80.656"/>
+</svg>
diff --git a/docs/getting_started/rtx.md b/docs/getting_started/rtx.md
new file mode 100644
index 00000000..7f9fde1e
--- /dev/null
+++ b/docs/getting_started/rtx.md
@@ -0,0 +1,261 @@
+# ⚡ RTX GPU Acceleration
+
+<div style="text-align: center">
+    <img loading="lazy" alt="Docling on RTX" src="../../assets/nvidia_logo_green.svg" width="200px" />
+</div>
+
+
+Whether you're an AI enthusiast, researcher, or developer working with document processing, this guide will help you unlock the full potential of your NVIDIA RTX GPU with Docling.
+
+By leveraging GPU acceleration, you can achieve up to **6x speedup** compared to CPU-only processing. This dramatic performance improvement makes GPU acceleration especially valuable for processing large batches of documents, handling high-throughput document conversion workflows, or experimenting with advanced document understanding models.
+
+<!-- TBA. Performance improvement figure. -->
+
+## Prerequisites
+
+Before setting up GPU acceleration, ensure you have:
+
+- An NVIDIA RTX GPU (RTX 40/50 series)
+- Windows 10/11 or Linux operating system
+
+## Installation Steps
+
+### 1. Install NVIDIA GPU Drivers
+
+First, ensure you have the latest NVIDIA GPU drivers installed:
+
+- **Windows**: Download from [NVIDIA Driver Downloads](https://www.nvidia.com/Download/index.aspx)
+- **Linux**: Use your distribution's package manager or download from NVIDIA
+
+Verify the installation:
+
+```bash
+nvidia-smi
+```
+
+This command should display your GPU information and driver version.
+
+### 2. Install CUDA Toolkit
+
+CUDA is NVIDIA's parallel computing platform required for GPU acceleration.
+
+Follow the official installation guide for your operating system at [NVIDIA CUDA Downloads](https://developer.nvidia.com/cuda-downloads). The installer will guide you through the process and automatically set up the required environment variables.
+
+### 3. Install cuDNN
+
+cuDNN provides optimized implementations for deep learning operations.
+
+Follow the official installation guide at [NVIDIA cuDNN Downloads](https://developer.nvidia.com/cudnn). The guide provides detailed instructions for all supported platforms.
+
+### 4. Install PyTorch with CUDA Support
+
+To use GPU acceleration with Docling, you need to install PyTorch with CUDA support using the special `extra-index-url`:
+
+```bash
+# For CUDA 12.8 (current default for PyTorch)
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+# For CUDA 13.0
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+```
+
+!!! note
+    The `--index-url` parameter is crucial as it ensures you get the CUDA-enabled version of PyTorch instead of the CPU-only version.
+
+For other CUDA versions and installation options, refer to the [PyTorch Installation Matrix](https://pytorch.org/get-started/locally/).
+
+Verify PyTorch CUDA installation:
+
+```python
+import torch
+print(f"PyTorch version: {torch.__version__}")
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA version: {torch.version.cuda}")
+print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
+```
+
+### 5. Install and Run Docling
+
+Install Docling with all dependencies:
+
+```bash
+pip install docling
+```
+
+**That's it!** Docling will automatically detect and use your RTX GPU when available. No additional configuration is required for basic usage.
+
+```python
+from docling.document_converter import DocumentConverter
+
+# Docling automatically uses GPU when available
+converter = DocumentConverter()
+result = converter.convert("document.pdf")
+```
+
+<details>
+<summary><b>Advanced: Tuning GPU Performance</b></summary>
+
+For optimal GPU performance with large document batches, you can adjust batch sizes and explicitly configure the accelerator:
+
+```python
+from docling.document_converter import DocumentConverter
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
+
+# Explicitly configure GPU acceleration
+accelerator_options = AcceleratorOptions(
+    device=AcceleratorDevice.CUDA,  # Use CUDA for NVIDIA GPUs
+)
+
+# Configure pipeline for optimal GPU performance
+pipeline_options = ThreadedPdfPipelineOptions(
+    ocr_batch_size=64,      # Increase batch size for GPU
+    layout_batch_size=64,   # Increase batch size for GPU
+    table_batch_size=4,
+)
+
+# Create converter with custom settings
+converter = DocumentConverter(
+    accelerator_options=accelerator_options,
+    pipeline_options=pipeline_options,
+)
+
+# Convert documents
+result = converter.convert("document.pdf")
+```
+
+Adjust batch sizes based on your GPU memory (see Performance Optimization Tips below).
+
+</details>
+
+## GPU-Accelerated VLM Pipeline
+
+For maximum performance with Vision Language Models (VLM), you can run a local inference server on your RTX GPU. This approach provides significantly better throughput than inline VLM processing.
+
+### Linux: Using vLLM (Recommended)
+
+vLLM provides the best performance for GPU-accelerated VLM inference. Start the vLLM server with optimized parameters:
+
+```bash
+vllm serve ibm-granite/granite-docling-258M \
+  --host 127.0.0.1 --port 8000 \
+  --max-num-seqs 512 \
+  --max-num-batched-tokens 8192 \
+  --enable-chunked-prefill \
+  --gpu-memory-utilization 0.9
+```
+
+### Windows: Using llama-server
+
+On Windows, you can use `llama-server` from llama.cpp for GPU-accelerated VLM inference:
+
+#### Installation
+
+1. Download the latest llama.cpp release from the [GitHub releases page](https://github.com/ggml-org/llama.cpp/releases)
+2. Extract the archive and locate `llama-server.exe`
+
+#### Launch Command
+
+```powershell
+llama-server.exe `
+  --hf-repo ibm-granite/granite-docling-258M-GGUF `
+  -cb `
+  -ngl -1 `
+  --port 8000 `
+  --context-shift `
+  -np 16 -c 131072
+```
+
+!!! note "Performance Comparison"
+    vLLM delivers approximately **4x better performance** compared to llama-server. For Windows users seeking maximum performance, consider running vLLM via WSL2 (Windows Subsystem for Linux). See [vLLM on RTX 5090 via Docker](https://github.com/BoltzmannEntropy/vLLM-5090) for detailed WSL2 setup instructions.
+
+### Configure Docling for VLM Server
+
+Once your inference server is running, configure Docling to use it:
+
+```python
+from docling.datamodel.pipeline_options import VlmPipelineOptions
+from docling.datamodel.settings import settings
+
+BATCH_SIZE = 64
+
+# Configure VLM options
+vlm_options = vlm_model_specs.GRANITEDOCLING_VLLM_API
+vlm_options.concurrency = BATCH_SIZE
+
+# when running with llama.cpp (llama-server), use the different model name.
+# vlm_options.params["model"] = "ibm-granite_granite-docling-258M-GGUF_granite-docling-258M-BF16.gguf"
+
+# Set page batch size to match or exceed concurrency
+settings.perf.page_batch_size = BATCH_SIZE
+
+# Create converter with VLM pipeline
+converter = DocumentConverter(
+    pipeline_options=vlm_options,
+)
+```
+
+For more details on VLM pipeline configuration, see the [GPU Support Guide](../usage/gpu.md).
+
+## Performance Optimization Tips
+
+### Batch Size Tuning
+
+Adjust batch sizes based on your GPU memory:
+
+- **RTX 5090 (32GB)**: Use batch sizes of 64-128
+- **RTX 4090 (24GB)**: Use batch sizes of 32-64
+- **RTX 5070 (12GB)**: Use batch sizes of 16-32
+
+### Memory Management
+
+Monitor GPU memory usage:
+
+```python
+import torch
+
+# Check GPU memory
+if torch.cuda.is_available():
+    print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
+    print(f"GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
+```
+
+## Troubleshooting
+
+### CUDA Out of Memory
+
+If you encounter out-of-memory errors:
+
+1. Reduce batch sizes in `pipeline_options`
+2. Process fewer documents concurrently
+3. Clear GPU cache between batches:
+
+```python
+import torch
+torch.cuda.empty_cache()
+```
+
+### CUDA Not Available
+
+If `torch.cuda.is_available()` returns `False`:
+
+1. Verify NVIDIA drivers are installed: `nvidia-smi`
+2. Check CUDA installation: `nvcc --version`
+3. Reinstall PyTorch with correct CUDA version
+4. Ensure your GPU is CUDA-compatible
+
+### Performance Not Improving
+
+If GPU acceleration doesn't improve performance:
+
+1. Increase batch sizes (if memory allows)
+2. Ensure you're processing enough documents to benefit from GPU parallelization
+3. Check GPU utilization: `nvidia-smi -l 1`
+4. Verify PyTorch is using GPU: `torch.cuda.is_available()`
+
+## Additional Resources
+
+- [NVIDIA CUDA Documentation](https://docs.nvidia.com/cuda/)
+- [PyTorch CUDA Installation Guide](https://pytorch.org/get-started/locally/)
+- [Docling GPU Support Guide](../usage/gpu.md)
+- [GPU Performance Examples](../examples/gpu_standard_pipeline.py)
diff --git a/mkdocs.yml b/mkdocs.yml
index 2d483f9f..8d51fe60 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -57,6 +57,7 @@ nav:
     - Getting started : 
       - Installation: getting_started/installation.md
       - Quickstart: getting_started/quickstart.md
+      - ⚡ RTX GPU: getting_started/rtx.md
     - Usage:
       - Advanced options: usage/advanced_options.md
       - Supported formats: usage/supported_formats.md