From c90c0686007ec3f52ffa21538eaf39f4ecff9c77 Mon Sep 17 00:00:00 2001 From: ElHachem02 Date: Mon, 2 Mar 2026 15:16:46 +0100 Subject: [PATCH] feat: add documentation on why we set the layout model to run on CPU Signed-off-by: ElHachem02 --- .../experimental/pipeline/threaded_layout_vlm_pipeline.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py index 78a0e44e..662c63c9 100644 --- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py @@ -72,7 +72,12 @@ class ThreadedLayoutVlmPipeline(BasePipeline): """Initialize layout and VLM models.""" art_path = self._resolve_artifacts_path() - # Layout model + # The layout model is forced to run on CPU. + # In this threaded pipeline, the VLM exclusively owns the GPU. + # Allowing multiple models to use the GPU concurrently can cause + # device contention, memory spikes, and unstable inference behavior. + # Since the layout model is lightweight, running it on CPU avoids + # cross-thread GPU contention without significantly impacting latency. self.layout_model = LayoutModel( artifacts_path=art_path, accelerator_options=AcceleratorOptions(device="cpu"),