From c90c0686007ec3f52ffa21538eaf39f4ecff9c77 Mon Sep 17 00:00:00 2001
From: ElHachem02 <peterelhachem02@gmail.com>
Date: Mon, 2 Mar 2026 15:16:46 +0100
Subject: [PATCH] feat: add documentation on why we set the layout model to run
 on CPU

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>
---
 .../experimental/pipeline/threaded_layout_vlm_pipeline.py  | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
index 78a0e44e..662c63c9 100644
--- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
+++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
@@ -72,7 +72,12 @@ class ThreadedLayoutVlmPipeline(BasePipeline):
         """Initialize layout and VLM models."""
         art_path = self._resolve_artifacts_path()
 
-        # Layout model
+        # The layout model is forced to run on CPU.
+        # In this threaded pipeline, the VLM exclusively owns the GPU.
+        # Allowing multiple models to use the GPU concurrently can cause
+        # device contention, memory spikes, and unstable inference behavior.
+        # Since the layout model is lightweight, running it on CPU avoids
+        # cross-thread GPU contention without significantly impacting latency.
         self.layout_model = LayoutModel(
             artifacts_path=art_path,
             accelerator_options=AcceleratorOptions(device="cpu"),