fix: Make CVAT pipeline resilient to single document crashes, report failures at the end

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2026-05-17 13:10:47 +00:00 · 2025-12-15 14:11:10 +01:00
parent bcc5200f74
commit 8a2ba74be8
2 changed files with 75 additions and 31 deletions
@@ -1,16 +1,20 @@
 from __future__ import annotations

+import logging
 from io import BytesIO
 from pathlib import Path
 from typing import Iterator, List

 from docling_core.types import DoclingDocument
 from docling_core.types.io import DocumentStream
+from pydantic import ValidationError

 from docling_eval.datamodels.dataset_record import DatasetRecord
 from docling_eval.datamodels.types import BenchMarkColumns
 from docling_eval.utils.utils import extract_images, get_binhash

+_LOGGER = logging.getLogger(__name__)
+

 def _select_range(files: List[Path], begin_index: int, end_index: int) -> List[Path]:
    if begin_index < 0:
@@ -35,25 +39,49 @@ def iter_docling_json_records(
    selected_files = _select_range(json_files, begin_index, end_index)

    for json_path in selected_files:
-        document: DoclingDocument = DoclingDocument.load_from_json(json_path)
-        document, pictures, page_images = extract_images(
-            document=document,
-            pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-            page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-        )
+        try:
+            document: DoclingDocument = DoclingDocument.load_from_json(json_path)
+        except ValidationError as exc:
+            _LOGGER.error(
+                "Validation error loading document %s: %s. Skipping this document.",
+                json_path,
+                exc,
+            )
+            continue
+        except Exception as exc:  # noqa: BLE001
+            _LOGGER.error(
+                "Unexpected error loading document %s: %s. Skipping this document.",
+                json_path,
+                exc,
+            )
+            continue

-        doc_bytes = json_path.read_bytes()
+        try:
+            document, pictures, page_images = extract_images(
+                document=document,
+                pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+            )

-        yield DatasetRecord(
-            doc_id=json_path.stem,
-            doc_path=json_path,
-            doc_hash=get_binhash(doc_bytes),
-            ground_truth_doc=document,
-            ground_truth_pictures=pictures,
-            ground_truth_page_images=page_images,
-            original=DocumentStream(
-                name=json_path.name,
-                stream=BytesIO(doc_bytes),
-            ),
-            mime_type="application/json",
-        )
+            doc_bytes = json_path.read_bytes()
+
+            yield DatasetRecord(
+                doc_id=json_path.stem,
+                doc_path=json_path,
+                doc_hash=get_binhash(doc_bytes),
+                ground_truth_doc=document,
+                ground_truth_pictures=pictures,
+                ground_truth_page_images=page_images,
+                original=DocumentStream(
+                    name=json_path.name,
+                    stream=BytesIO(doc_bytes),
+                ),
+                mime_type="application/json",
+            )
+        except Exception as exc:  # noqa: BLE001
+            _LOGGER.error(
+                "Error processing document %s: %s. Skipping this document.",
+                json_path,
+                exc,
+            )
+            continue
@@ -143,18 +143,34 @@ def join_docling_json_datasets(
                    continue
                raise ValueError(message)

-            prediction_doc, pictures, page_images = _load_prediction_json(
-                prediction_record
-            )
+            try:
+                prediction_doc, pictures, page_images = _load_prediction_json(
+                    prediction_record
+                )
+            except Exception as exc:  # noqa: BLE001
+                _LOGGER.error(
+                    "Error loading prediction JSON for document %s: %s. Skipping this document.",
+                    gt_record.doc_id,
+                    exc,
+                )
+                continue

-            joined = _build_prediction_record(
-                gt_record,
-                prediction_doc,
-                pictures,
-                page_images,
-                prediction_format=prediction_format,
-                predictor_info=predictor_info,
-            )
+            try:
+                joined = _build_prediction_record(
+                    gt_record,
+                    prediction_doc,
+                    pictures,
+                    page_images,
+                    prediction_format=prediction_format,
+                    predictor_info=predictor_info,
+                )
+            except Exception as exc:  # noqa: BLE001
+                _LOGGER.error(
+                    "Error building prediction record for document %s: %s. Skipping this document.",
+                    gt_record.doc_id,
+                    exc,
+                )
+                continue

            if do_visualization and visualizations_dir is not None:
                try: