fix: Make CVAT pipeline resilient to single document crashes, report failures at the end

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-12-15 14:11:10 +01:00
parent bcc5200f74
commit 8a2ba74be8
2 changed files with 75 additions and 31 deletions
+48 -20
View File
@@ -1,16 +1,20 @@
from __future__ import annotations
import logging
from io import BytesIO
from pathlib import Path
from typing import Iterator, List
from docling_core.types import DoclingDocument
from docling_core.types.io import DocumentStream
from pydantic import ValidationError
from docling_eval.datamodels.dataset_record import DatasetRecord
from docling_eval.datamodels.types import BenchMarkColumns
from docling_eval.utils.utils import extract_images, get_binhash
_LOGGER = logging.getLogger(__name__)
def _select_range(files: List[Path], begin_index: int, end_index: int) -> List[Path]:
if begin_index < 0:
@@ -35,25 +39,49 @@ def iter_docling_json_records(
selected_files = _select_range(json_files, begin_index, end_index)
for json_path in selected_files:
document: DoclingDocument = DoclingDocument.load_from_json(json_path)
document, pictures, page_images = extract_images(
document=document,
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
)
try:
document: DoclingDocument = DoclingDocument.load_from_json(json_path)
except ValidationError as exc:
_LOGGER.error(
"Validation error loading document %s: %s. Skipping this document.",
json_path,
exc,
)
continue
except Exception as exc: # noqa: BLE001
_LOGGER.error(
"Unexpected error loading document %s: %s. Skipping this document.",
json_path,
exc,
)
continue
doc_bytes = json_path.read_bytes()
try:
document, pictures, page_images = extract_images(
document=document,
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
)
yield DatasetRecord(
doc_id=json_path.stem,
doc_path=json_path,
doc_hash=get_binhash(doc_bytes),
ground_truth_doc=document,
ground_truth_pictures=pictures,
ground_truth_page_images=page_images,
original=DocumentStream(
name=json_path.name,
stream=BytesIO(doc_bytes),
),
mime_type="application/json",
)
doc_bytes = json_path.read_bytes()
yield DatasetRecord(
doc_id=json_path.stem,
doc_path=json_path,
doc_hash=get_binhash(doc_bytes),
ground_truth_doc=document,
ground_truth_pictures=pictures,
ground_truth_page_images=page_images,
original=DocumentStream(
name=json_path.name,
stream=BytesIO(doc_bytes),
),
mime_type="application/json",
)
except Exception as exc: # noqa: BLE001
_LOGGER.error(
"Error processing document %s: %s. Skipping this document.",
json_path,
exc,
)
continue
+27 -11
View File
@@ -143,18 +143,34 @@ def join_docling_json_datasets(
continue
raise ValueError(message)
prediction_doc, pictures, page_images = _load_prediction_json(
prediction_record
)
try:
prediction_doc, pictures, page_images = _load_prediction_json(
prediction_record
)
except Exception as exc: # noqa: BLE001
_LOGGER.error(
"Error loading prediction JSON for document %s: %s. Skipping this document.",
gt_record.doc_id,
exc,
)
continue
joined = _build_prediction_record(
gt_record,
prediction_doc,
pictures,
page_images,
prediction_format=prediction_format,
predictor_info=predictor_info,
)
try:
joined = _build_prediction_record(
gt_record,
prediction_doc,
pictures,
page_images,
prediction_format=prediction_format,
predictor_info=predictor_info,
)
except Exception as exc: # noqa: BLE001
_LOGGER.error(
"Error building prediction record for document %s: %s. Skipping this document.",
gt_record.doc_id,
exc,
)
continue
if do_visualization and visualizations_dir is not None:
try: