mirror of
https://github.com/docling-project/docling-eval.git
synced 2026-05-17 13:10:47 +00:00
fix: Make CVAT pipeline resilient to single document crashes, report failures at the end
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -1,16 +1,20 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List
|
||||
|
||||
from docling_core.types import DoclingDocument
|
||||
from docling_core.types.io import DocumentStream
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling_eval.datamodels.dataset_record import DatasetRecord
|
||||
from docling_eval.datamodels.types import BenchMarkColumns
|
||||
from docling_eval.utils.utils import extract_images, get_binhash
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _select_range(files: List[Path], begin_index: int, end_index: int) -> List[Path]:
|
||||
if begin_index < 0:
|
||||
@@ -35,25 +39,49 @@ def iter_docling_json_records(
|
||||
selected_files = _select_range(json_files, begin_index, end_index)
|
||||
|
||||
for json_path in selected_files:
|
||||
document: DoclingDocument = DoclingDocument.load_from_json(json_path)
|
||||
document, pictures, page_images = extract_images(
|
||||
document=document,
|
||||
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
|
||||
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
|
||||
)
|
||||
try:
|
||||
document: DoclingDocument = DoclingDocument.load_from_json(json_path)
|
||||
except ValidationError as exc:
|
||||
_LOGGER.error(
|
||||
"Validation error loading document %s: %s. Skipping this document.",
|
||||
json_path,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_LOGGER.error(
|
||||
"Unexpected error loading document %s: %s. Skipping this document.",
|
||||
json_path,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
doc_bytes = json_path.read_bytes()
|
||||
try:
|
||||
document, pictures, page_images = extract_images(
|
||||
document=document,
|
||||
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
|
||||
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
|
||||
)
|
||||
|
||||
yield DatasetRecord(
|
||||
doc_id=json_path.stem,
|
||||
doc_path=json_path,
|
||||
doc_hash=get_binhash(doc_bytes),
|
||||
ground_truth_doc=document,
|
||||
ground_truth_pictures=pictures,
|
||||
ground_truth_page_images=page_images,
|
||||
original=DocumentStream(
|
||||
name=json_path.name,
|
||||
stream=BytesIO(doc_bytes),
|
||||
),
|
||||
mime_type="application/json",
|
||||
)
|
||||
doc_bytes = json_path.read_bytes()
|
||||
|
||||
yield DatasetRecord(
|
||||
doc_id=json_path.stem,
|
||||
doc_path=json_path,
|
||||
doc_hash=get_binhash(doc_bytes),
|
||||
ground_truth_doc=document,
|
||||
ground_truth_pictures=pictures,
|
||||
ground_truth_page_images=page_images,
|
||||
original=DocumentStream(
|
||||
name=json_path.name,
|
||||
stream=BytesIO(doc_bytes),
|
||||
),
|
||||
mime_type="application/json",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_LOGGER.error(
|
||||
"Error processing document %s: %s. Skipping this document.",
|
||||
json_path,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
@@ -143,18 +143,34 @@ def join_docling_json_datasets(
|
||||
continue
|
||||
raise ValueError(message)
|
||||
|
||||
prediction_doc, pictures, page_images = _load_prediction_json(
|
||||
prediction_record
|
||||
)
|
||||
try:
|
||||
prediction_doc, pictures, page_images = _load_prediction_json(
|
||||
prediction_record
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_LOGGER.error(
|
||||
"Error loading prediction JSON for document %s: %s. Skipping this document.",
|
||||
gt_record.doc_id,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
joined = _build_prediction_record(
|
||||
gt_record,
|
||||
prediction_doc,
|
||||
pictures,
|
||||
page_images,
|
||||
prediction_format=prediction_format,
|
||||
predictor_info=predictor_info,
|
||||
)
|
||||
try:
|
||||
joined = _build_prediction_record(
|
||||
gt_record,
|
||||
prediction_doc,
|
||||
pictures,
|
||||
page_images,
|
||||
prediction_format=prediction_format,
|
||||
predictor_info=predictor_info,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_LOGGER.error(
|
||||
"Error building prediction record for document %s: %s. Skipping this document.",
|
||||
gt_record.doc_id,
|
||||
exc,
|
||||
)
|
||||
continue
|
||||
|
||||
if do_visualization and visualizations_dir is not None:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user