From a850784b4f9b26e94659085d6ea1f95473313f90 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com> Date: Tue, 16 Dec 2025 11:25:55 +0100 Subject: [PATCH] feat: Improvements in user experience: Performance, error handling, logging (#189) * feat: Extend evaluate_dpbench_on_external_predictions.sh to include visualisations of the evaluations Signed-off-by: Nikos Livathinos * fix: Improve error checking in main.py:visualize() Signed-off-by: Nikos Livathinos * fix: Improve logging Signed-off-by: Nikos Livathinos * feat: Parallelize the computation of PixelLayoutEvaluator at the level of page Signed-off-by: Nikos Livathinos * fix: Make DatasetPixelLayoutEvaluation a subclass of DatasetEvaluation Signed-off-by: Nikos Livathinos * feat: Parallelize the MarkdownTextEvaluator Signed-off-by: Nikos Livathinos * chore: Improve logging Signed-off-by: Nikos Livathinos --------- Signed-off-by: Nikos Livathinos --- docling_eval/cli/main.py | 35 ++- docling_eval/evaluators/base_evaluator.py | 2 + docling_eval/evaluators/layout_evaluator.py | 7 +- .../evaluators/markdown_text_evaluator.py | 205 +++++++------ docling_eval/evaluators/pixel/pixel_types.py | 8 +- .../evaluators/pixel_layout_evaluator.py | 287 ++++++++++-------- docling_eval/evaluators/stats.py | 8 +- ...valuate_dpbench_on_external_predictions.sh | 26 +- 8 files changed, 343 insertions(+), 235 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 3972f3a..1a6826c 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -641,6 +641,7 @@ def evaluate( split: str = "test", cvat_overview_path: Optional[Path] = None, external_predictions_path: Optional[Path] = None, + concurrency: int = 4, ) -> Optional[DatasetEvaluationType]: """Evaluate predictions against ground truth.""" if not os.path.exists(idir): @@ -673,17 +674,16 @@ def evaluate( # label_filtering_strategy=LabelFilteringStrategy.INTERSECTION, page_mapping_path=cvat_overview_path, ) - evaluation = layout_evaluator( # type: ignore + layout_evaluation = layout_evaluator( # type: ignore idir, split=split, external_predictions_path=external_predictions_path, ) - with open(save_fn, "w") as fd: - json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) + json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True) # Evaluate with the pixel-wise layout evaluation - pixel_layout_evaluator = PixelLayoutEvaluator() + pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency) pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator( idir, split=split, @@ -696,6 +696,9 @@ def evaluate( pixel_save_root, ) + # TODO: Redesign evaluate() to return multiple evaluation objects + evaluation = pixel_ds_evaluation # type: ignore + elif modality == EvaluationModality.TABLE_STRUCTURE: table_evaluator = TableEvaluator() evaluation = table_evaluator( # type: ignore @@ -764,7 +767,7 @@ def evaluate( ) elif modality == EvaluationModality.MARKDOWN_TEXT: - md_evaluator = MarkdownTextEvaluator() + md_evaluator = MarkdownTextEvaluator(concurrency=concurrency) evaluation = md_evaluator( # type: ignore idir, split=split, @@ -823,8 +826,8 @@ def evaluate( def visualize( modality: EvaluationModality, benchmark: BenchMarkNames, - idir: Path, odir: Path, + idir: Path | None = None, split: str = "test", ): """ @@ -839,10 +842,6 @@ def visualize( begin_index: Begin index end_index: End index """ - if not os.path.exists(idir): - _log.error(f"Input directory not found: {idir}") - return - os.makedirs(odir, exist_ok=True) metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json" @@ -989,6 +988,11 @@ def visualize( elif modality == EvaluationModality.READING_ORDER: try: + # idir is required here + if idir is None or not idir.is_dir(): + _log.error(f"Input directory not found: {idir}") + return + with open(metrics_filename, "r") as fd: ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json( fd.read() @@ -1080,6 +1084,11 @@ def visualize( elif modality == EvaluationModality.OCR: try: + # idir is required here + if idir is None or not idir.is_dir(): + _log.error(f"Input directory not found: {idir}") + return + with open(metrics_filename, "r") as fd: ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json( fd.read() @@ -1511,6 +1520,9 @@ def evaluate_cmd( help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]", ), ] = None, + concurrency: Annotated[ + int, typer.Option(help="Concurrency for the computation of each metric") + ] = 4, ): """Evaluate predictions against ground truth.""" input_dir, output_dir = derive_input_output_dirs( @@ -1531,6 +1543,7 @@ def evaluate_cmd( odir=eval_output_dir, split=split, external_predictions_path=external_predictions_path, + concurrency=concurrency, ) @@ -1573,8 +1586,8 @@ def visualize_cmd( visualize( modality=modality, benchmark=benchmark, - idir=input_dir, odir=eval_output_dir, + idir=input_dir, split=split, ) diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py index 4198f08..4a009e0 100644 --- a/docling_eval/evaluators/base_evaluator.py +++ b/docling_eval/evaluators/base_evaluator.py @@ -75,12 +75,14 @@ class BaseEvaluator(Generic[UnitEvaluationType, DatasetEvaluationType]): supported_prediction_formats: List[PredictionFormats] = [ PredictionFormats.DOCLING_DOCUMENT ], + concurrency: int = 4, ): r""" Parameters ---------- intermediate_evaluations_path: When True the evalution per example will be saved in a file """ + self._concurrency = concurrency self._intermediate_evaluations_path = intermediate_evaluations_path # Validate the prediction_sources diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 0d11394..c1a86e7 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -489,7 +489,7 @@ class LayoutEvaluator(BaseEvaluator): weighted_map_90_values.append(average_iou_90) weighted_map_95_values.append(average_iou_95) - _log.info( + _log.debug( "doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, " "precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f", doc_id_page, @@ -528,7 +528,6 @@ class LayoutEvaluator(BaseEvaluator): segmentation_precision_no_pictures=precision_no_pics, segmentation_recall_no_pictures=recall_no_pics, segmentation_f1_no_pictures=f1_no_pics, - # New per-sample element count metrics true_element_count=true_element_count, pred_element_count=pred_element_count, true_table_count=true_table_count, @@ -836,9 +835,7 @@ class LayoutEvaluator(BaseEvaluator): true_labels: Dict[str, int] = {} pred_labels: Dict[str, int] = {} - for i, data in tqdm( - enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds) - ): + for i, data in enumerate(ds): data_record = DatasetRecordWithPrediction.model_validate(data) true_doc = data_record.ground_truth_doc pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index afbc68e..1c17758 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -1,4 +1,5 @@ import logging +from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed from pathlib import Path from typing import Any, Dict, List, Optional, Set @@ -33,6 +34,51 @@ from docling_eval.utils.external_docling_document_loader import ( _log = logging.getLogger(__name__) +def compute_bleu_score(bleu_eval, true_txt: str, pred_txt: str) -> float: + r""" + Compute BLEU score with the HF evaluate and the default Tokenizer_13 + """ + result = bleu_eval.compute(predictions=[pred_txt], references=[[true_txt]]) + bleu = result["bleu"] + return bleu + + +def compute_nltk_scores(true_txt: str, pred_txt: str) -> dict[str, float]: + r""" + Returns: + -------- + dict with keys: ["f_measure", "precision", "recall", "edit_dist"] + """ + true_tokens = word_tokenize(true_txt) + true_tokens_set = set(true_tokens) + pred_tokens = word_tokenize(pred_txt) + pred_tokens_set = set(pred_tokens) + + f1_score = f_measure(true_tokens_set, pred_tokens_set) + precision_score = precision(true_tokens_set, pred_tokens_set) + recall_score = recall(true_tokens_set, pred_tokens_set) + edit_dist = edit_distance(pred_tokens, true_tokens) / max( + len(pred_tokens), len(true_tokens) + ) + meteor = meteor_score.meteor_score([true_tokens], pred_tokens) + + metrics: dict[str, float] = { + "f1_score": f1_score, + "precision": precision_score, + "recall": recall_score, + "edit_distance": edit_dist, + "meteor": meteor, + } + return metrics + + +def evaluate_page(bleu_eval, true_md: str, pred_md: str) -> dict[str, float]: + r"""Compute the bleu and the nltk scores""" + scores = compute_nltk_scores(true_md, pred_md) + scores["bleu"] = compute_bleu_score(bleu_eval, true_md, pred_md) + return scores + + class PageMarkdownEvaluation(UnitEvaluation): doc_id: str @@ -62,6 +108,7 @@ class MarkdownTextEvaluator(BaseEvaluator): self, intermediate_evaluations_path: Optional[Path] = None, prediction_sources: List[PredictionFormats] = [], + concurrency: int = 4, ): r""" """ supported_prediction_formats: List[PredictionFormats] = [ @@ -74,6 +121,7 @@ class MarkdownTextEvaluator(BaseEvaluator): intermediate_evaluations_path=intermediate_evaluations_path, prediction_sources=prediction_sources, supported_prediction_formats=supported_prediction_formats, + concurrency=concurrency, ) self._bleu_eval = evaluate.load("bleu") @@ -146,67 +194,80 @@ class MarkdownTextEvaluator(BaseEvaluator): "meteor": [], } - for i, data in tqdm( - enumerate(ds_selection), - desc="Markdown text evaluations", - ncols=120, - total=len(ds_selection), - ): - data_record = DatasetRecordWithPrediction.model_validate(data) - doc_id = data_record.doc_id - true_doc = data_record.ground_truth_doc - true_md = self._docling_document_to_md(true_doc) + with ProcessPoolExecutor(max_workers=self._concurrency) as executor: + futures: list[Future] = [] - # Get the predicted markdown from the external predictions path - if external_predictions_path is not None: - pred_doc = external_docling_doc_loader(data_record) - if pred_doc is None: - _log.error("No external prediction found for doc_id=%s", doc_id) + # Submit the evaluation tasks + _log.info("Submitting the documents for evaluation...") + for data in ds_selection: + data_record = DatasetRecordWithPrediction.model_validate(data) + doc_id = data_record.doc_id + true_doc = data_record.ground_truth_doc + true_md = self._docling_document_to_md(true_doc) + + # Get the predicted markdown from the external predictions path + if external_predictions_path is not None: + pred_doc = external_docling_doc_loader(data_record) + if pred_doc is None: + _log.error("No external prediction found for doc_id=%s", doc_id) + rejected_samples[ + EvaluationRejectionType.MISSING_PREDICTION + ] += 1 + continue + pred_md = self._docling_document_to_md(pred_doc) + else: + if data_record.status not in self._accepted_status: + _log.error( + "Skipping record without successfull conversion status: %s", + doc_id, + ) + rejected_samples[ + EvaluationRejectionType.INVALID_CONVERSION_STATUS + ] += 1 + continue + pred_md = self._get_pred_md(data_record) # type: ignore + + if pred_md is None: + _log.error("There is no markdown prediction for doc_id=%s", doc_id) rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 continue - pred_md = self._docling_document_to_md(pred_doc) - else: - if data_record.status not in self._accepted_status: - _log.error( - "Skipping record without successfull conversion status: %s", - doc_id, + + if true_md != "" and pred_md != "": + futures.append( + executor.submit( + evaluate_page, self._bleu_eval, true_md, pred_md + ) ) - rejected_samples[ - EvaluationRejectionType.INVALID_CONVERSION_STATUS - ] += 1 - continue - pred_md = self._get_pred_md(data_record) # type: ignore - if not pred_md: - _log.error("There is no markdown prediction for doc_id=%s", doc_id) - rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 - continue + # Collect the futures + _log.info("Collecting the documents for evaluations...") + for i, future in tqdm( + enumerate(as_completed(futures)), + desc="Markdown text evaluations", + ncols=120, + total=len(futures), + ): + doc_metrics = future.result() - bleu = 0.0 - if true_md != "" and pred_md != "": - bleu = self._compute_bleu_score(true_md, pred_md) - ntlk_scores = self._compute_nltk_scores(true_md, pred_md) + # Collect metrics across pages + for score_name, score in doc_metrics.items(): + ds_metrics[score_name].append(score) - # Collect metrics across pages - ds_metrics["bleu"].append(bleu) - for score_name, score in ntlk_scores.items(): - ds_metrics[score_name].append(score) + md_evaluation = PageMarkdownEvaluation( + doc_id=doc_id, + true_md=true_md, + pred_md=pred_md, + bleu=doc_metrics["bleu"], + f1_score=doc_metrics["f1_score"], + precision=doc_metrics["precision"], + recall=doc_metrics["recall"], + edit_distance=doc_metrics["edit_distance"], + meteor=doc_metrics["meteor"], + ) + evaluations.append(md_evaluation) - md_evaluation = PageMarkdownEvaluation( - doc_id=doc_id, - true_md=true_md, - pred_md=pred_md, - bleu=bleu, - f1_score=ntlk_scores["f1_score"], - precision=ntlk_scores["precision"], - recall=ntlk_scores["recall"], - edit_distance=ntlk_scores["edit_distance"], - meteor=ntlk_scores["meteor"], - ) - evaluations.append(md_evaluation) - - if self._intermediate_evaluations_path: - self.save_intermediate_evaluations("MD", i, doc_id, evaluations) + if self._intermediate_evaluations_path: + self.save_intermediate_evaluations("MD", i, doc_id, evaluations) ds_md_evalutions = DatasetMarkdownEvaluation( evaluated_samples=len(evaluations), @@ -221,44 +282,6 @@ class MarkdownTextEvaluator(BaseEvaluator): ) return ds_md_evalutions - def _compute_bleu_score(self, true_txt: str, pred_txt: str) -> float: - r""" - Compute BLEU score with the HF evaluate and the default Tokenizer_13 - """ - result = self._bleu_eval.compute( - predictions=[pred_txt], references=[[true_txt]] - ) - bleu = result["bleu"] - return bleu - - def _compute_nltk_scores(self, true_txt: str, pred_txt: str) -> dict[str, float]: - r""" - Returns: - -------- - dict with keys: ["f_measure", "precision", "recall", "edit_dist"] - """ - true_tokens = word_tokenize(true_txt) - true_tokens_set = set(true_tokens) - pred_tokens = word_tokenize(pred_txt) - pred_tokens_set = set(pred_tokens) - - f1_score = f_measure(true_tokens_set, pred_tokens_set) - precision_score = precision(true_tokens_set, pred_tokens_set) - recall_score = recall(true_tokens_set, pred_tokens_set) - edit_dist = edit_distance(pred_tokens, true_tokens) / max( - len(pred_tokens), len(true_tokens) - ) - meteor = meteor_score.meteor_score([true_tokens], pred_tokens) - - metrics: dict[str, float] = { - "f1_score": f1_score, - "precision": precision_score, - "recall": recall_score, - "edit_distance": edit_dist, - "meteor": meteor, - } - return metrics - def _docling_document_to_md(self, doc: DoclingDocument) -> str: r""" Export DoclingDocument to markdown diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py index dec19ec..b106531 100644 --- a/docling_eval/evaluators/pixel/pixel_types.py +++ b/docling_eval/evaluators/pixel/pixel_types.py @@ -3,7 +3,10 @@ from typing import Any, Dict, Optional import numpy as np from pydantic import BaseModel, model_serializer, model_validator -from docling_eval.evaluators.base_evaluator import EvaluationRejectionType +from docling_eval.evaluators.base_evaluator import ( + DatasetEvaluation, + EvaluationRejectionType, +) from docling_eval.evaluators.stats import DatasetStatistics @@ -73,11 +76,10 @@ class PagePixelLayoutEvaluation(BaseModel): matrix_evaluation: MultiLabelMatrixEvaluation -class DatasetPixelLayoutEvaluation(BaseModel): +class DatasetPixelLayoutEvaluation(DatasetEvaluation): layout_model_name: Optional[str] num_pages: int num_pixels: int - rejected_samples: Dict[EvaluationRejectionType, int] matrix_evaluation: MultiLabelMatrixEvaluation page_evaluations: Dict[str, PagePixelLayoutEvaluation] diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py index 1913f1b..0de8a56 100644 --- a/docling_eval/evaluators/pixel_layout_evaluator.py +++ b/docling_eval/evaluators/pixel_layout_evaluator.py @@ -3,6 +3,7 @@ import json import logging import math from collections import defaultdict +from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed from enum import Enum from pathlib import Path from typing import Dict, List, Optional, Tuple, Union @@ -53,6 +54,51 @@ def category_name_to_docitemlabel(category_name: str) -> DocItemLabel: return label +def evaluate_page( + mlcm: MultiLabelConfusionMatrix, + doc_id: str, + page_no: int, + pg_width: int, + pg_height: int, + matrix_id_to_name: dict[int, str], + gt_resolutions: list[LayoutResolution], + pred_resolutions: Optional[list[LayoutResolution]] = None, +) -> tuple[str, int, int, MultiLabelMatrixEvaluation]: + r""" + Compute the confusion matrix and the metrics for one page + If pred_resolutions is None, assume an all-background predictions + + Return + ------ + doc_id + page_no + page_pixels + page_metrics + """ + # Make binary representations + gt_binary = mlcm.make_binary_representation(pg_width, pg_height, gt_resolutions) + if pred_resolutions is not None: + preds_binary = mlcm.make_binary_representation( + pg_width, pg_height, pred_resolutions + ) + else: + preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64) + + # Compute confusion matrix + matrix_categories_ids: List[int] = list(matrix_id_to_name.keys()) + confusion_matrix = mlcm.generate_confusion_matrix( + gt_binary, preds_binary, matrix_categories_ids + ) + + # Compute metrics + page_metrics: MultiLabelMatrixEvaluation = mlcm.compute_metrics( + confusion_matrix, matrix_id_to_name + ) + page_pixels = pg_width * pg_height + + return doc_id, page_no, page_pixels, page_metrics + + class PixelLayoutEvaluator(BaseEvaluator): r""" Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices @@ -65,6 +111,7 @@ class PixelLayoutEvaluator(BaseEvaluator): intermediate_evaluations_path: Optional[Path] = None, prediction_sources: List[PredictionFormats] = [], missing_prediction_strategy: MissingPredictionStrategy = MissingPredictionStrategy.PENALIZE, + concurrency: int = 4, ): r""" @@ -82,6 +129,7 @@ class PixelLayoutEvaluator(BaseEvaluator): if not prediction_sources: prediction_sources = supported_prediction_formats super().__init__( + concurrency=concurrency, intermediate_evaluations_path=intermediate_evaluations_path, prediction_sources=prediction_sources, supported_prediction_formats=supported_prediction_formats, @@ -103,6 +151,23 @@ class PixelLayoutEvaluator(BaseEvaluator): self._build_matrix_categories(label_mapping) ) + @staticmethod + def evaluation_filenames( + benchmark: BenchMarkNames, save_root: Path + ) -> dict[str, Path]: + r""" + Generate the expected filenames for the produced evaluation files + """ + modality: str = EvaluationModality.LAYOUT.value + json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json" + excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx" + + eval_filenames: dict[str, Path] = { + "json": json_fn, + "excel": excel_fn, + } + return eval_filenames + def _build_matrix_categories( self, label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None, @@ -193,6 +258,7 @@ class PixelLayoutEvaluator(BaseEvaluator): ds_selection: Dataset = ds[split] # Results containers + evaluated_samples = 0 rejected_samples: Dict[EvaluationRejectionType, int] = { EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0, EvaluationRejectionType.MISSING_PREDICTION: 0, @@ -213,95 +279,101 @@ class PixelLayoutEvaluator(BaseEvaluator): [] ) # Gather f1 score/image when evaluated on collapsed classes - for i, data in tqdm( - enumerate(ds_selection), - desc="Multi-label Matrix Layout evaluations", - ncols=120, - total=len(ds_selection), - ): - data_record = DatasetRecordWithPrediction.model_validate(data) + with ProcessPoolExecutor(max_workers=self._concurrency) as executor: + futures: list[Future] = [] - # Try to extract the layout model name - if not self._layout_model_name: - self._layout_model_name = dict_get( - data_record.predictor_info, - [ - "options", - "pdf", - "pipeline_options", - "layout_options", - "model_spec", - "name", - ], - ) + # Submit pages for execution + _log.info("Submitting the documents for evaluation...") + for data in ds_selection: + data_record = DatasetRecordWithPrediction.model_validate(data) - doc_id: str = data_record.doc_id - if ( - ext_docdoc_loader is None - and data_record.status not in self._accepted_status - ): - _log.error( - "Skipping record without successfull conversion status: %s", doc_id - ) - rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 - continue + # Try to extract the layout model name + if not self._layout_model_name: + self._layout_model_name = dict_get( + data_record.predictor_info, + [ + "options", + "pdf", + "pipeline_options", + "layout_options", + "model_spec", + "name", + ], + ) - true_doc = data_record.ground_truth_doc - pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) - if not pred_doc: - _log.error("There is no prediction for doc_id=%s", doc_id) - rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 - continue + doc_id: str = data_record.doc_id + if ( + ext_docdoc_loader is None + and data_record.status not in self._accepted_status + ): + _log.error( + "Skipping record without successfull conversion status: %s", + doc_id, + ) + rejected_samples[ + EvaluationRejectionType.INVALID_CONVERSION_STATUS + ] += 1 + continue - # Compute confusion matrices - pages_confusion_matrices: Dict[int, np.ndarray] - pages_pixels: Dict[int, int] - pages_confusion_matrices, doc_num_pixels, pages_pixels = ( - self._compute_document_confusion_matrix(true_doc, pred_doc) - ) + true_doc = data_record.ground_truth_doc + pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) + if not pred_doc: + _log.error("There is no prediction for doc_id=%s", doc_id) + rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 + continue - # Compute metrics per page - for page_no, page_confusion_matrix in pages_confusion_matrices.items(): - # Contribute to the dataset's confusion matrix - ds_confusion_matrix += page_confusion_matrix - - # Compute page metrics - page_matrix_evaluation: MultiLabelMatrixEvaluation = ( - self._mlcm.compute_metrics( - page_confusion_matrix, - self._matrix_id_to_name, + evaluated_samples += 1 + futures.extend( + self._submit_document_evaluation( + executor, doc_id, true_doc, pred_doc ) ) + + # Collect the futures + _log.info("Collecting the documents for evaluations...") + for future in tqdm( + as_completed(futures), + desc="Multi-label Matrix Layout evaluations", + ncols=120, + total=len(futures), + ): + page_metrics: MultiLabelMatrixEvaluation + doc_id, page_no, page_pixels, page_metrics = future.result() + + page_confusion_matrix: np.ndarray = ( + page_metrics.detailed.confusion_matrix + ) + ds_num_pixels += page_pixels + ds_confusion_matrix += page_confusion_matrix + doc_page_id = f"{doc_id}-{page_no}" page_evaluation = PagePixelLayoutEvaluation( doc_id=doc_id, page_no=page_no, - num_pixels=pages_pixels[page_no], - matrix_evaluation=page_matrix_evaluation, + num_pixels=page_pixels, + matrix_evaluation=page_metrics, ) - doc_page_id = f"{doc_id}-{page_no}" all_pages_evaluations[doc_page_id] = page_evaluation # Update f1 lists pages_detailed_f1.append( - page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean + page_metrics.detailed.agg_metrics.classes_f1_mean ) pages_collapsed_f1.append( - page_matrix_evaluation.collapsed.agg_metrics.classes_f1_mean + page_metrics.collapsed.agg_metrics.classes_f1_mean ) - ds_num_pixels += doc_num_pixels - - # Compute metrics for the dataset and each document + # Compute metrics for the dataset ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics( ds_confusion_matrix, self._matrix_id_to_name, ) ds_evaluation = DatasetPixelLayoutEvaluation( + evaluated_samples=evaluated_samples, + rejected_samples=rejected_samples, layout_model_name=self._layout_model_name, num_pages=len(all_pages_evaluations), num_pixels=ds_num_pixels, - rejected_samples=rejected_samples, matrix_evaluation=ds_matrix_evaluation, page_evaluations=all_pages_evaluations, f1_all_classes_stats=compute_stats(pages_detailed_f1), @@ -310,23 +382,6 @@ class PixelLayoutEvaluator(BaseEvaluator): return ds_evaluation - @staticmethod - def evaluation_filenames( - benchmark: BenchMarkNames, save_root: Path - ) -> dict[str, Path]: - r""" - Generate the expected filenames for the produced evaluation files - """ - modality: str = EvaluationModality.LAYOUT.value - json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json" - excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx" - - eval_filenames: dict[str, Path] = { - "json": json_fn, - "excel": excel_fn, - } - return eval_filenames - def save_evaluations( self, benchmark: BenchMarkNames, @@ -393,19 +448,17 @@ class PixelLayoutEvaluator(BaseEvaluator): excel_fn, ) - def _compute_document_confusion_matrix( + def _submit_document_evaluation( self, + executor: Executor, + doc_id: str, true_doc: DoclingDocument, pred_doc: DoclingDocument, - ) -> Tuple[ - Dict[int, np.ndarray], # page_no -> page confusion matrix - int, # document num_pixels - Dict[int, int], # page_no -> page num_pixels - ]: + ) -> list[Future]: r""" - Compute the confusion matrix for the given documents. - This is the sum of the confusion matrices of the document pages. + Submit the document for evaluation and return a future for each page """ + futures: list[Future] = [] # Collect all DocItems by page for both GT and predictions true_pages_to_objects = self._collect_items_by_page(true_doc) @@ -416,13 +469,6 @@ class PixelLayoutEvaluator(BaseEvaluator): pred_pages = set(pred_pages_to_objects.keys()) _log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}") - matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys()) - page_confusion_matrices: Dict[int, np.ndarray] = ( - {} - ) # page_no -> page confusion_matrix - all_pages_pixels: Dict[int, int] = {} # page_no -> page num_pixels - doc_pixels = 0 - for page_no in sorted(gt_pages): page_size = true_doc.pages[page_no].size pg_width = math.ceil(page_size.width) @@ -444,41 +490,39 @@ class PixelLayoutEvaluator(BaseEvaluator): doc=pred_doc, ) - # TODO: Parallelize the confusion matrix over the pages - # Compute the confusion matrix - gt_binary = self._mlcm.make_binary_representation( - pg_width, pg_height, gt_layouts + # Submit the page for computation + futures.append( + executor.submit( + evaluate_page, + self._mlcm, + doc_id, + page_no, + pg_width, + pg_height, + self._matrix_id_to_name, + gt_layouts, + pred_layouts, + ) ) - preds_binary = self._mlcm.make_binary_representation( - pg_width, pg_height, pred_layouts - ) - page_confusion_matrix = self._mlcm.generate_confusion_matrix( - gt_binary, preds_binary, matrix_categories_ids - ) - page_pixels = pg_width * pg_height - doc_pixels += page_pixels - all_pages_pixels[page_no] = page_pixels - page_confusion_matrices[page_no] = page_confusion_matrix else: # No prediction data for this page if ( self._missing_prediction_strategy == MissingPredictionStrategy.PENALIZE ): - gt_binary = self._mlcm.make_binary_representation( - pg_width, pg_height, gt_layouts + # Submit the page for computation + futures.append( + executor.submit( + evaluate_page, + self._mlcm, + doc_id, + page_no, + pg_width, + pg_height, + self._matrix_id_to_name, + gt_layouts, + ) ) - - # Make an all-one binary representation for the prediction and evaluate as usual - preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64) - page_confusion_matrix = self._mlcm.generate_confusion_matrix( - gt_binary, preds_binary, matrix_categories_ids - ) - - page_pixels = pg_width * pg_height - doc_pixels += page_pixels - all_pages_pixels[page_no] = page_pixels - page_confusion_matrices[page_no] = page_confusion_matrix elif ( self._missing_prediction_strategy == MissingPredictionStrategy.IGNORE @@ -489,7 +533,8 @@ class PixelLayoutEvaluator(BaseEvaluator): raise ValueError( f"Unknown missing prediction strategy: {self._missing_prediction_strategy}" ) - return page_confusion_matrices, doc_pixels, all_pages_pixels + + return futures def _get_page_layout_resolution( self, diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py index 218bbfb..a4e3813 100644 --- a/docling_eval/evaluators/stats.py +++ b/docling_eval/evaluators/stats.py @@ -8,6 +8,8 @@ import matplotlib.pyplot as plt import numpy as np from pydantic import BaseModel, model_validator +_log = logging.getLogger(__name__) + class DatasetStatistics(BaseModel): total: int @@ -82,7 +84,9 @@ def compute_stats( mean: float = statistics.mean(values) if len(values) > 0 else -1 median: float = statistics.median(values) if len(values) > 0 else -1 std: float = statistics.stdev(values) if len(values) > 1 else 0.0 - logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}") + _log.debug( + f"Compute statistics: total: {total}, mean: {mean}, median: {median}, std: {std}" + ) max_value = 1.0 if not max_value_is_one and len(values) > 0: @@ -90,7 +94,7 @@ def compute_stats( # Compute the histogram hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value)) - logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}") + _log.debug(f"Compute statistics: hist: {len(hist)}, #-bins: {len(bins)}") return DatasetStatistics( total=total, mean=mean, median=median, std=std, hist=hist, bins=bins diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh index 6189e90..878c947 100755 --- a/docs/examples/evaluate_dpbench_on_external_predictions.sh +++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh @@ -49,7 +49,7 @@ evaluate() { } -visualize() { +visualize_predictions() { local pred_dir save_dir modality pred_dir="$1" save_dir="$2" @@ -71,10 +71,27 @@ visualize() { --output-dir "${save_dir}" } + +visualize_evaluations() { + local pred_dir eval_root modality + pred_dir="$1" + eval_root="$2" + + for modality in "${MODALITIES[@]}"; do + echo "Evaluate: modality: ${modality} for evaluations: ${eval_root}" + uv run docling-eval visualize \ + --benchmark DPBench \ + --modality "${modality}" \ + --input-dir "${pred_dir}" \ + --output-dir "${eval_root}" + done +} + ########################################################################################### # Main # +######################################### # Predictions # json predictions @@ -95,8 +112,13 @@ evaluate \ scratch/DPBench/external_predictions_yaml +######################################### # Visualisations -visualize \ +visualize_predictions \ scratch/DPBench/predicted_documents/json \ scratch/DPBench/external_predictions_visualisations +visualize_evaluations \ + scratch/DPBench/predicted_documents/doctag \ + scratch/DPBench/external_predictions_doctags +