feat: Improvements in user experience: Performance, error handling, logging (#189)

* feat: Extend evaluate_dpbench_on_external_predictions.sh to include visualisations of the evaluations Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Improve error checking in main.py:visualize() Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Improve logging Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * feat: Parallelize the computation of PixelLayoutEvaluator at the level of page Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Make DatasetPixelLayoutEvaluation a subclass of DatasetEvaluation Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * feat: Parallelize the MarkdownTextEvaluator Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Improve logging Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2026-05-17 13:10:47 +00:00 · 2025-12-16 11:25:55 +01:00
parent bcc5200f74
commit a850784b4f
8 changed files with 343 additions and 235 deletions
@@ -641,6 +641,7 @@ def evaluate(
    split: str = "test",
    cvat_overview_path: Optional[Path] = None,
    external_predictions_path: Optional[Path] = None,
+    concurrency: int = 4,
 ) -> Optional[DatasetEvaluationType]:
    """Evaluate predictions against ground truth."""
    if not os.path.exists(idir):
@@ -673,17 +674,16 @@ def evaluate(
            # label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
            page_mapping_path=cvat_overview_path,
        )
-        evaluation = layout_evaluator(  # type: ignore
+        layout_evaluation = layout_evaluator(  # type: ignore
            idir,
            split=split,
            external_predictions_path=external_predictions_path,
        )
-
        with open(save_fn, "w") as fd:
-            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+            json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)

        # Evaluate with the pixel-wise layout evaluation
-        pixel_layout_evaluator = PixelLayoutEvaluator()
+        pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
        pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
            idir,
            split=split,
@@ -696,6 +696,9 @@ def evaluate(
            pixel_save_root,
        )

+        # TODO: Redesign evaluate() to return multiple evaluation objects
+        evaluation = pixel_ds_evaluation  # type: ignore
+
    elif modality == EvaluationModality.TABLE_STRUCTURE:
        table_evaluator = TableEvaluator()
        evaluation = table_evaluator(  # type: ignore
@@ -764,7 +767,7 @@ def evaluate(
            )

    elif modality == EvaluationModality.MARKDOWN_TEXT:
-        md_evaluator = MarkdownTextEvaluator()
+        md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
        evaluation = md_evaluator(  # type: ignore
            idir,
            split=split,
@@ -823,8 +826,8 @@ def evaluate(
 def visualize(
    modality: EvaluationModality,
    benchmark: BenchMarkNames,
-    idir: Path,
    odir: Path,
+    idir: Path | None = None,
    split: str = "test",
 ):
    """
@@ -839,10 +842,6 @@ def visualize(
        begin_index: Begin index
        end_index: End index
    """
-    if not os.path.exists(idir):
-        _log.error(f"Input directory not found: {idir}")
-        return
-
    os.makedirs(odir, exist_ok=True)
    metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"

@@ -989,6 +988,11 @@ def visualize(

    elif modality == EvaluationModality.READING_ORDER:
        try:
+            # idir is required here
+            if idir is None or not idir.is_dir():
+                _log.error(f"Input directory not found: {idir}")
+                return
+
            with open(metrics_filename, "r") as fd:
                ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
                    fd.read()
@@ -1080,6 +1084,11 @@ def visualize(

    elif modality == EvaluationModality.OCR:
        try:
+            # idir is required here
+            if idir is None or not idir.is_dir():
+                _log.error(f"Input directory not found: {idir}")
+                return
+
            with open(metrics_filename, "r") as fd:
                ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
                    fd.read()
@@ -1511,6 +1520,9 @@ def evaluate_cmd(
            help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
        ),
    ] = None,
+    concurrency: Annotated[
+        int, typer.Option(help="Concurrency for the computation of each metric")
+    ] = 4,
 ):
    """Evaluate predictions against ground truth."""
    input_dir, output_dir = derive_input_output_dirs(
@@ -1531,6 +1543,7 @@ def evaluate_cmd(
        odir=eval_output_dir,
        split=split,
        external_predictions_path=external_predictions_path,
+        concurrency=concurrency,
    )


@@ -1573,8 +1586,8 @@ def visualize_cmd(
    visualize(
        modality=modality,
        benchmark=benchmark,
-        idir=input_dir,
        odir=eval_output_dir,
+        idir=input_dir,
        split=split,
    )

@@ -75,12 +75,14 @@ class BaseEvaluator(Generic[UnitEvaluationType, DatasetEvaluationType]):
        supported_prediction_formats: List[PredictionFormats] = [
            PredictionFormats.DOCLING_DOCUMENT
        ],
+        concurrency: int = 4,
    ):
        r"""
        Parameters
        ----------
        intermediate_evaluations_path: When True the evalution per example will be saved in a file
        """
+        self._concurrency = concurrency
        self._intermediate_evaluations_path = intermediate_evaluations_path

        # Validate the prediction_sources
@@ -489,7 +489,7 @@ class LayoutEvaluator(BaseEvaluator):
            weighted_map_90_values.append(average_iou_90)
            weighted_map_95_values.append(average_iou_95)

-            _log.info(
+            _log.debug(
                "doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
                "precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
                doc_id_page,
@@ -528,7 +528,6 @@ class LayoutEvaluator(BaseEvaluator):
                segmentation_precision_no_pictures=precision_no_pics,
                segmentation_recall_no_pictures=recall_no_pics,
                segmentation_f1_no_pictures=f1_no_pics,
-                # New per-sample element count metrics
                true_element_count=true_element_count,
                pred_element_count=pred_element_count,
                true_table_count=true_table_count,
@@ -836,9 +835,7 @@ class LayoutEvaluator(BaseEvaluator):
        true_labels: Dict[str, int] = {}
        pred_labels: Dict[str, int] = {}

-        for i, data in tqdm(
-            enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
-        ):
+        for i, data in enumerate(ds):
            data_record = DatasetRecordWithPrediction.model_validate(data)
            true_doc = data_record.ground_truth_doc
            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
@@ -1,4 +1,5 @@
 import logging
+from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set

@@ -33,6 +34,51 @@ from docling_eval.utils.external_docling_document_loader import (
 _log = logging.getLogger(__name__)


+def compute_bleu_score(bleu_eval, true_txt: str, pred_txt: str) -> float:
+    r"""
+    Compute BLEU score with the HF evaluate and the default Tokenizer_13
+    """
+    result = bleu_eval.compute(predictions=[pred_txt], references=[[true_txt]])
+    bleu = result["bleu"]
+    return bleu
+
+
+def compute_nltk_scores(true_txt: str, pred_txt: str) -> dict[str, float]:
+    r"""
+    Returns:
+    --------
+    dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
+    """
+    true_tokens = word_tokenize(true_txt)
+    true_tokens_set = set(true_tokens)
+    pred_tokens = word_tokenize(pred_txt)
+    pred_tokens_set = set(pred_tokens)
+
+    f1_score = f_measure(true_tokens_set, pred_tokens_set)
+    precision_score = precision(true_tokens_set, pred_tokens_set)
+    recall_score = recall(true_tokens_set, pred_tokens_set)
+    edit_dist = edit_distance(pred_tokens, true_tokens) / max(
+        len(pred_tokens), len(true_tokens)
+    )
+    meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
+
+    metrics: dict[str, float] = {
+        "f1_score": f1_score,
+        "precision": precision_score,
+        "recall": recall_score,
+        "edit_distance": edit_dist,
+        "meteor": meteor,
+    }
+    return metrics
+
+
+def evaluate_page(bleu_eval, true_md: str, pred_md: str) -> dict[str, float]:
+    r"""Compute the bleu and the nltk scores"""
+    scores = compute_nltk_scores(true_md, pred_md)
+    scores["bleu"] = compute_bleu_score(bleu_eval, true_md, pred_md)
+    return scores
+
+
 class PageMarkdownEvaluation(UnitEvaluation):
    doc_id: str

@@ -62,6 +108,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
        self,
        intermediate_evaluations_path: Optional[Path] = None,
        prediction_sources: List[PredictionFormats] = [],
+        concurrency: int = 4,
    ):
        r""" """
        supported_prediction_formats: List[PredictionFormats] = [
@@ -74,6 +121,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
            intermediate_evaluations_path=intermediate_evaluations_path,
            prediction_sources=prediction_sources,
            supported_prediction_formats=supported_prediction_formats,
+            concurrency=concurrency,
        )

        self._bleu_eval = evaluate.load("bleu")
@@ -146,67 +194,80 @@ class MarkdownTextEvaluator(BaseEvaluator):
            "meteor": [],
        }

-        for i, data in tqdm(
-            enumerate(ds_selection),
-            desc="Markdown text evaluations",
-            ncols=120,
-            total=len(ds_selection),
-        ):
-            data_record = DatasetRecordWithPrediction.model_validate(data)
-            doc_id = data_record.doc_id
-            true_doc = data_record.ground_truth_doc
-            true_md = self._docling_document_to_md(true_doc)
+        with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
+            futures: list[Future] = []

-            # Get the predicted markdown from the external predictions path
-            if external_predictions_path is not None:
-                pred_doc = external_docling_doc_loader(data_record)
-                if pred_doc is None:
-                    _log.error("No external prediction found for doc_id=%s", doc_id)
+            # Submit the evaluation tasks
+            _log.info("Submitting the documents for evaluation...")
+            for data in ds_selection:
+                data_record = DatasetRecordWithPrediction.model_validate(data)
+                doc_id = data_record.doc_id
+                true_doc = data_record.ground_truth_doc
+                true_md = self._docling_document_to_md(true_doc)
+
+                # Get the predicted markdown from the external predictions path
+                if external_predictions_path is not None:
+                    pred_doc = external_docling_doc_loader(data_record)
+                    if pred_doc is None:
+                        _log.error("No external prediction found for doc_id=%s", doc_id)
+                        rejected_samples[
+                            EvaluationRejectionType.MISSING_PREDICTION
+                        ] += 1
+                        continue
+                    pred_md = self._docling_document_to_md(pred_doc)
+                else:
+                    if data_record.status not in self._accepted_status:
+                        _log.error(
+                            "Skipping record without successfull conversion status: %s",
+                            doc_id,
+                        )
+                        rejected_samples[
+                            EvaluationRejectionType.INVALID_CONVERSION_STATUS
+                        ] += 1
+                        continue
+                    pred_md = self._get_pred_md(data_record)  # type: ignore
+
+                if pred_md is None:
+                    _log.error("There is no markdown prediction for doc_id=%s", doc_id)
                    rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
                    continue
-                pred_md = self._docling_document_to_md(pred_doc)
-            else:
-                if data_record.status not in self._accepted_status:
-                    _log.error(
-                        "Skipping record without successfull conversion status: %s",
-                        doc_id,
+
+                if true_md != "" and pred_md != "":
+                    futures.append(
+                        executor.submit(
+                            evaluate_page, self._bleu_eval, true_md, pred_md
+                        )
                    )
-                    rejected_samples[
-                        EvaluationRejectionType.INVALID_CONVERSION_STATUS
-                    ] += 1
-                    continue
-                pred_md = self._get_pred_md(data_record)  # type: ignore

-            if not pred_md:
-                _log.error("There is no markdown prediction for doc_id=%s", doc_id)
-                rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
-                continue
+            # Collect the futures
+            _log.info("Collecting the documents for evaluations...")
+            for i, future in tqdm(
+                enumerate(as_completed(futures)),
+                desc="Markdown text evaluations",
+                ncols=120,
+                total=len(futures),
+            ):
+                doc_metrics = future.result()

-            bleu = 0.0
-            if true_md != "" and pred_md != "":
-                bleu = self._compute_bleu_score(true_md, pred_md)
-                ntlk_scores = self._compute_nltk_scores(true_md, pred_md)
+                # Collect metrics across pages
+                for score_name, score in doc_metrics.items():
+                    ds_metrics[score_name].append(score)

-            # Collect metrics across pages
-            ds_metrics["bleu"].append(bleu)
-            for score_name, score in ntlk_scores.items():
-                ds_metrics[score_name].append(score)
+                md_evaluation = PageMarkdownEvaluation(
+                    doc_id=doc_id,
+                    true_md=true_md,
+                    pred_md=pred_md,
+                    bleu=doc_metrics["bleu"],
+                    f1_score=doc_metrics["f1_score"],
+                    precision=doc_metrics["precision"],
+                    recall=doc_metrics["recall"],
+                    edit_distance=doc_metrics["edit_distance"],
+                    meteor=doc_metrics["meteor"],
+                )
+                evaluations.append(md_evaluation)

-            md_evaluation = PageMarkdownEvaluation(
-                doc_id=doc_id,
-                true_md=true_md,
-                pred_md=pred_md,
-                bleu=bleu,
-                f1_score=ntlk_scores["f1_score"],
-                precision=ntlk_scores["precision"],
-                recall=ntlk_scores["recall"],
-                edit_distance=ntlk_scores["edit_distance"],
-                meteor=ntlk_scores["meteor"],
-            )
-            evaluations.append(md_evaluation)
-
-            if self._intermediate_evaluations_path:
-                self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
+                if self._intermediate_evaluations_path:
+                    self.save_intermediate_evaluations("MD", i, doc_id, evaluations)

        ds_md_evalutions = DatasetMarkdownEvaluation(
            evaluated_samples=len(evaluations),
@@ -221,44 +282,6 @@ class MarkdownTextEvaluator(BaseEvaluator):
        )
        return ds_md_evalutions

-    def _compute_bleu_score(self, true_txt: str, pred_txt: str) -> float:
-        r"""
-        Compute BLEU score with the HF evaluate and the default Tokenizer_13
-        """
-        result = self._bleu_eval.compute(
-            predictions=[pred_txt], references=[[true_txt]]
-        )
-        bleu = result["bleu"]
-        return bleu
-
-    def _compute_nltk_scores(self, true_txt: str, pred_txt: str) -> dict[str, float]:
-        r"""
-        Returns:
-        --------
-        dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
-        """
-        true_tokens = word_tokenize(true_txt)
-        true_tokens_set = set(true_tokens)
-        pred_tokens = word_tokenize(pred_txt)
-        pred_tokens_set = set(pred_tokens)
-
-        f1_score = f_measure(true_tokens_set, pred_tokens_set)
-        precision_score = precision(true_tokens_set, pred_tokens_set)
-        recall_score = recall(true_tokens_set, pred_tokens_set)
-        edit_dist = edit_distance(pred_tokens, true_tokens) / max(
-            len(pred_tokens), len(true_tokens)
-        )
-        meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
-
-        metrics: dict[str, float] = {
-            "f1_score": f1_score,
-            "precision": precision_score,
-            "recall": recall_score,
-            "edit_distance": edit_dist,
-            "meteor": meteor,
-        }
-        return metrics
-
    def _docling_document_to_md(self, doc: DoclingDocument) -> str:
        r"""
        Export DoclingDocument to markdown
@@ -3,7 +3,10 @@ from typing import Any, Dict, Optional
 import numpy as np
 from pydantic import BaseModel, model_serializer, model_validator

-from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
+from docling_eval.evaluators.base_evaluator import (
+    DatasetEvaluation,
+    EvaluationRejectionType,
+)
 from docling_eval.evaluators.stats import DatasetStatistics


@@ -73,11 +76,10 @@ class PagePixelLayoutEvaluation(BaseModel):
    matrix_evaluation: MultiLabelMatrixEvaluation


-class DatasetPixelLayoutEvaluation(BaseModel):
+class DatasetPixelLayoutEvaluation(DatasetEvaluation):
    layout_model_name: Optional[str]
    num_pages: int
    num_pixels: int
-    rejected_samples: Dict[EvaluationRejectionType, int]
    matrix_evaluation: MultiLabelMatrixEvaluation
    page_evaluations: Dict[str, PagePixelLayoutEvaluation]

@@ -3,6 +3,7 @@ import json
 import logging
 import math
 from collections import defaultdict
+from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
 from enum import Enum
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
@@ -53,6 +54,51 @@ def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
    return label


+def evaluate_page(
+    mlcm: MultiLabelConfusionMatrix,
+    doc_id: str,
+    page_no: int,
+    pg_width: int,
+    pg_height: int,
+    matrix_id_to_name: dict[int, str],
+    gt_resolutions: list[LayoutResolution],
+    pred_resolutions: Optional[list[LayoutResolution]] = None,
+) -> tuple[str, int, int, MultiLabelMatrixEvaluation]:
+    r"""
+    Compute the confusion matrix and the metrics for one page
+    If pred_resolutions is None, assume an all-background predictions
+
+    Return
+    ------
+    doc_id
+    page_no
+    page_pixels
+    page_metrics
+    """
+    # Make binary representations
+    gt_binary = mlcm.make_binary_representation(pg_width, pg_height, gt_resolutions)
+    if pred_resolutions is not None:
+        preds_binary = mlcm.make_binary_representation(
+            pg_width, pg_height, pred_resolutions
+        )
+    else:
+        preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
+
+    # Compute confusion matrix
+    matrix_categories_ids: List[int] = list(matrix_id_to_name.keys())
+    confusion_matrix = mlcm.generate_confusion_matrix(
+        gt_binary, preds_binary, matrix_categories_ids
+    )
+
+    # Compute metrics
+    page_metrics: MultiLabelMatrixEvaluation = mlcm.compute_metrics(
+        confusion_matrix, matrix_id_to_name
+    )
+    page_pixels = pg_width * pg_height
+
+    return doc_id, page_no, page_pixels, page_metrics
+
+
 class PixelLayoutEvaluator(BaseEvaluator):
    r"""
    Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices
@@ -65,6 +111,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
        intermediate_evaluations_path: Optional[Path] = None,
        prediction_sources: List[PredictionFormats] = [],
        missing_prediction_strategy: MissingPredictionStrategy = MissingPredictionStrategy.PENALIZE,
+        concurrency: int = 4,
    ):
        r"""

@@ -82,6 +129,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
        if not prediction_sources:
            prediction_sources = supported_prediction_formats
        super().__init__(
+            concurrency=concurrency,
            intermediate_evaluations_path=intermediate_evaluations_path,
            prediction_sources=prediction_sources,
            supported_prediction_formats=supported_prediction_formats,
@@ -103,6 +151,23 @@ class PixelLayoutEvaluator(BaseEvaluator):
            self._build_matrix_categories(label_mapping)
        )

+    @staticmethod
+    def evaluation_filenames(
+        benchmark: BenchMarkNames, save_root: Path
+    ) -> dict[str, Path]:
+        r"""
+        Generate the expected filenames for the produced evaluation files
+        """
+        modality: str = EvaluationModality.LAYOUT.value
+        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
+        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
+
+        eval_filenames: dict[str, Path] = {
+            "json": json_fn,
+            "excel": excel_fn,
+        }
+        return eval_filenames
+
    def _build_matrix_categories(
        self,
        label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
@@ -193,6 +258,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
        ds_selection: Dataset = ds[split]

        # Results containers
+        evaluated_samples = 0
        rejected_samples: Dict[EvaluationRejectionType, int] = {
            EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0,
            EvaluationRejectionType.MISSING_PREDICTION: 0,
@@ -213,95 +279,101 @@ class PixelLayoutEvaluator(BaseEvaluator):
            []
        )  # Gather f1 score/image when evaluated on collapsed classes

-        for i, data in tqdm(
-            enumerate(ds_selection),
-            desc="Multi-label Matrix Layout evaluations",
-            ncols=120,
-            total=len(ds_selection),
-        ):
-            data_record = DatasetRecordWithPrediction.model_validate(data)
+        with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
+            futures: list[Future] = []

-            # Try to extract the layout model name
-            if not self._layout_model_name:
-                self._layout_model_name = dict_get(
-                    data_record.predictor_info,
-                    [
-                        "options",
-                        "pdf",
-                        "pipeline_options",
-                        "layout_options",
-                        "model_spec",
-                        "name",
-                    ],
-                )
+            # Submit pages for execution
+            _log.info("Submitting the documents for evaluation...")
+            for data in ds_selection:
+                data_record = DatasetRecordWithPrediction.model_validate(data)

-            doc_id: str = data_record.doc_id
-            if (
-                ext_docdoc_loader is None
-                and data_record.status not in self._accepted_status
-            ):
-                _log.error(
-                    "Skipping record without successfull conversion status: %s", doc_id
-                )
-                rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
-                continue
+                # Try to extract the layout model name
+                if not self._layout_model_name:
+                    self._layout_model_name = dict_get(
+                        data_record.predictor_info,
+                        [
+                            "options",
+                            "pdf",
+                            "pipeline_options",
+                            "layout_options",
+                            "model_spec",
+                            "name",
+                        ],
+                    )

-            true_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
-            if not pred_doc:
-                _log.error("There is no prediction for doc_id=%s", doc_id)
-                rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
-                continue
+                doc_id: str = data_record.doc_id
+                if (
+                    ext_docdoc_loader is None
+                    and data_record.status not in self._accepted_status
+                ):
+                    _log.error(
+                        "Skipping record without successfull conversion status: %s",
+                        doc_id,
+                    )
+                    rejected_samples[
+                        EvaluationRejectionType.INVALID_CONVERSION_STATUS
+                    ] += 1
+                    continue

-            # Compute confusion matrices
-            pages_confusion_matrices: Dict[int, np.ndarray]
-            pages_pixels: Dict[int, int]
-            pages_confusion_matrices, doc_num_pixels, pages_pixels = (
-                self._compute_document_confusion_matrix(true_doc, pred_doc)
-            )
+                true_doc = data_record.ground_truth_doc
+                pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
+                if not pred_doc:
+                    _log.error("There is no prediction for doc_id=%s", doc_id)
+                    rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
+                    continue

-            # Compute metrics per page
-            for page_no, page_confusion_matrix in pages_confusion_matrices.items():
-                # Contribute to the dataset's confusion matrix
-                ds_confusion_matrix += page_confusion_matrix
-
-                # Compute page metrics
-                page_matrix_evaluation: MultiLabelMatrixEvaluation = (
-                    self._mlcm.compute_metrics(
-                        page_confusion_matrix,
-                        self._matrix_id_to_name,
+                evaluated_samples += 1
+                futures.extend(
+                    self._submit_document_evaluation(
+                        executor, doc_id, true_doc, pred_doc
                    )
                )
+
+            # Collect the futures
+            _log.info("Collecting the documents for evaluations...")
+            for future in tqdm(
+                as_completed(futures),
+                desc="Multi-label Matrix Layout evaluations",
+                ncols=120,
+                total=len(futures),
+            ):
+                page_metrics: MultiLabelMatrixEvaluation
+                doc_id, page_no, page_pixels, page_metrics = future.result()
+
+                page_confusion_matrix: np.ndarray = (
+                    page_metrics.detailed.confusion_matrix
+                )
+                ds_num_pixels += page_pixels
+                ds_confusion_matrix += page_confusion_matrix
+                doc_page_id = f"{doc_id}-{page_no}"
                page_evaluation = PagePixelLayoutEvaluation(
                    doc_id=doc_id,
                    page_no=page_no,
-                    num_pixels=pages_pixels[page_no],
-                    matrix_evaluation=page_matrix_evaluation,
+                    num_pixels=page_pixels,
+                    matrix_evaluation=page_metrics,
                )
-                doc_page_id = f"{doc_id}-{page_no}"
                all_pages_evaluations[doc_page_id] = page_evaluation

                # Update f1 lists
                pages_detailed_f1.append(
-                    page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean
+                    page_metrics.detailed.agg_metrics.classes_f1_mean
                )
                pages_collapsed_f1.append(
-                    page_matrix_evaluation.collapsed.agg_metrics.classes_f1_mean
+                    page_metrics.collapsed.agg_metrics.classes_f1_mean
                )

-            ds_num_pixels += doc_num_pixels
-
-        # Compute metrics for the dataset and each document
+        # Compute metrics for the dataset
        ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics(
            ds_confusion_matrix,
            self._matrix_id_to_name,
        )

        ds_evaluation = DatasetPixelLayoutEvaluation(
+            evaluated_samples=evaluated_samples,
+            rejected_samples=rejected_samples,
            layout_model_name=self._layout_model_name,
            num_pages=len(all_pages_evaluations),
            num_pixels=ds_num_pixels,
-            rejected_samples=rejected_samples,
            matrix_evaluation=ds_matrix_evaluation,
            page_evaluations=all_pages_evaluations,
            f1_all_classes_stats=compute_stats(pages_detailed_f1),
@@ -310,23 +382,6 @@ class PixelLayoutEvaluator(BaseEvaluator):

        return ds_evaluation

-    @staticmethod
-    def evaluation_filenames(
-        benchmark: BenchMarkNames, save_root: Path
-    ) -> dict[str, Path]:
-        r"""
-        Generate the expected filenames for the produced evaluation files
-        """
-        modality: str = EvaluationModality.LAYOUT.value
-        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
-        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
-
-        eval_filenames: dict[str, Path] = {
-            "json": json_fn,
-            "excel": excel_fn,
-        }
-        return eval_filenames
-
    def save_evaluations(
        self,
        benchmark: BenchMarkNames,
@@ -393,19 +448,17 @@ class PixelLayoutEvaluator(BaseEvaluator):
            excel_fn,
        )

-    def _compute_document_confusion_matrix(
+    def _submit_document_evaluation(
        self,
+        executor: Executor,
+        doc_id: str,
        true_doc: DoclingDocument,
        pred_doc: DoclingDocument,
-    ) -> Tuple[
-        Dict[int, np.ndarray],  # page_no -> page confusion matrix
-        int,  # document num_pixels
-        Dict[int, int],  # page_no -> page num_pixels
-    ]:
+    ) -> list[Future]:
        r"""
-        Compute the confusion matrix for the given documents.
-        This is the sum of the confusion matrices of the document pages.
+        Submit the document for evaluation and return a future for each page
        """
+        futures: list[Future] = []

        # Collect all DocItems by page for both GT and predictions
        true_pages_to_objects = self._collect_items_by_page(true_doc)
@@ -416,13 +469,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
        pred_pages = set(pred_pages_to_objects.keys())
        _log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")

-        matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
-        page_confusion_matrices: Dict[int, np.ndarray] = (
-            {}
-        )  # page_no -> page confusion_matrix
-        all_pages_pixels: Dict[int, int] = {}  # page_no -> page num_pixels
-        doc_pixels = 0
-
        for page_no in sorted(gt_pages):
            page_size = true_doc.pages[page_no].size
            pg_width = math.ceil(page_size.width)
@@ -444,41 +490,39 @@ class PixelLayoutEvaluator(BaseEvaluator):
                    doc=pred_doc,
                )

-                # TODO: Parallelize the confusion matrix over the pages
-                # Compute the confusion matrix
-                gt_binary = self._mlcm.make_binary_representation(
-                    pg_width, pg_height, gt_layouts
+                # Submit the page for computation
+                futures.append(
+                    executor.submit(
+                        evaluate_page,
+                        self._mlcm,
+                        doc_id,
+                        page_no,
+                        pg_width,
+                        pg_height,
+                        self._matrix_id_to_name,
+                        gt_layouts,
+                        pred_layouts,
+                    )
                )
-                preds_binary = self._mlcm.make_binary_representation(
-                    pg_width, pg_height, pred_layouts
-                )
-                page_confusion_matrix = self._mlcm.generate_confusion_matrix(
-                    gt_binary, preds_binary, matrix_categories_ids
-                )
-                page_pixels = pg_width * pg_height
-                doc_pixels += page_pixels
-                all_pages_pixels[page_no] = page_pixels
-                page_confusion_matrices[page_no] = page_confusion_matrix
            else:
                # No prediction data for this page
                if (
                    self._missing_prediction_strategy
                    == MissingPredictionStrategy.PENALIZE
                ):
-                    gt_binary = self._mlcm.make_binary_representation(
-                        pg_width, pg_height, gt_layouts
+                    # Submit the page for computation
+                    futures.append(
+                        executor.submit(
+                            evaluate_page,
+                            self._mlcm,
+                            doc_id,
+                            page_no,
+                            pg_width,
+                            pg_height,
+                            self._matrix_id_to_name,
+                            gt_layouts,
+                        )
                    )
-
-                    # Make an all-one binary representation for the prediction and evaluate as usual
-                    preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
-                    page_confusion_matrix = self._mlcm.generate_confusion_matrix(
-                        gt_binary, preds_binary, matrix_categories_ids
-                    )
-
-                    page_pixels = pg_width * pg_height
-                    doc_pixels += page_pixels
-                    all_pages_pixels[page_no] = page_pixels
-                    page_confusion_matrices[page_no] = page_confusion_matrix
                elif (
                    self._missing_prediction_strategy
                    == MissingPredictionStrategy.IGNORE
@@ -489,7 +533,8 @@ class PixelLayoutEvaluator(BaseEvaluator):
                    raise ValueError(
                        f"Unknown missing prediction strategy: {self._missing_prediction_strategy}"
                    )
-        return page_confusion_matrices, doc_pixels, all_pages_pixels
+
+        return futures

    def _get_page_layout_resolution(
        self,
@@ -8,6 +8,8 @@ import matplotlib.pyplot as plt
 import numpy as np
 from pydantic import BaseModel, model_validator

+_log = logging.getLogger(__name__)
+

 class DatasetStatistics(BaseModel):
    total: int
@@ -82,7 +84,9 @@ def compute_stats(
    mean: float = statistics.mean(values) if len(values) > 0 else -1
    median: float = statistics.median(values) if len(values) > 0 else -1
    std: float = statistics.stdev(values) if len(values) > 1 else 0.0
-    logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}")
+    _log.debug(
+        f"Compute statistics: total: {total}, mean: {mean}, median: {median}, std: {std}"
+    )

    max_value = 1.0
    if not max_value_is_one and len(values) > 0:
@@ -90,7 +94,7 @@ def compute_stats(

    # Compute the histogram
    hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value))
-    logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}")
+    _log.debug(f"Compute statistics: hist: {len(hist)}, #-bins: {len(bins)}")

    return DatasetStatistics(
        total=total, mean=mean, median=median, std=std, hist=hist, bins=bins
@@ -49,7 +49,7 @@ evaluate() {
 }


-visualize() {
+visualize_predictions() {
    local pred_dir save_dir modality
    pred_dir="$1"
    save_dir="$2"
@@ -71,10 +71,27 @@ visualize() {
        --output-dir "${save_dir}"
 }

+
+visualize_evaluations() {
+    local pred_dir eval_root modality
+    pred_dir="$1"
+    eval_root="$2"
+
+    for modality in "${MODALITIES[@]}"; do
+        echo "Evaluate: modality: ${modality} for evaluations: ${eval_root}"
+        uv run docling-eval visualize \
+            --benchmark DPBench \
+            --modality "${modality}" \
+            --input-dir "${pred_dir}" \
+            --output-dir "${eval_root}"
+    done
+}
+
 ###########################################################################################
 # Main
 #

+#########################################
 # Predictions

 # json predictions
@@ -95,8 +112,13 @@ evaluate \
    scratch/DPBench/external_predictions_yaml


+#########################################
 # Visualisations
-visualize \
+visualize_predictions \
    scratch/DPBench/predicted_documents/json \
    scratch/DPBench/external_predictions_visualisations

+visualize_evaluations \
+    scratch/DPBench/predicted_documents/doctag \
+    scratch/DPBench/external_predictions_doctags
+