From a850784b4f9b26e94659085d6ea1f95473313f90 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com>
Date: Tue, 16 Dec 2025 11:25:55 +0100
Subject: [PATCH] feat: Improvements in user experience: Performance, error
 handling, logging (#189)

* feat: Extend evaluate_dpbench_on_external_predictions.sh to include visualisations of the evaluations

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Improve error checking in main.py:visualize()

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Improve logging

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Parallelize the computation of PixelLayoutEvaluator at the level of page

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Make DatasetPixelLayoutEvaluation a subclass of DatasetEvaluation

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Parallelize the MarkdownTextEvaluator

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* chore: Improve logging

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      |  35 ++-
 docling_eval/evaluators/base_evaluator.py     |   2 +
 docling_eval/evaluators/layout_evaluator.py   |   7 +-
 .../evaluators/markdown_text_evaluator.py     | 205 +++++++------
 docling_eval/evaluators/pixel/pixel_types.py  |   8 +-
 .../evaluators/pixel_layout_evaluator.py      | 287 ++++++++++--------
 docling_eval/evaluators/stats.py              |   8 +-
 ...valuate_dpbench_on_external_predictions.sh |  26 +-
 8 files changed, 343 insertions(+), 235 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 3972f3a..1a6826c 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -641,6 +641,7 @@ def evaluate(
     split: str = "test",
     cvat_overview_path: Optional[Path] = None,
     external_predictions_path: Optional[Path] = None,
+    concurrency: int = 4,
 ) -> Optional[DatasetEvaluationType]:
     """Evaluate predictions against ground truth."""
     if not os.path.exists(idir):
@@ -673,17 +674,16 @@ def evaluate(
             # label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
             page_mapping_path=cvat_overview_path,
         )
-        evaluation = layout_evaluator(  # type: ignore
+        layout_evaluation = layout_evaluator(  # type: ignore
             idir,
             split=split,
             external_predictions_path=external_predictions_path,
         )
-
         with open(save_fn, "w") as fd:
-            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+            json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
         # Evaluate with the pixel-wise layout evaluation
-        pixel_layout_evaluator = PixelLayoutEvaluator()
+        pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
         pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
             idir,
             split=split,
@@ -696,6 +696,9 @@ def evaluate(
             pixel_save_root,
         )
 
+        # TODO: Redesign evaluate() to return multiple evaluation objects
+        evaluation = pixel_ds_evaluation  # type: ignore
+
     elif modality == EvaluationModality.TABLE_STRUCTURE:
         table_evaluator = TableEvaluator()
         evaluation = table_evaluator(  # type: ignore
@@ -764,7 +767,7 @@ def evaluate(
             )
 
     elif modality == EvaluationModality.MARKDOWN_TEXT:
-        md_evaluator = MarkdownTextEvaluator()
+        md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
         evaluation = md_evaluator(  # type: ignore
             idir,
             split=split,
@@ -823,8 +826,8 @@ def evaluate(
 def visualize(
     modality: EvaluationModality,
     benchmark: BenchMarkNames,
-    idir: Path,
     odir: Path,
+    idir: Path | None = None,
     split: str = "test",
 ):
     """
@@ -839,10 +842,6 @@ def visualize(
         begin_index: Begin index
         end_index: End index
     """
-    if not os.path.exists(idir):
-        _log.error(f"Input directory not found: {idir}")
-        return
-
     os.makedirs(odir, exist_ok=True)
     metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"
 
@@ -989,6 +988,11 @@ def visualize(
 
     elif modality == EvaluationModality.READING_ORDER:
         try:
+            # idir is required here
+            if idir is None or not idir.is_dir():
+                _log.error(f"Input directory not found: {idir}")
+                return
+
             with open(metrics_filename, "r") as fd:
                 ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
                     fd.read()
@@ -1080,6 +1084,11 @@ def visualize(
 
     elif modality == EvaluationModality.OCR:
         try:
+            # idir is required here
+            if idir is None or not idir.is_dir():
+                _log.error(f"Input directory not found: {idir}")
+                return
+
             with open(metrics_filename, "r") as fd:
                 ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
                     fd.read()
@@ -1511,6 +1520,9 @@ def evaluate_cmd(
             help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
         ),
     ] = None,
+    concurrency: Annotated[
+        int, typer.Option(help="Concurrency for the computation of each metric")
+    ] = 4,
 ):
     """Evaluate predictions against ground truth."""
     input_dir, output_dir = derive_input_output_dirs(
@@ -1531,6 +1543,7 @@ def evaluate_cmd(
         odir=eval_output_dir,
         split=split,
         external_predictions_path=external_predictions_path,
+        concurrency=concurrency,
     )
 
 
@@ -1573,8 +1586,8 @@ def visualize_cmd(
     visualize(
         modality=modality,
         benchmark=benchmark,
-        idir=input_dir,
         odir=eval_output_dir,
+        idir=input_dir,
         split=split,
     )
 
diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py
index 4198f08..4a009e0 100644
--- a/docling_eval/evaluators/base_evaluator.py
+++ b/docling_eval/evaluators/base_evaluator.py
@@ -75,12 +75,14 @@ class BaseEvaluator(Generic[UnitEvaluationType, DatasetEvaluationType]):
         supported_prediction_formats: List[PredictionFormats] = [
             PredictionFormats.DOCLING_DOCUMENT
         ],
+        concurrency: int = 4,
     ):
         r"""
         Parameters
         ----------
         intermediate_evaluations_path: When True the evalution per example will be saved in a file
         """
+        self._concurrency = concurrency
         self._intermediate_evaluations_path = intermediate_evaluations_path
 
         # Validate the prediction_sources
diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index 0d11394..c1a86e7 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -489,7 +489,7 @@ class LayoutEvaluator(BaseEvaluator):
             weighted_map_90_values.append(average_iou_90)
             weighted_map_95_values.append(average_iou_95)
 
-            _log.info(
+            _log.debug(
                 "doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
                 "precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
                 doc_id_page,
@@ -528,7 +528,6 @@ class LayoutEvaluator(BaseEvaluator):
                 segmentation_precision_no_pictures=precision_no_pics,
                 segmentation_recall_no_pictures=recall_no_pics,
                 segmentation_f1_no_pictures=f1_no_pics,
-                # New per-sample element count metrics
                 true_element_count=true_element_count,
                 pred_element_count=pred_element_count,
                 true_table_count=true_table_count,
@@ -836,9 +835,7 @@ class LayoutEvaluator(BaseEvaluator):
         true_labels: Dict[str, int] = {}
         pred_labels: Dict[str, int] = {}
 
-        for i, data in tqdm(
-            enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
-        ):
+        for i, data in enumerate(ds):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             true_doc = data_record.ground_truth_doc
             pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py
index afbc68e..1c17758 100644
--- a/docling_eval/evaluators/markdown_text_evaluator.py
+++ b/docling_eval/evaluators/markdown_text_evaluator.py
@@ -1,4 +1,5 @@
 import logging
+from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set
 
@@ -33,6 +34,51 @@ from docling_eval.utils.external_docling_document_loader import (
 _log = logging.getLogger(__name__)
 
 
+def compute_bleu_score(bleu_eval, true_txt: str, pred_txt: str) -> float:
+    r"""
+    Compute BLEU score with the HF evaluate and the default Tokenizer_13
+    """
+    result = bleu_eval.compute(predictions=[pred_txt], references=[[true_txt]])
+    bleu = result["bleu"]
+    return bleu
+
+
+def compute_nltk_scores(true_txt: str, pred_txt: str) -> dict[str, float]:
+    r"""
+    Returns:
+    --------
+    dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
+    """
+    true_tokens = word_tokenize(true_txt)
+    true_tokens_set = set(true_tokens)
+    pred_tokens = word_tokenize(pred_txt)
+    pred_tokens_set = set(pred_tokens)
+
+    f1_score = f_measure(true_tokens_set, pred_tokens_set)
+    precision_score = precision(true_tokens_set, pred_tokens_set)
+    recall_score = recall(true_tokens_set, pred_tokens_set)
+    edit_dist = edit_distance(pred_tokens, true_tokens) / max(
+        len(pred_tokens), len(true_tokens)
+    )
+    meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
+
+    metrics: dict[str, float] = {
+        "f1_score": f1_score,
+        "precision": precision_score,
+        "recall": recall_score,
+        "edit_distance": edit_dist,
+        "meteor": meteor,
+    }
+    return metrics
+
+
+def evaluate_page(bleu_eval, true_md: str, pred_md: str) -> dict[str, float]:
+    r"""Compute the bleu and the nltk scores"""
+    scores = compute_nltk_scores(true_md, pred_md)
+    scores["bleu"] = compute_bleu_score(bleu_eval, true_md, pred_md)
+    return scores
+
+
 class PageMarkdownEvaluation(UnitEvaluation):
     doc_id: str
 
@@ -62,6 +108,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
         self,
         intermediate_evaluations_path: Optional[Path] = None,
         prediction_sources: List[PredictionFormats] = [],
+        concurrency: int = 4,
     ):
         r""" """
         supported_prediction_formats: List[PredictionFormats] = [
@@ -74,6 +121,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
             intermediate_evaluations_path=intermediate_evaluations_path,
             prediction_sources=prediction_sources,
             supported_prediction_formats=supported_prediction_formats,
+            concurrency=concurrency,
         )
 
         self._bleu_eval = evaluate.load("bleu")
@@ -146,67 +194,80 @@ class MarkdownTextEvaluator(BaseEvaluator):
             "meteor": [],
         }
 
-        for i, data in tqdm(
-            enumerate(ds_selection),
-            desc="Markdown text evaluations",
-            ncols=120,
-            total=len(ds_selection),
-        ):
-            data_record = DatasetRecordWithPrediction.model_validate(data)
-            doc_id = data_record.doc_id
-            true_doc = data_record.ground_truth_doc
-            true_md = self._docling_document_to_md(true_doc)
+        with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
+            futures: list[Future] = []
 
-            # Get the predicted markdown from the external predictions path
-            if external_predictions_path is not None:
-                pred_doc = external_docling_doc_loader(data_record)
-                if pred_doc is None:
-                    _log.error("No external prediction found for doc_id=%s", doc_id)
+            # Submit the evaluation tasks
+            _log.info("Submitting the documents for evaluation...")
+            for data in ds_selection:
+                data_record = DatasetRecordWithPrediction.model_validate(data)
+                doc_id = data_record.doc_id
+                true_doc = data_record.ground_truth_doc
+                true_md = self._docling_document_to_md(true_doc)
+
+                # Get the predicted markdown from the external predictions path
+                if external_predictions_path is not None:
+                    pred_doc = external_docling_doc_loader(data_record)
+                    if pred_doc is None:
+                        _log.error("No external prediction found for doc_id=%s", doc_id)
+                        rejected_samples[
+                            EvaluationRejectionType.MISSING_PREDICTION
+                        ] += 1
+                        continue
+                    pred_md = self._docling_document_to_md(pred_doc)
+                else:
+                    if data_record.status not in self._accepted_status:
+                        _log.error(
+                            "Skipping record without successfull conversion status: %s",
+                            doc_id,
+                        )
+                        rejected_samples[
+                            EvaluationRejectionType.INVALID_CONVERSION_STATUS
+                        ] += 1
+                        continue
+                    pred_md = self._get_pred_md(data_record)  # type: ignore
+
+                if pred_md is None:
+                    _log.error("There is no markdown prediction for doc_id=%s", doc_id)
                     rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
                     continue
-                pred_md = self._docling_document_to_md(pred_doc)
-            else:
-                if data_record.status not in self._accepted_status:
-                    _log.error(
-                        "Skipping record without successfull conversion status: %s",
-                        doc_id,
+
+                if true_md != "" and pred_md != "":
+                    futures.append(
+                        executor.submit(
+                            evaluate_page, self._bleu_eval, true_md, pred_md
+                        )
                     )
-                    rejected_samples[
-                        EvaluationRejectionType.INVALID_CONVERSION_STATUS
-                    ] += 1
-                    continue
-                pred_md = self._get_pred_md(data_record)  # type: ignore
 
-            if not pred_md:
-                _log.error("There is no markdown prediction for doc_id=%s", doc_id)
-                rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
-                continue
+            # Collect the futures
+            _log.info("Collecting the documents for evaluations...")
+            for i, future in tqdm(
+                enumerate(as_completed(futures)),
+                desc="Markdown text evaluations",
+                ncols=120,
+                total=len(futures),
+            ):
+                doc_metrics = future.result()
 
-            bleu = 0.0
-            if true_md != "" and pred_md != "":
-                bleu = self._compute_bleu_score(true_md, pred_md)
-                ntlk_scores = self._compute_nltk_scores(true_md, pred_md)
+                # Collect metrics across pages
+                for score_name, score in doc_metrics.items():
+                    ds_metrics[score_name].append(score)
 
-            # Collect metrics across pages
-            ds_metrics["bleu"].append(bleu)
-            for score_name, score in ntlk_scores.items():
-                ds_metrics[score_name].append(score)
+                md_evaluation = PageMarkdownEvaluation(
+                    doc_id=doc_id,
+                    true_md=true_md,
+                    pred_md=pred_md,
+                    bleu=doc_metrics["bleu"],
+                    f1_score=doc_metrics["f1_score"],
+                    precision=doc_metrics["precision"],
+                    recall=doc_metrics["recall"],
+                    edit_distance=doc_metrics["edit_distance"],
+                    meteor=doc_metrics["meteor"],
+                )
+                evaluations.append(md_evaluation)
 
-            md_evaluation = PageMarkdownEvaluation(
-                doc_id=doc_id,
-                true_md=true_md,
-                pred_md=pred_md,
-                bleu=bleu,
-                f1_score=ntlk_scores["f1_score"],
-                precision=ntlk_scores["precision"],
-                recall=ntlk_scores["recall"],
-                edit_distance=ntlk_scores["edit_distance"],
-                meteor=ntlk_scores["meteor"],
-            )
-            evaluations.append(md_evaluation)
-
-            if self._intermediate_evaluations_path:
-                self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
+                if self._intermediate_evaluations_path:
+                    self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
 
         ds_md_evalutions = DatasetMarkdownEvaluation(
             evaluated_samples=len(evaluations),
@@ -221,44 +282,6 @@ class MarkdownTextEvaluator(BaseEvaluator):
         )
         return ds_md_evalutions
 
-    def _compute_bleu_score(self, true_txt: str, pred_txt: str) -> float:
-        r"""
-        Compute BLEU score with the HF evaluate and the default Tokenizer_13
-        """
-        result = self._bleu_eval.compute(
-            predictions=[pred_txt], references=[[true_txt]]
-        )
-        bleu = result["bleu"]
-        return bleu
-
-    def _compute_nltk_scores(self, true_txt: str, pred_txt: str) -> dict[str, float]:
-        r"""
-        Returns:
-        --------
-        dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
-        """
-        true_tokens = word_tokenize(true_txt)
-        true_tokens_set = set(true_tokens)
-        pred_tokens = word_tokenize(pred_txt)
-        pred_tokens_set = set(pred_tokens)
-
-        f1_score = f_measure(true_tokens_set, pred_tokens_set)
-        precision_score = precision(true_tokens_set, pred_tokens_set)
-        recall_score = recall(true_tokens_set, pred_tokens_set)
-        edit_dist = edit_distance(pred_tokens, true_tokens) / max(
-            len(pred_tokens), len(true_tokens)
-        )
-        meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
-
-        metrics: dict[str, float] = {
-            "f1_score": f1_score,
-            "precision": precision_score,
-            "recall": recall_score,
-            "edit_distance": edit_dist,
-            "meteor": meteor,
-        }
-        return metrics
-
     def _docling_document_to_md(self, doc: DoclingDocument) -> str:
         r"""
         Export DoclingDocument to markdown
diff --git a/docling_eval/evaluators/pixel/pixel_types.py b/docling_eval/evaluators/pixel/pixel_types.py
index dec19ec..b106531 100644
--- a/docling_eval/evaluators/pixel/pixel_types.py
+++ b/docling_eval/evaluators/pixel/pixel_types.py
@@ -3,7 +3,10 @@ from typing import Any, Dict, Optional
 import numpy as np
 from pydantic import BaseModel, model_serializer, model_validator
 
-from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
+from docling_eval.evaluators.base_evaluator import (
+    DatasetEvaluation,
+    EvaluationRejectionType,
+)
 from docling_eval.evaluators.stats import DatasetStatistics
 
 
@@ -73,11 +76,10 @@ class PagePixelLayoutEvaluation(BaseModel):
     matrix_evaluation: MultiLabelMatrixEvaluation
 
 
-class DatasetPixelLayoutEvaluation(BaseModel):
+class DatasetPixelLayoutEvaluation(DatasetEvaluation):
     layout_model_name: Optional[str]
     num_pages: int
     num_pixels: int
-    rejected_samples: Dict[EvaluationRejectionType, int]
     matrix_evaluation: MultiLabelMatrixEvaluation
     page_evaluations: Dict[str, PagePixelLayoutEvaluation]
 
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 1913f1b..0de8a56 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -3,6 +3,7 @@ import json
 import logging
 import math
 from collections import defaultdict
+from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
 from enum import Enum
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
@@ -53,6 +54,51 @@ def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
     return label
 
 
+def evaluate_page(
+    mlcm: MultiLabelConfusionMatrix,
+    doc_id: str,
+    page_no: int,
+    pg_width: int,
+    pg_height: int,
+    matrix_id_to_name: dict[int, str],
+    gt_resolutions: list[LayoutResolution],
+    pred_resolutions: Optional[list[LayoutResolution]] = None,
+) -> tuple[str, int, int, MultiLabelMatrixEvaluation]:
+    r"""
+    Compute the confusion matrix and the metrics for one page
+    If pred_resolutions is None, assume an all-background predictions
+
+    Return
+    ------
+    doc_id
+    page_no
+    page_pixels
+    page_metrics
+    """
+    # Make binary representations
+    gt_binary = mlcm.make_binary_representation(pg_width, pg_height, gt_resolutions)
+    if pred_resolutions is not None:
+        preds_binary = mlcm.make_binary_representation(
+            pg_width, pg_height, pred_resolutions
+        )
+    else:
+        preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
+
+    # Compute confusion matrix
+    matrix_categories_ids: List[int] = list(matrix_id_to_name.keys())
+    confusion_matrix = mlcm.generate_confusion_matrix(
+        gt_binary, preds_binary, matrix_categories_ids
+    )
+
+    # Compute metrics
+    page_metrics: MultiLabelMatrixEvaluation = mlcm.compute_metrics(
+        confusion_matrix, matrix_id_to_name
+    )
+    page_pixels = pg_width * pg_height
+
+    return doc_id, page_no, page_pixels, page_metrics
+
+
 class PixelLayoutEvaluator(BaseEvaluator):
     r"""
     Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices
@@ -65,6 +111,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
         intermediate_evaluations_path: Optional[Path] = None,
         prediction_sources: List[PredictionFormats] = [],
         missing_prediction_strategy: MissingPredictionStrategy = MissingPredictionStrategy.PENALIZE,
+        concurrency: int = 4,
     ):
         r"""
 
@@ -82,6 +129,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
         if not prediction_sources:
             prediction_sources = supported_prediction_formats
         super().__init__(
+            concurrency=concurrency,
             intermediate_evaluations_path=intermediate_evaluations_path,
             prediction_sources=prediction_sources,
             supported_prediction_formats=supported_prediction_formats,
@@ -103,6 +151,23 @@ class PixelLayoutEvaluator(BaseEvaluator):
             self._build_matrix_categories(label_mapping)
         )
 
+    @staticmethod
+    def evaluation_filenames(
+        benchmark: BenchMarkNames, save_root: Path
+    ) -> dict[str, Path]:
+        r"""
+        Generate the expected filenames for the produced evaluation files
+        """
+        modality: str = EvaluationModality.LAYOUT.value
+        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
+        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
+
+        eval_filenames: dict[str, Path] = {
+            "json": json_fn,
+            "excel": excel_fn,
+        }
+        return eval_filenames
+
     def _build_matrix_categories(
         self,
         label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
@@ -193,6 +258,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
         ds_selection: Dataset = ds[split]
 
         # Results containers
+        evaluated_samples = 0
         rejected_samples: Dict[EvaluationRejectionType, int] = {
             EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0,
             EvaluationRejectionType.MISSING_PREDICTION: 0,
@@ -213,95 +279,101 @@ class PixelLayoutEvaluator(BaseEvaluator):
             []
         )  # Gather f1 score/image when evaluated on collapsed classes
 
-        for i, data in tqdm(
-            enumerate(ds_selection),
-            desc="Multi-label Matrix Layout evaluations",
-            ncols=120,
-            total=len(ds_selection),
-        ):
-            data_record = DatasetRecordWithPrediction.model_validate(data)
+        with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
+            futures: list[Future] = []
 
-            # Try to extract the layout model name
-            if not self._layout_model_name:
-                self._layout_model_name = dict_get(
-                    data_record.predictor_info,
-                    [
-                        "options",
-                        "pdf",
-                        "pipeline_options",
-                        "layout_options",
-                        "model_spec",
-                        "name",
-                    ],
-                )
+            # Submit pages for execution
+            _log.info("Submitting the documents for evaluation...")
+            for data in ds_selection:
+                data_record = DatasetRecordWithPrediction.model_validate(data)
 
-            doc_id: str = data_record.doc_id
-            if (
-                ext_docdoc_loader is None
-                and data_record.status not in self._accepted_status
-            ):
-                _log.error(
-                    "Skipping record without successfull conversion status: %s", doc_id
-                )
-                rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
-                continue
+                # Try to extract the layout model name
+                if not self._layout_model_name:
+                    self._layout_model_name = dict_get(
+                        data_record.predictor_info,
+                        [
+                            "options",
+                            "pdf",
+                            "pipeline_options",
+                            "layout_options",
+                            "model_spec",
+                            "name",
+                        ],
+                    )
 
-            true_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
-            if not pred_doc:
-                _log.error("There is no prediction for doc_id=%s", doc_id)
-                rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
-                continue
+                doc_id: str = data_record.doc_id
+                if (
+                    ext_docdoc_loader is None
+                    and data_record.status not in self._accepted_status
+                ):
+                    _log.error(
+                        "Skipping record without successfull conversion status: %s",
+                        doc_id,
+                    )
+                    rejected_samples[
+                        EvaluationRejectionType.INVALID_CONVERSION_STATUS
+                    ] += 1
+                    continue
 
-            # Compute confusion matrices
-            pages_confusion_matrices: Dict[int, np.ndarray]
-            pages_pixels: Dict[int, int]
-            pages_confusion_matrices, doc_num_pixels, pages_pixels = (
-                self._compute_document_confusion_matrix(true_doc, pred_doc)
-            )
+                true_doc = data_record.ground_truth_doc
+                pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
+                if not pred_doc:
+                    _log.error("There is no prediction for doc_id=%s", doc_id)
+                    rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
+                    continue
 
-            # Compute metrics per page
-            for page_no, page_confusion_matrix in pages_confusion_matrices.items():
-                # Contribute to the dataset's confusion matrix
-                ds_confusion_matrix += page_confusion_matrix
-
-                # Compute page metrics
-                page_matrix_evaluation: MultiLabelMatrixEvaluation = (
-                    self._mlcm.compute_metrics(
-                        page_confusion_matrix,
-                        self._matrix_id_to_name,
+                evaluated_samples += 1
+                futures.extend(
+                    self._submit_document_evaluation(
+                        executor, doc_id, true_doc, pred_doc
                     )
                 )
+
+            # Collect the futures
+            _log.info("Collecting the documents for evaluations...")
+            for future in tqdm(
+                as_completed(futures),
+                desc="Multi-label Matrix Layout evaluations",
+                ncols=120,
+                total=len(futures),
+            ):
+                page_metrics: MultiLabelMatrixEvaluation
+                doc_id, page_no, page_pixels, page_metrics = future.result()
+
+                page_confusion_matrix: np.ndarray = (
+                    page_metrics.detailed.confusion_matrix
+                )
+                ds_num_pixels += page_pixels
+                ds_confusion_matrix += page_confusion_matrix
+                doc_page_id = f"{doc_id}-{page_no}"
                 page_evaluation = PagePixelLayoutEvaluation(
                     doc_id=doc_id,
                     page_no=page_no,
-                    num_pixels=pages_pixels[page_no],
-                    matrix_evaluation=page_matrix_evaluation,
+                    num_pixels=page_pixels,
+                    matrix_evaluation=page_metrics,
                 )
-                doc_page_id = f"{doc_id}-{page_no}"
                 all_pages_evaluations[doc_page_id] = page_evaluation
 
                 # Update f1 lists
                 pages_detailed_f1.append(
-                    page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean
+                    page_metrics.detailed.agg_metrics.classes_f1_mean
                 )
                 pages_collapsed_f1.append(
-                    page_matrix_evaluation.collapsed.agg_metrics.classes_f1_mean
+                    page_metrics.collapsed.agg_metrics.classes_f1_mean
                 )
 
-            ds_num_pixels += doc_num_pixels
-
-        # Compute metrics for the dataset and each document
+        # Compute metrics for the dataset
         ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics(
             ds_confusion_matrix,
             self._matrix_id_to_name,
         )
 
         ds_evaluation = DatasetPixelLayoutEvaluation(
+            evaluated_samples=evaluated_samples,
+            rejected_samples=rejected_samples,
             layout_model_name=self._layout_model_name,
             num_pages=len(all_pages_evaluations),
             num_pixels=ds_num_pixels,
-            rejected_samples=rejected_samples,
             matrix_evaluation=ds_matrix_evaluation,
             page_evaluations=all_pages_evaluations,
             f1_all_classes_stats=compute_stats(pages_detailed_f1),
@@ -310,23 +382,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
 
         return ds_evaluation
 
-    @staticmethod
-    def evaluation_filenames(
-        benchmark: BenchMarkNames, save_root: Path
-    ) -> dict[str, Path]:
-        r"""
-        Generate the expected filenames for the produced evaluation files
-        """
-        modality: str = EvaluationModality.LAYOUT.value
-        json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
-        excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
-
-        eval_filenames: dict[str, Path] = {
-            "json": json_fn,
-            "excel": excel_fn,
-        }
-        return eval_filenames
-
     def save_evaluations(
         self,
         benchmark: BenchMarkNames,
@@ -393,19 +448,17 @@ class PixelLayoutEvaluator(BaseEvaluator):
             excel_fn,
         )
 
-    def _compute_document_confusion_matrix(
+    def _submit_document_evaluation(
         self,
+        executor: Executor,
+        doc_id: str,
         true_doc: DoclingDocument,
         pred_doc: DoclingDocument,
-    ) -> Tuple[
-        Dict[int, np.ndarray],  # page_no -> page confusion matrix
-        int,  # document num_pixels
-        Dict[int, int],  # page_no -> page num_pixels
-    ]:
+    ) -> list[Future]:
         r"""
-        Compute the confusion matrix for the given documents.
-        This is the sum of the confusion matrices of the document pages.
+        Submit the document for evaluation and return a future for each page
         """
+        futures: list[Future] = []
 
         # Collect all DocItems by page for both GT and predictions
         true_pages_to_objects = self._collect_items_by_page(true_doc)
@@ -416,13 +469,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
         pred_pages = set(pred_pages_to_objects.keys())
         _log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")
 
-        matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
-        page_confusion_matrices: Dict[int, np.ndarray] = (
-            {}
-        )  # page_no -> page confusion_matrix
-        all_pages_pixels: Dict[int, int] = {}  # page_no -> page num_pixels
-        doc_pixels = 0
-
         for page_no in sorted(gt_pages):
             page_size = true_doc.pages[page_no].size
             pg_width = math.ceil(page_size.width)
@@ -444,41 +490,39 @@ class PixelLayoutEvaluator(BaseEvaluator):
                     doc=pred_doc,
                 )
 
-                # TODO: Parallelize the confusion matrix over the pages
-                # Compute the confusion matrix
-                gt_binary = self._mlcm.make_binary_representation(
-                    pg_width, pg_height, gt_layouts
+                # Submit the page for computation
+                futures.append(
+                    executor.submit(
+                        evaluate_page,
+                        self._mlcm,
+                        doc_id,
+                        page_no,
+                        pg_width,
+                        pg_height,
+                        self._matrix_id_to_name,
+                        gt_layouts,
+                        pred_layouts,
+                    )
                 )
-                preds_binary = self._mlcm.make_binary_representation(
-                    pg_width, pg_height, pred_layouts
-                )
-                page_confusion_matrix = self._mlcm.generate_confusion_matrix(
-                    gt_binary, preds_binary, matrix_categories_ids
-                )
-                page_pixels = pg_width * pg_height
-                doc_pixels += page_pixels
-                all_pages_pixels[page_no] = page_pixels
-                page_confusion_matrices[page_no] = page_confusion_matrix
             else:
                 # No prediction data for this page
                 if (
                     self._missing_prediction_strategy
                     == MissingPredictionStrategy.PENALIZE
                 ):
-                    gt_binary = self._mlcm.make_binary_representation(
-                        pg_width, pg_height, gt_layouts
+                    # Submit the page for computation
+                    futures.append(
+                        executor.submit(
+                            evaluate_page,
+                            self._mlcm,
+                            doc_id,
+                            page_no,
+                            pg_width,
+                            pg_height,
+                            self._matrix_id_to_name,
+                            gt_layouts,
+                        )
                     )
-
-                    # Make an all-one binary representation for the prediction and evaluate as usual
-                    preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
-                    page_confusion_matrix = self._mlcm.generate_confusion_matrix(
-                        gt_binary, preds_binary, matrix_categories_ids
-                    )
-
-                    page_pixels = pg_width * pg_height
-                    doc_pixels += page_pixels
-                    all_pages_pixels[page_no] = page_pixels
-                    page_confusion_matrices[page_no] = page_confusion_matrix
                 elif (
                     self._missing_prediction_strategy
                     == MissingPredictionStrategy.IGNORE
@@ -489,7 +533,8 @@ class PixelLayoutEvaluator(BaseEvaluator):
                     raise ValueError(
                         f"Unknown missing prediction strategy: {self._missing_prediction_strategy}"
                     )
-        return page_confusion_matrices, doc_pixels, all_pages_pixels
+
+        return futures
 
     def _get_page_layout_resolution(
         self,
diff --git a/docling_eval/evaluators/stats.py b/docling_eval/evaluators/stats.py
index 218bbfb..a4e3813 100644
--- a/docling_eval/evaluators/stats.py
+++ b/docling_eval/evaluators/stats.py
@@ -8,6 +8,8 @@ import matplotlib.pyplot as plt
 import numpy as np
 from pydantic import BaseModel, model_validator
 
+_log = logging.getLogger(__name__)
+
 
 class DatasetStatistics(BaseModel):
     total: int
@@ -82,7 +84,9 @@ def compute_stats(
     mean: float = statistics.mean(values) if len(values) > 0 else -1
     median: float = statistics.median(values) if len(values) > 0 else -1
     std: float = statistics.stdev(values) if len(values) > 1 else 0.0
-    logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}")
+    _log.debug(
+        f"Compute statistics: total: {total}, mean: {mean}, median: {median}, std: {std}"
+    )
 
     max_value = 1.0
     if not max_value_is_one and len(values) > 0:
@@ -90,7 +94,7 @@ def compute_stats(
 
     # Compute the histogram
     hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value))
-    logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}")
+    _log.debug(f"Compute statistics: hist: {len(hist)}, #-bins: {len(bins)}")
 
     return DatasetStatistics(
         total=total, mean=mean, median=median, std=std, hist=hist, bins=bins
diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh
index 6189e90..878c947 100755
--- a/docs/examples/evaluate_dpbench_on_external_predictions.sh
+++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh
@@ -49,7 +49,7 @@ evaluate() {
 }
 
 
-visualize() {
+visualize_predictions() {
     local pred_dir save_dir modality
     pred_dir="$1"
     save_dir="$2"
@@ -71,10 +71,27 @@ visualize() {
         --output-dir "${save_dir}"
 }
 
+
+visualize_evaluations() {
+    local pred_dir eval_root modality
+    pred_dir="$1"
+    eval_root="$2"
+
+    for modality in "${MODALITIES[@]}"; do
+        echo "Evaluate: modality: ${modality} for evaluations: ${eval_root}"
+        uv run docling-eval visualize \
+            --benchmark DPBench \
+            --modality "${modality}" \
+            --input-dir "${pred_dir}" \
+            --output-dir "${eval_root}"
+    done
+}
+
 ###########################################################################################
 # Main
 #
 
+#########################################
 # Predictions
 
 # json predictions
@@ -95,8 +112,13 @@ evaluate \
     scratch/DPBench/external_predictions_yaml
 
 
+#########################################
 # Visualisations
-visualize \
+visualize_predictions \
     scratch/DPBench/predicted_documents/json \
     scratch/DPBench/external_predictions_visualisations
 
+visualize_evaluations \
+    scratch/DPBench/predicted_documents/doctag \
+    scratch/DPBench/external_predictions_doctags
+