feat: Improvements in user experience: Performance, error handling, logging (#189)

* feat: Extend evaluate_dpbench_on_external_predictions.sh to include visualisations of the evaluations

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Improve error checking in main.py:visualize()

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Improve logging

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Parallelize the computation of PixelLayoutEvaluator at the level of page

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Make DatasetPixelLayoutEvaluation a subclass of DatasetEvaluation

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Parallelize the MarkdownTextEvaluator

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* chore: Improve logging

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos
2025-12-16 11:25:55 +01:00
committed by GitHub
parent bcc5200f74
commit a850784b4f
8 changed files with 343 additions and 235 deletions
+24 -11
View File
@@ -641,6 +641,7 @@ def evaluate(
split: str = "test",
cvat_overview_path: Optional[Path] = None,
external_predictions_path: Optional[Path] = None,
concurrency: int = 4,
) -> Optional[DatasetEvaluationType]:
"""Evaluate predictions against ground truth."""
if not os.path.exists(idir):
@@ -673,17 +674,16 @@ def evaluate(
# label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
page_mapping_path=cvat_overview_path,
)
evaluation = layout_evaluator( # type: ignore
layout_evaluation = layout_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)
with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)
# Evaluate with the pixel-wise layout evaluation
pixel_layout_evaluator = PixelLayoutEvaluator()
pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
idir,
split=split,
@@ -696,6 +696,9 @@ def evaluate(
pixel_save_root,
)
# TODO: Redesign evaluate() to return multiple evaluation objects
evaluation = pixel_ds_evaluation # type: ignore
elif modality == EvaluationModality.TABLE_STRUCTURE:
table_evaluator = TableEvaluator()
evaluation = table_evaluator( # type: ignore
@@ -764,7 +767,7 @@ def evaluate(
)
elif modality == EvaluationModality.MARKDOWN_TEXT:
md_evaluator = MarkdownTextEvaluator()
md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
evaluation = md_evaluator( # type: ignore
idir,
split=split,
@@ -823,8 +826,8 @@ def evaluate(
def visualize(
modality: EvaluationModality,
benchmark: BenchMarkNames,
idir: Path,
odir: Path,
idir: Path | None = None,
split: str = "test",
):
"""
@@ -839,10 +842,6 @@ def visualize(
begin_index: Begin index
end_index: End index
"""
if not os.path.exists(idir):
_log.error(f"Input directory not found: {idir}")
return
os.makedirs(odir, exist_ok=True)
metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"
@@ -989,6 +988,11 @@ def visualize(
elif modality == EvaluationModality.READING_ORDER:
try:
# idir is required here
if idir is None or not idir.is_dir():
_log.error(f"Input directory not found: {idir}")
return
with open(metrics_filename, "r") as fd:
ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
fd.read()
@@ -1080,6 +1084,11 @@ def visualize(
elif modality == EvaluationModality.OCR:
try:
# idir is required here
if idir is None or not idir.is_dir():
_log.error(f"Input directory not found: {idir}")
return
with open(metrics_filename, "r") as fd:
ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
fd.read()
@@ -1511,6 +1520,9 @@ def evaluate_cmd(
help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
),
] = None,
concurrency: Annotated[
int, typer.Option(help="Concurrency for the computation of each metric")
] = 4,
):
"""Evaluate predictions against ground truth."""
input_dir, output_dir = derive_input_output_dirs(
@@ -1531,6 +1543,7 @@ def evaluate_cmd(
odir=eval_output_dir,
split=split,
external_predictions_path=external_predictions_path,
concurrency=concurrency,
)
@@ -1573,8 +1586,8 @@ def visualize_cmd(
visualize(
modality=modality,
benchmark=benchmark,
idir=input_dir,
odir=eval_output_dir,
idir=input_dir,
split=split,
)
@@ -75,12 +75,14 @@ class BaseEvaluator(Generic[UnitEvaluationType, DatasetEvaluationType]):
supported_prediction_formats: List[PredictionFormats] = [
PredictionFormats.DOCLING_DOCUMENT
],
concurrency: int = 4,
):
r"""
Parameters
----------
intermediate_evaluations_path: When True the evalution per example will be saved in a file
"""
self._concurrency = concurrency
self._intermediate_evaluations_path = intermediate_evaluations_path
# Validate the prediction_sources
+2 -5
View File
@@ -489,7 +489,7 @@ class LayoutEvaluator(BaseEvaluator):
weighted_map_90_values.append(average_iou_90)
weighted_map_95_values.append(average_iou_95)
_log.info(
_log.debug(
"doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
"precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
doc_id_page,
@@ -528,7 +528,6 @@ class LayoutEvaluator(BaseEvaluator):
segmentation_precision_no_pictures=precision_no_pics,
segmentation_recall_no_pictures=recall_no_pics,
segmentation_f1_no_pictures=f1_no_pics,
# New per-sample element count metrics
true_element_count=true_element_count,
pred_element_count=pred_element_count,
true_table_count=true_table_count,
@@ -836,9 +835,7 @@ class LayoutEvaluator(BaseEvaluator):
true_labels: Dict[str, int] = {}
pred_labels: Dict[str, int] = {}
for i, data in tqdm(
enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
):
for i, data in enumerate(ds):
data_record = DatasetRecordWithPrediction.model_validate(data)
true_doc = data_record.ground_truth_doc
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
@@ -1,4 +1,5 @@
import logging
from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
@@ -33,6 +34,51 @@ from docling_eval.utils.external_docling_document_loader import (
_log = logging.getLogger(__name__)
def compute_bleu_score(bleu_eval, true_txt: str, pred_txt: str) -> float:
r"""
Compute BLEU score with the HF evaluate and the default Tokenizer_13
"""
result = bleu_eval.compute(predictions=[pred_txt], references=[[true_txt]])
bleu = result["bleu"]
return bleu
def compute_nltk_scores(true_txt: str, pred_txt: str) -> dict[str, float]:
r"""
Returns:
--------
dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
"""
true_tokens = word_tokenize(true_txt)
true_tokens_set = set(true_tokens)
pred_tokens = word_tokenize(pred_txt)
pred_tokens_set = set(pred_tokens)
f1_score = f_measure(true_tokens_set, pred_tokens_set)
precision_score = precision(true_tokens_set, pred_tokens_set)
recall_score = recall(true_tokens_set, pred_tokens_set)
edit_dist = edit_distance(pred_tokens, true_tokens) / max(
len(pred_tokens), len(true_tokens)
)
meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
metrics: dict[str, float] = {
"f1_score": f1_score,
"precision": precision_score,
"recall": recall_score,
"edit_distance": edit_dist,
"meteor": meteor,
}
return metrics
def evaluate_page(bleu_eval, true_md: str, pred_md: str) -> dict[str, float]:
r"""Compute the bleu and the nltk scores"""
scores = compute_nltk_scores(true_md, pred_md)
scores["bleu"] = compute_bleu_score(bleu_eval, true_md, pred_md)
return scores
class PageMarkdownEvaluation(UnitEvaluation):
doc_id: str
@@ -62,6 +108,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
self,
intermediate_evaluations_path: Optional[Path] = None,
prediction_sources: List[PredictionFormats] = [],
concurrency: int = 4,
):
r""" """
supported_prediction_formats: List[PredictionFormats] = [
@@ -74,6 +121,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
intermediate_evaluations_path=intermediate_evaluations_path,
prediction_sources=prediction_sources,
supported_prediction_formats=supported_prediction_formats,
concurrency=concurrency,
)
self._bleu_eval = evaluate.load("bleu")
@@ -146,67 +194,80 @@ class MarkdownTextEvaluator(BaseEvaluator):
"meteor": [],
}
for i, data in tqdm(
enumerate(ds_selection),
desc="Markdown text evaluations",
ncols=120,
total=len(ds_selection),
):
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id = data_record.doc_id
true_doc = data_record.ground_truth_doc
true_md = self._docling_document_to_md(true_doc)
with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
futures: list[Future] = []
# Get the predicted markdown from the external predictions path
if external_predictions_path is not None:
pred_doc = external_docling_doc_loader(data_record)
if pred_doc is None:
_log.error("No external prediction found for doc_id=%s", doc_id)
# Submit the evaluation tasks
_log.info("Submitting the documents for evaluation...")
for data in ds_selection:
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id = data_record.doc_id
true_doc = data_record.ground_truth_doc
true_md = self._docling_document_to_md(true_doc)
# Get the predicted markdown from the external predictions path
if external_predictions_path is not None:
pred_doc = external_docling_doc_loader(data_record)
if pred_doc is None:
_log.error("No external prediction found for doc_id=%s", doc_id)
rejected_samples[
EvaluationRejectionType.MISSING_PREDICTION
] += 1
continue
pred_md = self._docling_document_to_md(pred_doc)
else:
if data_record.status not in self._accepted_status:
_log.error(
"Skipping record without successfull conversion status: %s",
doc_id,
)
rejected_samples[
EvaluationRejectionType.INVALID_CONVERSION_STATUS
] += 1
continue
pred_md = self._get_pred_md(data_record) # type: ignore
if pred_md is None:
_log.error("There is no markdown prediction for doc_id=%s", doc_id)
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
continue
pred_md = self._docling_document_to_md(pred_doc)
else:
if data_record.status not in self._accepted_status:
_log.error(
"Skipping record without successfull conversion status: %s",
doc_id,
if true_md != "" and pred_md != "":
futures.append(
executor.submit(
evaluate_page, self._bleu_eval, true_md, pred_md
)
)
rejected_samples[
EvaluationRejectionType.INVALID_CONVERSION_STATUS
] += 1
continue
pred_md = self._get_pred_md(data_record) # type: ignore
if not pred_md:
_log.error("There is no markdown prediction for doc_id=%s", doc_id)
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
continue
# Collect the futures
_log.info("Collecting the documents for evaluations...")
for i, future in tqdm(
enumerate(as_completed(futures)),
desc="Markdown text evaluations",
ncols=120,
total=len(futures),
):
doc_metrics = future.result()
bleu = 0.0
if true_md != "" and pred_md != "":
bleu = self._compute_bleu_score(true_md, pred_md)
ntlk_scores = self._compute_nltk_scores(true_md, pred_md)
# Collect metrics across pages
for score_name, score in doc_metrics.items():
ds_metrics[score_name].append(score)
# Collect metrics across pages
ds_metrics["bleu"].append(bleu)
for score_name, score in ntlk_scores.items():
ds_metrics[score_name].append(score)
md_evaluation = PageMarkdownEvaluation(
doc_id=doc_id,
true_md=true_md,
pred_md=pred_md,
bleu=doc_metrics["bleu"],
f1_score=doc_metrics["f1_score"],
precision=doc_metrics["precision"],
recall=doc_metrics["recall"],
edit_distance=doc_metrics["edit_distance"],
meteor=doc_metrics["meteor"],
)
evaluations.append(md_evaluation)
md_evaluation = PageMarkdownEvaluation(
doc_id=doc_id,
true_md=true_md,
pred_md=pred_md,
bleu=bleu,
f1_score=ntlk_scores["f1_score"],
precision=ntlk_scores["precision"],
recall=ntlk_scores["recall"],
edit_distance=ntlk_scores["edit_distance"],
meteor=ntlk_scores["meteor"],
)
evaluations.append(md_evaluation)
if self._intermediate_evaluations_path:
self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
if self._intermediate_evaluations_path:
self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
ds_md_evalutions = DatasetMarkdownEvaluation(
evaluated_samples=len(evaluations),
@@ -221,44 +282,6 @@ class MarkdownTextEvaluator(BaseEvaluator):
)
return ds_md_evalutions
def _compute_bleu_score(self, true_txt: str, pred_txt: str) -> float:
r"""
Compute BLEU score with the HF evaluate and the default Tokenizer_13
"""
result = self._bleu_eval.compute(
predictions=[pred_txt], references=[[true_txt]]
)
bleu = result["bleu"]
return bleu
def _compute_nltk_scores(self, true_txt: str, pred_txt: str) -> dict[str, float]:
r"""
Returns:
--------
dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
"""
true_tokens = word_tokenize(true_txt)
true_tokens_set = set(true_tokens)
pred_tokens = word_tokenize(pred_txt)
pred_tokens_set = set(pred_tokens)
f1_score = f_measure(true_tokens_set, pred_tokens_set)
precision_score = precision(true_tokens_set, pred_tokens_set)
recall_score = recall(true_tokens_set, pred_tokens_set)
edit_dist = edit_distance(pred_tokens, true_tokens) / max(
len(pred_tokens), len(true_tokens)
)
meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
metrics: dict[str, float] = {
"f1_score": f1_score,
"precision": precision_score,
"recall": recall_score,
"edit_distance": edit_dist,
"meteor": meteor,
}
return metrics
def _docling_document_to_md(self, doc: DoclingDocument) -> str:
r"""
Export DoclingDocument to markdown
+5 -3
View File
@@ -3,7 +3,10 @@ from typing import Any, Dict, Optional
import numpy as np
from pydantic import BaseModel, model_serializer, model_validator
from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
from docling_eval.evaluators.base_evaluator import (
DatasetEvaluation,
EvaluationRejectionType,
)
from docling_eval.evaluators.stats import DatasetStatistics
@@ -73,11 +76,10 @@ class PagePixelLayoutEvaluation(BaseModel):
matrix_evaluation: MultiLabelMatrixEvaluation
class DatasetPixelLayoutEvaluation(BaseModel):
class DatasetPixelLayoutEvaluation(DatasetEvaluation):
layout_model_name: Optional[str]
num_pages: int
num_pixels: int
rejected_samples: Dict[EvaluationRejectionType, int]
matrix_evaluation: MultiLabelMatrixEvaluation
page_evaluations: Dict[str, PagePixelLayoutEvaluation]
+166 -121
View File
@@ -3,6 +3,7 @@ import json
import logging
import math
from collections import defaultdict
from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
@@ -53,6 +54,51 @@ def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
return label
def evaluate_page(
mlcm: MultiLabelConfusionMatrix,
doc_id: str,
page_no: int,
pg_width: int,
pg_height: int,
matrix_id_to_name: dict[int, str],
gt_resolutions: list[LayoutResolution],
pred_resolutions: Optional[list[LayoutResolution]] = None,
) -> tuple[str, int, int, MultiLabelMatrixEvaluation]:
r"""
Compute the confusion matrix and the metrics for one page
If pred_resolutions is None, assume an all-background predictions
Return
------
doc_id
page_no
page_pixels
page_metrics
"""
# Make binary representations
gt_binary = mlcm.make_binary_representation(pg_width, pg_height, gt_resolutions)
if pred_resolutions is not None:
preds_binary = mlcm.make_binary_representation(
pg_width, pg_height, pred_resolutions
)
else:
preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
# Compute confusion matrix
matrix_categories_ids: List[int] = list(matrix_id_to_name.keys())
confusion_matrix = mlcm.generate_confusion_matrix(
gt_binary, preds_binary, matrix_categories_ids
)
# Compute metrics
page_metrics: MultiLabelMatrixEvaluation = mlcm.compute_metrics(
confusion_matrix, matrix_id_to_name
)
page_pixels = pg_width * pg_height
return doc_id, page_no, page_pixels, page_metrics
class PixelLayoutEvaluator(BaseEvaluator):
r"""
Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices
@@ -65,6 +111,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
intermediate_evaluations_path: Optional[Path] = None,
prediction_sources: List[PredictionFormats] = [],
missing_prediction_strategy: MissingPredictionStrategy = MissingPredictionStrategy.PENALIZE,
concurrency: int = 4,
):
r"""
@@ -82,6 +129,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
if not prediction_sources:
prediction_sources = supported_prediction_formats
super().__init__(
concurrency=concurrency,
intermediate_evaluations_path=intermediate_evaluations_path,
prediction_sources=prediction_sources,
supported_prediction_formats=supported_prediction_formats,
@@ -103,6 +151,23 @@ class PixelLayoutEvaluator(BaseEvaluator):
self._build_matrix_categories(label_mapping)
)
@staticmethod
def evaluation_filenames(
benchmark: BenchMarkNames, save_root: Path
) -> dict[str, Path]:
r"""
Generate the expected filenames for the produced evaluation files
"""
modality: str = EvaluationModality.LAYOUT.value
json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
eval_filenames: dict[str, Path] = {
"json": json_fn,
"excel": excel_fn,
}
return eval_filenames
def _build_matrix_categories(
self,
label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
@@ -193,6 +258,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
ds_selection: Dataset = ds[split]
# Results containers
evaluated_samples = 0
rejected_samples: Dict[EvaluationRejectionType, int] = {
EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0,
EvaluationRejectionType.MISSING_PREDICTION: 0,
@@ -213,95 +279,101 @@ class PixelLayoutEvaluator(BaseEvaluator):
[]
) # Gather f1 score/image when evaluated on collapsed classes
for i, data in tqdm(
enumerate(ds_selection),
desc="Multi-label Matrix Layout evaluations",
ncols=120,
total=len(ds_selection),
):
data_record = DatasetRecordWithPrediction.model_validate(data)
with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
futures: list[Future] = []
# Try to extract the layout model name
if not self._layout_model_name:
self._layout_model_name = dict_get(
data_record.predictor_info,
[
"options",
"pdf",
"pipeline_options",
"layout_options",
"model_spec",
"name",
],
)
# Submit pages for execution
_log.info("Submitting the documents for evaluation...")
for data in ds_selection:
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id: str = data_record.doc_id
if (
ext_docdoc_loader is None
and data_record.status not in self._accepted_status
):
_log.error(
"Skipping record without successfull conversion status: %s", doc_id
)
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
continue
# Try to extract the layout model name
if not self._layout_model_name:
self._layout_model_name = dict_get(
data_record.predictor_info,
[
"options",
"pdf",
"pipeline_options",
"layout_options",
"model_spec",
"name",
],
)
true_doc = data_record.ground_truth_doc
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
if not pred_doc:
_log.error("There is no prediction for doc_id=%s", doc_id)
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
continue
doc_id: str = data_record.doc_id
if (
ext_docdoc_loader is None
and data_record.status not in self._accepted_status
):
_log.error(
"Skipping record without successfull conversion status: %s",
doc_id,
)
rejected_samples[
EvaluationRejectionType.INVALID_CONVERSION_STATUS
] += 1
continue
# Compute confusion matrices
pages_confusion_matrices: Dict[int, np.ndarray]
pages_pixels: Dict[int, int]
pages_confusion_matrices, doc_num_pixels, pages_pixels = (
self._compute_document_confusion_matrix(true_doc, pred_doc)
)
true_doc = data_record.ground_truth_doc
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
if not pred_doc:
_log.error("There is no prediction for doc_id=%s", doc_id)
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
continue
# Compute metrics per page
for page_no, page_confusion_matrix in pages_confusion_matrices.items():
# Contribute to the dataset's confusion matrix
ds_confusion_matrix += page_confusion_matrix
# Compute page metrics
page_matrix_evaluation: MultiLabelMatrixEvaluation = (
self._mlcm.compute_metrics(
page_confusion_matrix,
self._matrix_id_to_name,
evaluated_samples += 1
futures.extend(
self._submit_document_evaluation(
executor, doc_id, true_doc, pred_doc
)
)
# Collect the futures
_log.info("Collecting the documents for evaluations...")
for future in tqdm(
as_completed(futures),
desc="Multi-label Matrix Layout evaluations",
ncols=120,
total=len(futures),
):
page_metrics: MultiLabelMatrixEvaluation
doc_id, page_no, page_pixels, page_metrics = future.result()
page_confusion_matrix: np.ndarray = (
page_metrics.detailed.confusion_matrix
)
ds_num_pixels += page_pixels
ds_confusion_matrix += page_confusion_matrix
doc_page_id = f"{doc_id}-{page_no}"
page_evaluation = PagePixelLayoutEvaluation(
doc_id=doc_id,
page_no=page_no,
num_pixels=pages_pixels[page_no],
matrix_evaluation=page_matrix_evaluation,
num_pixels=page_pixels,
matrix_evaluation=page_metrics,
)
doc_page_id = f"{doc_id}-{page_no}"
all_pages_evaluations[doc_page_id] = page_evaluation
# Update f1 lists
pages_detailed_f1.append(
page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean
page_metrics.detailed.agg_metrics.classes_f1_mean
)
pages_collapsed_f1.append(
page_matrix_evaluation.collapsed.agg_metrics.classes_f1_mean
page_metrics.collapsed.agg_metrics.classes_f1_mean
)
ds_num_pixels += doc_num_pixels
# Compute metrics for the dataset and each document
# Compute metrics for the dataset
ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics(
ds_confusion_matrix,
self._matrix_id_to_name,
)
ds_evaluation = DatasetPixelLayoutEvaluation(
evaluated_samples=evaluated_samples,
rejected_samples=rejected_samples,
layout_model_name=self._layout_model_name,
num_pages=len(all_pages_evaluations),
num_pixels=ds_num_pixels,
rejected_samples=rejected_samples,
matrix_evaluation=ds_matrix_evaluation,
page_evaluations=all_pages_evaluations,
f1_all_classes_stats=compute_stats(pages_detailed_f1),
@@ -310,23 +382,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
return ds_evaluation
@staticmethod
def evaluation_filenames(
benchmark: BenchMarkNames, save_root: Path
) -> dict[str, Path]:
r"""
Generate the expected filenames for the produced evaluation files
"""
modality: str = EvaluationModality.LAYOUT.value
json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
eval_filenames: dict[str, Path] = {
"json": json_fn,
"excel": excel_fn,
}
return eval_filenames
def save_evaluations(
self,
benchmark: BenchMarkNames,
@@ -393,19 +448,17 @@ class PixelLayoutEvaluator(BaseEvaluator):
excel_fn,
)
def _compute_document_confusion_matrix(
def _submit_document_evaluation(
self,
executor: Executor,
doc_id: str,
true_doc: DoclingDocument,
pred_doc: DoclingDocument,
) -> Tuple[
Dict[int, np.ndarray], # page_no -> page confusion matrix
int, # document num_pixels
Dict[int, int], # page_no -> page num_pixels
]:
) -> list[Future]:
r"""
Compute the confusion matrix for the given documents.
This is the sum of the confusion matrices of the document pages.
Submit the document for evaluation and return a future for each page
"""
futures: list[Future] = []
# Collect all DocItems by page for both GT and predictions
true_pages_to_objects = self._collect_items_by_page(true_doc)
@@ -416,13 +469,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
pred_pages = set(pred_pages_to_objects.keys())
_log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")
matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
page_confusion_matrices: Dict[int, np.ndarray] = (
{}
) # page_no -> page confusion_matrix
all_pages_pixels: Dict[int, int] = {} # page_no -> page num_pixels
doc_pixels = 0
for page_no in sorted(gt_pages):
page_size = true_doc.pages[page_no].size
pg_width = math.ceil(page_size.width)
@@ -444,41 +490,39 @@ class PixelLayoutEvaluator(BaseEvaluator):
doc=pred_doc,
)
# TODO: Parallelize the confusion matrix over the pages
# Compute the confusion matrix
gt_binary = self._mlcm.make_binary_representation(
pg_width, pg_height, gt_layouts
# Submit the page for computation
futures.append(
executor.submit(
evaluate_page,
self._mlcm,
doc_id,
page_no,
pg_width,
pg_height,
self._matrix_id_to_name,
gt_layouts,
pred_layouts,
)
)
preds_binary = self._mlcm.make_binary_representation(
pg_width, pg_height, pred_layouts
)
page_confusion_matrix = self._mlcm.generate_confusion_matrix(
gt_binary, preds_binary, matrix_categories_ids
)
page_pixels = pg_width * pg_height
doc_pixels += page_pixels
all_pages_pixels[page_no] = page_pixels
page_confusion_matrices[page_no] = page_confusion_matrix
else:
# No prediction data for this page
if (
self._missing_prediction_strategy
== MissingPredictionStrategy.PENALIZE
):
gt_binary = self._mlcm.make_binary_representation(
pg_width, pg_height, gt_layouts
# Submit the page for computation
futures.append(
executor.submit(
evaluate_page,
self._mlcm,
doc_id,
page_no,
pg_width,
pg_height,
self._matrix_id_to_name,
gt_layouts,
)
)
# Make an all-one binary representation for the prediction and evaluate as usual
preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
page_confusion_matrix = self._mlcm.generate_confusion_matrix(
gt_binary, preds_binary, matrix_categories_ids
)
page_pixels = pg_width * pg_height
doc_pixels += page_pixels
all_pages_pixels[page_no] = page_pixels
page_confusion_matrices[page_no] = page_confusion_matrix
elif (
self._missing_prediction_strategy
== MissingPredictionStrategy.IGNORE
@@ -489,7 +533,8 @@ class PixelLayoutEvaluator(BaseEvaluator):
raise ValueError(
f"Unknown missing prediction strategy: {self._missing_prediction_strategy}"
)
return page_confusion_matrices, doc_pixels, all_pages_pixels
return futures
def _get_page_layout_resolution(
self,
+6 -2
View File
@@ -8,6 +8,8 @@ import matplotlib.pyplot as plt
import numpy as np
from pydantic import BaseModel, model_validator
_log = logging.getLogger(__name__)
class DatasetStatistics(BaseModel):
total: int
@@ -82,7 +84,9 @@ def compute_stats(
mean: float = statistics.mean(values) if len(values) > 0 else -1
median: float = statistics.median(values) if len(values) > 0 else -1
std: float = statistics.stdev(values) if len(values) > 1 else 0.0
logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}")
_log.debug(
f"Compute statistics: total: {total}, mean: {mean}, median: {median}, std: {std}"
)
max_value = 1.0
if not max_value_is_one and len(values) > 0:
@@ -90,7 +94,7 @@ def compute_stats(
# Compute the histogram
hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value))
logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}")
_log.debug(f"Compute statistics: hist: {len(hist)}, #-bins: {len(bins)}")
return DatasetStatistics(
total=total, mean=mean, median=median, std=std, hist=hist, bins=bins
@@ -49,7 +49,7 @@ evaluate() {
}
visualize() {
visualize_predictions() {
local pred_dir save_dir modality
pred_dir="$1"
save_dir="$2"
@@ -71,10 +71,27 @@ visualize() {
--output-dir "${save_dir}"
}
visualize_evaluations() {
local pred_dir eval_root modality
pred_dir="$1"
eval_root="$2"
for modality in "${MODALITIES[@]}"; do
echo "Evaluate: modality: ${modality} for evaluations: ${eval_root}"
uv run docling-eval visualize \
--benchmark DPBench \
--modality "${modality}" \
--input-dir "${pred_dir}" \
--output-dir "${eval_root}"
done
}
###########################################################################################
# Main
#
#########################################
# Predictions
# json predictions
@@ -95,8 +112,13 @@ evaluate \
scratch/DPBench/external_predictions_yaml
#########################################
# Visualisations
visualize \
visualize_predictions \
scratch/DPBench/predicted_documents/json \
scratch/DPBench/external_predictions_visualisations
visualize_evaluations \
scratch/DPBench/predicted_documents/doctag \
scratch/DPBench/external_predictions_doctags