mirror of
https://github.com/docling-project/docling-eval.git
synced 2026-05-17 13:10:47 +00:00
feat: Improvements in user experience: Performance, error handling, logging (#189)
* feat: Extend evaluate_dpbench_on_external_predictions.sh to include visualisations of the evaluations Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Improve error checking in main.py:visualize() Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Improve logging Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * feat: Parallelize the computation of PixelLayoutEvaluator at the level of page Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Make DatasetPixelLayoutEvaluation a subclass of DatasetEvaluation Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * feat: Parallelize the MarkdownTextEvaluator Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * chore: Improve logging Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
+24
-11
@@ -641,6 +641,7 @@ def evaluate(
|
||||
split: str = "test",
|
||||
cvat_overview_path: Optional[Path] = None,
|
||||
external_predictions_path: Optional[Path] = None,
|
||||
concurrency: int = 4,
|
||||
) -> Optional[DatasetEvaluationType]:
|
||||
"""Evaluate predictions against ground truth."""
|
||||
if not os.path.exists(idir):
|
||||
@@ -673,17 +674,16 @@ def evaluate(
|
||||
# label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
|
||||
page_mapping_path=cvat_overview_path,
|
||||
)
|
||||
evaluation = layout_evaluator( # type: ignore
|
||||
layout_evaluation = layout_evaluator( # type: ignore
|
||||
idir,
|
||||
split=split,
|
||||
external_predictions_path=external_predictions_path,
|
||||
)
|
||||
|
||||
with open(save_fn, "w") as fd:
|
||||
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
|
||||
json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)
|
||||
|
||||
# Evaluate with the pixel-wise layout evaluation
|
||||
pixel_layout_evaluator = PixelLayoutEvaluator()
|
||||
pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
|
||||
pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
|
||||
idir,
|
||||
split=split,
|
||||
@@ -696,6 +696,9 @@ def evaluate(
|
||||
pixel_save_root,
|
||||
)
|
||||
|
||||
# TODO: Redesign evaluate() to return multiple evaluation objects
|
||||
evaluation = pixel_ds_evaluation # type: ignore
|
||||
|
||||
elif modality == EvaluationModality.TABLE_STRUCTURE:
|
||||
table_evaluator = TableEvaluator()
|
||||
evaluation = table_evaluator( # type: ignore
|
||||
@@ -764,7 +767,7 @@ def evaluate(
|
||||
)
|
||||
|
||||
elif modality == EvaluationModality.MARKDOWN_TEXT:
|
||||
md_evaluator = MarkdownTextEvaluator()
|
||||
md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
|
||||
evaluation = md_evaluator( # type: ignore
|
||||
idir,
|
||||
split=split,
|
||||
@@ -823,8 +826,8 @@ def evaluate(
|
||||
def visualize(
|
||||
modality: EvaluationModality,
|
||||
benchmark: BenchMarkNames,
|
||||
idir: Path,
|
||||
odir: Path,
|
||||
idir: Path | None = None,
|
||||
split: str = "test",
|
||||
):
|
||||
"""
|
||||
@@ -839,10 +842,6 @@ def visualize(
|
||||
begin_index: Begin index
|
||||
end_index: End index
|
||||
"""
|
||||
if not os.path.exists(idir):
|
||||
_log.error(f"Input directory not found: {idir}")
|
||||
return
|
||||
|
||||
os.makedirs(odir, exist_ok=True)
|
||||
metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"
|
||||
|
||||
@@ -989,6 +988,11 @@ def visualize(
|
||||
|
||||
elif modality == EvaluationModality.READING_ORDER:
|
||||
try:
|
||||
# idir is required here
|
||||
if idir is None or not idir.is_dir():
|
||||
_log.error(f"Input directory not found: {idir}")
|
||||
return
|
||||
|
||||
with open(metrics_filename, "r") as fd:
|
||||
ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
|
||||
fd.read()
|
||||
@@ -1080,6 +1084,11 @@ def visualize(
|
||||
|
||||
elif modality == EvaluationModality.OCR:
|
||||
try:
|
||||
# idir is required here
|
||||
if idir is None or not idir.is_dir():
|
||||
_log.error(f"Input directory not found: {idir}")
|
||||
return
|
||||
|
||||
with open(metrics_filename, "r") as fd:
|
||||
ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
|
||||
fd.read()
|
||||
@@ -1511,6 +1520,9 @@ def evaluate_cmd(
|
||||
help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
|
||||
),
|
||||
] = None,
|
||||
concurrency: Annotated[
|
||||
int, typer.Option(help="Concurrency for the computation of each metric")
|
||||
] = 4,
|
||||
):
|
||||
"""Evaluate predictions against ground truth."""
|
||||
input_dir, output_dir = derive_input_output_dirs(
|
||||
@@ -1531,6 +1543,7 @@ def evaluate_cmd(
|
||||
odir=eval_output_dir,
|
||||
split=split,
|
||||
external_predictions_path=external_predictions_path,
|
||||
concurrency=concurrency,
|
||||
)
|
||||
|
||||
|
||||
@@ -1573,8 +1586,8 @@ def visualize_cmd(
|
||||
visualize(
|
||||
modality=modality,
|
||||
benchmark=benchmark,
|
||||
idir=input_dir,
|
||||
odir=eval_output_dir,
|
||||
idir=input_dir,
|
||||
split=split,
|
||||
)
|
||||
|
||||
|
||||
@@ -75,12 +75,14 @@ class BaseEvaluator(Generic[UnitEvaluationType, DatasetEvaluationType]):
|
||||
supported_prediction_formats: List[PredictionFormats] = [
|
||||
PredictionFormats.DOCLING_DOCUMENT
|
||||
],
|
||||
concurrency: int = 4,
|
||||
):
|
||||
r"""
|
||||
Parameters
|
||||
----------
|
||||
intermediate_evaluations_path: When True the evalution per example will be saved in a file
|
||||
"""
|
||||
self._concurrency = concurrency
|
||||
self._intermediate_evaluations_path = intermediate_evaluations_path
|
||||
|
||||
# Validate the prediction_sources
|
||||
|
||||
@@ -489,7 +489,7 @@ class LayoutEvaluator(BaseEvaluator):
|
||||
weighted_map_90_values.append(average_iou_90)
|
||||
weighted_map_95_values.append(average_iou_95)
|
||||
|
||||
_log.info(
|
||||
_log.debug(
|
||||
"doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
|
||||
"precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
|
||||
doc_id_page,
|
||||
@@ -528,7 +528,6 @@ class LayoutEvaluator(BaseEvaluator):
|
||||
segmentation_precision_no_pictures=precision_no_pics,
|
||||
segmentation_recall_no_pictures=recall_no_pics,
|
||||
segmentation_f1_no_pictures=f1_no_pics,
|
||||
# New per-sample element count metrics
|
||||
true_element_count=true_element_count,
|
||||
pred_element_count=pred_element_count,
|
||||
true_table_count=true_table_count,
|
||||
@@ -836,9 +835,7 @@ class LayoutEvaluator(BaseEvaluator):
|
||||
true_labels: Dict[str, int] = {}
|
||||
pred_labels: Dict[str, int] = {}
|
||||
|
||||
for i, data in tqdm(
|
||||
enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
|
||||
):
|
||||
for i, data in enumerate(ds):
|
||||
data_record = DatasetRecordWithPrediction.model_validate(data)
|
||||
true_doc = data_record.ground_truth_doc
|
||||
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
@@ -33,6 +34,51 @@ from docling_eval.utils.external_docling_document_loader import (
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def compute_bleu_score(bleu_eval, true_txt: str, pred_txt: str) -> float:
|
||||
r"""
|
||||
Compute BLEU score with the HF evaluate and the default Tokenizer_13
|
||||
"""
|
||||
result = bleu_eval.compute(predictions=[pred_txt], references=[[true_txt]])
|
||||
bleu = result["bleu"]
|
||||
return bleu
|
||||
|
||||
|
||||
def compute_nltk_scores(true_txt: str, pred_txt: str) -> dict[str, float]:
|
||||
r"""
|
||||
Returns:
|
||||
--------
|
||||
dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
|
||||
"""
|
||||
true_tokens = word_tokenize(true_txt)
|
||||
true_tokens_set = set(true_tokens)
|
||||
pred_tokens = word_tokenize(pred_txt)
|
||||
pred_tokens_set = set(pred_tokens)
|
||||
|
||||
f1_score = f_measure(true_tokens_set, pred_tokens_set)
|
||||
precision_score = precision(true_tokens_set, pred_tokens_set)
|
||||
recall_score = recall(true_tokens_set, pred_tokens_set)
|
||||
edit_dist = edit_distance(pred_tokens, true_tokens) / max(
|
||||
len(pred_tokens), len(true_tokens)
|
||||
)
|
||||
meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
|
||||
|
||||
metrics: dict[str, float] = {
|
||||
"f1_score": f1_score,
|
||||
"precision": precision_score,
|
||||
"recall": recall_score,
|
||||
"edit_distance": edit_dist,
|
||||
"meteor": meteor,
|
||||
}
|
||||
return metrics
|
||||
|
||||
|
||||
def evaluate_page(bleu_eval, true_md: str, pred_md: str) -> dict[str, float]:
|
||||
r"""Compute the bleu and the nltk scores"""
|
||||
scores = compute_nltk_scores(true_md, pred_md)
|
||||
scores["bleu"] = compute_bleu_score(bleu_eval, true_md, pred_md)
|
||||
return scores
|
||||
|
||||
|
||||
class PageMarkdownEvaluation(UnitEvaluation):
|
||||
doc_id: str
|
||||
|
||||
@@ -62,6 +108,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
|
||||
self,
|
||||
intermediate_evaluations_path: Optional[Path] = None,
|
||||
prediction_sources: List[PredictionFormats] = [],
|
||||
concurrency: int = 4,
|
||||
):
|
||||
r""" """
|
||||
supported_prediction_formats: List[PredictionFormats] = [
|
||||
@@ -74,6 +121,7 @@ class MarkdownTextEvaluator(BaseEvaluator):
|
||||
intermediate_evaluations_path=intermediate_evaluations_path,
|
||||
prediction_sources=prediction_sources,
|
||||
supported_prediction_formats=supported_prediction_formats,
|
||||
concurrency=concurrency,
|
||||
)
|
||||
|
||||
self._bleu_eval = evaluate.load("bleu")
|
||||
@@ -146,67 +194,80 @@ class MarkdownTextEvaluator(BaseEvaluator):
|
||||
"meteor": [],
|
||||
}
|
||||
|
||||
for i, data in tqdm(
|
||||
enumerate(ds_selection),
|
||||
desc="Markdown text evaluations",
|
||||
ncols=120,
|
||||
total=len(ds_selection),
|
||||
):
|
||||
data_record = DatasetRecordWithPrediction.model_validate(data)
|
||||
doc_id = data_record.doc_id
|
||||
true_doc = data_record.ground_truth_doc
|
||||
true_md = self._docling_document_to_md(true_doc)
|
||||
with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
|
||||
futures: list[Future] = []
|
||||
|
||||
# Get the predicted markdown from the external predictions path
|
||||
if external_predictions_path is not None:
|
||||
pred_doc = external_docling_doc_loader(data_record)
|
||||
if pred_doc is None:
|
||||
_log.error("No external prediction found for doc_id=%s", doc_id)
|
||||
# Submit the evaluation tasks
|
||||
_log.info("Submitting the documents for evaluation...")
|
||||
for data in ds_selection:
|
||||
data_record = DatasetRecordWithPrediction.model_validate(data)
|
||||
doc_id = data_record.doc_id
|
||||
true_doc = data_record.ground_truth_doc
|
||||
true_md = self._docling_document_to_md(true_doc)
|
||||
|
||||
# Get the predicted markdown from the external predictions path
|
||||
if external_predictions_path is not None:
|
||||
pred_doc = external_docling_doc_loader(data_record)
|
||||
if pred_doc is None:
|
||||
_log.error("No external prediction found for doc_id=%s", doc_id)
|
||||
rejected_samples[
|
||||
EvaluationRejectionType.MISSING_PREDICTION
|
||||
] += 1
|
||||
continue
|
||||
pred_md = self._docling_document_to_md(pred_doc)
|
||||
else:
|
||||
if data_record.status not in self._accepted_status:
|
||||
_log.error(
|
||||
"Skipping record without successfull conversion status: %s",
|
||||
doc_id,
|
||||
)
|
||||
rejected_samples[
|
||||
EvaluationRejectionType.INVALID_CONVERSION_STATUS
|
||||
] += 1
|
||||
continue
|
||||
pred_md = self._get_pred_md(data_record) # type: ignore
|
||||
|
||||
if pred_md is None:
|
||||
_log.error("There is no markdown prediction for doc_id=%s", doc_id)
|
||||
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
|
||||
continue
|
||||
pred_md = self._docling_document_to_md(pred_doc)
|
||||
else:
|
||||
if data_record.status not in self._accepted_status:
|
||||
_log.error(
|
||||
"Skipping record without successfull conversion status: %s",
|
||||
doc_id,
|
||||
|
||||
if true_md != "" and pred_md != "":
|
||||
futures.append(
|
||||
executor.submit(
|
||||
evaluate_page, self._bleu_eval, true_md, pred_md
|
||||
)
|
||||
)
|
||||
rejected_samples[
|
||||
EvaluationRejectionType.INVALID_CONVERSION_STATUS
|
||||
] += 1
|
||||
continue
|
||||
pred_md = self._get_pred_md(data_record) # type: ignore
|
||||
|
||||
if not pred_md:
|
||||
_log.error("There is no markdown prediction for doc_id=%s", doc_id)
|
||||
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
|
||||
continue
|
||||
# Collect the futures
|
||||
_log.info("Collecting the documents for evaluations...")
|
||||
for i, future in tqdm(
|
||||
enumerate(as_completed(futures)),
|
||||
desc="Markdown text evaluations",
|
||||
ncols=120,
|
||||
total=len(futures),
|
||||
):
|
||||
doc_metrics = future.result()
|
||||
|
||||
bleu = 0.0
|
||||
if true_md != "" and pred_md != "":
|
||||
bleu = self._compute_bleu_score(true_md, pred_md)
|
||||
ntlk_scores = self._compute_nltk_scores(true_md, pred_md)
|
||||
# Collect metrics across pages
|
||||
for score_name, score in doc_metrics.items():
|
||||
ds_metrics[score_name].append(score)
|
||||
|
||||
# Collect metrics across pages
|
||||
ds_metrics["bleu"].append(bleu)
|
||||
for score_name, score in ntlk_scores.items():
|
||||
ds_metrics[score_name].append(score)
|
||||
md_evaluation = PageMarkdownEvaluation(
|
||||
doc_id=doc_id,
|
||||
true_md=true_md,
|
||||
pred_md=pred_md,
|
||||
bleu=doc_metrics["bleu"],
|
||||
f1_score=doc_metrics["f1_score"],
|
||||
precision=doc_metrics["precision"],
|
||||
recall=doc_metrics["recall"],
|
||||
edit_distance=doc_metrics["edit_distance"],
|
||||
meteor=doc_metrics["meteor"],
|
||||
)
|
||||
evaluations.append(md_evaluation)
|
||||
|
||||
md_evaluation = PageMarkdownEvaluation(
|
||||
doc_id=doc_id,
|
||||
true_md=true_md,
|
||||
pred_md=pred_md,
|
||||
bleu=bleu,
|
||||
f1_score=ntlk_scores["f1_score"],
|
||||
precision=ntlk_scores["precision"],
|
||||
recall=ntlk_scores["recall"],
|
||||
edit_distance=ntlk_scores["edit_distance"],
|
||||
meteor=ntlk_scores["meteor"],
|
||||
)
|
||||
evaluations.append(md_evaluation)
|
||||
|
||||
if self._intermediate_evaluations_path:
|
||||
self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
|
||||
if self._intermediate_evaluations_path:
|
||||
self.save_intermediate_evaluations("MD", i, doc_id, evaluations)
|
||||
|
||||
ds_md_evalutions = DatasetMarkdownEvaluation(
|
||||
evaluated_samples=len(evaluations),
|
||||
@@ -221,44 +282,6 @@ class MarkdownTextEvaluator(BaseEvaluator):
|
||||
)
|
||||
return ds_md_evalutions
|
||||
|
||||
def _compute_bleu_score(self, true_txt: str, pred_txt: str) -> float:
|
||||
r"""
|
||||
Compute BLEU score with the HF evaluate and the default Tokenizer_13
|
||||
"""
|
||||
result = self._bleu_eval.compute(
|
||||
predictions=[pred_txt], references=[[true_txt]]
|
||||
)
|
||||
bleu = result["bleu"]
|
||||
return bleu
|
||||
|
||||
def _compute_nltk_scores(self, true_txt: str, pred_txt: str) -> dict[str, float]:
|
||||
r"""
|
||||
Returns:
|
||||
--------
|
||||
dict with keys: ["f_measure", "precision", "recall", "edit_dist"]
|
||||
"""
|
||||
true_tokens = word_tokenize(true_txt)
|
||||
true_tokens_set = set(true_tokens)
|
||||
pred_tokens = word_tokenize(pred_txt)
|
||||
pred_tokens_set = set(pred_tokens)
|
||||
|
||||
f1_score = f_measure(true_tokens_set, pred_tokens_set)
|
||||
precision_score = precision(true_tokens_set, pred_tokens_set)
|
||||
recall_score = recall(true_tokens_set, pred_tokens_set)
|
||||
edit_dist = edit_distance(pred_tokens, true_tokens) / max(
|
||||
len(pred_tokens), len(true_tokens)
|
||||
)
|
||||
meteor = meteor_score.meteor_score([true_tokens], pred_tokens)
|
||||
|
||||
metrics: dict[str, float] = {
|
||||
"f1_score": f1_score,
|
||||
"precision": precision_score,
|
||||
"recall": recall_score,
|
||||
"edit_distance": edit_dist,
|
||||
"meteor": meteor,
|
||||
}
|
||||
return metrics
|
||||
|
||||
def _docling_document_to_md(self, doc: DoclingDocument) -> str:
|
||||
r"""
|
||||
Export DoclingDocument to markdown
|
||||
|
||||
@@ -3,7 +3,10 @@ from typing import Any, Dict, Optional
|
||||
import numpy as np
|
||||
from pydantic import BaseModel, model_serializer, model_validator
|
||||
|
||||
from docling_eval.evaluators.base_evaluator import EvaluationRejectionType
|
||||
from docling_eval.evaluators.base_evaluator import (
|
||||
DatasetEvaluation,
|
||||
EvaluationRejectionType,
|
||||
)
|
||||
from docling_eval.evaluators.stats import DatasetStatistics
|
||||
|
||||
|
||||
@@ -73,11 +76,10 @@ class PagePixelLayoutEvaluation(BaseModel):
|
||||
matrix_evaluation: MultiLabelMatrixEvaluation
|
||||
|
||||
|
||||
class DatasetPixelLayoutEvaluation(BaseModel):
|
||||
class DatasetPixelLayoutEvaluation(DatasetEvaluation):
|
||||
layout_model_name: Optional[str]
|
||||
num_pages: int
|
||||
num_pixels: int
|
||||
rejected_samples: Dict[EvaluationRejectionType, int]
|
||||
matrix_evaluation: MultiLabelMatrixEvaluation
|
||||
page_evaluations: Dict[str, PagePixelLayoutEvaluation]
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
import logging
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
@@ -53,6 +54,51 @@ def category_name_to_docitemlabel(category_name: str) -> DocItemLabel:
|
||||
return label
|
||||
|
||||
|
||||
def evaluate_page(
|
||||
mlcm: MultiLabelConfusionMatrix,
|
||||
doc_id: str,
|
||||
page_no: int,
|
||||
pg_width: int,
|
||||
pg_height: int,
|
||||
matrix_id_to_name: dict[int, str],
|
||||
gt_resolutions: list[LayoutResolution],
|
||||
pred_resolutions: Optional[list[LayoutResolution]] = None,
|
||||
) -> tuple[str, int, int, MultiLabelMatrixEvaluation]:
|
||||
r"""
|
||||
Compute the confusion matrix and the metrics for one page
|
||||
If pred_resolutions is None, assume an all-background predictions
|
||||
|
||||
Return
|
||||
------
|
||||
doc_id
|
||||
page_no
|
||||
page_pixels
|
||||
page_metrics
|
||||
"""
|
||||
# Make binary representations
|
||||
gt_binary = mlcm.make_binary_representation(pg_width, pg_height, gt_resolutions)
|
||||
if pred_resolutions is not None:
|
||||
preds_binary = mlcm.make_binary_representation(
|
||||
pg_width, pg_height, pred_resolutions
|
||||
)
|
||||
else:
|
||||
preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
|
||||
|
||||
# Compute confusion matrix
|
||||
matrix_categories_ids: List[int] = list(matrix_id_to_name.keys())
|
||||
confusion_matrix = mlcm.generate_confusion_matrix(
|
||||
gt_binary, preds_binary, matrix_categories_ids
|
||||
)
|
||||
|
||||
# Compute metrics
|
||||
page_metrics: MultiLabelMatrixEvaluation = mlcm.compute_metrics(
|
||||
confusion_matrix, matrix_id_to_name
|
||||
)
|
||||
page_pixels = pg_width * pg_height
|
||||
|
||||
return doc_id, page_no, page_pixels, page_metrics
|
||||
|
||||
|
||||
class PixelLayoutEvaluator(BaseEvaluator):
|
||||
r"""
|
||||
Evaluate the document layout by computing a pixel-level confusion matrix and derivative matrices
|
||||
@@ -65,6 +111,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
intermediate_evaluations_path: Optional[Path] = None,
|
||||
prediction_sources: List[PredictionFormats] = [],
|
||||
missing_prediction_strategy: MissingPredictionStrategy = MissingPredictionStrategy.PENALIZE,
|
||||
concurrency: int = 4,
|
||||
):
|
||||
r"""
|
||||
|
||||
@@ -82,6 +129,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
if not prediction_sources:
|
||||
prediction_sources = supported_prediction_formats
|
||||
super().__init__(
|
||||
concurrency=concurrency,
|
||||
intermediate_evaluations_path=intermediate_evaluations_path,
|
||||
prediction_sources=prediction_sources,
|
||||
supported_prediction_formats=supported_prediction_formats,
|
||||
@@ -103,6 +151,23 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
self._build_matrix_categories(label_mapping)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def evaluation_filenames(
|
||||
benchmark: BenchMarkNames, save_root: Path
|
||||
) -> dict[str, Path]:
|
||||
r"""
|
||||
Generate the expected filenames for the produced evaluation files
|
||||
"""
|
||||
modality: str = EvaluationModality.LAYOUT.value
|
||||
json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
|
||||
excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
|
||||
|
||||
eval_filenames: dict[str, Path] = {
|
||||
"json": json_fn,
|
||||
"excel": excel_fn,
|
||||
}
|
||||
return eval_filenames
|
||||
|
||||
def _build_matrix_categories(
|
||||
self,
|
||||
label_mapping: Optional[Dict[DocItemLabel, Optional[DocItemLabel]]] = None,
|
||||
@@ -193,6 +258,7 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
ds_selection: Dataset = ds[split]
|
||||
|
||||
# Results containers
|
||||
evaluated_samples = 0
|
||||
rejected_samples: Dict[EvaluationRejectionType, int] = {
|
||||
EvaluationRejectionType.INVALID_CONVERSION_STATUS: 0,
|
||||
EvaluationRejectionType.MISSING_PREDICTION: 0,
|
||||
@@ -213,95 +279,101 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
[]
|
||||
) # Gather f1 score/image when evaluated on collapsed classes
|
||||
|
||||
for i, data in tqdm(
|
||||
enumerate(ds_selection),
|
||||
desc="Multi-label Matrix Layout evaluations",
|
||||
ncols=120,
|
||||
total=len(ds_selection),
|
||||
):
|
||||
data_record = DatasetRecordWithPrediction.model_validate(data)
|
||||
with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
|
||||
futures: list[Future] = []
|
||||
|
||||
# Try to extract the layout model name
|
||||
if not self._layout_model_name:
|
||||
self._layout_model_name = dict_get(
|
||||
data_record.predictor_info,
|
||||
[
|
||||
"options",
|
||||
"pdf",
|
||||
"pipeline_options",
|
||||
"layout_options",
|
||||
"model_spec",
|
||||
"name",
|
||||
],
|
||||
)
|
||||
# Submit pages for execution
|
||||
_log.info("Submitting the documents for evaluation...")
|
||||
for data in ds_selection:
|
||||
data_record = DatasetRecordWithPrediction.model_validate(data)
|
||||
|
||||
doc_id: str = data_record.doc_id
|
||||
if (
|
||||
ext_docdoc_loader is None
|
||||
and data_record.status not in self._accepted_status
|
||||
):
|
||||
_log.error(
|
||||
"Skipping record without successfull conversion status: %s", doc_id
|
||||
)
|
||||
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
|
||||
continue
|
||||
# Try to extract the layout model name
|
||||
if not self._layout_model_name:
|
||||
self._layout_model_name = dict_get(
|
||||
data_record.predictor_info,
|
||||
[
|
||||
"options",
|
||||
"pdf",
|
||||
"pipeline_options",
|
||||
"layout_options",
|
||||
"model_spec",
|
||||
"name",
|
||||
],
|
||||
)
|
||||
|
||||
true_doc = data_record.ground_truth_doc
|
||||
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
|
||||
if not pred_doc:
|
||||
_log.error("There is no prediction for doc_id=%s", doc_id)
|
||||
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
|
||||
continue
|
||||
doc_id: str = data_record.doc_id
|
||||
if (
|
||||
ext_docdoc_loader is None
|
||||
and data_record.status not in self._accepted_status
|
||||
):
|
||||
_log.error(
|
||||
"Skipping record without successfull conversion status: %s",
|
||||
doc_id,
|
||||
)
|
||||
rejected_samples[
|
||||
EvaluationRejectionType.INVALID_CONVERSION_STATUS
|
||||
] += 1
|
||||
continue
|
||||
|
||||
# Compute confusion matrices
|
||||
pages_confusion_matrices: Dict[int, np.ndarray]
|
||||
pages_pixels: Dict[int, int]
|
||||
pages_confusion_matrices, doc_num_pixels, pages_pixels = (
|
||||
self._compute_document_confusion_matrix(true_doc, pred_doc)
|
||||
)
|
||||
true_doc = data_record.ground_truth_doc
|
||||
pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
|
||||
if not pred_doc:
|
||||
_log.error("There is no prediction for doc_id=%s", doc_id)
|
||||
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
|
||||
continue
|
||||
|
||||
# Compute metrics per page
|
||||
for page_no, page_confusion_matrix in pages_confusion_matrices.items():
|
||||
# Contribute to the dataset's confusion matrix
|
||||
ds_confusion_matrix += page_confusion_matrix
|
||||
|
||||
# Compute page metrics
|
||||
page_matrix_evaluation: MultiLabelMatrixEvaluation = (
|
||||
self._mlcm.compute_metrics(
|
||||
page_confusion_matrix,
|
||||
self._matrix_id_to_name,
|
||||
evaluated_samples += 1
|
||||
futures.extend(
|
||||
self._submit_document_evaluation(
|
||||
executor, doc_id, true_doc, pred_doc
|
||||
)
|
||||
)
|
||||
|
||||
# Collect the futures
|
||||
_log.info("Collecting the documents for evaluations...")
|
||||
for future in tqdm(
|
||||
as_completed(futures),
|
||||
desc="Multi-label Matrix Layout evaluations",
|
||||
ncols=120,
|
||||
total=len(futures),
|
||||
):
|
||||
page_metrics: MultiLabelMatrixEvaluation
|
||||
doc_id, page_no, page_pixels, page_metrics = future.result()
|
||||
|
||||
page_confusion_matrix: np.ndarray = (
|
||||
page_metrics.detailed.confusion_matrix
|
||||
)
|
||||
ds_num_pixels += page_pixels
|
||||
ds_confusion_matrix += page_confusion_matrix
|
||||
doc_page_id = f"{doc_id}-{page_no}"
|
||||
page_evaluation = PagePixelLayoutEvaluation(
|
||||
doc_id=doc_id,
|
||||
page_no=page_no,
|
||||
num_pixels=pages_pixels[page_no],
|
||||
matrix_evaluation=page_matrix_evaluation,
|
||||
num_pixels=page_pixels,
|
||||
matrix_evaluation=page_metrics,
|
||||
)
|
||||
doc_page_id = f"{doc_id}-{page_no}"
|
||||
all_pages_evaluations[doc_page_id] = page_evaluation
|
||||
|
||||
# Update f1 lists
|
||||
pages_detailed_f1.append(
|
||||
page_matrix_evaluation.detailed.agg_metrics.classes_f1_mean
|
||||
page_metrics.detailed.agg_metrics.classes_f1_mean
|
||||
)
|
||||
pages_collapsed_f1.append(
|
||||
page_matrix_evaluation.collapsed.agg_metrics.classes_f1_mean
|
||||
page_metrics.collapsed.agg_metrics.classes_f1_mean
|
||||
)
|
||||
|
||||
ds_num_pixels += doc_num_pixels
|
||||
|
||||
# Compute metrics for the dataset and each document
|
||||
# Compute metrics for the dataset
|
||||
ds_matrix_evaluation: MultiLabelMatrixEvaluation = self._mlcm.compute_metrics(
|
||||
ds_confusion_matrix,
|
||||
self._matrix_id_to_name,
|
||||
)
|
||||
|
||||
ds_evaluation = DatasetPixelLayoutEvaluation(
|
||||
evaluated_samples=evaluated_samples,
|
||||
rejected_samples=rejected_samples,
|
||||
layout_model_name=self._layout_model_name,
|
||||
num_pages=len(all_pages_evaluations),
|
||||
num_pixels=ds_num_pixels,
|
||||
rejected_samples=rejected_samples,
|
||||
matrix_evaluation=ds_matrix_evaluation,
|
||||
page_evaluations=all_pages_evaluations,
|
||||
f1_all_classes_stats=compute_stats(pages_detailed_f1),
|
||||
@@ -310,23 +382,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
|
||||
return ds_evaluation
|
||||
|
||||
@staticmethod
|
||||
def evaluation_filenames(
|
||||
benchmark: BenchMarkNames, save_root: Path
|
||||
) -> dict[str, Path]:
|
||||
r"""
|
||||
Generate the expected filenames for the produced evaluation files
|
||||
"""
|
||||
modality: str = EvaluationModality.LAYOUT.value
|
||||
json_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.json"
|
||||
excel_fn = save_root / f"evaluation_{benchmark.value}_pixel_{modality}.xlsx"
|
||||
|
||||
eval_filenames: dict[str, Path] = {
|
||||
"json": json_fn,
|
||||
"excel": excel_fn,
|
||||
}
|
||||
return eval_filenames
|
||||
|
||||
def save_evaluations(
|
||||
self,
|
||||
benchmark: BenchMarkNames,
|
||||
@@ -393,19 +448,17 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
excel_fn,
|
||||
)
|
||||
|
||||
def _compute_document_confusion_matrix(
|
||||
def _submit_document_evaluation(
|
||||
self,
|
||||
executor: Executor,
|
||||
doc_id: str,
|
||||
true_doc: DoclingDocument,
|
||||
pred_doc: DoclingDocument,
|
||||
) -> Tuple[
|
||||
Dict[int, np.ndarray], # page_no -> page confusion matrix
|
||||
int, # document num_pixels
|
||||
Dict[int, int], # page_no -> page num_pixels
|
||||
]:
|
||||
) -> list[Future]:
|
||||
r"""
|
||||
Compute the confusion matrix for the given documents.
|
||||
This is the sum of the confusion matrices of the document pages.
|
||||
Submit the document for evaluation and return a future for each page
|
||||
"""
|
||||
futures: list[Future] = []
|
||||
|
||||
# Collect all DocItems by page for both GT and predictions
|
||||
true_pages_to_objects = self._collect_items_by_page(true_doc)
|
||||
@@ -416,13 +469,6 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
pred_pages = set(pred_pages_to_objects.keys())
|
||||
_log.debug(f"GT pages: {sorted(gt_pages)}, Pred pages: {sorted(pred_pages)}")
|
||||
|
||||
matrix_categories_ids: List[int] = list(self._matrix_id_to_name.keys())
|
||||
page_confusion_matrices: Dict[int, np.ndarray] = (
|
||||
{}
|
||||
) # page_no -> page confusion_matrix
|
||||
all_pages_pixels: Dict[int, int] = {} # page_no -> page num_pixels
|
||||
doc_pixels = 0
|
||||
|
||||
for page_no in sorted(gt_pages):
|
||||
page_size = true_doc.pages[page_no].size
|
||||
pg_width = math.ceil(page_size.width)
|
||||
@@ -444,41 +490,39 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
doc=pred_doc,
|
||||
)
|
||||
|
||||
# TODO: Parallelize the confusion matrix over the pages
|
||||
# Compute the confusion matrix
|
||||
gt_binary = self._mlcm.make_binary_representation(
|
||||
pg_width, pg_height, gt_layouts
|
||||
# Submit the page for computation
|
||||
futures.append(
|
||||
executor.submit(
|
||||
evaluate_page,
|
||||
self._mlcm,
|
||||
doc_id,
|
||||
page_no,
|
||||
pg_width,
|
||||
pg_height,
|
||||
self._matrix_id_to_name,
|
||||
gt_layouts,
|
||||
pred_layouts,
|
||||
)
|
||||
)
|
||||
preds_binary = self._mlcm.make_binary_representation(
|
||||
pg_width, pg_height, pred_layouts
|
||||
)
|
||||
page_confusion_matrix = self._mlcm.generate_confusion_matrix(
|
||||
gt_binary, preds_binary, matrix_categories_ids
|
||||
)
|
||||
page_pixels = pg_width * pg_height
|
||||
doc_pixels += page_pixels
|
||||
all_pages_pixels[page_no] = page_pixels
|
||||
page_confusion_matrices[page_no] = page_confusion_matrix
|
||||
else:
|
||||
# No prediction data for this page
|
||||
if (
|
||||
self._missing_prediction_strategy
|
||||
== MissingPredictionStrategy.PENALIZE
|
||||
):
|
||||
gt_binary = self._mlcm.make_binary_representation(
|
||||
pg_width, pg_height, gt_layouts
|
||||
# Submit the page for computation
|
||||
futures.append(
|
||||
executor.submit(
|
||||
evaluate_page,
|
||||
self._mlcm,
|
||||
doc_id,
|
||||
page_no,
|
||||
pg_width,
|
||||
pg_height,
|
||||
self._matrix_id_to_name,
|
||||
gt_layouts,
|
||||
)
|
||||
)
|
||||
|
||||
# Make an all-one binary representation for the prediction and evaluate as usual
|
||||
preds_binary = np.ones((pg_height, pg_width), dtype=np.uint64)
|
||||
page_confusion_matrix = self._mlcm.generate_confusion_matrix(
|
||||
gt_binary, preds_binary, matrix_categories_ids
|
||||
)
|
||||
|
||||
page_pixels = pg_width * pg_height
|
||||
doc_pixels += page_pixels
|
||||
all_pages_pixels[page_no] = page_pixels
|
||||
page_confusion_matrices[page_no] = page_confusion_matrix
|
||||
elif (
|
||||
self._missing_prediction_strategy
|
||||
== MissingPredictionStrategy.IGNORE
|
||||
@@ -489,7 +533,8 @@ class PixelLayoutEvaluator(BaseEvaluator):
|
||||
raise ValueError(
|
||||
f"Unknown missing prediction strategy: {self._missing_prediction_strategy}"
|
||||
)
|
||||
return page_confusion_matrices, doc_pixels, all_pages_pixels
|
||||
|
||||
return futures
|
||||
|
||||
def _get_page_layout_resolution(
|
||||
self,
|
||||
|
||||
@@ -8,6 +8,8 @@ import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pydantic import BaseModel, model_validator
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DatasetStatistics(BaseModel):
|
||||
total: int
|
||||
@@ -82,7 +84,9 @@ def compute_stats(
|
||||
mean: float = statistics.mean(values) if len(values) > 0 else -1
|
||||
median: float = statistics.median(values) if len(values) > 0 else -1
|
||||
std: float = statistics.stdev(values) if len(values) > 1 else 0.0
|
||||
logging.info(f"total: {total}, mean: {mean}, median: {median}, std: {std}")
|
||||
_log.debug(
|
||||
f"Compute statistics: total: {total}, mean: {mean}, median: {median}, std: {std}"
|
||||
)
|
||||
|
||||
max_value = 1.0
|
||||
if not max_value_is_one and len(values) > 0:
|
||||
@@ -90,7 +94,7 @@ def compute_stats(
|
||||
|
||||
# Compute the histogram
|
||||
hist, bins = np.histogram(values, bins=nr_bins, range=(0, max_value))
|
||||
logging.info(f"#-hist: {len(hist)}, #-bins: {len(bins)}")
|
||||
_log.debug(f"Compute statistics: hist: {len(hist)}, #-bins: {len(bins)}")
|
||||
|
||||
return DatasetStatistics(
|
||||
total=total, mean=mean, median=median, std=std, hist=hist, bins=bins
|
||||
|
||||
@@ -49,7 +49,7 @@ evaluate() {
|
||||
}
|
||||
|
||||
|
||||
visualize() {
|
||||
visualize_predictions() {
|
||||
local pred_dir save_dir modality
|
||||
pred_dir="$1"
|
||||
save_dir="$2"
|
||||
@@ -71,10 +71,27 @@ visualize() {
|
||||
--output-dir "${save_dir}"
|
||||
}
|
||||
|
||||
|
||||
visualize_evaluations() {
|
||||
local pred_dir eval_root modality
|
||||
pred_dir="$1"
|
||||
eval_root="$2"
|
||||
|
||||
for modality in "${MODALITIES[@]}"; do
|
||||
echo "Evaluate: modality: ${modality} for evaluations: ${eval_root}"
|
||||
uv run docling-eval visualize \
|
||||
--benchmark DPBench \
|
||||
--modality "${modality}" \
|
||||
--input-dir "${pred_dir}" \
|
||||
--output-dir "${eval_root}"
|
||||
done
|
||||
}
|
||||
|
||||
###########################################################################################
|
||||
# Main
|
||||
#
|
||||
|
||||
#########################################
|
||||
# Predictions
|
||||
|
||||
# json predictions
|
||||
@@ -95,8 +112,13 @@ evaluate \
|
||||
scratch/DPBench/external_predictions_yaml
|
||||
|
||||
|
||||
#########################################
|
||||
# Visualisations
|
||||
visualize \
|
||||
visualize_predictions \
|
||||
scratch/DPBench/predicted_documents/json \
|
||||
scratch/DPBench/external_predictions_visualisations
|
||||
|
||||
visualize_evaluations \
|
||||
scratch/DPBench/predicted_documents/doctag \
|
||||
scratch/DPBench/external_predictions_doctags
|
||||
|
||||
|
||||
Reference in New Issue
Block a user