Files
Nikos Livathinos 53dbd955ae feat: Extend the evaluators to support external predictions stored in files (#185)
* chore: Move the teds.py inside the subdir evaluators/table

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Introduce the external_predictions_path in BaseEvaluator and dummy entries in all evaluators.
Extend the CLI to support the --external-predictions-path

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend test_dataset_builder.py to save document predictions in various formats

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend MarkDownTextEvaluator to support external_predictions_path. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend LayoutEvaluator to support external_predictions_path. Add unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Add missing pytest dependencies in tests

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Fix loading the external predictions in LayoutEvaluator

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Introduce external predictions in DocStructureEvaluator. Add unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend the TableEvaluator to support external predictions. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend the KeyValueEvaluator to support external predictions. Add unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend the PixelLayoutEvaluator to support external predictions. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend the BboxTextEvaluator to support external predictions. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Disable the OCREvaluator when using the external predictions

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Fixing guard for external predictions in TimingsEvaluator, ReadingOrderEvaluator. Fix main

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Export the doctag files with the correct file extension

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Refactor the ExternalDoclingDocumentLoader to properly load a DoclingDocument from doctags and
the GT image.
- Introduce the staticmethod load_doctags() which covers all cases on page image loading.
- Refactor the FilePredictionProvider to use the load_doctags() from ExternalDoclingDocumentLoader.
- Refactor all evaluators to use the new ExternalDoclingDocumentLoader.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* chore: Rename code file as external_docling_document_loader.py

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Fix typo

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Introduce examples how to evaluate using external predictions using the API and the CLI.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-12-08 16:51:45 +01:00

148 lines
4.6 KiB
Python

import argparse
import logging
from pathlib import Path
from typing import List, Optional
from docling_eval.aggregations.consolidator import Consolidator
from docling_eval.aggregations.multi_evalutor import MultiEvaluator
from docling_eval.datamodels.types import (
BenchMarkNames,
ConsolidationFormats,
EvaluationModality,
PredictionProviderType,
)
# Configure logging
logging.getLogger("docling").setLevel(logging.WARNING)
_log = logging.getLogger(__name__)
def evaluate(
root_dir: Path,
benchmarks: List[BenchMarkNames],
experiments: List[str],
modalities: List[EvaluationModality],
):
r""" """
# Create multi evaluations
me: MultiEvaluator = MultiEvaluator(root_dir)
_log.info("Evaluating...")
m_evals = me(experiments, benchmarks, modalities)
_log.info("Finish evaluation")
def consolidate(
working_dir: Path,
):
r""" """
multi_evaluation = MultiEvaluator.load_multi_evaluation(working_dir)
consolidator = Consolidator(working_dir / "consolidation")
# Generate both Excel and Latex files
consolidation_formats = [ConsolidationFormats.EXCEL, ConsolidationFormats.LATEX]
for c_format in consolidation_formats:
_log.info("Consolidating in %s...", c_format.value)
consolidator(multi_evaluation, consolidation_format=c_format)
_log.info("Finish consolidation")
def main(args):
r""" """
task = args.task
working_dir = Path(args.working_dir)
benchmarks = (
[BenchMarkNames(x) for x in args.benchmarks.split(",")]
if args.benchmarks
else None
)
experiments_or_providers = (
args.experiments_or_providers.split(",")
if args.experiments_or_providers
else None
)
modalities = (
[EvaluationModality(x) for x in args.modalities.split(",")]
if args.modalities
else None
)
if task == "evaluate":
if not benchmarks or not experiments_or_providers or not modalities:
_log.error("Required Benchmarks/Experiments/Modalities")
return
evaluate(working_dir, benchmarks, experiments_or_providers, modalities)
elif task == "consolidate":
consolidate(working_dir)
elif task == "both":
if not benchmarks or not experiments_or_providers or not modalities:
_log.error("Required Benchmarks/Providers/Modalities")
return
evaluate(working_dir, benchmarks, experiments_or_providers, modalities)
consolidate(working_dir)
else:
_log.error("Unsupported task: %s", task)
if __name__ == "__main__":
description = """
Running multi-evaluation and consolidation inside a working directory and generate matrix reports
The working directory must have the structure:
.
├── consolidation
│ └── consolidation_matrix.xlsx
└── <benchmark_name>
├── gt_dataset [Dir with dataset in parquet format with the ground truth DoclingDocuments]
├── <experiment_name1> [It can be the name of a provider or anything else]
│ ├── eval_dataset
│ └── evaluations
│ ├── <modality1>
│ │ └── evaluation_<benchmark>_<modality1>.json
│ └── <modality2>
│ └── evaluation_<benchmark>_<modality2>.json
└── <experiment_name2> [It can be the name of a provider or anything else]
├── eval_dataset
└── evaluations
└── <modality1>
└── evaluation_<benchmark>_<modality1>.json
"""
parser = argparse.ArgumentParser(
description=description, formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
"-t",
"--task",
required=True,
help="One of ['evaluate', 'consolidate', 'both']",
)
parser.add_argument(
"-d",
"--working_dir",
required=True,
help="Working directory",
)
parser.add_argument(
"-b",
"--benchmarks",
required=False,
default=None,
help=f"Evaluate: Comma separated list of {[x.value for x in BenchMarkNames]}",
)
parser.add_argument(
"-e",
"--experiments_or_providers",
required=False,
default=None,
help=f"Evaluate: Comma separated list of experiments or providers names.",
)
parser.add_argument(
"-m",
"--modalities",
required=False,
default=None,
help=f"Evaluate: Comma separated list of {[x.value for x in EvaluationModality]}",
)
args = parser.parse_args()
main(args)