Files
samiuc 17e9fde84f feat: Update OCREvaluator with additional metrics (#78)
* Add README for Docling-DPBench

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* feat: Update OCREvaluator with additional metrics

* fix: bug fix

* add edit-distance lib

* update pure ocr metrics

* Establish SegmentedPage support in DatasetRecord and DatasetRecordWithPrediction

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add SegmentedPage usage to PixParse dataset provider

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* add pure ocr metrics

* refactor: update dependencies

* fix dependencies and build errors

* feat: add optype and scipy-stubs packages

* fix: fix type error

* fix package name

* fix bugs and add funsd ocr test

* fix type error

* finalize changes

* fix build errors

* fix: ignore edit_distance missing import

* Add functionality to merge cells in Google OCR prediction (#103)

* feat: add global_merge function in google prediction provider for word cell merging

* address review comment

* remove unused imports

* address review comments and remove dictionary conversions

---------

Co-authored-by: samiullahchattha <Sami.Ullah1@ibm.com>

* refactor and address review comments

* fix regression bug

* refactor code and reduce metrics to three

* make ocr classes private

* fix type error

* refactor: update geometry utils to use BoundingBox and TextCell

Signed-off-by: samiullahchattha <Sami.Ullah1@ibm.com>

* refactor: rename metrics variables for consistency and clarity

Signed-off-by: samiullahchattha <Sami.Ullah1@ibm.com>

* Update lock for docling-core

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: samiuc <sami.ullah.chat@gmail.com>
Signed-off-by: samiullahchattha <Sami.Ullah1@ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: samiullahchattha <Sami.Ullah1@ibm.com>
2025-06-02 14:48:32 +02:00

99 lines
3.0 KiB
Python

import logging
import os
import shutil
from pathlib import Path
import pytest
from docling_eval.dataset_builders.dataset_builder import (
BaseEvaluationDatasetBuilder,
S3Source,
)
IS_CI = bool(os.getenv("CI"))
# Get logger
_log = logging.getLogger(__name__)
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI if the dataset in cos is very large."
)
def test_s3source():
# Define the COS(s3) endpoints and buckets to pull the data from;
# Make sure there is some data in there.
endpoint = os.environ.get("S3_ENDPOINT")
access_key = os.environ.get("S3_ACCESS_KEY")
secret_key = os.environ.get("S3_SECRET_KEY")
bucket = os.environ.get("S3_BUCKET")
key_prefix = os.environ.get("S3_KEY_PREFIX")
root_dir = Path("./scratch/s3source")
target_path = root_dir / "evaluation_data" # path for GT+Predictions on the dataset
dataset_local_path = root_dir / "data_from_cos" # path to download the dataset
# Clean the directory
if os.path.exists(target_path):
shutil.rmtree(target_path)
if os.path.exists(dataset_local_path):
shutil.rmtree(dataset_local_path)
if not endpoint:
raise ValueError("Please set the S3_ENDPOINT environment variable")
if not access_key:
raise ValueError("Please set the S3_ACCESS_KEY environment variable")
if not secret_key:
raise ValueError("Please set the S3_SECRET_KEY environment variable")
if not bucket:
raise ValueError("Please set the S3_BUCKET environment variable")
if not key_prefix:
raise ValueError("Please set the S3_KEY_PREFIX environment variable")
dataset_source = S3Source(
endpoint=endpoint,
access_key=access_key,
secret_key=secret_key,
bucket=bucket,
key_prefix=key_prefix,
overwrite_downloads=True,
)
# Test 1: Specify separate target and dataset_local_path
dataset_builder = BaseEvaluationDatasetBuilder(
name="s3_dataset",
dataset_source=dataset_source,
target=target_path,
dataset_local_path=dataset_local_path,
end_index=-1,
)
output_dir = dataset_builder.retrieve_input_dataset()
assert output_dir is not None
assert (
len(os.listdir(dataset_local_path)) > 0
), f"The directory {dataset_local_path} is empty."
assert not (
os.path.exists(target_path)
), f"Target directory {target_path} should NOT exist."
# Test 2: Specify only target
# Clean the directory
if os.path.exists(target_path):
shutil.rmtree(target_path)
if os.path.exists(dataset_local_path):
shutil.rmtree(dataset_local_path)
dataset_builder = BaseEvaluationDatasetBuilder(
name="s3_dataset",
dataset_source=dataset_source,
target=target_path,
end_index=-1,
)
output_dir = dataset_builder.retrieve_input_dataset()
assert output_dir == target_path / "source_data"
assert len(os.listdir(output_dir)) > 0, f"The directory {output_dir} is empty."