mirror of
https://github.com/docling-project/docling-eval.git
synced 2026-05-17 13:10:47 +00:00
17e9fde84f
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat: Update OCREvaluator with additional metrics * fix: bug fix * add edit-distance lib * update pure ocr metrics * Establish SegmentedPage support in DatasetRecord and DatasetRecordWithPrediction Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add SegmentedPage usage to PixParse dataset provider Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * add pure ocr metrics * refactor: update dependencies * fix dependencies and build errors * feat: add optype and scipy-stubs packages * fix: fix type error * fix package name * fix bugs and add funsd ocr test * fix type error * finalize changes * fix build errors * fix: ignore edit_distance missing import * Add functionality to merge cells in Google OCR prediction (#103) * feat: add global_merge function in google prediction provider for word cell merging * address review comment * remove unused imports * address review comments and remove dictionary conversions --------- Co-authored-by: samiullahchattha <Sami.Ullah1@ibm.com> * refactor and address review comments * fix regression bug * refactor code and reduce metrics to three * make ocr classes private * fix type error * refactor: update geometry utils to use BoundingBox and TextCell Signed-off-by: samiullahchattha <Sami.Ullah1@ibm.com> * refactor: rename metrics variables for consistency and clarity Signed-off-by: samiullahchattha <Sami.Ullah1@ibm.com> * Update lock for docling-core Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: samiuc <sami.ullah.chat@gmail.com> Signed-off-by: samiullahchattha <Sami.Ullah1@ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: samiullahchattha <Sami.Ullah1@ibm.com>
99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
import logging
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from docling_eval.dataset_builders.dataset_builder import (
|
|
BaseEvaluationDatasetBuilder,
|
|
S3Source,
|
|
)
|
|
|
|
IS_CI = bool(os.getenv("CI"))
|
|
|
|
# Get logger
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
IS_CI, reason="Skipping test in CI if the dataset in cos is very large."
|
|
)
|
|
def test_s3source():
|
|
# Define the COS(s3) endpoints and buckets to pull the data from;
|
|
# Make sure there is some data in there.
|
|
endpoint = os.environ.get("S3_ENDPOINT")
|
|
access_key = os.environ.get("S3_ACCESS_KEY")
|
|
secret_key = os.environ.get("S3_SECRET_KEY")
|
|
bucket = os.environ.get("S3_BUCKET")
|
|
key_prefix = os.environ.get("S3_KEY_PREFIX")
|
|
|
|
root_dir = Path("./scratch/s3source")
|
|
target_path = root_dir / "evaluation_data" # path for GT+Predictions on the dataset
|
|
dataset_local_path = root_dir / "data_from_cos" # path to download the dataset
|
|
|
|
# Clean the directory
|
|
if os.path.exists(target_path):
|
|
shutil.rmtree(target_path)
|
|
if os.path.exists(dataset_local_path):
|
|
shutil.rmtree(dataset_local_path)
|
|
|
|
if not endpoint:
|
|
raise ValueError("Please set the S3_ENDPOINT environment variable")
|
|
if not access_key:
|
|
raise ValueError("Please set the S3_ACCESS_KEY environment variable")
|
|
if not secret_key:
|
|
raise ValueError("Please set the S3_SECRET_KEY environment variable")
|
|
if not bucket:
|
|
raise ValueError("Please set the S3_BUCKET environment variable")
|
|
if not key_prefix:
|
|
raise ValueError("Please set the S3_KEY_PREFIX environment variable")
|
|
|
|
dataset_source = S3Source(
|
|
endpoint=endpoint,
|
|
access_key=access_key,
|
|
secret_key=secret_key,
|
|
bucket=bucket,
|
|
key_prefix=key_prefix,
|
|
overwrite_downloads=True,
|
|
)
|
|
|
|
# Test 1: Specify separate target and dataset_local_path
|
|
dataset_builder = BaseEvaluationDatasetBuilder(
|
|
name="s3_dataset",
|
|
dataset_source=dataset_source,
|
|
target=target_path,
|
|
dataset_local_path=dataset_local_path,
|
|
end_index=-1,
|
|
)
|
|
|
|
output_dir = dataset_builder.retrieve_input_dataset()
|
|
assert output_dir is not None
|
|
|
|
assert (
|
|
len(os.listdir(dataset_local_path)) > 0
|
|
), f"The directory {dataset_local_path} is empty."
|
|
|
|
assert not (
|
|
os.path.exists(target_path)
|
|
), f"Target directory {target_path} should NOT exist."
|
|
|
|
# Test 2: Specify only target
|
|
# Clean the directory
|
|
if os.path.exists(target_path):
|
|
shutil.rmtree(target_path)
|
|
if os.path.exists(dataset_local_path):
|
|
shutil.rmtree(dataset_local_path)
|
|
|
|
dataset_builder = BaseEvaluationDatasetBuilder(
|
|
name="s3_dataset",
|
|
dataset_source=dataset_source,
|
|
target=target_path,
|
|
end_index=-1,
|
|
)
|
|
|
|
output_dir = dataset_builder.retrieve_input_dataset()
|
|
assert output_dir == target_path / "source_data"
|
|
|
|
assert len(os.listdir(output_dir)) > 0, f"The directory {output_dir} is empty."
|