mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
27 lines
753 B
Python
27 lines
753 B
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from huggingface_hub import snapshot_download
|
|
|
|
HF_DATASET_REPO_ID = "docling-project/regression-dataset-for-docling-parse"
|
|
HF_DATASET_REVISION = "96c606ee4d7a8db8170fef213ed2113cdfeeebce"
|
|
TESTS_DIR = Path(__file__).resolve().parent
|
|
TEST_DATA_DIR = TESTS_DIR / "data"
|
|
|
|
|
|
def ensure_test_data_downloaded(force: bool = False) -> Path:
|
|
TEST_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
if not force and any(TEST_DATA_DIR.iterdir()):
|
|
return TEST_DATA_DIR
|
|
|
|
snapshot_download(
|
|
repo_id=HF_DATASET_REPO_ID,
|
|
repo_type="dataset",
|
|
revision=HF_DATASET_REVISION,
|
|
local_dir=str(TEST_DATA_DIR),
|
|
force_download=force,
|
|
)
|
|
return TEST_DATA_DIR
|