mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
42274119b0
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
27 lines
753 B
Python
27 lines
753 B
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from huggingface_hub import snapshot_download
|
|
|
|
HF_DATASET_REPO_ID = "docling-project/regression-dataset-for-docling-parse"
|
|
HF_DATASET_REVISION = "5d7c3d7b575397ca5b2a943171b0da4fe08c5a5b"
|
|
TESTS_DIR = Path(__file__).resolve().parent
|
|
TEST_DATA_DIR = TESTS_DIR / "data"
|
|
|
|
|
|
def ensure_test_data_downloaded(force: bool = False) -> Path:
|
|
TEST_DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
if not force and any(TEST_DATA_DIR.iterdir()):
|
|
return TEST_DATA_DIR
|
|
|
|
snapshot_download(
|
|
repo_id=HF_DATASET_REPO_ID,
|
|
repo_type="dataset",
|
|
revision=HF_DATASET_REVISION,
|
|
local_dir=str(TEST_DATA_DIR),
|
|
force_download=force,
|
|
)
|
|
return TEST_DATA_DIR
|