mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
c73904e68e
* Added ruff to dev dependencies Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Added ruff settings to pyproject.toml as in docling Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Cleanup uf pyproject.toml Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Copied settings for ruff pre-commit hooks from docling Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Excluded test/data/** from ruff formatting / linting Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * ruff format Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Added some ignore statements to pyproject.toml such that ruff check raises fewer issues Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * ruff check --fix Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Ignored some more rules Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Fixed the rest of the errors that would only concern 1 - 3 files Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Added another ignore related to df for DataFrame names Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Modified CONTRIBUTING.md such that black / isort are replaced by ruff Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Added UP045 to ignore list such that Optional[...] does not raise Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Moved .flake8 configs to pyproject.toml Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Moved autoflake to be used with ruff Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Moved all .flake8 settings to pyproject.toml to be compatible with ruff (i.e. no separate [tool.flake8] section Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Removed flake8 from .pre-commit hooks Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Applied ruff format (again); formatted some files as the line-length = 120 equals now what was set for the .flake8 settings Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Set max-complexity to 30 (as was originally) in the pyproject.toml as one linting check would fail Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Adding PD901 to ignore list such that pre-commit hooks run fully again Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * Replaced dtype | None syntax by Optional[dtype] in remaining places Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> * chore: fix 'test' ref in pyproject Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove typing List, Set, Tuple, Dict Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove UP015 check from ignore list Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove UP034 check from ignore list Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: normalize dashes in comments and docstrings Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove PD901 check from ignore list Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove C403 check from ignore list Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove C403, C413, C416 check from ignore list Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style: remove E203, F811 check from ignore list Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Florian Schwarb <florian.schwarb@gmail.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
150 lines
5.8 KiB
Python
150 lines
5.8 KiB
Python
"""Tests for AzureDocSerializer."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from docling_core.transforms.serializer.azure import AzureDocSerializer, AzureParams
|
|
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
|
|
from docling_core.types.doc.document import (
|
|
DocItemLabel,
|
|
DoclingDocument,
|
|
ProvenanceItem,
|
|
)
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
|
|
|
|
def _assert_json_like_equal(a: Any, b: Any, eps: float = 1e-3, path: str = "$") -> None:
|
|
"""Recursively assert two JSON-like structures are equal with float tolerance.
|
|
|
|
Rules:
|
|
- Dicts: same keys; compare values recursively
|
|
- Lists: same length; compare elements in order
|
|
- Numbers: ints compare by ==; floats (or int-vs-float) compare within eps
|
|
- Strings/bools/null: compare by ==
|
|
|
|
The `path` argument is used to provide helpful assertion locations.
|
|
"""
|
|
# Handle dicts
|
|
if isinstance(a, dict) and isinstance(b, dict):
|
|
a_keys, b_keys = set(a.keys()), set(b.keys())
|
|
assert a_keys == b_keys, f"Key mismatch at {path}: {a_keys ^ b_keys}"
|
|
for k in sorted(a_keys):
|
|
_assert_json_like_equal(a[k], b[k], eps=eps, path=f"{path}.{k}")
|
|
return
|
|
|
|
# Handle lists
|
|
if isinstance(a, list) and isinstance(b, list):
|
|
assert len(a) == len(b), f"List length mismatch at {path}: {len(a)} != {len(b)}"
|
|
for i, (ai, bi) in enumerate(zip(a, b)):
|
|
_assert_json_like_equal(ai, bi, eps=eps, path=f"{path}[{i}]")
|
|
return
|
|
|
|
# Handle numbers (int/float)
|
|
num_types = (int, float)
|
|
if isinstance(a, num_types) and isinstance(b, num_types):
|
|
# If either is float, compare with tolerance; if both int, exact match
|
|
if isinstance(a, float) or isinstance(b, float):
|
|
diff = abs(float(a) - float(b))
|
|
assert diff <= eps, f"Float mismatch at {path}: {a} != {b} (diff={diff}, eps={eps})"
|
|
else:
|
|
assert a == b, f"Int mismatch at {path}: {a} != {b}"
|
|
return
|
|
|
|
# Handle strings, bools, and None
|
|
if isinstance(a, (str, bool)) or a is None:
|
|
assert a == b, f"Value mismatch at {path}: {a!r} != {b!r}"
|
|
return
|
|
|
|
# Fallback: types must match and then equality
|
|
assert type(a) is type(b), f"Type mismatch at {path}: {type(a)} != {type(b)}"
|
|
assert a == b, f"Mismatch at {path}: {a!r} != {b!r}"
|
|
|
|
|
|
def _verify_json(exp_file: Path, actual_json: str) -> None:
|
|
"""Verify Azure JSON string against ground-truth file with generation support.
|
|
|
|
Compares parsed JSON structures and tolerates tiny float differences.
|
|
"""
|
|
if GEN_TEST_DATA:
|
|
# Keep writing the canonical serialized string for determinism
|
|
exp_file.write_text(actual_json + "\n", encoding="utf-8")
|
|
else:
|
|
expected_text = exp_file.read_text(encoding="utf-8").rstrip()
|
|
expected_obj = json.loads(expected_text)
|
|
actual_obj = json.loads(actual_json)
|
|
_assert_json_like_equal(expected_obj, actual_obj, eps=1e-3)
|
|
|
|
|
|
def test_azure_serialize_activities_doc():
|
|
"""Serialize a GT document (activities.json) and verify Azure JSON output."""
|
|
src = Path("./test/data/doc/activities.json")
|
|
doc = DoclingDocument.load_from_json(src)
|
|
|
|
ser = AzureDocSerializer(doc=doc, params=AzureParams(indent=2))
|
|
actual_json = ser.serialize().text
|
|
|
|
# Sanity-check the JSON structure
|
|
data = json.loads(actual_json)
|
|
assert isinstance(data, dict)
|
|
assert "pages" in data and isinstance(data["pages"], list)
|
|
assert "tables" in data and isinstance(data["tables"], list)
|
|
assert "figures" in data and isinstance(data["figures"], list)
|
|
assert "paragraphs" in data and isinstance(data["paragraphs"], list)
|
|
|
|
_verify_json(exp_file=src.with_suffix(".gt.azure.json"), actual_json=actual_json)
|
|
|
|
|
|
def test_azure_serialize_construct_doc_minimal_prov(sample_doc: DoclingDocument):
|
|
"""Serialize a constructed document with minimal provenance to Azure JSON.
|
|
|
|
The sample_doc fixture does not attach provenance or pages; here we add a
|
|
single page and minimal bounding boxes to a subset of items to allow Azure JSON
|
|
output to include paragraphs/tables/pictures with boundingRegions.
|
|
"""
|
|
|
|
# Ensure at least one page is present
|
|
if not sample_doc.pages:
|
|
sample_doc.add_page(page_no=1, size=Size(width=600.0, height=800.0), image=None)
|
|
|
|
# Helper to add a simple TOPLEFT bbox provenance if missing
|
|
def _ensure_prov(item, l=10.0, t=10.0, r=200.0, b=40.0):
|
|
if not item.prov:
|
|
item.prov = [
|
|
ProvenanceItem(
|
|
page_no=min(sample_doc.pages.keys()),
|
|
bbox=BoundingBox(l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT),
|
|
charspan=(0, 0),
|
|
)
|
|
]
|
|
|
|
# Add provenance for the title and a couple of paragraphs if present
|
|
for it in sample_doc.texts[:3]:
|
|
if it.label in {
|
|
DocItemLabel.TITLE,
|
|
DocItemLabel.TEXT,
|
|
DocItemLabel.SECTION_HEADER,
|
|
}:
|
|
_ensure_prov(it)
|
|
|
|
# Add provenance for the first table if present
|
|
if sample_doc.tables:
|
|
_ensure_prov(sample_doc.tables[0], l=20.0, t=80.0, r=300.0, b=200.0)
|
|
|
|
# Add provenance for the first picture if present
|
|
if sample_doc.pictures:
|
|
_ensure_prov(sample_doc.pictures[0], l=320.0, t=80.0, r=500.0, b=220.0)
|
|
|
|
ser = AzureDocSerializer(doc=sample_doc, params=AzureParams(indent=2))
|
|
actual_json = ser.serialize().text
|
|
|
|
# Basic structure check
|
|
data = json.loads(actual_json)
|
|
assert isinstance(data, dict)
|
|
assert "pages" in data and isinstance(data["pages"], list) and len(data["pages"]) >= 1
|
|
assert "paragraphs" in data and isinstance(data["paragraphs"], list)
|
|
|
|
exp_file = Path("./test/data/doc/constructed.gt.azure.json")
|
|
_verify_json(exp_file=exp_file, actual_json=actual_json)
|