Files
docling-core/test/test_base.py
Florian Schwarb c73904e68e style: replace black, isort, flake8 and autoflake with ruff (#456)
* Added ruff to dev dependencies

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Added ruff settings to pyproject.toml as in docling

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Cleanup uf pyproject.toml

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Copied settings for ruff pre-commit hooks from docling

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Excluded test/data/** from ruff formatting / linting

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* ruff format

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Added some ignore statements to pyproject.toml such that ruff check raises fewer issues

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* ruff check --fix

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Ignored some more rules

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Fixed the rest of the errors that would only concern 1 - 3 files

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Added another ignore related to df for DataFrame names

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Modified CONTRIBUTING.md such that black / isort are replaced by ruff

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Added UP045 to ignore list such that Optional[...] does not raise

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Moved .flake8 configs to pyproject.toml

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Moved autoflake to be used with ruff

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Moved all .flake8 settings to pyproject.toml to be compatible with ruff (i.e. no separate [tool.flake8] section

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Removed flake8 from .pre-commit hooks

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Applied ruff format (again); formatted some files as the line-length = 120 equals now what was set for the .flake8 settings

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Set max-complexity to 30 (as was originally) in the pyproject.toml as one linting check would fail

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Adding PD901 to ignore list such that pre-commit hooks run fully again

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* Replaced dtype | None syntax by Optional[dtype] in remaining places

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>

* chore: fix 'test' ref in pyproject

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove typing List, Set, Tuple, Dict

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove UP015 check from ignore list

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove UP034 check from ignore list

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: normalize dashes in comments and docstrings

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove PD901 check from ignore list

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove C403 check from ignore list

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove C403, C413, C416 check from ignore list

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* style: remove E203, F811 check from ignore list

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Florian Schwarb <florian.schwarb@gmail.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Co-authored-by: Florian Schwarb <florian.schwarb@gmail.com>
Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2026-01-13 17:03:10 +01:00

264 lines
9.1 KiB
Python

"""Test the pydantic models in module data_types.base.py."""
import json
from datetime import datetime, timezone
from typing import Literal
import pytest
from pydantic import BaseModel, ValidationError
from docling_core.types.base import (
CollectionDocumentInfo,
CollectionInfo,
CollectionRecordInfo,
FileInfoObject,
Identifier,
Log,
StrictDateTime,
)
from docling_core.types.legacy_doc.document import CCSDocumentDescription
from docling_core.types.rec.record import RecordDescription
def test_identifier():
"""Validate data with Identifier model."""
gold_dict = {"type": "id", "value": "abc", "_name": "id#abc"}
data = Identifier(type="id", value="abc", _name="id#abc")
# dict(): important to set by_alias=True, if the model has aliases
assert data.model_dump(by_alias=True) == gold_dict
assert data.model_dump_json(by_alias=True, indent=2) == json.dumps(gold_dict, indent=2)
# schema_json(): no need to set by_alias since it is True by the default
with open("test/data/json_schemas/base_identifier.json", encoding="utf-8") as tf:
gold_json = json.load(tf)
assert Identifier.model_json_schema() == gold_json
gold_dict = {"type": "id", "value": "ABC", "_name": "id#abc"}
valid_keys = Literal["id", "doi", "uid"]
data = Identifier[valid_keys](type="id", value="ABC", _name="id#abc")
assert data.model_dump(by_alias=True) == gold_dict
assert data.model_dump(by_alias=True, exclude_unset=True) == gold_dict
with pytest.raises(ValidationError, match="type"):
Identifier[valid_keys](type="arxivid", value="ABC", _name="arxivid#abc")
with pytest.raises(ValidationError, match="concatenation"):
Identifier[str](type="id", value="ABC", _name="id#ABC")
with pytest.raises(ValidationError, match="required"):
Identifier[str](type="id", value="abc")
Identifier[str](value="abc")
with pytest.raises(ValidationError, match="_name"):
Identifier[str](type="id", value="abc", _name=None)
with pytest.raises(ValidationError, match="type"):
Identifier[str](type=None, value="abc", _name="abc")
with pytest.raises(ValidationError, match="comment"):
Identifier[str](type="id", value="abc", _name="id#abc", comment="OK")
def test_log():
"""Validate data with Log model."""
Log(agent="CXS", type="annotation", date=datetime.now())
Log(
task="run 3",
agent="CXS",
type="annotation",
comment="UCMI 3.10",
date="2021-11-03T04:42:54.844631+00:00",
)
data = Log(task=None, agent="CXS", type="parsing", date="2021-11-03T04:42:54.844631+00:00")
gold_dict = {
"agent": "CXS",
"type": "parsing",
"date": "2021-11-03T04:42:54.844631+00:00",
}
# None values will be exported, use exclude_none=True to export clean
assert data.model_dump() != gold_dict
assert data.model_dump(exclude_none=True, by_alias=True) == gold_dict
# Optional unset parameters will be exported as null, use exclude_unset=True
assert Log(**gold_dict).model_dump() != gold_dict
assert Log(**gold_dict).model_dump(exclude_unset=True, by_alias=True) == gold_dict
# Models that inherit from AliasModel will generate data with alias field names
assert Log(**gold_dict).model_dump(exclude_unset=True) == gold_dict
# ***Best practice***: exclude_unset=True, exclude_none=True, by_alias=True
assert Log(**gold_dict).model_dump(exclude_unset=True, exclude_none=True, by_alias=True) == gold_dict
with open("test/data/json_schemas/base_log.json", encoding="utf-8") as tf:
gold_json_schema = json.load(tf)
assert Log.model_json_schema() == gold_json_schema
with pytest.raises(ValidationError, match="Value type must be a datetime or a non-numeric string"):
Log(agent="CXS", type="annotation", date=123456789)
def test_file_info_object():
"""Validate data with FileInfoObject model."""
gold_dict = {
"filename": "document.pdf",
"filename-prov": "http:www.ibm.com",
"document-hash": "PnNF3Fhr22nJH4a",
}
data = FileInfoObject(**gold_dict)
# dictionaries and JSON exports need to explicitly use aliases, but children from AliasModel don't.
assert data.model_dump(by_alias=True) == gold_dict
assert data.model_dump() == gold_dict
gold_dict.pop("filename-prov")
gold_json = json.dumps(gold_dict)
FileInfoObject(**gold_dict).model_dump_json(exclude_unset=True, exclude_none=True) == gold_json
# creating an instance with input variables requires the use of field names. Since
# document-hash is an invalid function parameter name, 'populate_by_name' needs to
# be set to True in model definition. For convenience, inherit from AliasModel.
FileInfoObject(filename="document.pdf", document_hash="PnNF3Fhr22nJH4a")
def test_collection_info():
"""Validate data with CollectionInfo model."""
# Test 1
gold_dict = {
"name": "patent USPTO",
"type": "Document",
"version": "3.2.0",
"alias": ["patent"],
}
data = CollectionInfo(**gold_dict)
assert data.model_dump(exclude_unset=True, exclude_none=True) == gold_dict
# Test 2
gold_dict = {
"name": "patent USPTO",
"type": "experiment",
"version": "3.2.0",
"alias": ["simulation"],
}
with pytest.raises(ValidationError, match="type"):
CollectionInfo(**gold_dict)
# Test 3
input_dict = {
"name": "patent USPTO",
"type": "Document",
"version": "3.2.0",
"alias": None,
}
clean_dict = {"name": "patent USPTO", "type": "Document", "version": "3.2.0"}
data = CollectionInfo(**input_dict)
assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) != input_dict
assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == clean_dict
data = CollectionInfo(**clean_dict)
assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == clean_dict
def test_collection_document_info():
"""Validate data with CollectionDocumentInfo model."""
gold_dict = {
"name": "patent USPTO",
"type": "Document",
"version": "3.2.0",
"alias": ["patent"],
}
data = CollectionDocumentInfo(**gold_dict)
assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == gold_dict
# within dictionary
desc_dict = {
"logs": [
{
"date": "2021-11-03T04:42:54.844631+00:00",
"agent": "CXS",
"type": "parsing",
}
],
"collection": {
"name": "patent USPTO",
"type": "Document",
"version": "3.2.0",
"alias": ["patent"],
},
}
CCSDocumentDescription(**desc_dict)
desc_dict["collection"]["type"] = "Record"
with pytest.raises(ValidationError, match="collection\\.type"):
CCSDocumentDescription(**desc_dict)
def test_collection_record_info():
"""Validate data with CollectionRecordInfo model."""
gold_dict = {
"name": "PubChem",
"type": "Record",
"version": "3.2.0",
"alias": ["chemical", "Material Sciences"],
}
data = CollectionRecordInfo(**gold_dict)
assert data.model_dump(by_alias=True, exclude_unset=True, exclude_none=True) == gold_dict
# within dictionary
desc_dict = {
"logs": [
{
"date": "2021-11-03T04:42:54.844631+00:00",
"agent": "CXS",
"type": "parsing",
}
],
"collection": {
"name": "PubChem",
"type": "Record",
"version": "3.2.0",
"alias": ["chemical", "Material Sciences"],
},
}
RecordDescription(**desc_dict)
desc_dict["collection"]["type"] = "Document"
with pytest.raises(ValidationError, match="collection\\.type"):
RecordDescription(**desc_dict)
desc_dict["collection"]["type"] = "record"
with pytest.raises(ValidationError, match="collection\\.type"):
RecordDescription(**desc_dict)
def test_strict_date_time() -> None:
"""Validate data with StrictDateTime model."""
class Model(BaseModel):
published: StrictDateTime
# allowed formats
Model(published=datetime.now(tz=timezone.utc))
data = Model(published="2022-12-01T03:49:20.724435+00:00")
assert data.published.isoformat() == "2022-12-01T03:49:20.724435+00:00"
data = Model(published="2022-12-01T03:49:20.724435+03:00")
assert data.published.isoformat() == "2022-12-01T03:49:20.724435+03:00"
data = Model(published="2022-12-01T03:49:20.724435Z")
assert data.published.isoformat() == "2022-12-01T03:49:20.724435+00:00"
data = Model(published="2022-12-01T03:49:20")
assert data.published.isoformat() == "2022-12-01T03:49:20"
data = Model(published="2022-12-01")
assert data.published.isoformat() == "2022-12-01T00:00:00"
# invalid formats
with pytest.raises(ValidationError, match="published"):
Model(published="03:49:20")
with pytest.raises(ValidationError, match="published"):
Model(published=1679616000.0)