mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
feat: add 'meta' field to DoclingDocument at root level
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
@@ -2609,6 +2609,10 @@ class DoclingDocument(BaseModel):
|
||||
# This is optional, e.g. a DoclingDocument could also be entirely
|
||||
# generated from synthetic data.
|
||||
)
|
||||
meta: Annotated[
|
||||
BaseMeta | None,
|
||||
Field(description="Metadata for the document, such as summaries or other custom metadata fields."),
|
||||
] = None
|
||||
|
||||
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
|
||||
name="_root_",
|
||||
|
||||
@@ -3752,6 +3752,18 @@
|
||||
],
|
||||
"default": null
|
||||
},
|
||||
"meta": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/$defs/BaseMeta"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null,
|
||||
"description": "Metadata for the document, such as summaries or other custom metadata fields."
|
||||
},
|
||||
"furniture": {
|
||||
"$ref": "#/$defs/GroupItem",
|
||||
"default": {
|
||||
|
||||
+6
@@ -18,6 +18,12 @@ furniture:
|
||||
self_ref: '#/furniture'
|
||||
groups: []
|
||||
key_value_items: []
|
||||
meta:
|
||||
summary:
|
||||
confidence: 0.98
|
||||
text: This is a document-level summary describing the entire document.
|
||||
my_corp__doc_category: technical_report
|
||||
my_corp__doc_version: '1.0'
|
||||
name: dummy_doc
|
||||
origin:
|
||||
binary_hash: 7954723514066505909
|
||||
|
||||
@@ -18,6 +18,12 @@ furniture:
|
||||
self_ref: '#/furniture'
|
||||
groups: []
|
||||
key_value_items: []
|
||||
meta:
|
||||
my_corp__doc_category: technical_report
|
||||
my_corp__doc_version: '1.0'
|
||||
summary:
|
||||
confidence: 0.98
|
||||
text: This is a document-level summary describing the entire document.
|
||||
name: dummy_doc
|
||||
origin:
|
||||
binary_hash: 7954723514066505909
|
||||
|
||||
+27
-4
@@ -142,7 +142,7 @@ def test_md_ser_default():
|
||||
with open(exp_file, "w", encoding="utf-8") as f:
|
||||
f.write(actual)
|
||||
else:
|
||||
with open(exp_file, "r", encoding="utf-8") as f:
|
||||
with open(exp_file, encoding="utf-8") as f:
|
||||
expected = f.read()
|
||||
assert actual == expected
|
||||
|
||||
@@ -183,7 +183,7 @@ def test_md_ser_allowed_meta_names():
|
||||
with open(exp_file, "w", encoding="utf-8") as f:
|
||||
f.write(actual)
|
||||
else:
|
||||
with open(exp_file, "r", encoding="utf-8") as f:
|
||||
with open(exp_file, encoding="utf-8") as f:
|
||||
expected = f.read()
|
||||
assert actual == expected
|
||||
|
||||
@@ -224,7 +224,7 @@ def test_md_ser_without_non_meta():
|
||||
with open(exp_file, "w", encoding="utf-8") as f:
|
||||
f.write(actual)
|
||||
else:
|
||||
with open(exp_file, "r", encoding="utf-8") as f:
|
||||
with open(exp_file, encoding="utf-8") as f:
|
||||
expected = f.read()
|
||||
assert actual == expected
|
||||
|
||||
@@ -276,6 +276,29 @@ def test_ser_custom_meta_serializer():
|
||||
with open(exp_file, "w", encoding="utf-8") as f:
|
||||
f.write(actual)
|
||||
else:
|
||||
with open(exp_file, "r", encoding="utf-8") as f:
|
||||
with open(exp_file, encoding="utf-8") as f:
|
||||
expected = f.read()
|
||||
assert actual == expected
|
||||
|
||||
|
||||
def test_document_level_metadata() -> None:
|
||||
"""Test that document-level metadata can be loaded and accessed."""
|
||||
src = Path("test/data/doc/dummy_doc_with_meta.yaml")
|
||||
doc = DoclingDocument.load_from_yaml(filename=src)
|
||||
|
||||
# Verify document-level metadata exists
|
||||
assert doc.meta is not None
|
||||
assert doc.meta.summary is not None
|
||||
assert doc.meta.summary.text == "This is a document-level summary describing the entire document."
|
||||
assert doc.meta.summary.confidence == 0.98
|
||||
|
||||
# Verify custom metadata fields at document level
|
||||
custom_part = doc.meta.get_custom_part()
|
||||
assert custom_part["my_corp__doc_category"] == "technical_report"
|
||||
assert custom_part["my_corp__doc_version"] == "1.0"
|
||||
|
||||
# Verify that item-level metadata still works alongside document-level metadata
|
||||
first_text = doc.texts[1] # The title item
|
||||
assert first_text.meta is not None
|
||||
assert first_text.meta.summary is not None
|
||||
assert first_text.meta.summary.text == "This is a title."
|
||||
|
||||
Reference in New Issue
Block a user