mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
00c3bb223d
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1543 lines
51 KiB
Python
1543 lines
51 KiB
Python
"""Unit tests for Doclang create_closing_token helper."""
|
|
|
|
from itertools import chain
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import pytest
|
|
|
|
from docling_core.experimental.doclang import (
|
|
ContentType,
|
|
EscapeMode,
|
|
DoclangDocSerializer,
|
|
DoclangParams,
|
|
DoclangVocabulary,
|
|
LayerMode,
|
|
WrapMode,
|
|
)
|
|
from docling_core.types.doc import (
|
|
BoundingBox,
|
|
CodeLanguageLabel,
|
|
CoordOrigin,
|
|
DescriptionMetaField,
|
|
DocItemLabel,
|
|
DoclingDocument,
|
|
Formatting,
|
|
PictureClassificationLabel,
|
|
PictureClassificationMetaField,
|
|
PictureClassificationPrediction,
|
|
PictureMeta,
|
|
ProvenanceItem,
|
|
Script,
|
|
Size,
|
|
SummaryMetaField,
|
|
TableData,
|
|
TabularChartMetaField,
|
|
)
|
|
from docling_core.types.doc.base import ImageRefMode
|
|
from docling_core.types.doc.document import ContentLayer, GraphCell, GraphData, GraphLink, ImageRef, RichTableCell, TableCell
|
|
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
|
|
from test.test_serialization import verify
|
|
from test.test_data_gen_flag import GEN_TEST_DATA
|
|
|
|
|
|
def add_texts_section(doc: DoclingDocument):
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Simple text")
|
|
inline1 = doc.add_inline_group()
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="Here a code snippet: ",
|
|
parent=inline1,
|
|
)
|
|
doc.add_code(
|
|
text="help()",
|
|
parent=inline1,
|
|
code_language=CodeLanguageLabel.PYTHON,
|
|
)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text=" (to be shown)",
|
|
parent=inline1,
|
|
)
|
|
|
|
def add_list_section(doc: DoclingDocument):
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
lg = doc.add_list_group()
|
|
|
|
doc.add_list_item(text="foo", parent=lg)
|
|
doc.add_list_item(text="bar", parent=lg)
|
|
|
|
# just inline group with a formula
|
|
li = doc.add_list_item(text="", parent=lg)
|
|
inline = doc.add_inline_group(parent=li)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="Here a formula: ",
|
|
parent=inline,
|
|
)
|
|
doc.add_formula(text="E=mc^2 ", parent=inline)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="in line",
|
|
parent=inline,
|
|
)
|
|
|
|
# just inline group with formatted span
|
|
li = doc.add_list_item(text="", parent=lg)
|
|
inline = doc.add_inline_group(parent=li)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="Here a ",
|
|
parent=inline,
|
|
)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="bold",
|
|
parent=inline,
|
|
formatting=Formatting(bold=True),
|
|
)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text=" text",
|
|
parent=inline,
|
|
)
|
|
|
|
li = doc.add_list_item(text="will contain sublist", parent=lg)
|
|
lg_sub = doc.add_list_group(parent=li)
|
|
doc.add_list_item(text="sublist item 1", parent=lg_sub)
|
|
doc.add_list_item(text="sublist item 2", parent=lg_sub)
|
|
|
|
li = doc.add_list_item(text="", parent=lg, prov=prov)
|
|
inline = doc.add_inline_group(parent=li)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="Here a ",
|
|
parent=inline,
|
|
)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="both bold and italicized",
|
|
parent=inline,
|
|
formatting=Formatting(bold=True, italic=True),
|
|
)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text=" text and a sublist:",
|
|
parent=inline,
|
|
)
|
|
lg_sub = doc.add_list_group(parent=li)
|
|
doc.add_list_item(text="sublist item a", parent=lg_sub)
|
|
doc.add_list_item(text="sublist item b", parent=lg_sub)
|
|
|
|
doc.add_list_item(text="final element", parent=lg)
|
|
|
|
# ===============================
|
|
# Doclang unit-tests
|
|
# ===============================
|
|
|
|
|
|
def test_create_closing_token_from_opening_tag_simple():
|
|
assert DoclangVocabulary.create_closing_token(token="<text>") == "</text>"
|
|
assert (
|
|
DoclangVocabulary.create_closing_token(token='\n <heading level="2"> ')
|
|
== "</heading>"
|
|
)
|
|
assert (
|
|
DoclangVocabulary.create_closing_token(token=' <list ordered="true"> ')
|
|
== "</list>"
|
|
)
|
|
|
|
|
|
def test_create_closing_token_returns_existing_closing():
|
|
assert DoclangVocabulary.create_closing_token(token="</text>") == "</text>"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"bad",
|
|
[
|
|
"<br/>",
|
|
'<location value="3"/>',
|
|
'<hour value="1"/>',
|
|
'<thread id="abc"/>',
|
|
],
|
|
)
|
|
def test_create_closing_token_rejects_self_closing(bad):
|
|
with pytest.raises(ValueError):
|
|
DoclangVocabulary.create_closing_token(token=bad)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"bad",
|
|
[
|
|
"text", # not a tag
|
|
"<text", # incomplete
|
|
"<text/>", # self-closing form of non-self-closing token
|
|
"</ unknown >", # malformed closing
|
|
"<unknown>", # unknown token
|
|
],
|
|
)
|
|
def test_create_closing_token_invalid_inputs(bad):
|
|
with pytest.raises(ValueError):
|
|
DoclangVocabulary.create_closing_token(token=bad)
|
|
|
|
|
|
# ===============================
|
|
# Doclang tests
|
|
# ===============================
|
|
|
|
|
|
def serialize_doclang(doc: DoclingDocument, params: Optional[DoclangParams] = None) -> str:
|
|
ser = DoclangDocSerializer(doc=doc, params=params or DoclangParams())
|
|
return ser.serialize().text
|
|
|
|
|
|
def test_list_items_not_double_wrapped_when_no_content():
|
|
doc = DoclingDocument(name="t")
|
|
lst = doc.add_list_group()
|
|
doc.add_list_item("Item A", parent=lst)
|
|
doc.add_list_item("Item B", parent=lst)
|
|
|
|
txt = serialize_doclang(doc, params=DoclangParams(content_types=set()))
|
|
exp_txt = """
|
|
<doclang version="1.0.0">
|
|
<list ordered="false">
|
|
<list_text></list_text>
|
|
<list_text></list_text>
|
|
</list>
|
|
</doclang>
|
|
"""
|
|
assert txt.strip() == exp_txt.strip()
|
|
|
|
|
|
def test_doclang():
|
|
src = Path("./test/data/doc/ddoc_0.json")
|
|
doc = DoclingDocument.load_from_json(src)
|
|
|
|
# Human readable, indented and with content
|
|
params = DoclangParams()
|
|
|
|
ser = DoclangDocSerializer(doc=doc, params=params)
|
|
actual = ser.serialize().text
|
|
|
|
verify(exp_file=src.with_suffix(".v0.gt.dclg.xml"), actual=actual)
|
|
|
|
# Human readable, indented but without content
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_types={ContentType.TABLE},
|
|
),
|
|
)
|
|
actual = ser.serialize().text
|
|
|
|
verify(exp_file=src.with_suffix(".v1.gt.dclg.xml"), actual=actual)
|
|
|
|
# Machine readable, not indented and without content
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
pretty_indentation=None,
|
|
content_types={ContentType.TABLE},
|
|
),
|
|
)
|
|
actual = ser.serialize().text
|
|
|
|
verify(exp_file=src.with_suffix(".v2.gt.dclg.xml"), actual=actual)
|
|
|
|
|
|
def test_doclang_meta():
|
|
src = Path("./test/data/doc/dummy_doc_with_meta.yaml")
|
|
doc = DoclingDocument.load_from_yaml(src)
|
|
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(image_mode=ImageRefMode.EMBEDDED),
|
|
)
|
|
actual = ser.serialize().text
|
|
verify(exp_file=src.with_suffix(".gt.dclg.xml"), actual=actual)
|
|
|
|
|
|
def test_doclang_crop_embedded():
|
|
src = Path("./test/data/doc/activities_simplified.yaml")
|
|
doc = DoclingDocument.load_from_yaml(src)
|
|
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(image_mode=ImageRefMode.EMBEDDED),
|
|
)
|
|
actual = serializer.serialize().text
|
|
|
|
# verifying everything except base64 data as the latter seems to be flaky across runs/platforms
|
|
exp_prefix = """
|
|
<doclang version="1.0.0">
|
|
<floating_group class="picture">
|
|
<picture>
|
|
<meta>
|
|
<classification>Other</classification>
|
|
</meta>
|
|
<location value="43"/>
|
|
<location value="117"/>
|
|
<location value="172"/>
|
|
<location value="208"/>
|
|
<uri>data:image/png;base64,
|
|
""".strip()
|
|
assert actual.startswith(exp_prefix)
|
|
|
|
exp_suffix = """
|
|
</uri>
|
|
</picture>
|
|
</floating_group>
|
|
</doclang>
|
|
""".strip()
|
|
assert actual.endswith(exp_suffix)
|
|
|
|
def test_doclang_crop_placeholder():
|
|
src = Path("./test/data/doc/activities_simplified.yaml")
|
|
doc = DoclingDocument.load_from_yaml(src)
|
|
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(image_mode=ImageRefMode.PLACEHOLDER),
|
|
)
|
|
actual = serializer.serialize().text
|
|
exp_file = src.parent / f"{src.stem}_cropped_placeholder.dclg.xml"
|
|
verify(exp_file=exp_file, actual=actual)
|
|
|
|
def _create_escape_test_doc(inp_doc: DoclingDocument):
|
|
doc = inp_doc.model_copy(deep=True)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Simple text")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" 4 leading spaces, 1 trailing ")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Some 'single' quotes")
|
|
doc.add_text(label=DocItemLabel.TEXT, text='Some "double" quotes')
|
|
text_item = doc.add_text(label=DocItemLabel.TEXT, text="An ampersand: &")
|
|
text_item.meta = PictureMeta(
|
|
summary=SummaryMetaField(text="Summary with <tags> & ampersands"),
|
|
description=DescriptionMetaField(text="Description content"),
|
|
)
|
|
doc.add_code(text="0 == 0")
|
|
doc.add_code(text=" 1 leading space, 4 trailing ")
|
|
doc.add_code(text="0 < 1")
|
|
doc.add_code(text="42 == 42", code_language=CodeLanguageLabel.PYTHON)
|
|
doc.add_code(text="42 < 1337", code_language=CodeLanguageLabel.PYTHON)
|
|
|
|
td = TableData(num_cols=2)
|
|
td.add_row(["Foo", "Bar"])
|
|
td.add_row(["Header & Title", "Value > 100"])
|
|
td.add_row(["<script>", "A & B"])
|
|
td.add_row(["Only", "<second>"])
|
|
doc.add_table(data=td)
|
|
|
|
# test combination of formatting and special characters
|
|
doc.add_text(label=DocItemLabel.TEXT, text="0 < 1")
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="0 < 42",
|
|
formatting=Formatting(bold=True, italic=True),
|
|
)
|
|
|
|
return doc
|
|
|
|
|
|
def test_cdata_always(sample_doc: DoclingDocument):
|
|
"""Test cdata_always mode."""
|
|
doc = _create_escape_test_doc(sample_doc)
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
escape_mode=EscapeMode.CDATA_ALWAYS,
|
|
image_mode=ImageRefMode.EMBEDDED,
|
|
),
|
|
)
|
|
ser_res = serializer.serialize()
|
|
ser_txt = ser_res.text
|
|
|
|
exp_file = Path("./test/data/doc/cdata_always.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_cdata_when_needed(sample_doc: DoclingDocument):
|
|
"""Test cdata_when_needed mode."""
|
|
doc = _create_escape_test_doc(sample_doc)
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
escape_mode=EscapeMode.CDATA_WHEN_NEEDED,
|
|
image_mode=ImageRefMode.EMBEDDED,
|
|
),
|
|
)
|
|
ser_res = serializer.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/cdata_when_needed.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_strikethrough_formatting():
|
|
"""Test strikethrough formatting serialization."""
|
|
doc = DoclingDocument(name="test")
|
|
formatting = Formatting(strikethrough=True)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Strike text", formatting=formatting)
|
|
|
|
result = serialize_doclang(
|
|
doc, params=DoclangParams(add_location=False)
|
|
)
|
|
assert "<strikethrough>Strike text</strikethrough>" in result
|
|
|
|
|
|
def test_subscript_formatting():
|
|
"""Test subscript formatting serialization."""
|
|
doc = DoclingDocument(name="test")
|
|
formatting = Formatting(script=Script.SUB)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="H2O", formatting=formatting)
|
|
|
|
result = serialize_doclang(
|
|
doc, params=DoclangParams(add_location=False)
|
|
)
|
|
assert "<subscript>H2O</subscript>" in result
|
|
|
|
|
|
def test_superscript_formatting():
|
|
"""Test superscript formatting serialization."""
|
|
doc = DoclingDocument(name="test")
|
|
formatting = Formatting(script=Script.SUPER)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="x^2", formatting=formatting)
|
|
|
|
result = serialize_doclang(
|
|
doc, params=DoclangParams(add_location=False)
|
|
)
|
|
assert "<superscript>x^2</superscript>" in result
|
|
|
|
|
|
def test_combined_formatting():
|
|
"""Test combined formatting (bold + italic)."""
|
|
doc = DoclingDocument(name="test")
|
|
formatting = Formatting(bold=True, italic=True)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Bold and italic", formatting=formatting)
|
|
|
|
result = serialize_doclang(
|
|
doc, params=DoclangParams(add_location=False)
|
|
)
|
|
# When both bold and italic are applied, they should be nested
|
|
assert "<bold>" in result
|
|
assert "<italic>" in result
|
|
assert "Bold and italic" in result
|
|
|
|
|
|
|
|
|
|
def _create_content_filtering_doc(inp_doc: DoclingDocument):
|
|
doc = inp_doc.model_copy(deep=True)
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
pic = doc.add_picture(
|
|
caption=doc.add_text(label=DocItemLabel.CAPTION, text="Picture Caption")
|
|
)
|
|
pic.prov = [prov]
|
|
pic.meta = PictureMeta(
|
|
summary=SummaryMetaField(text="Picture Summary"),
|
|
description=DescriptionMetaField(text="Picture Description"),
|
|
)
|
|
|
|
chart = doc.add_picture(
|
|
caption=doc.add_text(label=DocItemLabel.CAPTION, text="Picture Caption")
|
|
)
|
|
chart.prov = [prov]
|
|
chart.meta = PictureMeta(
|
|
summary=SummaryMetaField(text="Picture Summary"),
|
|
description=DescriptionMetaField(text="Picture Description"),
|
|
classification=PictureClassificationMetaField(
|
|
predictions=[
|
|
PictureClassificationPrediction(
|
|
class_name=PictureClassificationLabel.PIE_CHART.value,
|
|
confidence=1.0,
|
|
)
|
|
]
|
|
),
|
|
)
|
|
chart_data = TableData(num_cols=2)
|
|
chart_data.add_row(["Foo", "Bar"])
|
|
chart_data.add_row(["One", "Two"])
|
|
chart.meta.tabular_chart = TabularChartMetaField(
|
|
title="Chart Title",
|
|
chart_data=chart_data,
|
|
)
|
|
doc.add_code(text="0 == 0")
|
|
doc.add_code(text="with location", prov=prov)
|
|
|
|
return doc
|
|
|
|
|
|
def test_handwritten_text_label(doc_with_handwritten: DoclingDocument):
|
|
result = doc_with_handwritten.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/handwritten_text.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=result)
|
|
|
|
|
|
def test_content_allow_all_types(sample_doc: DoclingDocument):
|
|
doc = _create_content_filtering_doc(sample_doc)
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_types=set(ContentType),
|
|
image_mode=ImageRefMode.EMBEDDED,
|
|
),
|
|
)
|
|
ser_txt = serializer.serialize().text
|
|
|
|
exp_file = Path("./test/data/doc/content_all.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_content_allow_no_types(sample_doc: DoclingDocument):
|
|
doc = _create_content_filtering_doc(sample_doc)
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_types=set(),
|
|
image_mode=ImageRefMode.EMBEDDED,
|
|
),
|
|
)
|
|
ser_txt = serializer.serialize().text
|
|
exp_file = Path("./test/data/doc/content_none.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_content_allow_specific_types(sample_doc: DoclingDocument):
|
|
doc = _create_content_filtering_doc(sample_doc)
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_types={
|
|
ContentType.PICTURE,
|
|
ContentType.TABLE,
|
|
ContentType.TABLE_CELL,
|
|
ContentType.REF_CAPTION,
|
|
ContentType.TEXT_CODE,
|
|
},
|
|
image_mode=ImageRefMode.EMBEDDED,
|
|
),
|
|
)
|
|
ser_txt = serializer.serialize().text
|
|
exp_file = Path("./test/data/doc/content_specific.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_content_block_specific_types(sample_doc: DoclingDocument):
|
|
doc = _create_content_filtering_doc(sample_doc)
|
|
blocked_types = {
|
|
ContentType.TABLE,
|
|
ContentType.TEXT_CODE,
|
|
}
|
|
serializer = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_types={ct for ct in ContentType if ct not in blocked_types},
|
|
image_mode=ImageRefMode.EMBEDDED,
|
|
),
|
|
)
|
|
ser_txt = serializer.serialize().text
|
|
exp_file = Path("./test/data/doc/content_block_specific.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_inline_group():
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
|
|
parent_txt = doc.add_text(label=DocItemLabel.TEXT, text="", prov=prov)
|
|
simple_inline_gr = doc.add_inline_group(parent=parent_txt)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="One", parent=simple_inline_gr)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="Two",
|
|
parent=simple_inline_gr,
|
|
formatting=Formatting(bold=True),
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Three", parent=simple_inline_gr)
|
|
|
|
li_inline_gr = doc.add_list_group()
|
|
doc.add_list_item(text="Item 1", parent=li_inline_gr)
|
|
li2 = doc.add_list_item(text="", parent=li_inline_gr)
|
|
li2_inline_gr = doc.add_inline_group(parent=li2)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Four", parent=li2_inline_gr)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="Five",
|
|
parent=li2_inline_gr,
|
|
formatting=Formatting(bold=True),
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Six", parent=li2_inline_gr)
|
|
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/inline_group.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_mini_inline():
|
|
doc = DoclingDocument(name="test")
|
|
ul = doc.add_list_group()
|
|
li = doc.add_list_item(text="", parent=ul)
|
|
inl = doc.add_inline_group(parent=li)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="foo", parent=inl)
|
|
doc.add_text(
|
|
label=DocItemLabel.TEXT,
|
|
text="bar",
|
|
parent=inl,
|
|
formatting=Formatting(bold=True),
|
|
)
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/mini_inline.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def _create_wrapping_test_doc():
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="simple")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" leading")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="trailing ")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="< special")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" leading and < special")
|
|
|
|
doc.add_text(label=DocItemLabel.TEXT, text="w/prov simple", prov=prov)
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" w/prov leading", prov=prov)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="w/prov trailing ", prov=prov)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="w/prov < special", prov=prov)
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" w/prov leading and < special", prov=prov)
|
|
|
|
return doc
|
|
|
|
def test_content_wrapping_mode_when_needed():
|
|
doc = _create_wrapping_test_doc()
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_wrapping_mode=WrapMode.WRAP_WHEN_NEEDED,
|
|
),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/wrapping_when_needed.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_content_wrapping_mode_always():
|
|
doc = _create_wrapping_test_doc()
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
content_wrapping_mode=WrapMode.WRAP_ALWAYS,
|
|
),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/wrapping_always.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_default_mode():
|
|
doc = DoclingDocument(name="test")
|
|
add_texts_section(doc)
|
|
add_list_section(doc)
|
|
|
|
ser = DoclangDocSerializer(doc=doc)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/default_mode.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_vlm_mode():
|
|
doc = DoclingDocument(name="test")
|
|
add_texts_section(doc)
|
|
add_list_section(doc)
|
|
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
pretty_indentation=None,
|
|
escape_mode=EscapeMode.CDATA_ALWAYS,
|
|
content_wrapping_mode=WrapMode.WRAP_ALWAYS,
|
|
),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/vlm_mode.gt.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_rich_cells(rich_table_doc):
|
|
ser = DoclangDocSerializer(
|
|
doc=rich_table_doc,
|
|
params=DoclangParams(),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/rich_table.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def _create_simple_prov_doc():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Hello", prov=prov)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="World", prov=prov)
|
|
return doc
|
|
|
|
def test_checkboxes():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_text(label=DocItemLabel.CHECKBOX_UNSELECTED, text="TODO")
|
|
doc.add_text(label=DocItemLabel.CHECKBOX_SELECTED, text="DONE")
|
|
ser = DoclangDocSerializer(doc=doc)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/checkboxes.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_def_prov_512():
|
|
doc = _create_simple_prov_doc()
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
xsize=512,
|
|
ysize=512,
|
|
),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/simple_prov_res_512.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_def_prov_256():
|
|
doc = _create_simple_prov_doc()
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
xsize=256,
|
|
ysize=256,
|
|
),
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/simple_prov_res_256.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_chart():
|
|
doc = DoclingDocument.load_from_json("./test/data/doc/barchart.json")
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
)
|
|
ser_res = ser.serialize()
|
|
ser_txt = ser_res.text
|
|
exp_file = Path("./test/data/doc/barchart.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def _verify_doc(doc: DoclingDocument, exp_json: Path):
|
|
if GEN_TEST_DATA:
|
|
doc.save_as_json(filename=exp_json)
|
|
else:
|
|
exp_doc = DoclingDocument.load_from_json(filename=exp_json)
|
|
assert doc == exp_doc
|
|
|
|
def test_kv():
|
|
doc = DoclingDocument(name="")
|
|
kvm = doc.add_field_region()
|
|
|
|
doc.add_field_heading(text="KV heading", parent=kvm)
|
|
|
|
kve = doc.add_field_item(parent=kvm)
|
|
doc.add_field_key(text="simple key", parent=kve)
|
|
doc.add_field_value(text="simple value", parent=kve)
|
|
|
|
doc.add_field_heading(level=2, text="KV sub-heading", parent=kvm)
|
|
|
|
# inlined key-value pair (outer is <text>...</text>)
|
|
# TODO: possibly support outer bounding box
|
|
inl = doc.add_inline_group(parent=kvm)
|
|
kve = doc.add_field_item(parent=inl)
|
|
doc.add_field_key(text="my inline key1: ", parent=kve)
|
|
doc.add_field_value(text="my inline value1", parent=kve, kind="fillable")
|
|
|
|
# # inlined key-value pair (outer is <kv_entry>...</kv_entry>)
|
|
# # TODO: possibly support outer bounding box
|
|
# kve = doc.add_field_item(parent=kvm)
|
|
# inl = doc.add_inline_group(parent=kve)
|
|
# doc.add_field_key(text="my inline key2: ", parent=inl)
|
|
# doc.add_field_value(text="my inline value2", parent=inl, kind="fillable")
|
|
|
|
kve = doc.add_field_item(parent=kvm)
|
|
doc.add_field_key(text="name", parent=kve)
|
|
doc.add_field_value(text="John Doe", parent=kve, kind="fillable")
|
|
doc.add_field_value(text="Max Mustermann", parent=kve, kind="fillable")
|
|
|
|
kk = doc.add_field_value(text="", parent=kve, kind="fillable")
|
|
opt_vis = doc.add_inline_group(parent=kk)
|
|
doc.add_text(label=DocItemLabel.CHECKBOX_UNSELECTED, text="", parent=opt_vis)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Clark ", parent=opt_vis)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Kent", parent=opt_vis, formatting=Formatting(bold=True))
|
|
doc.add_field_hint(text="Select this if you are a Superman fan", parent=opt_vis)
|
|
|
|
doc.add_field_value(text="", parent=kve)
|
|
|
|
# inlined form inputs
|
|
# TODO: add support for outer bounding box
|
|
inl = doc.add_inline_group(parent=kve)
|
|
|
|
doc.add_text(label=DocItemLabel.TEXT, text="My first input ", parent=inl)
|
|
doc.add_field_value(text="", parent=inl, kind="fillable")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" and my second input ", parent=inl)
|
|
doc.add_field_value(text="m", parent=inl, kind="fillable")
|
|
|
|
kv_entry_3 = doc.add_field_item(parent=kvm)
|
|
doc.add_field_key(text="I am in the country as a: ", parent=kv_entry_3)
|
|
|
|
vis = doc.add_field_value(text="", parent=kv_entry_3, kind="fillable")
|
|
opt_vis = doc.add_inline_group(parent=vis)
|
|
doc.add_text(label=DocItemLabel.CHECKBOX_UNSELECTED, text="Visitor", parent=opt_vis)
|
|
|
|
std = doc.add_field_value(text="", parent=kv_entry_3, kind="fillable")
|
|
opt_std = doc.add_inline_group(parent=std)
|
|
doc.add_text(label=DocItemLabel.CHECKBOX_UNSELECTED, text=" Student", parent=opt_std)
|
|
|
|
oth = doc.add_field_value(text="", parent=kv_entry_3, kind="fillable")
|
|
opt_oth = doc.add_inline_group(parent=oth)
|
|
doc.add_text(label=DocItemLabel.CHECKBOX_UNSELECTED, text="Other (Specify)", parent=opt_oth)
|
|
|
|
doc.add_field_value(text="", parent=kv_entry_3, kind="fillable")
|
|
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Some final stuff.")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="The end.")
|
|
|
|
exp_json = Path("./test/data/doc/kv.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/kv.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
|
|
def test_kv_invoice():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
# prov = None
|
|
image_uri = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII="
|
|
|
|
# first key-value map
|
|
kvm = doc.add_field_region(prov=prov)
|
|
|
|
# inlined key-value pair
|
|
kve = doc.add_field_item(parent=kvm)
|
|
kvk = doc.add_field_key(text="", parent=kve)
|
|
doc.add_picture(
|
|
parent=kvk,
|
|
image=ImageRef(
|
|
mimetype="image/png",
|
|
uri=image_uri,
|
|
dpi=300,
|
|
size=Size(width=100, height=100),
|
|
),
|
|
)
|
|
doc.add_field_value(text="+123-456-7890", parent=kve)
|
|
|
|
|
|
# another inlined key-value pair
|
|
kve = doc.add_field_item(parent=kvm)
|
|
kvk = doc.add_field_key(text="", parent=kve)
|
|
doc.add_picture(
|
|
parent=kvk,
|
|
image=ImageRef(
|
|
mimetype="image/png",
|
|
uri=image_uri,
|
|
dpi=300,
|
|
size=Size(width=100, height=100),
|
|
),
|
|
)
|
|
doc.add_field_value(text="hello@example.com", parent=kve)
|
|
|
|
# second key-value map
|
|
kvm = doc.add_field_region()
|
|
|
|
# inlined key-value pair
|
|
inl_outer = doc.add_inline_group(parent=kvm)
|
|
kve = doc.add_field_item(parent=inl_outer)
|
|
doc.add_field_key(text="Invoice No: ", parent=kve)
|
|
doc.add_field_value(text="222", parent=kve)
|
|
|
|
# another inlined key-value pair
|
|
inl_outer = doc.add_inline_group(parent=kvm)
|
|
kve = doc.add_field_item(parent=inl_outer)
|
|
doc.add_field_key(text="Date: ", parent=kve)
|
|
doc.add_field_value(text="02 May, 2021", parent=kve)
|
|
|
|
# a last key-value map
|
|
kvm = doc.add_field_region()
|
|
kve = doc.add_field_item(parent=kvm)
|
|
doc.add_field_key(text="Administrator", parent=kve, prov=prov)
|
|
doc.add_field_value(text="John Doe", parent=kve, prov=prov)
|
|
|
|
exp_json = Path("./test/data/doc/kv_invoice.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/kv_invoice.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
|
|
|
|
def test_kv_advanced_inline():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
prov = None
|
|
image_uri = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII="
|
|
|
|
# first key-value map
|
|
kvm = doc.add_field_region()
|
|
|
|
# inlined key-value pair
|
|
inl_outer = doc.add_inline_group(parent=kvm)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="This certificate applies to ", parent=inl_outer)
|
|
|
|
kve = doc.add_field_item(parent=inl_outer)
|
|
doc.add_field_value(text="", parent=kve, kind="fillable")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" percent of Buyer's purchases from ", parent=inl_outer)
|
|
|
|
kve = doc.add_field_item(parent=inl_outer)
|
|
doc.add_field_value(text="", parent=kve, kind="fillable")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" (name, address, and employer idenficiation number of seller) as follows (complete as applicable): ", parent=inl_outer)
|
|
|
|
kve = doc.add_field_item(parent=inl_outer)
|
|
doc.add_field_value(text="", parent=kve, kind="fillable")
|
|
doc.add_text(label=DocItemLabel.TEXT, text=".", parent=inl_outer)
|
|
|
|
exp_json = Path("./test/data/doc/kv_advanced_inline.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/kv_advanced_inline.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_kv_nested():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
# prov = None
|
|
image_uri = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII="
|
|
|
|
# first key-value map
|
|
kvm = doc.add_field_region(prov=prov)
|
|
|
|
kve = doc.add_field_item(parent=kvm)
|
|
doc.add_field_key(text="A", parent=kve)
|
|
kvv = doc.add_field_value(text="", parent=kve)
|
|
kvm_inner = doc.add_field_region(parent=kvv)
|
|
kve_inner = doc.add_field_item(parent=kvm_inner)
|
|
doc.add_marker(text="1.", parent=kve_inner)
|
|
doc.add_field_key(text="AA", parent=kve_inner)
|
|
doc.add_field_hint(text="Some explanation for key AA", parent=kve_inner)
|
|
doc.add_field_value(text="AAA", parent=kve_inner)
|
|
doc.add_field_hint(text="Some explanation for value AAA", parent=kve_inner)
|
|
doc.add_field_value(text="AAB", parent=kve_inner)
|
|
doc.add_field_hint(text="Some explanation for value AAB", parent=kve_inner)
|
|
kve_inner = doc.add_field_item(parent=kvm_inner)
|
|
doc.add_marker(text="2.", parent=kve_inner)
|
|
doc.add_field_key(text="AB", parent=kve_inner)
|
|
doc.add_field_value(text="ABA", parent=kve_inner)
|
|
doc.add_field_value(text="ABB", parent=kve_inner)
|
|
|
|
exp_json = Path("./test/data/doc/kv_nested.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/kv_nested.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_kv_form_with_table():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
prov = None
|
|
image_uri = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII="
|
|
|
|
# first key-value map
|
|
kvm = doc.add_field_region()
|
|
|
|
# table
|
|
|
|
table_vals = [
|
|
["Description of property", "Cost or other basis, plus improvements and expense of sale", "Gain or loss"],
|
|
["" , "gain", "150,997"],
|
|
["", "loss", "114,676"],
|
|
]
|
|
num_rows = len(table_vals)
|
|
num_cols = len(table_vals[0])
|
|
table = doc.add_table(data=TableData(num_rows=num_rows, num_cols=num_cols), parent=kvm)
|
|
|
|
for i in range(num_rows):
|
|
for j in range(num_cols):
|
|
if i == 0: # headers
|
|
cell = TableCell(
|
|
start_row_offset_idx=i,
|
|
end_row_offset_idx=i + 1,
|
|
start_col_offset_idx=j,
|
|
end_col_offset_idx=j + 1,
|
|
text=table_vals[i][j],
|
|
column_header=True,
|
|
)
|
|
else:
|
|
kve = doc.add_field_item(parent=table)
|
|
doc.add_field_value(text=table_vals[i][j], parent=kve, kind="fillable")
|
|
cell = RichTableCell(
|
|
start_row_offset_idx=i,
|
|
end_row_offset_idx=i + 1,
|
|
start_col_offset_idx=j,
|
|
end_col_offset_idx=j + 1,
|
|
text="",
|
|
ref=kve.get_ref(),
|
|
)
|
|
doc.add_table_cell(table_item=table, cell=cell)
|
|
|
|
exp_json = Path("./test/data/doc/kv_form_with_table.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
|
|
exp_file = Path("./test/data/doc/kv_form_with_table.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_kv_migration_self_contained_scenario():
|
|
doc = DoclingDocument(name="")
|
|
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
|
|
prov = ProvenanceItem(
|
|
page_no=1,
|
|
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
|
|
charspan=(0, 2),
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Hello, world!")
|
|
doc.add_key_values(
|
|
graph=GraphData(
|
|
cells=[
|
|
# both TO_VALUE & TO_KEY links:
|
|
GraphCell(
|
|
label=GraphCellLabel.KEY,
|
|
cell_id=0,
|
|
text="Common name",
|
|
orig="Common name",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=1,
|
|
text="Duck",
|
|
orig="Duck",
|
|
),
|
|
|
|
# TO_PARENT & TO_CHILD links:
|
|
GraphCell(
|
|
label=GraphCellLabel.KEY,
|
|
cell_id=2,
|
|
text="Anatoidea",
|
|
orig="Anatoidea",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=3,
|
|
text="Anatidae",
|
|
orig="Anatidae",
|
|
),
|
|
|
|
# multiple TO_VALUE links:
|
|
GraphCell(
|
|
label=GraphCellLabel.KEY,
|
|
cell_id=4,
|
|
text="Distribution package",
|
|
orig="Distribution package",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=5,
|
|
text="docling",
|
|
orig="docling",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=6,
|
|
text="docling-core",
|
|
orig="docling-core",
|
|
prov=prov,
|
|
),
|
|
],
|
|
links=[
|
|
GraphLink(
|
|
label=GraphLinkLabel.TO_VALUE,
|
|
source_cell_id=0,
|
|
target_cell_id=1,
|
|
),
|
|
GraphLink(label=GraphLinkLabel.TO_KEY, source_cell_id=1, target_cell_id=0),
|
|
GraphLink(label=GraphLinkLabel.TO_CHILD, source_cell_id=2, target_cell_id=3),
|
|
GraphLink(label=GraphLinkLabel.TO_PARENT, source_cell_id=3, target_cell_id=2),
|
|
GraphLink(label=GraphLinkLabel.TO_VALUE, source_cell_id=4, target_cell_id=5),
|
|
GraphLink(label=GraphLinkLabel.TO_VALUE, source_cell_id=4, target_cell_id=6),
|
|
],
|
|
),
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Some more text...", parent=doc.body)
|
|
doc.add_form(
|
|
graph=GraphData(
|
|
cells=[
|
|
# both TO_VALUE & TO_KEY links:
|
|
GraphCell(
|
|
label=GraphCellLabel.KEY,
|
|
cell_id=0,
|
|
text="Color",
|
|
orig="Color",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=1,
|
|
text="Orange",
|
|
orig="Orange",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=2,
|
|
text="Black",
|
|
orig="Black",
|
|
),
|
|
GraphCell(
|
|
label=GraphCellLabel.VALUE,
|
|
cell_id=3,
|
|
text="White",
|
|
orig="White",
|
|
),
|
|
],
|
|
links=[
|
|
GraphLink(label=GraphLinkLabel.TO_VALUE, source_cell_id=0, target_cell_id=1),
|
|
GraphLink(label=GraphLinkLabel.TO_VALUE, source_cell_id=0, target_cell_id=2),
|
|
GraphLink(label=GraphLinkLabel.TO_VALUE, source_cell_id=0, target_cell_id=3),
|
|
GraphLink(label=GraphLinkLabel.TO_KEY, source_cell_id=3, target_cell_id=0),
|
|
],
|
|
),
|
|
prov=prov,
|
|
)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="The end.", parent=doc.body)
|
|
|
|
exp_json = Path("./test/data/doc/kv_pre_migration.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
doc._migrate_to_field_regions()
|
|
|
|
exp_json = Path("./test/data/doc/kv_post_migration.out.json")
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/kv_migration.out.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
def test_kv_migration_annot_scenario():
|
|
roots = [
|
|
"./test/data/doc/kv",
|
|
"./test/data/doc/doclang_ref",
|
|
]
|
|
for subdir in chain.from_iterable([Path(root).iterdir() for root in roots]):
|
|
if not subdir.is_dir():
|
|
continue
|
|
input_json = subdir / "input.json"
|
|
if not input_json.exists():
|
|
continue
|
|
doc = DoclingDocument.load_from_json(input_json)
|
|
if GEN_TEST_DATA:
|
|
modes = {
|
|
# "ro": "reading_order",
|
|
"kv": "key_value",
|
|
}
|
|
for mode_kw in modes:
|
|
pages = doc.get_visualization(viz_mode=modes[mode_kw])
|
|
for page_no, page in pages.items():
|
|
page.save(str(subdir / f"input_{mode_kw}_p{page_no}.png"))
|
|
doc._migrate_to_field_regions()
|
|
exp_json = subdir / "output.json"
|
|
_verify_doc(doc=doc, exp_json=exp_json)
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = subdir / "output.dclg.xml"
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
ser = DoclangDocSerializer(
|
|
doc=doc,
|
|
params=DoclangParams(
|
|
add_content=False,
|
|
),
|
|
)
|
|
ser_txt = ser.serialize().text
|
|
exp_file = subdir / "output_no_content.dclg.xml"
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
if GEN_TEST_DATA:
|
|
modes = {
|
|
# "ro": "reading_order",
|
|
"kv": "key_value",
|
|
}
|
|
for mode_kw in modes:
|
|
pages = doc.get_visualization(viz_mode=modes[mode_kw])
|
|
for page_no, page in pages.items():
|
|
page.save(str(subdir / f"output_{mode_kw}_p{page_no}.png"))
|
|
|
|
|
|
# ===============================
|
|
# suppress_empty_elements tests
|
|
# ===============================
|
|
|
|
_SUPPRESS_PARAMS = DoclangParams(
|
|
suppress_empty_elements=True,
|
|
add_location=False,
|
|
content_types=set(), # no content → forces items empty
|
|
)
|
|
|
|
|
|
def test_suppress_empty_text_item():
|
|
"""An empty text item is omitted when suppress_empty_elements is True."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="")
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
# The empty <text></text> must not appear
|
|
assert "<text" not in result
|
|
# Document root must still be present
|
|
assert "<doclang" in result
|
|
|
|
|
|
def test_empty_text_item_preserved_by_default():
|
|
"""Without suppress_empty_elements the empty tag pair is emitted."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="")
|
|
|
|
default_params = DoclangParams(
|
|
add_location=False,
|
|
content_types=set(),
|
|
)
|
|
result = serialize_doclang(doc, params=default_params)
|
|
assert "<text></text>" in result
|
|
|
|
|
|
def test_suppress_empty_heading():
|
|
"""An empty heading is suppressed."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_heading(text="", level=2)
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
assert "<heading" not in result
|
|
assert "</heading>" not in result
|
|
|
|
|
|
def test_suppress_empty_code():
|
|
"""An empty code block is suppressed."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_code(text="")
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
assert "<code" not in result
|
|
|
|
|
|
def test_suppress_empty_picture():
|
|
"""A picture with no content, no caption, no footnotes is suppressed."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_picture()
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
assert "<floating_group" not in result
|
|
assert "<picture" not in result
|
|
|
|
|
|
def test_empty_picture_preserved_by_default():
|
|
"""Without suppress_empty_elements the empty floating group is emitted."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_picture()
|
|
|
|
default_params = DoclangParams(
|
|
add_location=False,
|
|
content_types=set(),
|
|
)
|
|
result = serialize_doclang(doc, params=default_params)
|
|
assert "<floating_group" in result
|
|
|
|
|
|
def test_suppress_empty_table():
|
|
"""A table with no data, no caption, no footnotes is suppressed."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_table(data=TableData())
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
assert "<floating_group" not in result
|
|
assert "<otsl" not in result
|
|
|
|
|
|
def test_empty_table_preserved_by_default():
|
|
"""Without suppress_empty_elements the empty table group is emitted."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_table(data=TableData())
|
|
|
|
default_params = DoclangParams(
|
|
add_location=False,
|
|
content_types=set(),
|
|
)
|
|
result = serialize_doclang(doc, params=default_params)
|
|
assert "<floating_group" in result
|
|
|
|
|
|
def test_suppress_empty_inline_group():
|
|
"""An inline group whose children are all empty is suppressed."""
|
|
doc = DoclingDocument(name="test")
|
|
inl = doc.add_inline_group()
|
|
doc.add_text(label=DocItemLabel.TEXT, text="", parent=inl)
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
# The inline group emits <text> wrapper when unwrapped; both should vanish
|
|
assert "<text" not in result
|
|
|
|
|
|
def test_suppress_list_with_all_empty_children():
|
|
"""A list group whose children all produce empty text is auto-suppressed.
|
|
|
|
The list serializer already skips the <list> wrapper when no child text
|
|
is produced, so suppressing individual empty <list_text> items causes
|
|
the parent <list> to vanish too.
|
|
"""
|
|
doc = DoclingDocument(name="test")
|
|
lst = doc.add_list_group()
|
|
doc.add_list_item(text="", parent=lst)
|
|
doc.add_list_item(text="", parent=lst)
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
assert "<list" not in result
|
|
assert "<list_text" not in result
|
|
|
|
|
|
def test_suppress_list_keeps_nonempty_items():
|
|
"""Only empty list items are suppressed; non-empty ones remain."""
|
|
doc = DoclingDocument(name="test")
|
|
lst = doc.add_list_group()
|
|
doc.add_list_item(text="", parent=lst)
|
|
doc.add_list_item(text="Keep me", parent=lst)
|
|
doc.add_list_item(text="", parent=lst)
|
|
|
|
params = DoclangParams(
|
|
suppress_empty_elements=True,
|
|
add_location=False,
|
|
)
|
|
result = serialize_doclang(doc, params=params)
|
|
assert "<list " in result
|
|
assert result.count("<list_text>") == 1
|
|
assert "Keep me" in result
|
|
|
|
|
|
def test_suppress_mixed_content():
|
|
"""A document with a mix of empty and non-empty items.
|
|
|
|
Empty items are suppressed, non-empty ones remain.
|
|
"""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="") # suppressed
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Visible paragraph") # kept
|
|
doc.add_picture() # suppressed (empty picture)
|
|
doc.add_heading(text="Visible Heading", level=1) # kept
|
|
doc.add_code(text="") # suppressed
|
|
|
|
params = DoclangParams(
|
|
suppress_empty_elements=True,
|
|
add_location=False,
|
|
)
|
|
result = serialize_doclang(doc, params=params)
|
|
assert result.count("<text>") == 1
|
|
assert "Visible paragraph" in result
|
|
assert "<floating_group" not in result
|
|
assert '<heading level="1">' in result
|
|
assert "Visible Heading" in result
|
|
assert "<code" not in result
|
|
|
|
|
|
def test_suppress_does_not_affect_nonempty():
|
|
"""Suppression flag has no effect on items that carry content."""
|
|
doc = DoclingDocument(name="test")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="Hello")
|
|
doc.add_heading(text="World", level=1)
|
|
|
|
params = DoclangParams(
|
|
suppress_empty_elements=True,
|
|
add_location=False,
|
|
)
|
|
result = serialize_doclang(doc, params=params)
|
|
assert "<text>Hello</text>" in result
|
|
assert '<heading level="1">World</heading>' in result
|
|
|
|
|
|
def test_suppress_nested_section_with_empty_children():
|
|
"""A section containing only empty elements should still emit the section
|
|
(sections are grouping tokens and not subject to content-level suppression),
|
|
but all its empty children should be suppressed.
|
|
"""
|
|
from docling_core.types.doc import GroupLabel
|
|
|
|
doc = DoclingDocument(name="test")
|
|
sec = doc.add_group(label=GroupLabel.SECTION, name="empty_sec")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="", parent=sec)
|
|
doc.add_code(text="", parent=sec)
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
# Section grouping wrapper may or may not remain (depends on serializer),
|
|
# but importantly no <text> or <code> tags appear
|
|
assert "<text" not in result
|
|
assert "<code" not in result
|
|
|
|
|
|
def test_suppress_empty_caption_and_footnote_on_picture():
|
|
"""A picture with an empty caption and empty footnote is suppressed when
|
|
suppress_empty_elements is True and there is no other content.
|
|
"""
|
|
doc = DoclingDocument(name="test")
|
|
cap = doc.add_text(label=DocItemLabel.CAPTION, text="")
|
|
doc.add_picture(caption=cap)
|
|
|
|
result = serialize_doclang(doc, params=_SUPPRESS_PARAMS)
|
|
assert "<floating_group" not in result
|
|
assert "<picture" not in result
|
|
|
|
|
|
def test_suppress_empty_picture_with_nonempty_caption():
|
|
"""A picture with a non-empty caption should still be emitted even when
|
|
suppress_empty_elements is True, because the composed_inner is non-empty.
|
|
"""
|
|
doc = DoclingDocument(name="test")
|
|
cap = doc.add_text(label=DocItemLabel.CAPTION, text="My Figure")
|
|
doc.add_picture(caption=cap)
|
|
|
|
params = DoclangParams(
|
|
suppress_empty_elements=True,
|
|
add_location=False,
|
|
)
|
|
result = serialize_doclang(doc, params=params)
|
|
assert "<floating_group" in result
|
|
assert "My Figure" in result
|
|
|
|
|
|
def test_layer_minimal_mode(doc_with_layers):
|
|
"""Test MINIMAL mode omits default layer, includes non-default."""
|
|
params = DoclangParams(layer_mode=LayerMode.MINIMAL)
|
|
ser = DoclangDocSerializer(doc=doc_with_layers, params=params)
|
|
ser_txt = ser.serialize().text
|
|
|
|
exp_file = Path("./test/data/doc/layer_minimal_mode.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_layer_always_mode(doc_with_layers):
|
|
"""Test ALWAYS mode includes layer element for all items."""
|
|
params = DoclangParams(layer_mode=LayerMode.ALWAYS)
|
|
ser = DoclangDocSerializer(doc=doc_with_layers, params=params)
|
|
ser_txt = ser.serialize().text
|
|
|
|
exp_file = Path("./test/data/doc/layer_always_mode.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_layer_filter_body_only(doc_with_layers):
|
|
"""Test that layers parameter filters content to only show specified layers."""
|
|
# Serialize with only body layer
|
|
params = DoclangParams(
|
|
layers={ContentLayer.BODY},
|
|
)
|
|
ser = DoclangDocSerializer(doc=doc_with_layers, params=params)
|
|
ser_txt = ser.serialize().text
|
|
|
|
exp_file = Path("./test/data/doc/layer_only_body.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|
|
|
|
|
|
def test_newline_to_br():
|
|
|
|
code = """
|
|
|
|
|
|
import pytest
|
|
|
|
from docling_core.experimental.doclang import (
|
|
ContentType,
|
|
EscapeMode,
|
|
DoclangDocSerializer,
|
|
DoclangParams,
|
|
DoclangVocabulary,
|
|
LayerMode,
|
|
WrapMode,
|
|
)
|
|
"""
|
|
|
|
"""Test that newlines survive serialization and deserialization roundtrip."""
|
|
from docling_core.experimental.doclang import DoclangDeserializer
|
|
from docling_core.types.doc import TextItem
|
|
|
|
# Create a document with newlines
|
|
doc = DoclingDocument(name="")
|
|
doc.add_text(label=DocItemLabel.TEXT, text="foo\nbar")
|
|
|
|
inl = doc.add_inline_group()
|
|
doc.add_text(label=DocItemLabel.TEXT, text="eins\n", parent=inl)
|
|
doc.add_text(label=DocItemLabel.TEXT, text=" zwei\n ", parent=inl)
|
|
doc.add_text(label=DocItemLabel.TEXT, text="drei", parent=inl, formatting=Formatting(bold=True))
|
|
|
|
doc.add_code(text=code)
|
|
|
|
# NOTE: this particular case seems bit brittle as to how it's preserved by XML tooling
|
|
doc.add_text(label=DocItemLabel.TEXT, text="\n")
|
|
|
|
ser_txt = doc.export_to_doclang()
|
|
exp_file = Path("./test/data/doc/newline_to_br.dclg.xml")
|
|
verify(exp_file=exp_file, actual=ser_txt)
|