Files
docling-core/test/test_serialization.py
Panos Vagenas d8a5256b2c feat: add table annotations (#304)
* feat: add table annotations

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* refactor annotation types

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* expand to HTML

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* introduce annotation serializer

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* Update dummy_doc.yaml

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-06-05 15:38:46 +02:00

427 lines
12 KiB
Python

"""Test serialization."""
from pathlib import Path
from typing import Any
from typing_extensions import override
from docling_core.transforms.serializer.base import (
BaseDocSerializer,
SerializationResult,
)
from docling_core.transforms.serializer.common import _DEFAULT_LABELS, create_ser_result
from docling_core.transforms.serializer.html import (
HTMLDocSerializer,
HTMLOutputStyle,
HTMLParams,
)
from docling_core.transforms.serializer.markdown import (
MarkdownDocSerializer,
MarkdownParams,
MarkdownTableSerializer,
_get_annotation_ser_result,
)
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import DoclingDocument, MiscAnnotation, TableItem
from docling_core.types.doc.labels import DocItemLabel
from .test_data_gen_flag import GEN_TEST_DATA
class CustomAnnotationTableSerializer(MarkdownTableSerializer):
@override
def serialize(
self,
*,
item: TableItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
params = MarkdownParams(**kwargs)
res_parts: list[SerializationResult] = []
if params.include_annotations:
for ann in item.annotations:
if isinstance(ann, MiscAnnotation):
# custom serialization logic:
ann_txt = "\n".join([f"{k}: {ann.content[k]}" for k in ann.content])
ann_ser_res = _get_annotation_ser_result(
ann_kind=ann.kind,
ann_text=ann_txt,
mark_annotation=params.mark_annotations,
doc_item=item,
)
res_parts.append(ann_ser_res)
# reusing the existing result (excluding the annotations):
parent_res = super().serialize(
item=item,
doc_serializer=doc_serializer,
doc=doc,
**{**kwargs, **{"include_annotations": False}},
)
res_parts.append(parent_res)
text_res = "\n\n".join([part.text for part in res_parts])
return create_ser_result(text=text_res, span_source=res_parts)
def verify(exp_file: Path, actual: str):
if GEN_TEST_DATA:
with open(exp_file, "w", encoding="utf-8") as f:
f.write(f"{actual}\n")
else:
with open(exp_file, "r", encoding="utf-8") as f:
expected = f.read().rstrip()
assert expected == actual
def test_md_cross_page_list_page_break():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="<!-- page break -->",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
def test_md_cross_page_list_page_break_none():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder=None,
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_pb_none.gt.md", actual=actual)
def test_md_cross_page_list_page_break_empty():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_pb_empty.gt.md", actual=actual)
def test_md_cross_page_list_page_break_non_empty():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="<!-- page-break -->",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_pb_non_empty.gt.md", actual=actual)
def test_md_cross_page_list_page_break_p2():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder=None,
pages={2},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
def test_html_charts():
src = Path("./test/data/doc/barchart.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
def test_md_charts():
src = Path("./test/data/doc/barchart.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
def test_html_cross_page_list_page_break():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
def test_html_cross_page_list_page_break_p1():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
pages={1},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
def test_html_cross_page_list_page_break_p2():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
pages={2},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
def test_md_pb_placeholder_and_page_filter():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
# NOTE ambiguous case
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
page_break_placeholder="<!-- page break -->",
pages={3, 4, 6},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
def test_md_include_annotations_false():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
table_serializer=CustomAnnotationTableSerializer(),
params=MarkdownParams(
include_annotations=False,
pages={1, 5},
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_include_annotations_false.gt.md",
actual=actual,
)
def test_md_mark_annotations_false():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
table_serializer=CustomAnnotationTableSerializer(),
params=MarkdownParams(
include_annotations=True,
mark_annotations=False,
pages={1, 5},
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_mark_annotations_false.gt.md",
actual=actual,
)
def test_md_mark_annotations_true():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
table_serializer=CustomAnnotationTableSerializer(),
params=MarkdownParams(
include_annotations=True,
mark_annotations=True,
pages={1, 5},
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_mark_annotations_true.gt.md",
actual=actual,
)
def test_html_split_page():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_split.gt.html", actual=actual)
def test_html_split_page_p2():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
pages={2},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_split_p2.gt.html", actual=actual)
def test_html_split_page_p2_with_visualizer():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
pages={2},
),
)
ser_res = ser.serialize(
visualizer=LayoutVisualizer(),
)
actual = ser_res.text
# pinning the result with visualizer appeared flaky, so at least ensure it contains
# a figure (for the page) and that it is different than without visualizer:
assert '<figure><img src="data:image/png;base64' in actual
file_without_viz = src.parent / f"{src.stem}_split_p2.gt.html"
with open(file_without_viz) as f:
data_without_viz = f.read()
assert actual.strip() != data_without_viz.strip()
def test_html_split_page_no_page_breaks():
src = Path("./test/data/doc/2408.09869_p1.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_split.gt.html", actual=actual)
def test_html_include_annotations_false():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
include_annotations=False,
pages={1},
html_head="<head></head>", # keeping test output minimal
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_include_annotations_false.gt.html",
actual=actual,
)
def test_html_include_annotations_true():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
include_annotations=True,
pages={1},
html_head="<head></head>", # keeping test output minimal
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_include_annotations_true.gt.html",
actual=actual,
)