Files
docling-core/test/test_serialization.py
Cesar Berrospi Ramis 8881d0430b feat: add WebVTT export and save functionality (#523)
* feat(vtt): export and save to WebVTT format

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(vtt): omit empty blocks in parsing

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* test(vtt): add tests for exporting and saving to WebVTT

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2026-03-11 15:55:31 +01:00

696 lines
21 KiB
Python

"""Test serialization."""
from pathlib import Path
import pytest
from docling_core.transforms.serializer.common import _DEFAULT_LABELS
from docling_core.transforms.serializer.html import (
HTMLDocSerializer,
HTMLOutputStyle,
HTMLParams,
)
from docling_core.transforms.serializer.markdown import (
MarkdownDocSerializer,
MarkdownParams,
MarkdownTableSerializer,
OrigListItemMarkerMode,
)
from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer, WebVTTParams
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import (
DescriptionAnnotation,
DoclingDocument,
RefItem,
TableCell,
TableData,
TextItem,
)
from docling_core.types.doc.labels import DocItemLabel
from .test_data_gen_flag import GEN_TEST_DATA
def verify(exp_file: Path, actual: str):
if GEN_TEST_DATA:
with open(exp_file, "w", encoding="utf-8") as f:
f.write(f"{actual}\n")
else:
with open(exp_file, encoding="utf-8") as f:
expected = f.read().rstrip()
# Normalize platform-dependent quote escaping for DocTags outputs
name = exp_file.name
if name.endswith((".dt", ".idt", ".idt.xml")):
def _normalize_quotes(s: str) -> str:
return s.replace("&quot;", '"').replace("&#34;", '"')
expected = _normalize_quotes(expected)
actual = _normalize_quotes(actual)
assert actual == expected
# ===============================
# Markdown tests
# ===============================
def test_md_cross_page_list_page_break():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="<!-- page break -->",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
def test_md_checkboxes():
src = Path("./test/data/doc/checkboxes.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="<!-- page break -->",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
def test_md_cross_page_list_page_break_none():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder=None,
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_pb_none.gt.md", actual=actual)
def test_md_cross_page_list_page_break_empty():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_pb_empty.gt.md", actual=actual)
def test_md_cross_page_list_page_break_non_empty():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder="<!-- page-break -->",
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_pb_non_empty.gt.md", actual=actual)
def test_md_cross_page_list_page_break_p2():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
page_break_placeholder=None,
pages={2},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_p2.gt.md", actual=actual)
def test_md_charts():
src = Path("./test/data/doc/barchart.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
def test_md_inline_and_formatting():
src = Path("./test/data/doc/inline_and_formatting.yaml")
doc = DoclingDocument.load_from_yaml(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
def test_md_pb_placeholder_and_page_filter():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
# NOTE ambiguous case
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
page_break_placeholder="<!-- page break -->",
pages={3, 4, 6},
),
)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
def test_md_list_item_markers(sample_doc):
root_dir = Path("./test/data/doc")
for mode in OrigListItemMarkerMode:
for valid in [False, True]:
ser = MarkdownDocSerializer(
doc=sample_doc,
params=MarkdownParams(
orig_list_item_marker_mode=mode,
ensure_valid_list_item_marker=valid,
),
)
actual = ser.serialize().text
verify(
root_dir / f"constructed_mode_{str(mode.value).lower()}_valid_{str(valid).lower()}.gt.md",
actual=actual,
)
def test_md_mark_meta_true():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
mark_meta=True,
pages={1, 5},
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_mark_meta_true.gt.md",
actual=actual,
)
def test_md_mark_meta_false():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
mark_meta=False,
pages={1, 5},
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_mark_meta_false.gt.md",
actual=actual,
)
def test_md_legacy_annotations_mark_true(sample_doc):
exp_file = Path("./test/data/doc/constructed_legacy_annot_mark_true.gt.md")
with pytest.warns(DeprecationWarning):
sample_doc.tables[0].annotations.append(
DescriptionAnnotation(text="This is a description of table 1.", provenance="foo")
)
ser = MarkdownDocSerializer(
doc=sample_doc,
params=MarkdownParams(
mark_annotations=True,
),
)
actual = ser.serialize().text
verify(
exp_file=exp_file,
actual=actual,
)
def test_md_legacy_annotations_mark_false(sample_doc):
exp_file = Path("./test/data/doc/constructed_legacy_annot_mark_false.gt.md")
with pytest.warns(DeprecationWarning):
sample_doc.tables[0].annotations.append(
DescriptionAnnotation(text="This is a description of table 1.", provenance="foo")
)
ser = MarkdownDocSerializer(
doc=sample_doc,
params=MarkdownParams(
mark_annotations=False,
),
)
actual = ser.serialize().text
verify(
exp_file=exp_file,
actual=actual,
)
def test_md_nested_lists():
src = Path("./test/data/doc/polymers.json")
doc = DoclingDocument.load_from_json(src)
ser = MarkdownDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
def test_md_rich_table(rich_table_doc):
exp_file = Path("./test/data/doc/rich_table.gt.md")
ser = MarkdownDocSerializer(doc=rich_table_doc)
actual = ser.serialize().text
verify(exp_file=exp_file, actual=actual)
def test_md_single_row_table():
exp_file = Path("./test/data/doc/single_row_table.gt.md")
words = ["foo", "bar"]
doc = DoclingDocument(name="")
row_idx = 0
table = doc.add_table(data=TableData(num_rows=1, num_cols=len(words)))
for col_idx, word in enumerate(words):
doc.add_table_cell(
table_item=table,
cell=TableCell(
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + 1,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + 1,
text=word,
),
)
ser = MarkdownDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=exp_file, actual=actual)
def test_md_pipe_in_table():
doc = DoclingDocument(name="Pipe in Table")
table = doc.add_table(data=TableData(num_rows=1, num_cols=1))
# TODO: properly handle nested tables, for now just escape the pipe
doc.add_table_cell(
table,
TableCell(
start_row_offset_idx=0,
end_row_offset_idx=1,
start_col_offset_idx=0,
end_col_offset_idx=1,
text="Fruits | Veggies",
)
)
ser = doc.export_to_markdown()
assert ser == "| Fruits &#124; Veggies |\n|-------------------------|"
def test_md_compact_table():
"""Test compact table format removes padding and uses minimal separators."""
# Test the _compact_table method directly
padded_table = """| item | qty | description |
| ------ | ----: | :-------------------: |
| spam | 42 | A canned meat product |
| eggs | 451 | Fresh farm eggs |
| bacon | 0 | Out of stock |"""
expected_compact = """| item | qty | description |
| - | -: | :-: |
| spam | 42 | A canned meat product |
| eggs | 451 | Fresh farm eggs |
| bacon | 0 | Out of stock |"""
compact_result = MarkdownTableSerializer._compact_table(padded_table)
assert compact_result == expected_compact
# Verify space savings
assert len(compact_result) < len(padded_table)
def test_md_traverse_pictures():
"""Test traverse_pictures parameter to include text inside PictureItems."""
doc = DoclingDocument(name="Test Document")
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text before picture")
picture = doc.add_picture()
# Manually add a text item as child of picture
text_in_picture = TextItem(
self_ref=f"#/texts/{len(doc.texts)}",
parent=RefItem(cref=picture.self_ref),
label=DocItemLabel.PARAGRAPH,
text="Text inside picture",
orig="Text inside picture",
)
doc.texts.append(text_in_picture)
picture.children.append(RefItem(cref=text_in_picture.self_ref))
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text after picture")
# Test with traverse_pictures=False (default)
ser_no_traverse = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
traverse_pictures=False,
),
)
result_no_traverse = ser_no_traverse.serialize().text
# Should NOT contain text inside picture
assert "Text before picture" in result_no_traverse
assert "Text after picture" in result_no_traverse
assert "Text inside picture" not in result_no_traverse
assert "<!-- image -->" in result_no_traverse
# Test with traverse_pictures=True
ser_with_traverse = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.PLACEHOLDER,
image_placeholder="<!-- image -->",
traverse_pictures=True,
),
)
result_with_traverse = ser_with_traverse.serialize().text
# Should contain text inside picture
assert "Text before picture" in result_with_traverse
assert "Text after picture" in result_with_traverse
assert "Text inside picture" in result_with_traverse
assert "<!-- image -->" in result_with_traverse
# ===============================
# HTML tests
# ===============================
def test_html_charts():
src = Path("./test/data/doc/barchart.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
def test_html_cross_page_list_page_break():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
),
)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
def test_html_cross_page_list_page_break_p1():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
pages={1},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_p1.gt.html", actual=actual)
def test_html_cross_page_list_page_break_p2():
src = Path("./test/data/doc/activities.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
pages={2},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_p2.gt.html", actual=actual)
def test_html_split_page():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_split.gt.html", actual=actual)
def test_html_split_page_p2():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
pages={2},
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_split_p2.gt.html", actual=actual)
def test_html_split_page_p2_with_visualizer():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
pages={2},
),
)
ser_res = ser.serialize(
visualizer=LayoutVisualizer(),
)
actual = ser_res.text
# pinning the result with visualizer appeared flaky, so at least ensure it contains
# a figure (for the page) and that it is different than without visualizer:
assert '<figure><img src="data:image/png;base64' in actual
file_without_viz = src.parent / f"{src.stem}_split_p2.gt.html"
with open(file_without_viz) as f:
data_without_viz = f.read()
assert actual.strip() != data_without_viz.strip()
def test_html_split_page_no_page_breaks():
src = Path("./test/data/doc/2408.09869_p1.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.EMBEDDED,
output_style=HTMLOutputStyle.SPLIT_PAGE,
),
)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}_split.gt.html", actual=actual)
def test_html_include_annotations_false():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
include_annotations=False,
pages={1},
html_head="<head></head>", # keeping test output minimal
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_include_annotations_false.gt.html",
actual=actual,
)
def test_html_include_annotations_true():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(
doc=doc,
params=HTMLParams(
image_mode=ImageRefMode.PLACEHOLDER,
include_annotations=True,
pages={1},
html_head="<head></head>", # keeping test output minimal
),
)
actual = ser.serialize().text
verify(
exp_file=src.parent / f"{src.stem}_p1_include_annotations_true.gt.html",
actual=actual,
)
def test_html_list_item_markers(sample_doc):
root_dir = Path("./test/data/doc")
for orig in [False, True]:
ser = HTMLDocSerializer(
doc=sample_doc,
params=HTMLParams(
show_original_list_item_marker=orig,
),
)
actual = ser.serialize().text
verify(
root_dir / f"constructed_orig_{str(orig).lower()}.gt.html",
actual=actual,
)
def test_html_nested_lists():
src = Path("./test/data/doc/polymers.json")
doc = DoclingDocument.load_from_json(src)
ser = HTMLDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
def test_html_rich_table(rich_table_doc):
exp_file = Path("./test/data/doc/rich_table.gt.html")
ser = HTMLDocSerializer(doc=rich_table_doc)
actual = ser.serialize().text
verify(exp_file=exp_file, actual=actual)
def test_html_inline_and_formatting():
src = Path("./test/data/doc/inline_and_formatting.yaml")
doc = DoclingDocument.load_from_yaml(src)
ser = HTMLDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
# ===============================
# WebVTT tests
# ===============================
@pytest.mark.parametrize(
"example_num",
[1, 2, 3, 4, 5],
)
def test_webvtt(example_num):
src = Path(f"test/data/doc/webvtt_example_{example_num:02d}.json")
doc = DoclingDocument.load_from_json(src)
ser = WebVTTDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.vtt"), actual=actual)
def test_webvtt_params():
"""Test WebVTT serialization with WebVTTParams."""
src = Path("./test/data/doc/webvtt_example_01.json")
doc = DoclingDocument.load_from_json(src)
# Test with omit_hours_if_zero=True
ser = WebVTTDocSerializer(doc=doc, params=WebVTTParams(omit_hours_if_zero=True))
actual = ser.serialize().text
assert "00:11.000 --> 00:13.000" in actual
# Test with omit_voice_end=True
ser = WebVTTDocSerializer(doc=doc, params=WebVTTParams(omit_voice_end=True))
actual = ser.serialize().text
assert "</v>" not in actual
# Test with both parameters enabled
ser = WebVTTDocSerializer(
doc=doc,
params=WebVTTParams(omit_hours_if_zero=True, omit_voice_end=True)
)
actual = ser.serialize().text
assert "00:11.000 --> 00:13.000" in actual
assert "</v>" not in actual
ser_default = WebVTTDocSerializer(doc=doc, params=WebVTTParams())
actual_default = ser_default.serialize().text
assert len(actual) <= len(actual_default) or actual != actual_default