diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index ab7acfd7..aefa7d70 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -69,6 +69,68 @@ from docling_core.types.doc.document import ( ) +def _cell_content_has_table(item: NodeItem, doc: DoclingDocument) -> bool: + """Return True if *item* is, or has a descendant that is, a TableItem.""" + if isinstance(item, TableItem): + return True + elif isinstance(item, NodeItem): + for child_ref in item.children: + if _cell_content_has_table(child_ref.resolve(doc=doc), doc): + return True + return False + + +def _mark_subtree_visited( + item: NodeItem, + doc: DoclingDocument, + visited: set[str], +) -> None: + """Recursively add *item* and all its descendants to *visited*. + + When a nested table inside a RichTableCell is flattened, its items are + never passed through the normal serialize() path that would mark them + visited. Calling this keeps the visited set consistent so the document + serializer does not emit those items again at the top level. + """ + if isinstance(item, NodeItem): + visited.add(item.self_ref) + for child_ref in item.children: + _mark_subtree_visited(child_ref.resolve(doc=doc), doc, visited) + + +def _collect_subtree_text(item: NodeItem, doc: DoclingDocument) -> str: + """Collect all text from *item*'s subtree, flattening nested tables. + + Returns a space-joined string of every piece of text found so that the + content of a nested table is preserved in a flat, readable form. + + For TableItems the text is pulled from ``data.grid`` cells directly; + children are *not* recursed into because they duplicate the grid content + for RichTableCells. For all other items, ``.text`` is collected and + children are visited recursively. + """ + parts: list[str] = [] + + if isinstance(item, TableItem): + for row in item.data.grid: + for cell in row: + if cell.text: + parts.append(cell.text) + return " ".join(parts) + + if isinstance(item, TextItem) and item.text: + parts.append(item.text) + + if isinstance(item, NodeItem): + for child_ref in item.children: + child = child_ref.resolve(doc=doc) + child_text = _collect_subtree_text(child, doc) + if child_text: + parts.append(child_text) + + return " ".join(parts) + + class OrigListItemMarkerMode(str, Enum): """Display mode for original list item marker.""" @@ -429,6 +491,14 @@ class MarkdownTableSerializer(BaseTableSerializer): **kwargs: Any, ) -> SerializationResult: """Serializes the passed item.""" + if kwargs.get("_nested_in_table"): + visited: set[str] = kwargs.get("visited") or set() + _mark_subtree_visited(item, doc, visited) + return create_ser_result( + text=_collect_subtree_text(item, doc), + span_source=item, + ) + params = MarkdownParams(**kwargs) res_parts: list[SerializationResult] = [] @@ -448,19 +518,23 @@ class MarkdownTableSerializer(BaseTableSerializer): if ann_res.text: res_parts.append(ann_res) - rows = [ - [ - # make sure that md tables are not broken due to newline or pipe chars in the text - # TODO: escape pipe characters also in RichTableCell once nested tables are properly handled - ( - doc_serializer.serialize(item=col.ref.resolve(doc=doc), **kwargs).text.replace("\n", " ") - if isinstance(col, RichTableCell) - else col.text.replace("\n", " ").replace("|", "|") - ) - for col in row - ] - for row in item.data.grid - ] + rows = [] + for row in item.data.grid: + rendered_row = [] + for col in row: + if isinstance(col, RichTableCell): + ref_item = col.ref.resolve(doc=doc) + inner_kwargs = {**kwargs, "_nested_in_table": True} + cell_text = doc_serializer.serialize( + item=ref_item, + **inner_kwargs, + ).text + else: + cell_text = col.text or "" + # Newlines and pipes must be escaped in every cell so the + # markdown table stays valid. + rendered_row.append(cell_text.replace("\n", " ").replace("|", "|")) + rows.append(rendered_row) if len(rows) > 0: try: table_text = tabulate(rows[1:], headers=rows[0], tablefmt="github") diff --git a/test/data/doc/rich_table.gt.md b/test/data/doc/rich_table.gt.md index ba6973ac..824ba36c 100644 --- a/test/data/doc/rich_table.gt.md +++ b/test/data/doc/rich_table.gt.md @@ -1,8 +1,8 @@ # Rich tables -| cell 0,0 | cell 0,1 | -|--------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| cell 1,0 | *text in italic* | -| - list item 1 - list item 2 | cell 2,1 | -| cell 3,0 | | inner cell 0,0 | inner cell 0,1 | inner cell 0,2 | |------------------|------------------|------------------| | inner cell 1,0 | inner cell 1,1 | inner cell 1,2 | | -| Some text in a generic group. More text in the group. | cell 4,1 | +| cell 0,0 | cell 0,1 | +|--------------------------------------------------------|-------------------------------------------------------------------------------------------| +| cell 1,0 | *text in italic* | +| - list item 1 - list item 2 | cell 2,1 | +| cell 3,0 | inner cell 0,0 inner cell 0,1 inner cell 0,2 inner cell 1,0 inner cell 1,1 inner cell 1,2 | +| Some text in a generic group. More text in the group. | cell 4,1 | diff --git a/test/test_serialization.py b/test/test_serialization.py index 227b14bc..0307dc3b 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -1,5 +1,6 @@ """Test serialization.""" +import threading from pathlib import Path import pytest @@ -15,6 +16,7 @@ from docling_core.transforms.serializer.markdown import ( MarkdownParams, MarkdownTableSerializer, OrigListItemMarkerMode, + _cell_content_has_table, ) from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer, WebVTTParams from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer @@ -23,6 +25,7 @@ from docling_core.types.doc.document import ( DescriptionAnnotation, DoclingDocument, RefItem, + RichTableCell, TableCell, TableData, TextItem, @@ -335,6 +338,7 @@ def test_md_single_row_table(): actual = ser.serialize().text verify(exp_file=exp_file, actual=actual) + def test_md_pipe_in_table(): doc = DoclingDocument(name="Pipe in Table") table = doc.add_table(data=TableData(num_rows=1, num_cols=1)) @@ -347,12 +351,110 @@ def test_md_pipe_in_table(): start_col_offset_idx=0, end_col_offset_idx=1, text="Fruits | Veggies", - ) + ), ) ser = doc.export_to_markdown() assert ser == "| Fruits | Veggies |\n|-------------------------|" +def test_cell_content_has_table_detects_descendant_table(): + """Ensure nested tables are detected through non-table parent nodes.""" + doc = DoclingDocument(name="descendant_table") + wrapper = doc.add_group() + nested_table = doc.add_table(data=TableData(num_rows=1, num_cols=1), parent=wrapper) + doc.add_table_cell( + nested_table, + TableCell( + text="inner", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=1, + ), + ) + + assert _cell_content_has_table(wrapper, doc) + + +def _build_nested_rich_table_doc(depth: int) -> DoclingDocument: + """Build a document with `depth` levels of nested RichTableCell tables. + + Each level is a 1x2 table whose first cell is a RichTableCell referencing + the next-level table, and whose second cell is a plain TableCell. + This is the structure produced by the HTML backend for Wikipedia clade tables. + """ + doc = DoclingDocument(name="nested_tables") + + def _add_level(parent, remaining: int): + table = doc.add_table(data=TableData(num_rows=1, num_cols=2), parent=parent) + if remaining > 0: + nested = _add_level(table, remaining - 1) + rich_cell: TableCell = RichTableCell( + ref=nested.get_ref(), + text="rich", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=1, + ) + else: + rich_cell = TableCell( + text="leaf", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=1, + ) + doc.add_table_cell(table, rich_cell) + doc.add_table_cell( + table, + TableCell( + text="plain", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=1, + end_col_offset_idx=2, + ), + ) + return table + + _add_level(doc.body, depth) + return doc + + +def test_md_nested_rich_table_no_hang(): + """Regression: export_to_markdown() must not hang on nested RichTableCells. + + When a RichTableCell's content contains a nested table, the + ``_nested_in_table`` flag passed through kwargs causes + MarkdownTableSerializer to flatten the inner table instead of + re-entering the full table serializer recursively. Without this + guard every level of nesting re-enters the table serializer, causing + exponential string growth and an indefinite hang. + """ + doc = _build_nested_rich_table_doc(depth=5) + + result: list[str] = [] + + def _run() -> None: + result.append(doc.export_to_markdown()) + + t = threading.Thread(target=_run, daemon=True) + t.start() + t.join(timeout=5.0) + + assert not t.is_alive(), "export_to_markdown() hung on a document with nested RichTableCells." + assert result, "export_to_markdown() produced no output" + + # The outer table must be a valid 2-column markdown table. + # Without the pipe-escaping fix, inner-table pipes would leak into the outer + # table and produce dozens of phantom columns. + table_rows = [line for line in result[0].splitlines() if line.startswith("|")] + assert table_rows, "Expected at least one markdown table row in output" + col_counts = {line.count("|") - 1 for line in table_rows} + assert col_counts == {2}, f"Outer table must have exactly 2 columns throughout; got column counts: {col_counts}" + + def test_md_compact_table(): """Test compact table format removes padding and uses minimal separators."""