mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
fix: prevent hang in export_to_markdown() on nested RichTableCells (#525)
* fix: prevent hang in export_to_markdown() on nested RichTableCells
When a table cell contains another table (via RichTableCell.ref pointing
to a nested table item), MarkdownTableSerializer recursively invoked
itself through doc_serializer.serialize(). Each level of nesting caused
the outer tabulate() call to receive cell strings that were already
formatted markdown tables, which then got |-encoded and re-processed.
On documents with many such cells (e.g. Wikipedia pages with taxobox or
classification tables containing hyperlink-rich cells), this caused
exponential string growth, eventually exhausting memory or hanging.
Fix: add two module-level helpers:
_cell_content_has_table(item, doc) — walks the item subtree and
returns True if any node is a TableItem.
_mark_subtree_visited(item, doc, visited) — adds all nodes in the
subtree to the shared 'visited' set that the document serializer uses
to prevent double-emitting items.
In MarkdownTableSerializer.serialize(), when a RichTableCell's content
contains a nested table, call _mark_subtree_visited() on it and then use
col.text (the precomputed plain-text fallback) instead of calling
doc_serializer.serialize(). This avoids the recursive MarkdownTableSerializer
call entirely, and keeps the visited set consistent so the nested table is
not emitted again as a separate top-level item.
For RichTableCells whose content does NOT contain a nested table (e.g.
italic text, lists, hyperlinks), doc_serializer.serialize() is called as
before, so inline Markdown formatting is preserved.
Bug introduced in docling-core v2.46.0 (commit 1d04154) when RichTableCell
markdown serialization was first added.
Signed-off-by: Ivan Traus <ivan@liminary.io>
* style: apply ruff formatting to markdown serializer
Keep the branch green in local and CI pre-commit runs by applying Ruff's required formatting in the nested rich table serializer path.
Signed-off-by: Ivan Traus <ivan@liminary.io>
* test(markdown): cover descendant table detection in rich-cell helper
Signed-off-by: Ivan Traus <ivan@liminary.io>
* fix(markdown): preserve nested table content as flattened text
Instead of falling back to the opaque col.text placeholder when a
RichTableCell contains a nested table, walk the subtree and collect
text from all grid cells and doc items. This preserves the actual
inner table content in the markdown output while still avoiding the
recursive doc_serializer.serialize() call that caused exponential
string growth and the OOM hang.
Signed-off-by: Ivan Traus <ivan@liminary.io>
* fix(markdown): use TextItem instead of DocItem in subtree text collector
Signed-off-by: Ivan Traus <ivan@liminary.io>
* refactor(markdown): flatten only nested tables, preserve sibling rich formatting
Address review feedback: replace the all-or-nothing _cell_content_has_table
branch with a _nested_in_table flag propagated through kwargs, so only
MarkdownTableSerializer flattens when nested — sibling rich elements (italic,
lists, etc.) keep their markdown formatting. Also replace getattr with
isinstance(NodeItem) + dot notation and remove string annotations.
Signed-off-by: Ivan Traus <ivan@liminary.io>
---------
Signed-off-by: Ivan Traus <ivan@liminary.io>
This commit is contained in:
@@ -69,6 +69,68 @@ from docling_core.types.doc.document import (
|
||||
)
|
||||
|
||||
|
||||
def _cell_content_has_table(item: NodeItem, doc: DoclingDocument) -> bool:
|
||||
"""Return True if *item* is, or has a descendant that is, a TableItem."""
|
||||
if isinstance(item, TableItem):
|
||||
return True
|
||||
elif isinstance(item, NodeItem):
|
||||
for child_ref in item.children:
|
||||
if _cell_content_has_table(child_ref.resolve(doc=doc), doc):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _mark_subtree_visited(
|
||||
item: NodeItem,
|
||||
doc: DoclingDocument,
|
||||
visited: set[str],
|
||||
) -> None:
|
||||
"""Recursively add *item* and all its descendants to *visited*.
|
||||
|
||||
When a nested table inside a RichTableCell is flattened, its items are
|
||||
never passed through the normal serialize() path that would mark them
|
||||
visited. Calling this keeps the visited set consistent so the document
|
||||
serializer does not emit those items again at the top level.
|
||||
"""
|
||||
if isinstance(item, NodeItem):
|
||||
visited.add(item.self_ref)
|
||||
for child_ref in item.children:
|
||||
_mark_subtree_visited(child_ref.resolve(doc=doc), doc, visited)
|
||||
|
||||
|
||||
def _collect_subtree_text(item: NodeItem, doc: DoclingDocument) -> str:
|
||||
"""Collect all text from *item*'s subtree, flattening nested tables.
|
||||
|
||||
Returns a space-joined string of every piece of text found so that the
|
||||
content of a nested table is preserved in a flat, readable form.
|
||||
|
||||
For TableItems the text is pulled from ``data.grid`` cells directly;
|
||||
children are *not* recursed into because they duplicate the grid content
|
||||
for RichTableCells. For all other items, ``.text`` is collected and
|
||||
children are visited recursively.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
|
||||
if isinstance(item, TableItem):
|
||||
for row in item.data.grid:
|
||||
for cell in row:
|
||||
if cell.text:
|
||||
parts.append(cell.text)
|
||||
return " ".join(parts)
|
||||
|
||||
if isinstance(item, TextItem) and item.text:
|
||||
parts.append(item.text)
|
||||
|
||||
if isinstance(item, NodeItem):
|
||||
for child_ref in item.children:
|
||||
child = child_ref.resolve(doc=doc)
|
||||
child_text = _collect_subtree_text(child, doc)
|
||||
if child_text:
|
||||
parts.append(child_text)
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
class OrigListItemMarkerMode(str, Enum):
|
||||
"""Display mode for original list item marker."""
|
||||
|
||||
@@ -429,6 +491,14 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
||||
**kwargs: Any,
|
||||
) -> SerializationResult:
|
||||
"""Serializes the passed item."""
|
||||
if kwargs.get("_nested_in_table"):
|
||||
visited: set[str] = kwargs.get("visited") or set()
|
||||
_mark_subtree_visited(item, doc, visited)
|
||||
return create_ser_result(
|
||||
text=_collect_subtree_text(item, doc),
|
||||
span_source=item,
|
||||
)
|
||||
|
||||
params = MarkdownParams(**kwargs)
|
||||
res_parts: list[SerializationResult] = []
|
||||
|
||||
@@ -448,19 +518,23 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
||||
if ann_res.text:
|
||||
res_parts.append(ann_res)
|
||||
|
||||
rows = [
|
||||
[
|
||||
# make sure that md tables are not broken due to newline or pipe chars in the text
|
||||
# TODO: escape pipe characters also in RichTableCell once nested tables are properly handled
|
||||
(
|
||||
doc_serializer.serialize(item=col.ref.resolve(doc=doc), **kwargs).text.replace("\n", " ")
|
||||
if isinstance(col, RichTableCell)
|
||||
else col.text.replace("\n", " ").replace("|", "|")
|
||||
)
|
||||
for col in row
|
||||
]
|
||||
for row in item.data.grid
|
||||
]
|
||||
rows = []
|
||||
for row in item.data.grid:
|
||||
rendered_row = []
|
||||
for col in row:
|
||||
if isinstance(col, RichTableCell):
|
||||
ref_item = col.ref.resolve(doc=doc)
|
||||
inner_kwargs = {**kwargs, "_nested_in_table": True}
|
||||
cell_text = doc_serializer.serialize(
|
||||
item=ref_item,
|
||||
**inner_kwargs,
|
||||
).text
|
||||
else:
|
||||
cell_text = col.text or ""
|
||||
# Newlines and pipes must be escaped in every cell so the
|
||||
# markdown table stays valid.
|
||||
rendered_row.append(cell_text.replace("\n", " ").replace("|", "|"))
|
||||
rows.append(rendered_row)
|
||||
if len(rows) > 0:
|
||||
try:
|
||||
table_text = tabulate(rows[1:], headers=rows[0], tablefmt="github")
|
||||
|
||||
Vendored
+6
-6
@@ -1,8 +1,8 @@
|
||||
# Rich tables
|
||||
|
||||
| cell 0,0 | cell 0,1 |
|
||||
|--------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| cell 1,0 | *text in italic* |
|
||||
| - list item 1 - list item 2 | cell 2,1 |
|
||||
| cell 3,0 | | inner cell 0,0 | inner cell 0,1 | inner cell 0,2 | |------------------|------------------|------------------| | inner cell 1,0 | inner cell 1,1 | inner cell 1,2 | |
|
||||
| Some text in a generic group. More text in the group. | cell 4,1 |
|
||||
| cell 0,0 | cell 0,1 |
|
||||
|--------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| cell 1,0 | *text in italic* |
|
||||
| - list item 1 - list item 2 | cell 2,1 |
|
||||
| cell 3,0 | inner cell 0,0 inner cell 0,1 inner cell 0,2 inner cell 1,0 inner cell 1,1 inner cell 1,2 |
|
||||
| Some text in a generic group. More text in the group. | cell 4,1 |
|
||||
|
||||
+103
-1
@@ -1,5 +1,6 @@
|
||||
"""Test serialization."""
|
||||
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -15,6 +16,7 @@ from docling_core.transforms.serializer.markdown import (
|
||||
MarkdownParams,
|
||||
MarkdownTableSerializer,
|
||||
OrigListItemMarkerMode,
|
||||
_cell_content_has_table,
|
||||
)
|
||||
from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer, WebVTTParams
|
||||
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
||||
@@ -23,6 +25,7 @@ from docling_core.types.doc.document import (
|
||||
DescriptionAnnotation,
|
||||
DoclingDocument,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
@@ -335,6 +338,7 @@ def test_md_single_row_table():
|
||||
actual = ser.serialize().text
|
||||
verify(exp_file=exp_file, actual=actual)
|
||||
|
||||
|
||||
def test_md_pipe_in_table():
|
||||
doc = DoclingDocument(name="Pipe in Table")
|
||||
table = doc.add_table(data=TableData(num_rows=1, num_cols=1))
|
||||
@@ -347,12 +351,110 @@ def test_md_pipe_in_table():
|
||||
start_col_offset_idx=0,
|
||||
end_col_offset_idx=1,
|
||||
text="Fruits | Veggies",
|
||||
)
|
||||
),
|
||||
)
|
||||
ser = doc.export_to_markdown()
|
||||
assert ser == "| Fruits | Veggies |\n|-------------------------|"
|
||||
|
||||
|
||||
def test_cell_content_has_table_detects_descendant_table():
|
||||
"""Ensure nested tables are detected through non-table parent nodes."""
|
||||
doc = DoclingDocument(name="descendant_table")
|
||||
wrapper = doc.add_group()
|
||||
nested_table = doc.add_table(data=TableData(num_rows=1, num_cols=1), parent=wrapper)
|
||||
doc.add_table_cell(
|
||||
nested_table,
|
||||
TableCell(
|
||||
text="inner",
|
||||
start_row_offset_idx=0,
|
||||
end_row_offset_idx=1,
|
||||
start_col_offset_idx=0,
|
||||
end_col_offset_idx=1,
|
||||
),
|
||||
)
|
||||
|
||||
assert _cell_content_has_table(wrapper, doc)
|
||||
|
||||
|
||||
def _build_nested_rich_table_doc(depth: int) -> DoclingDocument:
|
||||
"""Build a document with `depth` levels of nested RichTableCell tables.
|
||||
|
||||
Each level is a 1x2 table whose first cell is a RichTableCell referencing
|
||||
the next-level table, and whose second cell is a plain TableCell.
|
||||
This is the structure produced by the HTML backend for Wikipedia clade tables.
|
||||
"""
|
||||
doc = DoclingDocument(name="nested_tables")
|
||||
|
||||
def _add_level(parent, remaining: int):
|
||||
table = doc.add_table(data=TableData(num_rows=1, num_cols=2), parent=parent)
|
||||
if remaining > 0:
|
||||
nested = _add_level(table, remaining - 1)
|
||||
rich_cell: TableCell = RichTableCell(
|
||||
ref=nested.get_ref(),
|
||||
text="rich",
|
||||
start_row_offset_idx=0,
|
||||
end_row_offset_idx=1,
|
||||
start_col_offset_idx=0,
|
||||
end_col_offset_idx=1,
|
||||
)
|
||||
else:
|
||||
rich_cell = TableCell(
|
||||
text="leaf",
|
||||
start_row_offset_idx=0,
|
||||
end_row_offset_idx=1,
|
||||
start_col_offset_idx=0,
|
||||
end_col_offset_idx=1,
|
||||
)
|
||||
doc.add_table_cell(table, rich_cell)
|
||||
doc.add_table_cell(
|
||||
table,
|
||||
TableCell(
|
||||
text="plain",
|
||||
start_row_offset_idx=0,
|
||||
end_row_offset_idx=1,
|
||||
start_col_offset_idx=1,
|
||||
end_col_offset_idx=2,
|
||||
),
|
||||
)
|
||||
return table
|
||||
|
||||
_add_level(doc.body, depth)
|
||||
return doc
|
||||
|
||||
|
||||
def test_md_nested_rich_table_no_hang():
|
||||
"""Regression: export_to_markdown() must not hang on nested RichTableCells.
|
||||
|
||||
When a RichTableCell's content contains a nested table, the
|
||||
``_nested_in_table`` flag passed through kwargs causes
|
||||
MarkdownTableSerializer to flatten the inner table instead of
|
||||
re-entering the full table serializer recursively. Without this
|
||||
guard every level of nesting re-enters the table serializer, causing
|
||||
exponential string growth and an indefinite hang.
|
||||
"""
|
||||
doc = _build_nested_rich_table_doc(depth=5)
|
||||
|
||||
result: list[str] = []
|
||||
|
||||
def _run() -> None:
|
||||
result.append(doc.export_to_markdown())
|
||||
|
||||
t = threading.Thread(target=_run, daemon=True)
|
||||
t.start()
|
||||
t.join(timeout=5.0)
|
||||
|
||||
assert not t.is_alive(), "export_to_markdown() hung on a document with nested RichTableCells."
|
||||
assert result, "export_to_markdown() produced no output"
|
||||
|
||||
# The outer table must be a valid 2-column markdown table.
|
||||
# Without the pipe-escaping fix, inner-table pipes would leak into the outer
|
||||
# table and produce dozens of phantom columns.
|
||||
table_rows = [line for line in result[0].splitlines() if line.startswith("|")]
|
||||
assert table_rows, "Expected at least one markdown table row in output"
|
||||
col_counts = {line.count("|") - 1 for line in table_rows}
|
||||
assert col_counts == {2}, f"Outer table must have exactly 2 columns throughout; got column counts: {col_counts}"
|
||||
|
||||
|
||||
def test_md_compact_table():
|
||||
"""Test compact table format removes padding and uses minimal separators."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user