feat: add rich table cells (#368)

* feat: add rich table cells

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* propagate cell text resolution, cover row deletions

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* add doctags, fix referential integrity, expand tests, reenable mypy

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* bump DoclingDocument version

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* simplify / remove serialize_cell

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* update rich table cell refs in doc indexing

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* update notebook

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* expose new classes in `docling_core.types.doc`

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

---------

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas
2025-08-27 10:07:47 +02:00
committed by GitHub
parent e9d4bd944c
commit 1d04154378
49 changed files with 2728 additions and 58 deletions
@@ -359,6 +359,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
item=item,
doc_serializer=self,
doc=self.doc,
visited=my_visited,
**my_kwargs,
)
elif isinstance(item, PictureItem):
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
item: TableItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
visited: Optional[set[str]] = None,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
add_cell_text=params.add_table_cell_text,
xsize=params.xsize,
ysize=params.ysize,
visited=visited,
)
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
+13 -2
View File
@@ -65,6 +65,7 @@ from docling_core.types.doc.document import (
PictureItem,
PictureMoleculeData,
PictureTabularChartData,
RichTableCell,
SectionHeaderItem,
TableCell,
TableItem,
@@ -356,6 +357,7 @@ class HTMLTableSerializer(BaseTableSerializer):
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
body = ""
span_source: Union[DocItem, list[SerializationResult]] = []
for i in range(nrows):
body += "<tr>"
@@ -376,7 +378,16 @@ class HTMLTableSerializer(BaseTableSerializer):
if colstart != j:
continue
content = html.escape(cell.text.strip())
if isinstance(cell, RichTableCell):
ser_res = doc_serializer.serialize(
item=cell.ref.resolve(doc=doc), **kwargs
)
content = ser_res.text
span_source = [ser_res]
else:
content = html.escape(cell.text.strip())
span_source = item
celltag = "td"
if cell.column_header or cell.row_header or cell.row_section:
celltag = "th"
@@ -396,7 +407,7 @@ class HTMLTableSerializer(BaseTableSerializer):
if body:
body = f"<tbody>{body}</tbody>"
res_parts.append(create_ser_result(text=body, span_source=item))
res_parts.append(create_ser_result(text=body, span_source=span_source))
text_res = "".join([r.text for r in res_parts])
text_res = f"<table>{text_res}</table>" if text_res else ""
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
PictureItem,
PictureMoleculeData,
PictureTabularChartData,
RichTableCell,
SectionHeaderItem,
TableItem,
TextItem,
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
[
# make sure that md tables are not broken
# due to newline chars in the text
col.text.replace("\n", " ")
(
doc_serializer.serialize(
item=col.ref.resolve(doc=doc), **kwargs
).text
if isinstance(col, RichTableCell)
else col.text
).replace("\n", " ")
for col in row
]
for row in item.data.grid
+2
View File
@@ -7,6 +7,7 @@
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
from .document import (
AnyTableCell,
BaseAnnotation,
ChartBar,
ChartLine,
@@ -52,6 +53,7 @@ from .document import (
PictureTabularChartData,
ProvenanceItem,
RefItem,
RichTableCell,
Script,
SectionHeaderItem,
TableCell,
+113 -21
View File
@@ -34,7 +34,7 @@ from pydantic import (
validate_call,
)
from tabulate import tabulate
from typing_extensions import Annotated, Self, deprecated
from typing_extensions import Annotated, Self, deprecated, override
from docling_core.search.package import VERSION_PATTERN
from docling_core.types.base import _JSON_POINTER_REGEX
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
CURRENT_VERSION: Final = "1.5.0"
CURRENT_VERSION: Final = "1.6.0"
DEFAULT_EXPORT_LABELS = {
DocItemLabel.TITLE,
@@ -325,7 +325,7 @@ class TableCell(BaseModel):
in data
):
return data
text = data["bbox"].get("token", "")
text = data.get("bbox", {}).get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
@@ -337,11 +337,37 @@ class TableCell(BaseModel):
return data
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
return self.text
class RichTableCell(TableCell):
"""RichTableCell."""
ref: "RefItem"
@override
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
if doc is not None:
doc_serializer = MarkdownDocSerializer(doc=doc)
ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
return ser_res.text
else:
return "<!-- rich cell -->"
AnyTableCell = Annotated[
Union[RichTableCell, TableCell],
Field(union_mode="left_to_right"),
]
class TableData(BaseModel): # TBD
"""BaseTableData."""
table_cells: List[TableCell] = []
table_cells: List[AnyTableCell] = []
num_rows: int = 0
num_cols: int = 0
@@ -380,7 +406,9 @@ class TableData(BaseModel): # TBD
return table_data
def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
def remove_rows(
self, indices: List[int], doc: Optional["DoclingDocument"] = None
) -> List[List[TableCell]]:
"""Remove rows from the table by their indices.
:param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
@@ -392,6 +420,7 @@ class TableData(BaseModel): # TBD
indices = sorted(indices, reverse=True)
refs_to_remove = []
all_removed_cells = []
for row_index in indices:
if row_index < 0 or row_index >= self.num_rows:
@@ -403,6 +432,10 @@ class TableData(BaseModel): # TBD
end_idx = start_idx + self.num_cols
removed_cells = self.table_cells[start_idx:end_idx]
for cell in removed_cells:
if isinstance(cell, RichTableCell):
refs_to_remove.append(cell.ref)
# Remove the cells from the table
self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
@@ -417,9 +450,18 @@ class TableData(BaseModel): # TBD
all_removed_cells.append(removed_cells)
if refs_to_remove:
if doc is None:
_logger.warning(
"When table contains rich cells, `doc` argument must be provided, "
"otherwise rich cell content will be left dangling."
)
else:
doc._delete_items(refs_to_remove)
return all_removed_cells
def pop_row(self) -> List[TableCell]:
def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
"""Remove and return the last row from the table.
:returns: List[TableCell]: A list of TableCell objects representing the popped row.
@@ -427,16 +469,18 @@ class TableData(BaseModel): # TBD
if self.num_rows == 0:
raise IndexError("Cannot pop from an empty table.")
return self.remove_row(self.num_rows - 1)
return self.remove_row(self.num_rows - 1, doc=doc)
def remove_row(self, row_index: int) -> List[TableCell]:
def remove_row(
self, row_index: int, doc: Optional["DoclingDocument"] = None
) -> List[TableCell]:
"""Remove a row from the table by its index.
:param row_index: int: The index of the row to remove. (Starting from 0)
:returns: List[TableCell]: A list of TableCell objects representing the removed row.
"""
return self.remove_rows([row_index])[0]
return self.remove_rows([row_index], doc=doc)[0]
def insert_rows(
self, row_index: int, rows: List[List[str]], after: bool = False
@@ -1509,8 +1553,15 @@ class TableItem(FloatingItem):
annotations: List[TableAnnotationType] = []
def export_to_dataframe(self) -> pd.DataFrame:
def export_to_dataframe(
self, doc: Optional["DoclingDocument"] = None
) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame."""
if doc is None:
_logger.warning(
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
)
if self.data.num_rows == 0 or self.data.num_cols == 0:
return pd.DataFrame()
@@ -1539,14 +1590,15 @@ class TableItem(FloatingItem):
columns = ["" for _ in range(self.data.num_cols)]
for i in range(num_headers):
for j, cell in enumerate(self.data.grid[i]):
col_name = cell.text
col_name = cell._get_text(doc=doc)
if columns[j] != "":
col_name = f".{col_name}"
columns[j] += col_name
# Create table data
table_data = [
[cell.text for cell in row] for row in self.data.grid[num_headers:]
[cell._get_text(doc=doc) for cell in row]
for row in self.data.grid[num_headers:]
]
# Create DataFrame
@@ -1577,7 +1629,7 @@ class TableItem(FloatingItem):
# make sure that md tables are not broken
# due to newline chars in the text
text = col.text
text = col._get_text(doc=doc)
text = text.replace("\n", " ")
tmp.append(text)
@@ -1623,6 +1675,7 @@ class TableItem(FloatingItem):
add_cell_text: bool = True,
xsize: int = 500,
ysize: int = 500,
**kwargs: Any,
) -> str:
"""Export the table as OTSL."""
# Possible OTSL tokens...
@@ -1652,7 +1705,7 @@ class TableItem(FloatingItem):
for i in range(nrows):
for j in range(ncols):
cell: TableCell = self.data.grid[i][j]
content = cell.text.strip()
content = cell._get_text(doc=doc, **kwargs).strip()
rowspan, rowstart = (
cell.row_span,
cell.start_row_offset_idx,
@@ -2304,6 +2357,15 @@ class DoclingDocument(BaseModel):
refs_to_be_deleted=refs_to_be_deleted,
lookup=lookup,
)
if isinstance(node, TableItem):
for cell in node.data.table_cells:
if isinstance(cell, RichTableCell):
path = cell.ref._split_ref_to_path()
cell.ref = self._update_ref_with_lookup(
item_label=path[1],
item_index=int(path[2]),
lookup=lookup,
)
# Update the self_ref reference
if node.parent is not None:
@@ -3945,16 +4007,22 @@ class DoclingDocument(BaseModel):
"""num_pages."""
return len(self.pages.values())
def validate_tree(self, root) -> bool:
def validate_tree(self, root: NodeItem) -> bool:
"""validate_tree."""
res = []
for child_ref in root.children:
child = child_ref.resolve(self)
if child.parent.resolve(self) != root:
if child.parent.resolve(self) != root or not self.validate_tree(child):
return False
res.append(self.validate_tree(child))
return all(res) or len(res) == 0
if isinstance(root, TableItem):
for cell in root.data.table_cells:
if isinstance(cell, RichTableCell) and (
(par_ref := cell.ref.resolve(self).parent) is None
or par_ref.resolve(self) != root
):
return False
return True
def iterate_items(
self,
@@ -3963,7 +4031,7 @@ class DoclingDocument(BaseModel):
traverse_pictures: bool = False,
page_no: Optional[int] = None,
included_content_layers: Optional[set[ContentLayer]] = None,
_level: int = 0, # fixed parameter, carries through the node nesting level
_level: int = 0, # deprecated
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
"""Iterate elements with level."""
for item, stack in self._iterate_items_with_stack(
@@ -5324,7 +5392,9 @@ class DoclingDocument(BaseModel):
grid.append([])
for j, cell in enumerate(row):
if j < 10:
text = get_text(text=cell.text, max_text_len=16)
text = get_text(
cell._get_text(doc=self), max_text_len=16
)
grid[-1].append(text)
result.append("\n" + tabulate(grid) + "\n")
@@ -5588,6 +5658,16 @@ class DoclingDocument(BaseModel):
)
break
# update rich table cells references:
if isinstance(parent_item, TableItem):
for cell in parent_item.data.table_cells:
if (
isinstance(cell, RichTableCell)
and cell.ref.cref == item.self_ref
):
cell.ref.cref = new_cref
break
elif num_components == 2 and path_components[1] == "body":
parent_item = self._body
else:
@@ -5676,6 +5756,18 @@ class DoclingDocument(BaseModel):
elif isinstance(item, ListItem):
validate_list_item(self, item)
def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
"""Add a table cell to the table."""
if isinstance(cell, RichTableCell):
item = cell.ref.resolve(doc=self)
if isinstance(item, NodeItem) and (
(not item.parent) or item.parent.cref != table_item.self_ref
):
raise ValueError(
f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
)
table_item.data.table_cells.append(cell)
# deprecated aliases (kept for backwards compatibility):
BasePictureData = BaseAnnotation
+1 -1
View File
@@ -252,7 +252,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
spans = list(_make_spans(cell, item))
table_data[i][j] = GlmTableCell(
text=cell.text,
text=cell._get_text(doc=doc),
bbox=(
cell.bbox.as_tuple()
if cell.bbox is not None
+83 -2
View File
@@ -1721,6 +1721,80 @@
"title": "RefItem",
"type": "object"
},
"RichTableCell": {
"description": "RichTableCell.",
"properties": {
"bbox": {
"anyOf": [
{
"$ref": "#/$defs/BoundingBox"
},
{
"type": "null"
}
],
"default": null
},
"row_span": {
"default": 1,
"title": "Row Span",
"type": "integer"
},
"col_span": {
"default": 1,
"title": "Col Span",
"type": "integer"
},
"start_row_offset_idx": {
"title": "Start Row Offset Idx",
"type": "integer"
},
"end_row_offset_idx": {
"title": "End Row Offset Idx",
"type": "integer"
},
"start_col_offset_idx": {
"title": "Start Col Offset Idx",
"type": "integer"
},
"end_col_offset_idx": {
"title": "End Col Offset Idx",
"type": "integer"
},
"text": {
"title": "Text",
"type": "string"
},
"column_header": {
"default": false,
"title": "Column Header",
"type": "boolean"
},
"row_header": {
"default": false,
"title": "Row Header",
"type": "boolean"
},
"row_section": {
"default": false,
"title": "Row Section",
"type": "boolean"
},
"ref": {
"$ref": "#/$defs/RefItem"
}
},
"required": [
"start_row_offset_idx",
"end_row_offset_idx",
"start_col_offset_idx",
"end_col_offset_idx",
"text",
"ref"
],
"title": "RichTableCell",
"type": "object"
},
"Script": {
"description": "Text script position.",
"enum": [
@@ -1923,7 +1997,14 @@
"table_cells": {
"default": [],
"items": {
"$ref": "#/$defs/TableCell"
"anyOf": [
{
"$ref": "#/$defs/RichTableCell"
},
{
"$ref": "#/$defs/TableCell"
}
]
},
"title": "Table Cells",
"type": "array"
@@ -2264,7 +2345,7 @@
"type": "string"
},
"version": {
"default": "1.5.0",
"default": "1.6.0",
"pattern": "^(?P<major>0|[1-9]\\d*)\\.(?P<minor>0|[1-9]\\d*)\\.(?P<patch>0|[1-9]\\d*)(?:-(?P<prerelease>(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
"title": "Version",
"type": "string"
+595
View File
@@ -0,0 +1,595 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "81ee4096",
"metadata": {},
"outputs": [],
"source": [
"from docling_core.types.doc import DoclingDocument, TableData, TableCell, RichTableCell, DocItemLabel\n",
"\n",
"doc = DoclingDocument(name=\"\")\n",
"doc.add_text(label=DocItemLabel.TITLE, text=\"Rich tables\")\n",
"\n",
"table_item = doc.add_table(\n",
" data=TableData(\n",
" num_rows=3,\n",
" num_cols=2,\n",
" ),\n",
")\n",
"\n",
"rich_item = doc.add_list_group(parent=table_item)\n",
"doc.add_list_item(parent=rich_item, text=\"list item 1\")\n",
"doc.add_list_item(parent=rich_item, text=\"list item 2\")\n",
"\n",
"for i in range(table_item.data.num_rows):\n",
" for j in range(table_item.data.num_cols):\n",
" if i == 1 and j == 1:\n",
" cell = RichTableCell(\n",
" start_row_offset_idx=i,\n",
" end_row_offset_idx=i + 1,\n",
" start_col_offset_idx=j,\n",
" end_col_offset_idx=j + 1,\n",
" ref=rich_item.get_ref(),\n",
" )\n",
" else:\n",
" cell = TableCell(\n",
" start_row_offset_idx=i,\n",
" end_row_offset_idx=i + 1,\n",
" start_col_offset_idx=j,\n",
" end_col_offset_idx=j + 1,\n",
" text=f\"cell {i},{j}\",\n",
" )\n",
" doc.add_table_cell(table_item=table_item, cell=cell)"
]
},
{
"cell_type": "markdown",
"id": "7ef93338",
"metadata": {},
"source": [
"## Doc-level exports"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ba4d71d9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Rich tables\n",
"\n",
"| cell 0,0 | cell 0,1 |\n",
"|------------|-----------------------------|\n",
"| cell 1,0 | - list item 1 - list item 2 |\n",
"| cell 2,0 | cell 2,1 |\n"
]
}
],
"source": [
"print(doc.export_to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8a528e15",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"em;\n",
" }\n",
"</style>\n",
"</head>\n",
"<body>\n",
"<div class='page'>\n",
"<h1>Rich tables</h1>\n",
"<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><ul>\n",
"<li>list item 1</li>\n",
"<li>list item 2</li>\n",
"</ul></td></tr><tr><td>cell 2,0</td><td>cell 2,1</td></tr></tbody></table>\n",
"</div>\n",
"</body>\n",
"</html>\n"
]
}
],
"source": [
"print(doc.export_to_html()[-300:])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e9ddfa73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<doctag><title>Rich tables</title>\n",
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n",
"</doctag>\n"
]
}
],
"source": [
"print(doc.export_to_doctags())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "04b08710",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"item-0 at level 0: unspecified: group _root_\n",
" item-1 at level 1: title: Rich tables\n",
" item-2 at level 1: table with [3x2]\n",
" item-3 at level 2: list: group group\n",
" item-4 at level 3: list_item: list item 1\n",
" item-5 at level 3: list_item: list item 2\n"
]
}
],
"source": [
"print(doc._export_to_indented_text())"
]
},
{
"cell_type": "markdown",
"id": "42efa550",
"metadata": {},
"source": [
"## Item-level exports"
]
},
{
"cell_type": "markdown",
"id": "4b1de5bf",
"metadata": {},
"source": [
"### With document reference"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5fea4de1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| cell 0,0 | cell 0,1 |\n",
"|------------|-----------------------------|\n",
"| cell 1,0 | - list item 1 - list item 2 |\n",
"| cell 2,0 | cell 2,1 |\n"
]
}
],
"source": [
"print(doc.tables[0].export_to_markdown(doc=doc))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c6d8dec4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><ul>\n",
"<li>list item 1</li>\n",
"<li>list item 2</li>\n",
"</ul></td></tr><tr><td>cell 2,0</td><td>cell 2,1</td></tr></tbody></table>\n"
]
}
],
"source": [
"print(doc.tables[0].export_to_html(doc=doc))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5aaecb64",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n"
]
}
],
"source": [
"print(doc.tables[0].export_to_doctags(doc=doc))\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6092fbb6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl>\n"
]
}
],
"source": [
"print(doc.tables[0].export_to_otsl(doc=doc))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "46c2de7e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>cell 0,0</td>\n",
" <td>cell 0,1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>cell 1,0</td>\n",
" <td>- list item 1\\n- list item 2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>cell 2,0</td>\n",
" <td>cell 2,1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 cell 0,0 cell 0,1\n",
"1 cell 1,0 - list item 1\\n- list item 2\n",
"2 cell 2,0 cell 2,1"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(doc.tables[0].export_to_dataframe(doc=doc))"
]
},
{
"cell_type": "markdown",
"id": "d8e7ba5b",
"metadata": {},
"source": [
"### Without document reference"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "5098f6a5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Usage of TableItem.export_to_markdown() without `doc` argument is deprecated.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| cell 0,0 | cell 0,1 |\n",
"|------------|--------------------|\n",
"| cell 1,0 | <!-- rich cell --> |\n",
"| cell 2,0 | cell 2,1 |\n"
]
}
],
"source": [
"print(doc.tables[0].export_to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6d1c43e5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Usage of TableItem.export_to_html() without `doc` argument is deprecated.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"print(doc.tables[0].export_to_html())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "89c9a3c4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>cell 0,0</td>\n",
" <td>cell 0,1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>cell 1,0</td>\n",
" <td>&lt;!-- rich cell --&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>cell 2,0</td>\n",
" <td>cell 2,1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 cell 0,0 cell 0,1\n",
"1 cell 1,0 <!-- rich cell -->\n",
"2 cell 2,0 cell 2,1"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(doc.tables[0].export_to_dataframe())"
]
},
{
"cell_type": "markdown",
"id": "32db55c8",
"metadata": {},
"source": [
"## Row operations"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7c419c46",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Rich tables\n",
"\n",
"| cell 0,0 | cell 0,1 |\n",
"|------------|-----------------------------|\n",
"| foo | bar |\n",
"| cell 1,0 | - list item 1 - list item 2 |\n",
"| cell 2,0 | cell 2,1 |\n"
]
}
],
"source": [
"doc.tables[0].data.insert_row(\n",
" row_index=1,\n",
" after=False,\n",
" row=[\"foo\", \"bar\"],\n",
")\n",
"print(doc.export_to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "7a56d5fa",
"metadata": {},
"outputs": [],
"source": [
"backup_doc = doc.model_copy(deep=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "31fcb3f7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"When table contains rich cells, `doc` argument must be provided, otherwise rich cell content will be left dangling.\n"
]
}
],
"source": [
"_ = doc.tables[0].data.remove_row(row_index=2)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "cb5827fd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Rich tables\n",
"\n",
"| cell 0,0 | cell 0,1 |\n",
"|------------|------------|\n",
"| foo | bar |\n",
"| cell 2,0 | cell 2,1 |\n",
"\n",
"- list item 1\n",
"- list item 2\n"
]
}
],
"source": [
"print(doc.export_to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "4b3f75ba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Rich tables\n",
"\n",
"| cell 0,0 | cell 0,1 |\n",
"|------------|------------|\n",
"| foo | bar |\n",
"| cell 2,0 | cell 2,1 |\n"
]
}
],
"source": [
"backup_doc.tables[0].data.remove_row(row_index=2, doc=backup_doc)\n",
"\n",
"print(backup_doc.export_to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c77b2c9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Document",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Document",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "2501.17887v1 + Untitled 1 + 2311.18481v1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1097,4 +1097,4 @@ texts:
prov: []
self_ref: '#/texts/55'
text: The end.
version: 1.5.0
version: 1.6.0
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1097,4 +1097,4 @@ texts:
prov: []
self_ref: '#/texts/55'
text: The end.
version: 1.5.0
version: 1.6.0
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Untitled 1",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Document",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -237,4 +237,4 @@ texts:
self_ref: '#/texts/3'
text: 'Figure 1: Four examples of complex page layouts across different document
categories'
version: 1.5.0
version: 1.6.0
+1 -1
View File
@@ -81,4 +81,4 @@ texts:
prov: []
self_ref: '#/texts/3'
text: there
version: 1.5.0
version: 1.6.0
+1 -1
View File
@@ -81,4 +81,4 @@ texts:
prov: []
self_ref: '#/texts/3'
text: foo
version: 1.5.0
version: 1.6.0
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Document",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Document",
"furniture": {
"self_ref": "#/furniture",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.5.0",
"version": "1.6.0",
"name": "Document",
"furniture": {
"self_ref": "#/furniture",
+134
View File
@@ -0,0 +1,134 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8"/>
<title>Docling Document</title>
<meta name="generator" content="Docling HTML Serializer"/>
<style>
html {
background-color: #f5f5f5;
font-family: Arial, sans-serif;
line-height: 1.6;
}
body {
max-width: 800px;
margin: 0 auto;
padding: 2rem;
background-color: white;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
}
h1, h2, h3, h4, h5, h6 {
color: #333;
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 {
font-size: 2em;
border-bottom: 1px solid #eee;
padding-bottom: 0.3em;
}
table {
border-collapse: collapse;
margin: 1em 0;
width: 100%;
}
th, td {
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}
th {
background-color: #f2f2f2;
font-weight: bold;
}
figure {
margin: 1.5em 0;
text-align: center;
}
figcaption {
color: #666;
font-style: italic;
margin-top: 0.5em;
}
img {
max-width: 100%;
height: auto;
}
pre {
background-color: #f6f8fa;
border-radius: 3px;
padding: 1em;
overflow: auto;
}
code {
font-family: monospace;
background-color: #f6f8fa;
padding: 0.2em 0.4em;
border-radius: 3px;
}
pre code {
background-color: transparent;
padding: 0;
}
.formula {
text-align: center;
padding: 0.5em;
margin: 1em 0;
background-color: #f9f9f9;
}
.formula-not-decoded {
text-align: center;
padding: 0.5em;
margin: 1em 0;
background: repeating-linear-gradient(
45deg,
#f0f0f0,
#f0f0f0 10px,
#f9f9f9 10px,
#f9f9f9 20px
);
}
.page-break {
page-break-after: always;
border-top: 1px dashed #ccc;
margin: 2em 0;
}
.key-value-region {
background-color: #f9f9f9;
padding: 1em;
border-radius: 4px;
margin: 1em 0;
}
.key-value-region dt {
font-weight: bold;
}
.key-value-region dd {
margin-left: 1em;
margin-bottom: 0.5em;
}
.form-container {
border: 1px solid #ddd;
padding: 1em;
border-radius: 4px;
margin: 1em 0;
}
.form-item {
margin-bottom: 0.5em;
}
.image-classification {
font-size: 0.9em;
color: #666;
margin-top: 0.5em;
}
</style>
</head>
<body>
<div class='page'>
<h1>Rich tables</h1>
<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><em><p>text in italic</p></em></td></tr><tr><td><ul>
<li>list item 1</li>
<li>list item 2</li>
</ul></td><td>cell 2,1</td></tr><tr><td>cell 3,0</td><td><table><tbody><tr><td>inner cell 0,0</td><td>inner cell 0,1</td><td>inner cell 0,2</td></tr><tr><td>inner cell 1,0</td><td>inner cell 1,1</td><td>inner cell 1,2</td></tr></tbody></table></td></tr></tbody></table>
</div>
</body>
</html>
+7
View File
@@ -0,0 +1,7 @@
# Rich tables
| cell 0,0 | cell 0,1 |
|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| cell 1,0 | *text in italic* |
| - list item 1 - list item 2 | cell 2,1 |
| cell 3,0 | | inner cell 0,0 | inner cell 0,1 | inner cell 0,2 | |------------------|------------------|------------------| | inner cell 1,0 | inner cell 1,1 | inner cell 1,2 | |
+6
View File
@@ -0,0 +1,6 @@
<doctag><title>Rich tables</title>
<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>*text in italic*<nl><fcel>- list item 1
- list item 2<fcel>cell 2,1<nl><fcel>cell 3,0<fcel>| inner cell 0,0 | inner cell 0,1 | inner cell 0,2 |
|------------------|------------------|------------------|
| inner cell 1,0 | inner cell 1,1 | inner cell 1,2 |<nl></otsl>
</doctag>
+400
View File
@@ -0,0 +1,400 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/tables/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/2'
- $ref: '#/texts/3'
content_layer: body
label: list
name: group
parent:
$ref: '#/tables/0'
self_ref: '#/groups/0'
key_value_items: []
name: ''
pages: {}
pictures: []
schema_name: DoclingDocument
tables:
- annotations: []
captions: []
children:
- $ref: '#/texts/1'
- $ref: '#/groups/0'
- $ref: '#/tables/1'
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: ''
num_cols: 2
num_rows: 4
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
ref:
$ref: '#/texts/1'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
ref:
$ref: '#/groups/0'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
ref:
$ref: '#/tables/1'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: ''
footnotes: []
label: table
parent:
$ref: '#/body'
prov: []
references: []
self_ref: '#/tables/0'
- annotations: []
captions: []
children: []
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: inner cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: inner cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 0
text: inner cell 0,2
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: inner cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: inner cell 1,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 1
text: inner cell 1,2
num_cols: 3
num_rows: 2
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: inner cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: inner cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 0
text: inner cell 0,2
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: inner cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: inner cell 1,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 1
text: inner cell 1,2
footnotes: []
label: table
parent:
$ref: '#/tables/0'
prov: []
references: []
self_ref: '#/tables/1'
texts:
- children: []
content_layer: body
label: title
orig: Rich tables
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Rich tables
- children: []
content_layer: body
formatting:
bold: false
italic: true
script: baseline
strikethrough: false
underline: false
label: text
orig: text in italic
parent:
$ref: '#/tables/0'
prov: []
self_ref: '#/texts/1'
text: text in italic
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: list item 1
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: list item 1
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: list item 2
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/3'
text: list item 2
version: 1.6.0
+227
View File
@@ -0,0 +1,227 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/tables/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups: []
key_value_items: []
name: ''
pages: {}
pictures: []
schema_name: DoclingDocument
tables:
- annotations: []
captions: []
children:
- $ref: '#/texts/1'
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: cell 2,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: cell 3,1
num_cols: 2
num_rows: 4
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
ref:
$ref: '#/texts/1'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: cell 2,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: cell 3,1
footnotes: []
label: table
parent:
$ref: '#/body'
prov: []
references: []
self_ref: '#/tables/0'
texts:
- children: []
content_layer: body
label: title
orig: Rich tables
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Rich tables
- children: []
content_layer: body
formatting:
bold: false
italic: true
script: baseline
strikethrough: false
underline: false
label: text
orig: text in italic
parent:
$ref: '#/tables/0'
prov: []
self_ref: '#/texts/1'
text: text in italic
version: 1.6.0
+237
View File
@@ -0,0 +1,237 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/2'
- $ref: '#/tables/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups: []
key_value_items: []
name: ''
pages: {}
pictures: []
schema_name: DoclingDocument
tables:
- annotations: []
captions: []
children:
- $ref: '#/texts/1'
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: cell 2,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: cell 3,1
num_cols: 2
num_rows: 4
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
ref:
$ref: '#/texts/1'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: cell 2,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: cell 3,1
footnotes: []
label: table
parent:
$ref: '#/body'
prov: []
references: []
self_ref: '#/tables/0'
texts:
- children: []
content_layer: body
label: title
orig: Rich tables
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Rich tables
- children: []
content_layer: body
formatting:
bold: false
italic: true
script: baseline
strikethrough: false
underline: false
label: text
orig: text in italic
parent:
$ref: '#/tables/0'
prov: []
self_ref: '#/texts/1'
text: text in italic
- children: []
content_layer: body
label: text
orig: text before
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/2'
text: text before
version: 1.6.0
+237
View File
@@ -0,0 +1,237 @@
body:
children:
- $ref: '#/texts/0'
- $ref: '#/texts/1'
- $ref: '#/tables/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups: []
key_value_items: []
name: ''
pages: {}
pictures: []
schema_name: DoclingDocument
tables:
- annotations: []
captions: []
children:
- $ref: '#/texts/2'
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: cell 2,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: cell 3,1
num_cols: 2
num_rows: 4
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
ref:
$ref: '#/texts/2'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: cell 2,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: cell 3,1
footnotes: []
label: table
parent:
$ref: '#/body'
prov: []
references: []
self_ref: '#/tables/0'
texts:
- children: []
content_layer: body
label: title
orig: Rich tables
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/0'
text: Rich tables
- children: []
content_layer: body
label: text
orig: text before
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/1'
text: text before
- children: []
content_layer: body
formatting:
bold: false
italic: true
script: baseline
strikethrough: false
underline: false
label: text
orig: text in italic
parent:
$ref: '#/tables/0'
prov: []
self_ref: '#/texts/2'
text: text in italic
version: 1.6.0
+390
View File
@@ -0,0 +1,390 @@
body:
children:
- $ref: '#/tables/0'
content_layer: body
label: unspecified
name: _root_
self_ref: '#/body'
form_items: []
furniture:
children: []
content_layer: furniture
label: unspecified
name: _root_
self_ref: '#/furniture'
groups:
- children:
- $ref: '#/texts/1'
- $ref: '#/texts/2'
content_layer: body
label: list
name: group
parent:
$ref: '#/tables/0'
self_ref: '#/groups/0'
key_value_items: []
name: ''
pages: {}
pictures: []
schema_name: DoclingDocument
tables:
- annotations: []
captions: []
children:
- $ref: '#/texts/0'
- $ref: '#/groups/0'
- $ref: '#/tables/1'
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: ''
num_cols: 2
num_rows: 4
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
ref:
$ref: '#/texts/0'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 3
ref:
$ref: '#/groups/0'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 2
text: ''
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 3
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 2
text: cell 2,1
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 4
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 3
text: cell 3,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 4
ref:
$ref: '#/tables/1'
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 3
text: ''
footnotes: []
label: table
parent:
$ref: '#/body'
prov: []
references: []
self_ref: '#/tables/0'
- annotations: []
captions: []
children: []
content_layer: body
data:
grid:
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: inner cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: inner cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 0
text: inner cell 0,2
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: inner cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: inner cell 1,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 1
text: inner cell 1,2
num_cols: 3
num_rows: 2
table_cells:
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: inner cell 0,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: inner cell 0,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 0
text: inner cell 0,2
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: inner cell 1,0
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: inner cell 1,1
- col_span: 1
column_header: false
end_col_offset_idx: 3
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 2
start_row_offset_idx: 1
text: inner cell 1,2
footnotes: []
label: table
parent:
$ref: '#/tables/0'
prov: []
references: []
self_ref: '#/tables/1'
texts:
- children: []
content_layer: body
formatting:
bold: false
italic: true
script: baseline
strikethrough: false
underline: false
label: text
orig: text in italic
parent:
$ref: '#/tables/0'
prov: []
self_ref: '#/texts/0'
text: text in italic
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: list item 1
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/1'
text: list item 1
- children: []
content_layer: body
enumerated: false
label: list_item
marker: ''
orig: list item 2
parent:
$ref: '#/groups/0'
prov: []
self_ref: '#/texts/2'
text: list item 2
version: 1.6.0
+1 -1
View File
@@ -6618,4 +6618,4 @@ texts:
text: '23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for
document layout analysis. In: 2019 International Conference on Document Analysis
and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)'
version: 1.5.0
version: 1.6.0
+213
View File
@@ -33,6 +33,7 @@ from docling_core.types.doc.document import ( # BoundingBox,
PictureItem,
ProvenanceItem,
RefItem,
RichTableCell,
Script,
SectionHeaderItem,
Size,
@@ -2092,3 +2093,215 @@ def test_group_without_children():
bad_doc.add_group()
with pytest.raises(ValueError):
bad_doc._validate_rules()
def _construct_rich_table_doc():
doc = DoclingDocument(name="")
doc.add_text(label=DocItemLabel.TITLE, text="Rich tables")
table_item = doc.add_table(
data=TableData(
num_rows=4,
num_cols=2,
),
)
rich_item_1 = doc.add_text(
parent=table_item,
text="text in italic",
label=DocItemLabel.TEXT,
formatting=Formatting(italic=True),
)
rich_item_2 = doc.add_list_group(parent=table_item)
doc.add_list_item(parent=rich_item_2, text="list item 1")
doc.add_list_item(parent=rich_item_2, text="list item 2")
rich_item_3 = doc.add_table(
data=TableData(num_rows=2, num_cols=3), parent=table_item
)
for i in range(rich_item_3.data.num_rows):
for j in range(rich_item_3.data.num_cols):
cell = TableCell(
text=f"inner cell {i},{j}",
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
)
doc.add_table_cell(table_item=rich_item_3, cell=cell)
for i in range(table_item.data.num_rows):
for j in range(table_item.data.num_cols):
if i == 1 and j == 1:
cell = RichTableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
ref=rich_item_1.get_ref(),
)
elif i == 2 and j == 0:
cell = RichTableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
ref=rich_item_2.get_ref(),
)
elif i == 3 and j == 1:
cell = RichTableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
ref=rich_item_3.get_ref(),
)
else:
cell = TableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
text=f"cell {i},{j}",
)
doc.add_table_cell(table_item=table_item, cell=cell)
return doc
def test_rich_tables():
doc = _construct_rich_table_doc()
exp_file = Path("test/data/doc/rich_table.out.yaml")
if GEN_TEST_DATA:
doc.save_as_yaml(exp_file)
exp_doc = DoclingDocument.load_from_yaml(exp_file)
assert doc == exp_doc
def test_doc_manipulation_with_rich_tables():
doc = _construct_rich_table_doc()
doc.delete_items(node_items=[doc.texts[0]])
exp_file = Path("test/data/doc/rich_table_post_text_del.out.yaml")
if GEN_TEST_DATA:
doc.save_as_yaml(exp_file)
exp_doc = DoclingDocument.load_from_yaml(exp_file)
assert doc == exp_doc
def test_invalid_rich_table_doc():
doc = DoclingDocument(name="")
table_item = doc.add_table(data=TableData(num_rows=2, num_cols=2))
rich_item = doc.add_text(
text="rich item",
label=DocItemLabel.TEXT,
parent=doc.body, # not the table item
)
for i in range(table_item.data.num_rows):
for j in range(table_item.data.num_cols):
if i == 1 and j == 1:
table_cell = RichTableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
ref=rich_item.get_ref(),
)
# ensure add_table_cell() raises:
with pytest.raises(ValueError):
doc.add_table_cell(table_item=table_item, cell=table_cell)
else:
table_cell = TableCell(
text=f"cell {i},{j}",
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
)
# discouraged but technically possible:
table_item.data.table_cells.append(table_cell)
# ensure validate_document() raises:
with pytest.raises(ValueError):
DoclingDocument.validate_document(doc)
def test_rich_table_item_insertion_normalization():
doc = DoclingDocument(name="")
doc.add_text(label=DocItemLabel.TITLE, text="Rich tables")
table_item = doc.add_table(
data=TableData(
num_rows=4,
num_cols=2,
),
)
rich_item = doc.add_text(
parent=table_item,
text="text in italic",
label=DocItemLabel.TEXT,
formatting=Formatting(italic=True),
)
for i in range(table_item.data.num_rows):
for j in range(table_item.data.num_cols):
if i == 1 and j == 1:
cell = RichTableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
ref=rich_item.get_ref(),
)
else:
cell = TableCell(
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
text=f"cell {i},{j}",
)
doc.add_table_cell(table_item=table_item, cell=cell)
# state before insert:
exp_file = Path("test/data/doc/rich_table_item_ins_norm_1.out.yaml")
if GEN_TEST_DATA:
doc.save_as_yaml(exp_file)
exp_doc = DoclingDocument.load_from_yaml(exp_file)
assert doc == exp_doc
doc.insert_item_before_sibling(
new_item=TextItem(
self_ref="#",
text="text before",
orig="text before",
label=DocItemLabel.TEXT,
),
sibling=table_item,
)
# state after insert (prior to normalization):
exp_file = Path("test/data/doc/rich_table_item_ins_norm_2.out.yaml")
if GEN_TEST_DATA:
doc.save_as_yaml(exp_file)
exp_doc = DoclingDocument.load_from_yaml(exp_file)
assert doc == exp_doc
doc._normalize_references()
# state after insert (after normalization):
exp_file = Path("test/data/doc/rich_table_item_ins_norm_3.out.yaml")
if GEN_TEST_DATA:
doc.save_as_yaml(exp_file)
exp_doc = DoclingDocument.load_from_yaml(exp_file)
assert doc == exp_doc
+29 -1
View File
@@ -29,7 +29,7 @@ from docling_core.types.doc.document import DoclingDocument, MiscAnnotation, Tab
from docling_core.types.doc.labels import DocItemLabel
from .test_data_gen_flag import GEN_TEST_DATA
from .test_docling_doc import _construct_doc
from .test_docling_doc import _construct_doc, _construct_rich_table_doc
class CustomAnnotationTableSerializer(MarkdownTableSerializer):
@@ -361,6 +361,15 @@ def test_md_nested_lists():
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
def test_md_rich_table():
exp_file = Path("./test/data/doc/rich_table.gt.md")
doc = _construct_rich_table_doc()
ser = MarkdownDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=exp_file, actual=actual)
def test_html_split_page():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
@@ -500,6 +509,15 @@ def test_html_nested_lists():
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
def test_html_rich_table():
exp_file = Path("./test/data/doc/rich_table.gt.html")
doc = _construct_rich_table_doc()
ser = HTMLDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=exp_file, actual=actual)
def test_doctags_inline_loc_tags():
src = Path("./test/data/doc/2408.09869v3_enriched.json")
doc = DoclingDocument.load_from_json(src)
@@ -507,3 +525,13 @@ def test_doctags_inline_loc_tags():
ser = DocTagsDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.parent / f"{src.stem}.out.dt", actual=actual)
def test_doctags_rich_table():
exp_file = Path("./test/data/doc/rich_table.out.dt")
doc = _construct_rich_table_doc()
ser = DocTagsDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=exp_file, actual=actual)