mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
feat: add rich table cells (#368)
* feat: add rich table cells Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * propagate cell text resolution, cover row deletions Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * add doctags, fix referential integrity, expand tests, reenable mypy Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * bump DoclingDocument version Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * simplify / remove serialize_cell Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * update rich table cell refs in doc indexing Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * update notebook Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * expose new classes in `docling_core.types.doc` Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -359,6 +359,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
||||
item=item,
|
||||
doc_serializer=self,
|
||||
doc=self.doc,
|
||||
visited=my_visited,
|
||||
**my_kwargs,
|
||||
)
|
||||
elif isinstance(item, PictureItem):
|
||||
|
||||
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
||||
item: TableItem,
|
||||
doc_serializer: BaseDocSerializer,
|
||||
doc: DoclingDocument,
|
||||
visited: Optional[set[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> SerializationResult:
|
||||
"""Serializes the passed item."""
|
||||
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
||||
add_cell_text=params.add_table_cell_text,
|
||||
xsize=params.xsize,
|
||||
ysize=params.ysize,
|
||||
visited=visited,
|
||||
)
|
||||
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ from docling_core.types.doc.document import (
|
||||
PictureItem,
|
||||
PictureMoleculeData,
|
||||
PictureTabularChartData,
|
||||
RichTableCell,
|
||||
SectionHeaderItem,
|
||||
TableCell,
|
||||
TableItem,
|
||||
@@ -356,6 +357,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
||||
|
||||
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
||||
body = ""
|
||||
span_source: Union[DocItem, list[SerializationResult]] = []
|
||||
|
||||
for i in range(nrows):
|
||||
body += "<tr>"
|
||||
@@ -376,7 +378,16 @@ class HTMLTableSerializer(BaseTableSerializer):
|
||||
if colstart != j:
|
||||
continue
|
||||
|
||||
content = html.escape(cell.text.strip())
|
||||
if isinstance(cell, RichTableCell):
|
||||
ser_res = doc_serializer.serialize(
|
||||
item=cell.ref.resolve(doc=doc), **kwargs
|
||||
)
|
||||
content = ser_res.text
|
||||
span_source = [ser_res]
|
||||
else:
|
||||
content = html.escape(cell.text.strip())
|
||||
span_source = item
|
||||
|
||||
celltag = "td"
|
||||
if cell.column_header or cell.row_header or cell.row_section:
|
||||
celltag = "th"
|
||||
@@ -396,7 +407,7 @@ class HTMLTableSerializer(BaseTableSerializer):
|
||||
|
||||
if body:
|
||||
body = f"<tbody>{body}</tbody>"
|
||||
res_parts.append(create_ser_result(text=body, span_source=item))
|
||||
res_parts.append(create_ser_result(text=body, span_source=span_source))
|
||||
|
||||
text_res = "".join([r.text for r in res_parts])
|
||||
text_res = f"<table>{text_res}</table>" if text_res else ""
|
||||
|
||||
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
|
||||
PictureItem,
|
||||
PictureMoleculeData,
|
||||
PictureTabularChartData,
|
||||
RichTableCell,
|
||||
SectionHeaderItem,
|
||||
TableItem,
|
||||
TextItem,
|
||||
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
|
||||
[
|
||||
# make sure that md tables are not broken
|
||||
# due to newline chars in the text
|
||||
col.text.replace("\n", " ")
|
||||
(
|
||||
doc_serializer.serialize(
|
||||
item=col.ref.resolve(doc=doc), **kwargs
|
||||
).text
|
||||
if isinstance(col, RichTableCell)
|
||||
else col.text
|
||||
).replace("\n", " ")
|
||||
for col in row
|
||||
]
|
||||
for row in item.data.grid
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
|
||||
from .document import (
|
||||
AnyTableCell,
|
||||
BaseAnnotation,
|
||||
ChartBar,
|
||||
ChartLine,
|
||||
@@ -52,6 +53,7 @@ from .document import (
|
||||
PictureTabularChartData,
|
||||
ProvenanceItem,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
Script,
|
||||
SectionHeaderItem,
|
||||
TableCell,
|
||||
|
||||
@@ -34,7 +34,7 @@ from pydantic import (
|
||||
validate_call,
|
||||
)
|
||||
from tabulate import tabulate
|
||||
from typing_extensions import Annotated, Self, deprecated
|
||||
from typing_extensions import Annotated, Self, deprecated, override
|
||||
|
||||
from docling_core.search.package import VERSION_PATTERN
|
||||
from docling_core.types.base import _JSON_POINTER_REGEX
|
||||
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
|
||||
|
||||
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
||||
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
||||
CURRENT_VERSION: Final = "1.5.0"
|
||||
CURRENT_VERSION: Final = "1.6.0"
|
||||
|
||||
DEFAULT_EXPORT_LABELS = {
|
||||
DocItemLabel.TITLE,
|
||||
@@ -325,7 +325,7 @@ class TableCell(BaseModel):
|
||||
in data
|
||||
):
|
||||
return data
|
||||
text = data["bbox"].get("token", "")
|
||||
text = data.get("bbox", {}).get("token", "")
|
||||
if not len(text):
|
||||
text_cells = data.pop("text_cell_bboxes", None)
|
||||
if text_cells:
|
||||
@@ -337,11 +337,37 @@ class TableCell(BaseModel):
|
||||
|
||||
return data
|
||||
|
||||
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
|
||||
return self.text
|
||||
|
||||
|
||||
class RichTableCell(TableCell):
|
||||
"""RichTableCell."""
|
||||
|
||||
ref: "RefItem"
|
||||
|
||||
@override
|
||||
def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
|
||||
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
|
||||
|
||||
if doc is not None:
|
||||
doc_serializer = MarkdownDocSerializer(doc=doc)
|
||||
ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
|
||||
return ser_res.text
|
||||
else:
|
||||
return "<!-- rich cell -->"
|
||||
|
||||
|
||||
AnyTableCell = Annotated[
|
||||
Union[RichTableCell, TableCell],
|
||||
Field(union_mode="left_to_right"),
|
||||
]
|
||||
|
||||
|
||||
class TableData(BaseModel): # TBD
|
||||
"""BaseTableData."""
|
||||
|
||||
table_cells: List[TableCell] = []
|
||||
table_cells: List[AnyTableCell] = []
|
||||
num_rows: int = 0
|
||||
num_cols: int = 0
|
||||
|
||||
@@ -380,7 +406,9 @@ class TableData(BaseModel): # TBD
|
||||
|
||||
return table_data
|
||||
|
||||
def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
|
||||
def remove_rows(
|
||||
self, indices: List[int], doc: Optional["DoclingDocument"] = None
|
||||
) -> List[List[TableCell]]:
|
||||
"""Remove rows from the table by their indices.
|
||||
|
||||
:param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
|
||||
@@ -392,6 +420,7 @@ class TableData(BaseModel): # TBD
|
||||
|
||||
indices = sorted(indices, reverse=True)
|
||||
|
||||
refs_to_remove = []
|
||||
all_removed_cells = []
|
||||
for row_index in indices:
|
||||
if row_index < 0 or row_index >= self.num_rows:
|
||||
@@ -403,6 +432,10 @@ class TableData(BaseModel): # TBD
|
||||
end_idx = start_idx + self.num_cols
|
||||
removed_cells = self.table_cells[start_idx:end_idx]
|
||||
|
||||
for cell in removed_cells:
|
||||
if isinstance(cell, RichTableCell):
|
||||
refs_to_remove.append(cell.ref)
|
||||
|
||||
# Remove the cells from the table
|
||||
self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
|
||||
|
||||
@@ -417,9 +450,18 @@ class TableData(BaseModel): # TBD
|
||||
|
||||
all_removed_cells.append(removed_cells)
|
||||
|
||||
if refs_to_remove:
|
||||
if doc is None:
|
||||
_logger.warning(
|
||||
"When table contains rich cells, `doc` argument must be provided, "
|
||||
"otherwise rich cell content will be left dangling."
|
||||
)
|
||||
else:
|
||||
doc._delete_items(refs_to_remove)
|
||||
|
||||
return all_removed_cells
|
||||
|
||||
def pop_row(self) -> List[TableCell]:
|
||||
def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
|
||||
"""Remove and return the last row from the table.
|
||||
|
||||
:returns: List[TableCell]: A list of TableCell objects representing the popped row.
|
||||
@@ -427,16 +469,18 @@ class TableData(BaseModel): # TBD
|
||||
if self.num_rows == 0:
|
||||
raise IndexError("Cannot pop from an empty table.")
|
||||
|
||||
return self.remove_row(self.num_rows - 1)
|
||||
return self.remove_row(self.num_rows - 1, doc=doc)
|
||||
|
||||
def remove_row(self, row_index: int) -> List[TableCell]:
|
||||
def remove_row(
|
||||
self, row_index: int, doc: Optional["DoclingDocument"] = None
|
||||
) -> List[TableCell]:
|
||||
"""Remove a row from the table by its index.
|
||||
|
||||
:param row_index: int: The index of the row to remove. (Starting from 0)
|
||||
|
||||
:returns: List[TableCell]: A list of TableCell objects representing the removed row.
|
||||
"""
|
||||
return self.remove_rows([row_index])[0]
|
||||
return self.remove_rows([row_index], doc=doc)[0]
|
||||
|
||||
def insert_rows(
|
||||
self, row_index: int, rows: List[List[str]], after: bool = False
|
||||
@@ -1509,8 +1553,15 @@ class TableItem(FloatingItem):
|
||||
|
||||
annotations: List[TableAnnotationType] = []
|
||||
|
||||
def export_to_dataframe(self) -> pd.DataFrame:
|
||||
def export_to_dataframe(
|
||||
self, doc: Optional["DoclingDocument"] = None
|
||||
) -> pd.DataFrame:
|
||||
"""Export the table as a Pandas DataFrame."""
|
||||
if doc is None:
|
||||
_logger.warning(
|
||||
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
|
||||
)
|
||||
|
||||
if self.data.num_rows == 0 or self.data.num_cols == 0:
|
||||
return pd.DataFrame()
|
||||
|
||||
@@ -1539,14 +1590,15 @@ class TableItem(FloatingItem):
|
||||
columns = ["" for _ in range(self.data.num_cols)]
|
||||
for i in range(num_headers):
|
||||
for j, cell in enumerate(self.data.grid[i]):
|
||||
col_name = cell.text
|
||||
col_name = cell._get_text(doc=doc)
|
||||
if columns[j] != "":
|
||||
col_name = f".{col_name}"
|
||||
columns[j] += col_name
|
||||
|
||||
# Create table data
|
||||
table_data = [
|
||||
[cell.text for cell in row] for row in self.data.grid[num_headers:]
|
||||
[cell._get_text(doc=doc) for cell in row]
|
||||
for row in self.data.grid[num_headers:]
|
||||
]
|
||||
|
||||
# Create DataFrame
|
||||
@@ -1577,7 +1629,7 @@ class TableItem(FloatingItem):
|
||||
|
||||
# make sure that md tables are not broken
|
||||
# due to newline chars in the text
|
||||
text = col.text
|
||||
text = col._get_text(doc=doc)
|
||||
text = text.replace("\n", " ")
|
||||
tmp.append(text)
|
||||
|
||||
@@ -1623,6 +1675,7 @@ class TableItem(FloatingItem):
|
||||
add_cell_text: bool = True,
|
||||
xsize: int = 500,
|
||||
ysize: int = 500,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""Export the table as OTSL."""
|
||||
# Possible OTSL tokens...
|
||||
@@ -1652,7 +1705,7 @@ class TableItem(FloatingItem):
|
||||
for i in range(nrows):
|
||||
for j in range(ncols):
|
||||
cell: TableCell = self.data.grid[i][j]
|
||||
content = cell.text.strip()
|
||||
content = cell._get_text(doc=doc, **kwargs).strip()
|
||||
rowspan, rowstart = (
|
||||
cell.row_span,
|
||||
cell.start_row_offset_idx,
|
||||
@@ -2304,6 +2357,15 @@ class DoclingDocument(BaseModel):
|
||||
refs_to_be_deleted=refs_to_be_deleted,
|
||||
lookup=lookup,
|
||||
)
|
||||
if isinstance(node, TableItem):
|
||||
for cell in node.data.table_cells:
|
||||
if isinstance(cell, RichTableCell):
|
||||
path = cell.ref._split_ref_to_path()
|
||||
cell.ref = self._update_ref_with_lookup(
|
||||
item_label=path[1],
|
||||
item_index=int(path[2]),
|
||||
lookup=lookup,
|
||||
)
|
||||
|
||||
# Update the self_ref reference
|
||||
if node.parent is not None:
|
||||
@@ -3945,16 +4007,22 @@ class DoclingDocument(BaseModel):
|
||||
"""num_pages."""
|
||||
return len(self.pages.values())
|
||||
|
||||
def validate_tree(self, root) -> bool:
|
||||
def validate_tree(self, root: NodeItem) -> bool:
|
||||
"""validate_tree."""
|
||||
res = []
|
||||
for child_ref in root.children:
|
||||
child = child_ref.resolve(self)
|
||||
if child.parent.resolve(self) != root:
|
||||
if child.parent.resolve(self) != root or not self.validate_tree(child):
|
||||
return False
|
||||
res.append(self.validate_tree(child))
|
||||
|
||||
return all(res) or len(res) == 0
|
||||
if isinstance(root, TableItem):
|
||||
for cell in root.data.table_cells:
|
||||
if isinstance(cell, RichTableCell) and (
|
||||
(par_ref := cell.ref.resolve(self).parent) is None
|
||||
or par_ref.resolve(self) != root
|
||||
):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def iterate_items(
|
||||
self,
|
||||
@@ -3963,7 +4031,7 @@ class DoclingDocument(BaseModel):
|
||||
traverse_pictures: bool = False,
|
||||
page_no: Optional[int] = None,
|
||||
included_content_layers: Optional[set[ContentLayer]] = None,
|
||||
_level: int = 0, # fixed parameter, carries through the node nesting level
|
||||
_level: int = 0, # deprecated
|
||||
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
||||
"""Iterate elements with level."""
|
||||
for item, stack in self._iterate_items_with_stack(
|
||||
@@ -5324,7 +5392,9 @@ class DoclingDocument(BaseModel):
|
||||
grid.append([])
|
||||
for j, cell in enumerate(row):
|
||||
if j < 10:
|
||||
text = get_text(text=cell.text, max_text_len=16)
|
||||
text = get_text(
|
||||
cell._get_text(doc=self), max_text_len=16
|
||||
)
|
||||
grid[-1].append(text)
|
||||
|
||||
result.append("\n" + tabulate(grid) + "\n")
|
||||
@@ -5588,6 +5658,16 @@ class DoclingDocument(BaseModel):
|
||||
)
|
||||
break
|
||||
|
||||
# update rich table cells references:
|
||||
if isinstance(parent_item, TableItem):
|
||||
for cell in parent_item.data.table_cells:
|
||||
if (
|
||||
isinstance(cell, RichTableCell)
|
||||
and cell.ref.cref == item.self_ref
|
||||
):
|
||||
cell.ref.cref = new_cref
|
||||
break
|
||||
|
||||
elif num_components == 2 and path_components[1] == "body":
|
||||
parent_item = self._body
|
||||
else:
|
||||
@@ -5676,6 +5756,18 @@ class DoclingDocument(BaseModel):
|
||||
elif isinstance(item, ListItem):
|
||||
validate_list_item(self, item)
|
||||
|
||||
def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
|
||||
"""Add a table cell to the table."""
|
||||
if isinstance(cell, RichTableCell):
|
||||
item = cell.ref.resolve(doc=self)
|
||||
if isinstance(item, NodeItem) and (
|
||||
(not item.parent) or item.parent.cref != table_item.self_ref
|
||||
):
|
||||
raise ValueError(
|
||||
f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
|
||||
)
|
||||
table_item.data.table_cells.append(cell)
|
||||
|
||||
|
||||
# deprecated aliases (kept for backwards compatibility):
|
||||
BasePictureData = BaseAnnotation
|
||||
|
||||
@@ -252,7 +252,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
|
||||
|
||||
spans = list(_make_spans(cell, item))
|
||||
table_data[i][j] = GlmTableCell(
|
||||
text=cell.text,
|
||||
text=cell._get_text(doc=doc),
|
||||
bbox=(
|
||||
cell.bbox.as_tuple()
|
||||
if cell.bbox is not None
|
||||
|
||||
@@ -1721,6 +1721,80 @@
|
||||
"title": "RefItem",
|
||||
"type": "object"
|
||||
},
|
||||
"RichTableCell": {
|
||||
"description": "RichTableCell.",
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/$defs/BoundingBox"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
],
|
||||
"default": null
|
||||
},
|
||||
"row_span": {
|
||||
"default": 1,
|
||||
"title": "Row Span",
|
||||
"type": "integer"
|
||||
},
|
||||
"col_span": {
|
||||
"default": 1,
|
||||
"title": "Col Span",
|
||||
"type": "integer"
|
||||
},
|
||||
"start_row_offset_idx": {
|
||||
"title": "Start Row Offset Idx",
|
||||
"type": "integer"
|
||||
},
|
||||
"end_row_offset_idx": {
|
||||
"title": "End Row Offset Idx",
|
||||
"type": "integer"
|
||||
},
|
||||
"start_col_offset_idx": {
|
||||
"title": "Start Col Offset Idx",
|
||||
"type": "integer"
|
||||
},
|
||||
"end_col_offset_idx": {
|
||||
"title": "End Col Offset Idx",
|
||||
"type": "integer"
|
||||
},
|
||||
"text": {
|
||||
"title": "Text",
|
||||
"type": "string"
|
||||
},
|
||||
"column_header": {
|
||||
"default": false,
|
||||
"title": "Column Header",
|
||||
"type": "boolean"
|
||||
},
|
||||
"row_header": {
|
||||
"default": false,
|
||||
"title": "Row Header",
|
||||
"type": "boolean"
|
||||
},
|
||||
"row_section": {
|
||||
"default": false,
|
||||
"title": "Row Section",
|
||||
"type": "boolean"
|
||||
},
|
||||
"ref": {
|
||||
"$ref": "#/$defs/RefItem"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start_row_offset_idx",
|
||||
"end_row_offset_idx",
|
||||
"start_col_offset_idx",
|
||||
"end_col_offset_idx",
|
||||
"text",
|
||||
"ref"
|
||||
],
|
||||
"title": "RichTableCell",
|
||||
"type": "object"
|
||||
},
|
||||
"Script": {
|
||||
"description": "Text script position.",
|
||||
"enum": [
|
||||
@@ -1923,7 +1997,14 @@
|
||||
"table_cells": {
|
||||
"default": [],
|
||||
"items": {
|
||||
"$ref": "#/$defs/TableCell"
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/$defs/RichTableCell"
|
||||
},
|
||||
{
|
||||
"$ref": "#/$defs/TableCell"
|
||||
}
|
||||
]
|
||||
},
|
||||
"title": "Table Cells",
|
||||
"type": "array"
|
||||
@@ -2264,7 +2345,7 @@
|
||||
"type": "string"
|
||||
},
|
||||
"version": {
|
||||
"default": "1.5.0",
|
||||
"default": "1.6.0",
|
||||
"pattern": "^(?P<major>0|[1-9]\\d*)\\.(?P<minor>0|[1-9]\\d*)\\.(?P<patch>0|[1-9]\\d*)(?:-(?P<prerelease>(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
|
||||
"title": "Version",
|
||||
"type": "string"
|
||||
|
||||
@@ -0,0 +1,595 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "81ee4096",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from docling_core.types.doc import DoclingDocument, TableData, TableCell, RichTableCell, DocItemLabel\n",
|
||||
"\n",
|
||||
"doc = DoclingDocument(name=\"\")\n",
|
||||
"doc.add_text(label=DocItemLabel.TITLE, text=\"Rich tables\")\n",
|
||||
"\n",
|
||||
"table_item = doc.add_table(\n",
|
||||
" data=TableData(\n",
|
||||
" num_rows=3,\n",
|
||||
" num_cols=2,\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"rich_item = doc.add_list_group(parent=table_item)\n",
|
||||
"doc.add_list_item(parent=rich_item, text=\"list item 1\")\n",
|
||||
"doc.add_list_item(parent=rich_item, text=\"list item 2\")\n",
|
||||
"\n",
|
||||
"for i in range(table_item.data.num_rows):\n",
|
||||
" for j in range(table_item.data.num_cols):\n",
|
||||
" if i == 1 and j == 1:\n",
|
||||
" cell = RichTableCell(\n",
|
||||
" start_row_offset_idx=i,\n",
|
||||
" end_row_offset_idx=i + 1,\n",
|
||||
" start_col_offset_idx=j,\n",
|
||||
" end_col_offset_idx=j + 1,\n",
|
||||
" ref=rich_item.get_ref(),\n",
|
||||
" )\n",
|
||||
" else:\n",
|
||||
" cell = TableCell(\n",
|
||||
" start_row_offset_idx=i,\n",
|
||||
" end_row_offset_idx=i + 1,\n",
|
||||
" start_col_offset_idx=j,\n",
|
||||
" end_col_offset_idx=j + 1,\n",
|
||||
" text=f\"cell {i},{j}\",\n",
|
||||
" )\n",
|
||||
" doc.add_table_cell(table_item=table_item, cell=cell)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7ef93338",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Doc-level exports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "ba4d71d9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Rich tables\n",
|
||||
"\n",
|
||||
"| cell 0,0 | cell 0,1 |\n",
|
||||
"|------------|-----------------------------|\n",
|
||||
"| cell 1,0 | - list item 1 - list item 2 |\n",
|
||||
"| cell 2,0 | cell 2,1 |\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.export_to_markdown())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8a528e15",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"em;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"</head>\n",
|
||||
"<body>\n",
|
||||
"<div class='page'>\n",
|
||||
"<h1>Rich tables</h1>\n",
|
||||
"<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><ul>\n",
|
||||
"<li>list item 1</li>\n",
|
||||
"<li>list item 2</li>\n",
|
||||
"</ul></td></tr><tr><td>cell 2,0</td><td>cell 2,1</td></tr></tbody></table>\n",
|
||||
"</div>\n",
|
||||
"</body>\n",
|
||||
"</html>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.export_to_html()[-300:])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e9ddfa73",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<doctag><title>Rich tables</title>\n",
|
||||
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
|
||||
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n",
|
||||
"</doctag>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.export_to_doctags())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "04b08710",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"item-0 at level 0: unspecified: group _root_\n",
|
||||
" item-1 at level 1: title: Rich tables\n",
|
||||
" item-2 at level 1: table with [3x2]\n",
|
||||
" item-3 at level 2: list: group group\n",
|
||||
" item-4 at level 3: list_item: list item 1\n",
|
||||
" item-5 at level 3: list_item: list item 2\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc._export_to_indented_text())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "42efa550",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Item-level exports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4b1de5bf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### With document reference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "5fea4de1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"| cell 0,0 | cell 0,1 |\n",
|
||||
"|------------|-----------------------------|\n",
|
||||
"| cell 1,0 | - list item 1 - list item 2 |\n",
|
||||
"| cell 2,0 | cell 2,1 |\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.tables[0].export_to_markdown(doc=doc))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "c6d8dec4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><ul>\n",
|
||||
"<li>list item 1</li>\n",
|
||||
"<li>list item 2</li>\n",
|
||||
"</ul></td></tr><tr><td>cell 2,0</td><td>cell 2,1</td></tr></tbody></table>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.tables[0].export_to_html(doc=doc))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "5aaecb64",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
|
||||
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl></otsl>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.tables[0].export_to_doctags(doc=doc))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "6092fbb6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>- list item 1\n",
|
||||
"- list item 2<nl><fcel>cell 2,0<fcel>cell 2,1<nl>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.tables[0].export_to_otsl(doc=doc))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "46c2de7e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>0</th>\n",
|
||||
" <th>1</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>cell 0,0</td>\n",
|
||||
" <td>cell 0,1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>cell 1,0</td>\n",
|
||||
" <td>- list item 1\\n- list item 2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>cell 2,0</td>\n",
|
||||
" <td>cell 2,1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 0 1\n",
|
||||
"0 cell 0,0 cell 0,1\n",
|
||||
"1 cell 1,0 - list item 1\\n- list item 2\n",
|
||||
"2 cell 2,0 cell 2,1"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display(doc.tables[0].export_to_dataframe(doc=doc))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d8e7ba5b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Without document reference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "5098f6a5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Usage of TableItem.export_to_markdown() without `doc` argument is deprecated.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"| cell 0,0 | cell 0,1 |\n",
|
||||
"|------------|--------------------|\n",
|
||||
"| cell 1,0 | <!-- rich cell --> |\n",
|
||||
"| cell 2,0 | cell 2,1 |\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.tables[0].export_to_markdown())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "6d1c43e5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Usage of TableItem.export_to_html() without `doc` argument is deprecated.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.tables[0].export_to_html())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "89c9a3c4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>0</th>\n",
|
||||
" <th>1</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>cell 0,0</td>\n",
|
||||
" <td>cell 0,1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>cell 1,0</td>\n",
|
||||
" <td><!-- rich cell --></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>cell 2,0</td>\n",
|
||||
" <td>cell 2,1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 0 1\n",
|
||||
"0 cell 0,0 cell 0,1\n",
|
||||
"1 cell 1,0 <!-- rich cell -->\n",
|
||||
"2 cell 2,0 cell 2,1"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display(doc.tables[0].export_to_dataframe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "32db55c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Row operations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "7c419c46",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Rich tables\n",
|
||||
"\n",
|
||||
"| cell 0,0 | cell 0,1 |\n",
|
||||
"|------------|-----------------------------|\n",
|
||||
"| foo | bar |\n",
|
||||
"| cell 1,0 | - list item 1 - list item 2 |\n",
|
||||
"| cell 2,0 | cell 2,1 |\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc.tables[0].data.insert_row(\n",
|
||||
" row_index=1,\n",
|
||||
" after=False,\n",
|
||||
" row=[\"foo\", \"bar\"],\n",
|
||||
")\n",
|
||||
"print(doc.export_to_markdown())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "7a56d5fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"backup_doc = doc.model_copy(deep=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "31fcb3f7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"When table contains rich cells, `doc` argument must be provided, otherwise rich cell content will be left dangling.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"_ = doc.tables[0].data.remove_row(row_index=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "cb5827fd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Rich tables\n",
|
||||
"\n",
|
||||
"| cell 0,0 | cell 0,1 |\n",
|
||||
"|------------|------------|\n",
|
||||
"| foo | bar |\n",
|
||||
"| cell 2,0 | cell 2,1 |\n",
|
||||
"\n",
|
||||
"- list item 1\n",
|
||||
"- list item 2\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(doc.export_to_markdown())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "4b3f75ba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Rich tables\n",
|
||||
"\n",
|
||||
"| cell 0,0 | cell 0,1 |\n",
|
||||
"|------------|------------|\n",
|
||||
"| foo | bar |\n",
|
||||
"| cell 2,0 | cell 2,1 |\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"backup_doc.tables[0].data.remove_row(row_index=2, doc=backup_doc)\n",
|
||||
"\n",
|
||||
"print(backup_doc.export_to_markdown())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4c77b2c9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Document",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Document",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
Vendored
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "2501.17887v1 + Untitled 1 + 2311.18481v1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1097,4 +1097,4 @@ texts:
|
||||
prov: []
|
||||
self_ref: '#/texts/55'
|
||||
text: The end.
|
||||
version: 1.5.0
|
||||
version: 1.6.0
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1097,4 +1097,4 @@ texts:
|
||||
prov: []
|
||||
self_ref: '#/texts/55'
|
||||
text: The end.
|
||||
version: 1.5.0
|
||||
version: 1.6.0
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Untitled 1",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
Vendored
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Document",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
Vendored
+1
-1
@@ -237,4 +237,4 @@ texts:
|
||||
self_ref: '#/texts/3'
|
||||
text: 'Figure 1: Four examples of complex page layouts across different document
|
||||
categories'
|
||||
version: 1.5.0
|
||||
version: 1.6.0
|
||||
|
||||
+1
-1
@@ -81,4 +81,4 @@ texts:
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: there
|
||||
version: 1.5.0
|
||||
version: 1.6.0
|
||||
|
||||
+1
-1
@@ -81,4 +81,4 @@ texts:
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: foo
|
||||
version: 1.5.0
|
||||
version: 1.6.0
|
||||
|
||||
Vendored
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Document",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Document",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.5.0",
|
||||
"version": "1.6.0",
|
||||
"name": "Document",
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
|
||||
Vendored
+134
@@ -0,0 +1,134 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8"/>
|
||||
<title>Docling Document</title>
|
||||
<meta name="generator" content="Docling HTML Serializer"/>
|
||||
<style>
|
||||
html {
|
||||
background-color: #f5f5f5;
|
||||
font-family: Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
}
|
||||
body {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 2rem;
|
||||
background-color: white;
|
||||
box-shadow: 0 0 10px rgba(0,0,0,0.1);
|
||||
}
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #333;
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
h1 {
|
||||
font-size: 2em;
|
||||
border-bottom: 1px solid #eee;
|
||||
padding-bottom: 0.3em;
|
||||
}
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
margin: 1em 0;
|
||||
width: 100%;
|
||||
}
|
||||
th, td {
|
||||
border: 1px solid #ddd;
|
||||
padding: 8px;
|
||||
text-align: left;
|
||||
}
|
||||
th {
|
||||
background-color: #f2f2f2;
|
||||
font-weight: bold;
|
||||
}
|
||||
figure {
|
||||
margin: 1.5em 0;
|
||||
text-align: center;
|
||||
}
|
||||
figcaption {
|
||||
color: #666;
|
||||
font-style: italic;
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
pre {
|
||||
background-color: #f6f8fa;
|
||||
border-radius: 3px;
|
||||
padding: 1em;
|
||||
overflow: auto;
|
||||
}
|
||||
code {
|
||||
font-family: monospace;
|
||||
background-color: #f6f8fa;
|
||||
padding: 0.2em 0.4em;
|
||||
border-radius: 3px;
|
||||
}
|
||||
pre code {
|
||||
background-color: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
.formula {
|
||||
text-align: center;
|
||||
padding: 0.5em;
|
||||
margin: 1em 0;
|
||||
background-color: #f9f9f9;
|
||||
}
|
||||
.formula-not-decoded {
|
||||
text-align: center;
|
||||
padding: 0.5em;
|
||||
margin: 1em 0;
|
||||
background: repeating-linear-gradient(
|
||||
45deg,
|
||||
#f0f0f0,
|
||||
#f0f0f0 10px,
|
||||
#f9f9f9 10px,
|
||||
#f9f9f9 20px
|
||||
);
|
||||
}
|
||||
.page-break {
|
||||
page-break-after: always;
|
||||
border-top: 1px dashed #ccc;
|
||||
margin: 2em 0;
|
||||
}
|
||||
.key-value-region {
|
||||
background-color: #f9f9f9;
|
||||
padding: 1em;
|
||||
border-radius: 4px;
|
||||
margin: 1em 0;
|
||||
}
|
||||
.key-value-region dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
.key-value-region dd {
|
||||
margin-left: 1em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
.form-container {
|
||||
border: 1px solid #ddd;
|
||||
padding: 1em;
|
||||
border-radius: 4px;
|
||||
margin: 1em 0;
|
||||
}
|
||||
.form-item {
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
.image-classification {
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class='page'>
|
||||
<h1>Rich tables</h1>
|
||||
<table><tbody><tr><td>cell 0,0</td><td>cell 0,1</td></tr><tr><td>cell 1,0</td><td><em><p>text in italic</p></em></td></tr><tr><td><ul>
|
||||
<li>list item 1</li>
|
||||
<li>list item 2</li>
|
||||
</ul></td><td>cell 2,1</td></tr><tr><td>cell 3,0</td><td><table><tbody><tr><td>inner cell 0,0</td><td>inner cell 0,1</td><td>inner cell 0,2</td></tr><tr><td>inner cell 1,0</td><td>inner cell 1,1</td><td>inner cell 1,2</td></tr></tbody></table></td></tr></tbody></table>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
Vendored
+7
@@ -0,0 +1,7 @@
|
||||
# Rich tables
|
||||
|
||||
| cell 0,0 | cell 0,1 |
|
||||
|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| cell 1,0 | *text in italic* |
|
||||
| - list item 1 - list item 2 | cell 2,1 |
|
||||
| cell 3,0 | | inner cell 0,0 | inner cell 0,1 | inner cell 0,2 | |------------------|------------------|------------------| | inner cell 1,0 | inner cell 1,1 | inner cell 1,2 | |
|
||||
Vendored
+6
@@ -0,0 +1,6 @@
|
||||
<doctag><title>Rich tables</title>
|
||||
<otsl><fcel>cell 0,0<fcel>cell 0,1<nl><fcel>cell 1,0<fcel>*text in italic*<nl><fcel>- list item 1
|
||||
- list item 2<fcel>cell 2,1<nl><fcel>cell 3,0<fcel>| inner cell 0,0 | inner cell 0,1 | inner cell 0,2 |
|
||||
|------------------|------------------|------------------|
|
||||
| inner cell 1,0 | inner cell 1,1 | inner cell 1,2 |<nl></otsl>
|
||||
</doctag>
|
||||
Vendored
+400
@@ -0,0 +1,400 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/tables/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups:
|
||||
- children:
|
||||
- $ref: '#/texts/2'
|
||||
- $ref: '#/texts/3'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
self_ref: '#/groups/0'
|
||||
key_value_items: []
|
||||
name: ''
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables:
|
||||
- annotations: []
|
||||
captions: []
|
||||
children:
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/groups/0'
|
||||
- $ref: '#/tables/1'
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: ''
|
||||
num_cols: 2
|
||||
num_rows: 4
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
ref:
|
||||
$ref: '#/texts/1'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
ref:
|
||||
$ref: '#/groups/0'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
ref:
|
||||
$ref: '#/tables/1'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: ''
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/0'
|
||||
- annotations: []
|
||||
captions: []
|
||||
children: []
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,2
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,2
|
||||
num_cols: 3
|
||||
num_rows: 2
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,2
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,2
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/1'
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: title
|
||||
orig: Rich tables
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: Rich tables
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
script: baseline
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: text in italic
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: text in italic
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: list item 1
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: list item 1
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: list item 2
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/3'
|
||||
text: list item 2
|
||||
version: 1.6.0
|
||||
+227
@@ -0,0 +1,227 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/tables/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups: []
|
||||
key_value_items: []
|
||||
name: ''
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables:
|
||||
- annotations: []
|
||||
captions: []
|
||||
children:
|
||||
- $ref: '#/texts/1'
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,1
|
||||
num_cols: 2
|
||||
num_rows: 4
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
ref:
|
||||
$ref: '#/texts/1'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,1
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/0'
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: title
|
||||
orig: Rich tables
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: Rich tables
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
script: baseline
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: text in italic
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: text in italic
|
||||
version: 1.6.0
|
||||
+237
@@ -0,0 +1,237 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/2'
|
||||
- $ref: '#/tables/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups: []
|
||||
key_value_items: []
|
||||
name: ''
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables:
|
||||
- annotations: []
|
||||
captions: []
|
||||
children:
|
||||
- $ref: '#/texts/1'
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,1
|
||||
num_cols: 2
|
||||
num_rows: 4
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
ref:
|
||||
$ref: '#/texts/1'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,1
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/0'
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: title
|
||||
orig: Rich tables
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: Rich tables
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
script: baseline
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: text in italic
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: text in italic
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: text before
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: text before
|
||||
version: 1.6.0
|
||||
+237
@@ -0,0 +1,237 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/tables/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups: []
|
||||
key_value_items: []
|
||||
name: ''
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables:
|
||||
- annotations: []
|
||||
captions: []
|
||||
children:
|
||||
- $ref: '#/texts/2'
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,1
|
||||
num_cols: 2
|
||||
num_rows: 4
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
ref:
|
||||
$ref: '#/texts/2'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,1
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/0'
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: title
|
||||
orig: Rich tables
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: Rich tables
|
||||
- children: []
|
||||
content_layer: body
|
||||
label: text
|
||||
orig: text before
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: text before
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
script: baseline
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: text in italic
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: text in italic
|
||||
version: 1.6.0
|
||||
+390
@@ -0,0 +1,390 @@
|
||||
body:
|
||||
children:
|
||||
- $ref: '#/tables/0'
|
||||
content_layer: body
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/body'
|
||||
form_items: []
|
||||
furniture:
|
||||
children: []
|
||||
content_layer: furniture
|
||||
label: unspecified
|
||||
name: _root_
|
||||
self_ref: '#/furniture'
|
||||
groups:
|
||||
- children:
|
||||
- $ref: '#/texts/1'
|
||||
- $ref: '#/texts/2'
|
||||
content_layer: body
|
||||
label: list
|
||||
name: group
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
self_ref: '#/groups/0'
|
||||
key_value_items: []
|
||||
name: ''
|
||||
pages: {}
|
||||
pictures: []
|
||||
schema_name: DoclingDocument
|
||||
tables:
|
||||
- annotations: []
|
||||
captions: []
|
||||
children:
|
||||
- $ref: '#/texts/0'
|
||||
- $ref: '#/groups/0'
|
||||
- $ref: '#/tables/1'
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: ''
|
||||
num_cols: 2
|
||||
num_rows: 4
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
ref:
|
||||
$ref: '#/texts/0'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 3
|
||||
ref:
|
||||
$ref: '#/groups/0'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 2
|
||||
text: ''
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 3
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 2
|
||||
text: cell 2,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 4
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 3
|
||||
text: cell 3,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 4
|
||||
ref:
|
||||
$ref: '#/tables/1'
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 3
|
||||
text: ''
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/body'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/0'
|
||||
- annotations: []
|
||||
captions: []
|
||||
children: []
|
||||
content_layer: body
|
||||
data:
|
||||
grid:
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,2
|
||||
- - col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,2
|
||||
num_cols: 3
|
||||
num_rows: 2
|
||||
table_cells:
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 1
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 0
|
||||
text: inner cell 0,2
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 1
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 0
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,0
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 2
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 1
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,1
|
||||
- col_span: 1
|
||||
column_header: false
|
||||
end_col_offset_idx: 3
|
||||
end_row_offset_idx: 2
|
||||
row_header: false
|
||||
row_section: false
|
||||
row_span: 1
|
||||
start_col_offset_idx: 2
|
||||
start_row_offset_idx: 1
|
||||
text: inner cell 1,2
|
||||
footnotes: []
|
||||
label: table
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
references: []
|
||||
self_ref: '#/tables/1'
|
||||
texts:
|
||||
- children: []
|
||||
content_layer: body
|
||||
formatting:
|
||||
bold: false
|
||||
italic: true
|
||||
script: baseline
|
||||
strikethrough: false
|
||||
underline: false
|
||||
label: text
|
||||
orig: text in italic
|
||||
parent:
|
||||
$ref: '#/tables/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/0'
|
||||
text: text in italic
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: list item 1
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/1'
|
||||
text: list item 1
|
||||
- children: []
|
||||
content_layer: body
|
||||
enumerated: false
|
||||
label: list_item
|
||||
marker: ''
|
||||
orig: list item 2
|
||||
parent:
|
||||
$ref: '#/groups/0'
|
||||
prov: []
|
||||
self_ref: '#/texts/2'
|
||||
text: list item 2
|
||||
version: 1.6.0
|
||||
+1
-1
@@ -6618,4 +6618,4 @@ texts:
|
||||
text: '23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for
|
||||
document layout analysis. In: 2019 International Conference on Document Analysis
|
||||
and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)'
|
||||
version: 1.5.0
|
||||
version: 1.6.0
|
||||
|
||||
@@ -33,6 +33,7 @@ from docling_core.types.doc.document import ( # BoundingBox,
|
||||
PictureItem,
|
||||
ProvenanceItem,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
Script,
|
||||
SectionHeaderItem,
|
||||
Size,
|
||||
@@ -2092,3 +2093,215 @@ def test_group_without_children():
|
||||
bad_doc.add_group()
|
||||
with pytest.raises(ValueError):
|
||||
bad_doc._validate_rules()
|
||||
|
||||
|
||||
def _construct_rich_table_doc():
|
||||
|
||||
doc = DoclingDocument(name="")
|
||||
doc.add_text(label=DocItemLabel.TITLE, text="Rich tables")
|
||||
|
||||
table_item = doc.add_table(
|
||||
data=TableData(
|
||||
num_rows=4,
|
||||
num_cols=2,
|
||||
),
|
||||
)
|
||||
|
||||
rich_item_1 = doc.add_text(
|
||||
parent=table_item,
|
||||
text="text in italic",
|
||||
label=DocItemLabel.TEXT,
|
||||
formatting=Formatting(italic=True),
|
||||
)
|
||||
|
||||
rich_item_2 = doc.add_list_group(parent=table_item)
|
||||
doc.add_list_item(parent=rich_item_2, text="list item 1")
|
||||
doc.add_list_item(parent=rich_item_2, text="list item 2")
|
||||
|
||||
rich_item_3 = doc.add_table(
|
||||
data=TableData(num_rows=2, num_cols=3), parent=table_item
|
||||
)
|
||||
for i in range(rich_item_3.data.num_rows):
|
||||
for j in range(rich_item_3.data.num_cols):
|
||||
cell = TableCell(
|
||||
text=f"inner cell {i},{j}",
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
)
|
||||
doc.add_table_cell(table_item=rich_item_3, cell=cell)
|
||||
|
||||
for i in range(table_item.data.num_rows):
|
||||
for j in range(table_item.data.num_cols):
|
||||
if i == 1 and j == 1:
|
||||
cell = RichTableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
ref=rich_item_1.get_ref(),
|
||||
)
|
||||
elif i == 2 and j == 0:
|
||||
cell = RichTableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
ref=rich_item_2.get_ref(),
|
||||
)
|
||||
elif i == 3 and j == 1:
|
||||
cell = RichTableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
ref=rich_item_3.get_ref(),
|
||||
)
|
||||
else:
|
||||
cell = TableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
text=f"cell {i},{j}",
|
||||
)
|
||||
doc.add_table_cell(table_item=table_item, cell=cell)
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
def test_rich_tables():
|
||||
doc = _construct_rich_table_doc()
|
||||
|
||||
exp_file = Path("test/data/doc/rich_table.out.yaml")
|
||||
if GEN_TEST_DATA:
|
||||
doc.save_as_yaml(exp_file)
|
||||
|
||||
exp_doc = DoclingDocument.load_from_yaml(exp_file)
|
||||
assert doc == exp_doc
|
||||
|
||||
|
||||
def test_doc_manipulation_with_rich_tables():
|
||||
doc = _construct_rich_table_doc()
|
||||
|
||||
doc.delete_items(node_items=[doc.texts[0]])
|
||||
|
||||
exp_file = Path("test/data/doc/rich_table_post_text_del.out.yaml")
|
||||
if GEN_TEST_DATA:
|
||||
doc.save_as_yaml(exp_file)
|
||||
|
||||
exp_doc = DoclingDocument.load_from_yaml(exp_file)
|
||||
assert doc == exp_doc
|
||||
|
||||
|
||||
def test_invalid_rich_table_doc():
|
||||
doc = DoclingDocument(name="")
|
||||
table_item = doc.add_table(data=TableData(num_rows=2, num_cols=2))
|
||||
rich_item = doc.add_text(
|
||||
text="rich item",
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=doc.body, # not the table item
|
||||
)
|
||||
for i in range(table_item.data.num_rows):
|
||||
for j in range(table_item.data.num_cols):
|
||||
if i == 1 and j == 1:
|
||||
table_cell = RichTableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
ref=rich_item.get_ref(),
|
||||
)
|
||||
|
||||
# ensure add_table_cell() raises:
|
||||
with pytest.raises(ValueError):
|
||||
doc.add_table_cell(table_item=table_item, cell=table_cell)
|
||||
else:
|
||||
table_cell = TableCell(
|
||||
text=f"cell {i},{j}",
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
)
|
||||
|
||||
# discouraged but technically possible:
|
||||
table_item.data.table_cells.append(table_cell)
|
||||
|
||||
# ensure validate_document() raises:
|
||||
with pytest.raises(ValueError):
|
||||
DoclingDocument.validate_document(doc)
|
||||
|
||||
|
||||
def test_rich_table_item_insertion_normalization():
|
||||
|
||||
doc = DoclingDocument(name="")
|
||||
doc.add_text(label=DocItemLabel.TITLE, text="Rich tables")
|
||||
|
||||
table_item = doc.add_table(
|
||||
data=TableData(
|
||||
num_rows=4,
|
||||
num_cols=2,
|
||||
),
|
||||
)
|
||||
|
||||
rich_item = doc.add_text(
|
||||
parent=table_item,
|
||||
text="text in italic",
|
||||
label=DocItemLabel.TEXT,
|
||||
formatting=Formatting(italic=True),
|
||||
)
|
||||
|
||||
for i in range(table_item.data.num_rows):
|
||||
for j in range(table_item.data.num_cols):
|
||||
if i == 1 and j == 1:
|
||||
cell = RichTableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
ref=rich_item.get_ref(),
|
||||
)
|
||||
else:
|
||||
cell = TableCell(
|
||||
start_row_offset_idx=i,
|
||||
end_row_offset_idx=i + 1,
|
||||
start_col_offset_idx=j,
|
||||
end_col_offset_idx=j + 1,
|
||||
text=f"cell {i},{j}",
|
||||
)
|
||||
doc.add_table_cell(table_item=table_item, cell=cell)
|
||||
|
||||
# state before insert:
|
||||
exp_file = Path("test/data/doc/rich_table_item_ins_norm_1.out.yaml")
|
||||
if GEN_TEST_DATA:
|
||||
doc.save_as_yaml(exp_file)
|
||||
exp_doc = DoclingDocument.load_from_yaml(exp_file)
|
||||
assert doc == exp_doc
|
||||
|
||||
doc.insert_item_before_sibling(
|
||||
new_item=TextItem(
|
||||
self_ref="#",
|
||||
text="text before",
|
||||
orig="text before",
|
||||
label=DocItemLabel.TEXT,
|
||||
),
|
||||
sibling=table_item,
|
||||
)
|
||||
|
||||
# state after insert (prior to normalization):
|
||||
exp_file = Path("test/data/doc/rich_table_item_ins_norm_2.out.yaml")
|
||||
if GEN_TEST_DATA:
|
||||
doc.save_as_yaml(exp_file)
|
||||
exp_doc = DoclingDocument.load_from_yaml(exp_file)
|
||||
assert doc == exp_doc
|
||||
|
||||
doc._normalize_references()
|
||||
|
||||
# state after insert (after normalization):
|
||||
exp_file = Path("test/data/doc/rich_table_item_ins_norm_3.out.yaml")
|
||||
if GEN_TEST_DATA:
|
||||
doc.save_as_yaml(exp_file)
|
||||
exp_doc = DoclingDocument.load_from_yaml(exp_file)
|
||||
assert doc == exp_doc
|
||||
|
||||
@@ -29,7 +29,7 @@ from docling_core.types.doc.document import DoclingDocument, MiscAnnotation, Tab
|
||||
from docling_core.types.doc.labels import DocItemLabel
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
from .test_docling_doc import _construct_doc
|
||||
from .test_docling_doc import _construct_doc, _construct_rich_table_doc
|
||||
|
||||
|
||||
class CustomAnnotationTableSerializer(MarkdownTableSerializer):
|
||||
@@ -361,6 +361,15 @@ def test_md_nested_lists():
|
||||
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
|
||||
|
||||
|
||||
def test_md_rich_table():
|
||||
exp_file = Path("./test/data/doc/rich_table.gt.md")
|
||||
doc = _construct_rich_table_doc()
|
||||
|
||||
ser = MarkdownDocSerializer(doc=doc)
|
||||
actual = ser.serialize().text
|
||||
verify(exp_file=exp_file, actual=actual)
|
||||
|
||||
|
||||
def test_html_split_page():
|
||||
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
||||
doc = DoclingDocument.load_from_json(src)
|
||||
@@ -500,6 +509,15 @@ def test_html_nested_lists():
|
||||
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
|
||||
|
||||
|
||||
def test_html_rich_table():
|
||||
exp_file = Path("./test/data/doc/rich_table.gt.html")
|
||||
doc = _construct_rich_table_doc()
|
||||
|
||||
ser = HTMLDocSerializer(doc=doc)
|
||||
actual = ser.serialize().text
|
||||
verify(exp_file=exp_file, actual=actual)
|
||||
|
||||
|
||||
def test_doctags_inline_loc_tags():
|
||||
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
||||
doc = DoclingDocument.load_from_json(src)
|
||||
@@ -507,3 +525,13 @@ def test_doctags_inline_loc_tags():
|
||||
ser = DocTagsDocSerializer(doc=doc)
|
||||
actual = ser.serialize().text
|
||||
verify(exp_file=src.parent / f"{src.stem}.out.dt", actual=actual)
|
||||
|
||||
|
||||
def test_doctags_rich_table():
|
||||
|
||||
exp_file = Path("./test/data/doc/rich_table.out.dt")
|
||||
doc = _construct_rich_table_doc()
|
||||
|
||||
ser = DocTagsDocSerializer(doc=doc)
|
||||
actual = ser.serialize().text
|
||||
verify(exp_file=exp_file, actual=actual)
|
||||
|
||||
Reference in New Issue
Block a user