working on Forms

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar
2025-09-22 11:34:15 +02:00
parent 072e6fb1a4
commit 566a0dcd29
4 changed files with 70 additions and 37 deletions
+29 -31
View File
@@ -1188,6 +1188,7 @@ class TextItem(DocItem):
DocItemLabel.REFERENCE,
DocItemLabel.TEXT,
DocItemLabel.EMPTY_VALUE,
DocItemLabel.FORM_KEY,
]
orig: str # untreated representation
@@ -1920,59 +1921,55 @@ class KeyValueItem(FloatingItem):
text = serializer.serialize(item=self).text
return text
class CheckboxItem(ListItem):
"""FormTextItem."""
label: typing.Literal[DocItemLabel.CHECKBOX] = DocItemLabel.CHECKBOX
checked: bool = False
"""
class FormHeaderItem(SectionHeaderItem):
"""FormHeaderItem."""
label: typing.Literal[DocItemLabel.FORM_HEADER] = DocItemLabel.FORM_HEADER
class FormTextItem(TextItem):
"""FormTextItem."""
label: typing.Literal[DocItemLabel.FORM_TEXT] = DocItemLabel.FORM_TEXT
class FormListItem(TextItem):
"""
class FormListItem(DocItem):
"""FormListItem."""
label: typing.Literal[DocItemLabel.FORM_ITEM] = DocItemLabel.FORM_ITEM
label: typing.Literal[DocItemLabel.FORM_LISTITEM] = DocItemLabel.FORM_LISTITEM
marker: Optional[TextItem]
marker: Optional[TextItem] = None
key: TextItem
value: TextItem
def add_value(self, item: Union[CheckboxItem, ListItem, TextItem]) -> NodeItem:
item.parent = self.get_ref()
self.children.append(item)
return item
class FormItem(FloatingItem):
"""FormItem."""
label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
def add(self, item: Union["FormItem", FormHeaderItem, FormTextItem, FormListItem]):
return
def add_form(self, item: "FormItem") -> NodeItem:
item.parent = self.cref
self.children.append(item)
def add(self, item: Union["FormItem", SectionHeaderItem, TextItem, FormListItem]) -> NodeItem:
item.parent = self.get_ref()
self.children.append(item.get_ref())
return item
def add_form_item(self, item: FormItem):
item.parent = self.cref
self.children.append(item)
def add_listitem(self, doc: DoclingDocument, prov: Optional[ProvenanceItem] = None) -> NodeItem:
li = FormListItem(self_ref=self.get_ref())
return item
def add_form_text(self, item: FormTextItem):
item.parent = self.cref
self.children.append(item)
return item
def add_form_header(self, item: FormHeaderItem):
item.parent = self.cref
self.children.append(item)
return item
ContentItem = Annotated[
Union[
@@ -1985,6 +1982,7 @@ ContentItem = Annotated[
PictureItem,
TableItem,
KeyValueItem,
FormItem,
],
Field(discriminator="label"),
]
+4 -3
View File
@@ -31,9 +31,10 @@ class DocItemLabel(str, Enum):
HANDWRITTEN_TEXT = "handwritten_text"
EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms
FORM_HEADER = "form_header"
FORM_ITEM = "form_item"
FORM_TEXT = "form_text"
# FORM_HEADER = "form_header"
FORM_KEY = "form_key"
FORM_LISTITEM = "form_listitem"
CHECKBOX = "checkbox"
# Additional labels for markup-based formats (e.g. HTML, Word)
PARAGRAPH = "paragraph"
+1 -1
View File
@@ -24,7 +24,7 @@ graph:
source_cell_id: 1
target_cell_id: 0
image: null
label: form
label: key_value_region
parent: null
prov: []
references: []
+36 -2
View File
@@ -14,6 +14,7 @@ from pydantic import AnyUrl, ValidationError
from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size
from docling_core.types.doc.document import ( # BoundingBox,
CURRENT_VERSION,
CheckboxItem,
CodeItem,
ContentLayer,
DocItem,
@@ -42,6 +43,8 @@ from docling_core.types.doc.document import ( # BoundingBox,
TableItem,
TextItem,
TitleItem,
CheckboxItem,
FormListItem,
)
from docling_core.types.doc.labels import (
DocItemLabel,
@@ -491,6 +494,7 @@ def test_docitems():
elif dc is FormItem:
"""
graph = GraphData(
cells=[
GraphCell(
@@ -524,7 +528,31 @@ def test_docitems():
self_ref="#",
)
verify(dc, obj)
"""
key_name = TextItem(text="name", orig="name", self_ref="#", label=DocItemLabel.FORM_KEY)
val_name = TextItem(text="John Doe", orig="name", self_ref="#", label=DocItemLabel.TEXT)
form_item_name = FormListItem(key=key_name, self_ref="#")
form_item_name.add_value(val_name)
key_age = TextItem(text="Age", orig="Age", self_ref="#", label=DocItemLabel.FORM_KEY)
cb_age_0 = CheckboxItem(checked=True, text="0-20", orig="0-20", self_ref="#")
cb_age_1 = CheckboxItem(checked=False, text="20-40", orig="20-40", self_ref="#")
val_age = TextItem(text="other", orig="other", self_ref="#", label=DocItemLabel.TEXT)
form_item_age = FormListItem(key=key_age, self_ref="#") #, value=[cb_age_0, cb_age_1, val_age])
for _ in [cb_age_0, cb_age_1, val_age]:
form_item_age.add_value(_)
form = FormItem(self_ref="#")
form.add(form_item_name)
form.add(form_item_age)
verify(dc, obj)
elif dc is TitleItem:
obj = dc(
text="whatever",
@@ -571,8 +599,12 @@ def test_docitems():
text="E=mc^2",
)
verify(dc, obj)
elif dc is GraphData: # we skip this on purpose
elif dc is CheckboxItem: # we skip this on purpose
continue
elif dc is FormListItem: # we skip this on purpose
continue
elif dc is GraphData: # we skip this on purpose
continue
else:
raise RuntimeError(f"New derived class detected {dc.__name__}")
@@ -1002,8 +1034,10 @@ def _construct_doc() -> DoclingDocument:
doc.add_key_values(graph=graph)
doc.add_form(graph=graph)
form_1 = doc.add_form(graph=graph)
form_1_item_1 = form_1.add_listitem(key="Name")
inline_fmt = doc.add_inline_group()
doc.add_text(
label=DocItemLabel.TEXT, text="Some formatting chops:", parent=inline_fmt