diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 447278e6..29d64c5a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1188,6 +1188,7 @@ class TextItem(DocItem): DocItemLabel.REFERENCE, DocItemLabel.TEXT, DocItemLabel.EMPTY_VALUE, + DocItemLabel.FORM_KEY, ] orig: str # untreated representation @@ -1920,59 +1921,55 @@ class KeyValueItem(FloatingItem): text = serializer.serialize(item=self).text return text +class CheckboxItem(ListItem): + """FormTextItem.""" + + label: typing.Literal[DocItemLabel.CHECKBOX] = DocItemLabel.CHECKBOX + checked: bool = False + +""" class FormHeaderItem(SectionHeaderItem): - """FormHeaderItem.""" label: typing.Literal[DocItemLabel.FORM_HEADER] = DocItemLabel.FORM_HEADER class FormTextItem(TextItem): - """FormTextItem.""" label: typing.Literal[DocItemLabel.FORM_TEXT] = DocItemLabel.FORM_TEXT - -class FormListItem(TextItem): +""" + +class FormListItem(DocItem): """FormListItem.""" - label: typing.Literal[DocItemLabel.FORM_ITEM] = DocItemLabel.FORM_ITEM + label: typing.Literal[DocItemLabel.FORM_LISTITEM] = DocItemLabel.FORM_LISTITEM - marker: Optional[TextItem] + marker: Optional[TextItem] = None key: TextItem - value: TextItem + def add_value(self, item: Union[CheckboxItem, ListItem, TextItem]) -> NodeItem: + item.parent = self.get_ref() + self.children.append(item) + + return item + + + class FormItem(FloatingItem): """FormItem.""" label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM - def add(self, item: Union["FormItem", FormHeaderItem, FormTextItem, FormListItem]): - return - - def add_form(self, item: "FormItem") -> NodeItem: - item.parent = self.cref - self.children.append(item) - + def add(self, item: Union["FormItem", SectionHeaderItem, TextItem, FormListItem]) -> NodeItem: + item.parent = self.get_ref() + self.children.append(item.get_ref()) + return item - def add_form_item(self, item: FormItem): - item.parent = self.cref - self.children.append(item) - + def add_listitem(self, doc: DoclingDocument, prov: Optional[ProvenanceItem] = None) -> NodeItem: + li = FormListItem(self_ref=self.get_ref()) return item - - def add_form_text(self, item: FormTextItem): - item.parent = self.cref - self.children.append(item) - - return item - - def add_form_header(self, item: FormHeaderItem): - item.parent = self.cref - self.children.append(item) - - return item - + ContentItem = Annotated[ Union[ @@ -1985,6 +1982,7 @@ ContentItem = Annotated[ PictureItem, TableItem, KeyValueItem, + FormItem, ], Field(discriminator="label"), ] diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index 17f6d1cf..69454895 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -31,9 +31,10 @@ class DocItemLabel(str, Enum): HANDWRITTEN_TEXT = "handwritten_text" EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms - FORM_HEADER = "form_header" - FORM_ITEM = "form_item" - FORM_TEXT = "form_text" + # FORM_HEADER = "form_header" + FORM_KEY = "form_key" + FORM_LISTITEM = "form_listitem" + CHECKBOX = "checkbox" # Additional labels for markup-based formats (e.g. HTML, Word) PARAGRAPH = "paragraph" diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..219e951e 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -24,7 +24,7 @@ graph: source_cell_id: 1 target_cell_id: 0 image: null -label: form +label: key_value_region parent: null prov: [] references: [] diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 50c263a9..de71fb1c 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -14,6 +14,7 @@ from pydantic import AnyUrl, ValidationError from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size from docling_core.types.doc.document import ( # BoundingBox, CURRENT_VERSION, + CheckboxItem, CodeItem, ContentLayer, DocItem, @@ -42,6 +43,8 @@ from docling_core.types.doc.document import ( # BoundingBox, TableItem, TextItem, TitleItem, + CheckboxItem, + FormListItem, ) from docling_core.types.doc.labels import ( DocItemLabel, @@ -491,6 +494,7 @@ def test_docitems(): elif dc is FormItem: + """ graph = GraphData( cells=[ GraphCell( @@ -524,7 +528,31 @@ def test_docitems(): self_ref="#", ) verify(dc, obj) + """ + key_name = TextItem(text="name", orig="name", self_ref="#", label=DocItemLabel.FORM_KEY) + val_name = TextItem(text="John Doe", orig="name", self_ref="#", label=DocItemLabel.TEXT) + + form_item_name = FormListItem(key=key_name, self_ref="#") + form_item_name.add_value(val_name) + + key_age = TextItem(text="Age", orig="Age", self_ref="#", label=DocItemLabel.FORM_KEY) + + cb_age_0 = CheckboxItem(checked=True, text="0-20", orig="0-20", self_ref="#") + cb_age_1 = CheckboxItem(checked=False, text="20-40", orig="20-40", self_ref="#") + val_age = TextItem(text="other", orig="other", self_ref="#", label=DocItemLabel.TEXT) + + form_item_age = FormListItem(key=key_age, self_ref="#") #, value=[cb_age_0, cb_age_1, val_age]) + for _ in [cb_age_0, cb_age_1, val_age]: + form_item_age.add_value(_) + + form = FormItem(self_ref="#") + + form.add(form_item_name) + form.add(form_item_age) + + verify(dc, obj) + elif dc is TitleItem: obj = dc( text="whatever", @@ -571,8 +599,12 @@ def test_docitems(): text="E=mc^2", ) verify(dc, obj) - elif dc is GraphData: # we skip this on purpose + elif dc is CheckboxItem: # we skip this on purpose + continue + elif dc is FormListItem: # we skip this on purpose continue + elif dc is GraphData: # we skip this on purpose + continue else: raise RuntimeError(f"New derived class detected {dc.__name__}") @@ -1002,8 +1034,10 @@ def _construct_doc() -> DoclingDocument: doc.add_key_values(graph=graph) - doc.add_form(graph=graph) + form_1 = doc.add_form(graph=graph) + form_1_item_1 = form_1.add_listitem(key="Name") + inline_fmt = doc.add_inline_group() doc.add_text( label=DocItemLabel.TEXT, text="Some formatting chops:", parent=inline_fmt