From 8076e30969e647df451a30e494d89c3ef4ffe99b Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 25 Mar 2025 10:58:55 +0100 Subject: [PATCH] fix: Build lines from word level cells Signed-off-by: Christoph Auer --- docling_parse/pdf_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 05524f6..f40401d 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -367,10 +367,13 @@ class PdfDocument: if len(segmented_page.textline_cells) > 0: return + self._create_word_cells(segmented_page, _loglevel) + sanitizer = pdf_sanitizer(level=_loglevel) char_data = [] - for item in segmented_page.char_cells: + # Note: We build the lines from the word level cells. + for item in segmented_page.word_cells: item_dict = item.model_dump(mode="json", by_alias=True, exclude_none=True) # TODO changing representation for the C++ parser, need to update on C++ code.