mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
0c317060cf
* fix(docx): preserve custom numbering text prefix in list markers
* DCO Remediation Commit for Brighton <brighton@Brightons-MacBook-Air.local>
I, Brighton <brighton@Brightons-MacBook-Air.local>, hereby add my Signed-off-by to this commit: b92e32597b
Signed-off-by: Brighton <brighton@Brightons-MacBook-Air.local>
* fix: resolve lint errors in test (unused vars, import order)
Signed-off-by: Brighton <brighton@Brightons-MacBook-Air.local>
* style: fix ruff lint and format
Signed-off-by: Brighton <brighton@Brightons-MacBook-Air.local>
* refactor: move import re to top of file
Signed-off-by: Brighton <brighton@Brightons-MacBook-Air.local>
---------
Signed-off-by: Brighton <brighton@Brightons-MacBook-Air.local>
Co-authored-by: Brighton <brighton@Brightons-MacBook-Air.local>
794 lines
27 KiB
Python
794 lines
27 KiB
Python
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
from docling_core.types.doc import DocItemLabel, GroupItem
|
|
from lxml import etree
|
|
|
|
import docling.backend.msword_backend as msword_backend_module
|
|
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
|
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import (
|
|
ConversionResult,
|
|
DoclingDocument,
|
|
InputDocument,
|
|
SectionHeaderItem,
|
|
TextItem,
|
|
)
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
from .verify_utils import verify_document, verify_export
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
GENERATE = GEN_TEST_DATA
|
|
IS_CI = bool(os.getenv("CI"))
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def docx_paths() -> list[Path]:
|
|
# Define the directory you want to search
|
|
directory = Path("./tests/data/docx/")
|
|
|
|
# List all docx files in the directory and its subdirectories
|
|
docx_files = sorted(directory.rglob("*.docx"))
|
|
|
|
return docx_files
|
|
|
|
|
|
def get_converter():
|
|
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
|
|
|
|
return converter
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def backend(docx_paths) -> MsWordDocumentBackend:
|
|
docx_path = docx_paths[0]
|
|
in_doc = InputDocument(
|
|
path_or_stream=docx_path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
)
|
|
return in_doc._backend
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]:
|
|
documents: list[dict[Path, DoclingDocument]] = []
|
|
|
|
converter = get_converter()
|
|
|
|
for docx_path in docx_paths:
|
|
_log.debug(f"converting {docx_path}")
|
|
|
|
gt_path = (
|
|
docx_path.parent.parent / "groundtruth" / "docling_v2" / docx_path.name
|
|
)
|
|
|
|
conv_result: ConversionResult = converter.convert(docx_path)
|
|
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
assert doc, f"Failed to convert document from file {gt_path}"
|
|
documents.append((gt_path, doc))
|
|
|
|
return documents
|
|
|
|
|
|
def _test_e2e_docx_conversions_impl(docx_paths: list[tuple[Path, DoclingDocument]]):
|
|
has_libreoffice = False
|
|
try:
|
|
cmd = get_libreoffice_cmd(raise_if_unavailable=True)
|
|
if cmd is not None:
|
|
has_libreoffice = True
|
|
except Exception:
|
|
pass
|
|
|
|
for docx_path, doc in docx_paths:
|
|
if not IS_CI and not has_libreoffice and docx_path.name == "drawingml.docx":
|
|
print(f"Skipping {docx_path} because no Libreoffice is installed.")
|
|
continue
|
|
|
|
pred_md: str = doc.export_to_markdown()
|
|
assert verify_export(pred_md, str(docx_path) + ".md", generate=GENERATE), (
|
|
f"export to markdown failed on {docx_path}"
|
|
)
|
|
|
|
pred_itxt: str = doc._export_to_indented_text(
|
|
max_text_len=70, explicit_tables=False
|
|
)
|
|
assert verify_export(pred_itxt, str(docx_path) + ".itxt", generate=GENERATE), (
|
|
f"export to indented-text failed on {docx_path}"
|
|
)
|
|
|
|
assert verify_document(doc, str(docx_path) + ".json", generate=GENERATE), (
|
|
f"DoclingDocument verification failed on {docx_path}"
|
|
)
|
|
|
|
if docx_path.name in {"word_tables.docx", "docx_rich_cells.docx"}:
|
|
pred_html: str = doc.export_to_html()
|
|
assert verify_export(
|
|
pred_text=pred_html,
|
|
gtfile=str(docx_path) + ".html",
|
|
generate=GENERATE,
|
|
), f"export to html failed on {docx_path}"
|
|
|
|
|
|
flaky_file = "textbox.docx"
|
|
|
|
|
|
def test_e2e_docx_conversions(documents):
|
|
target = [item for item in documents if item[0].name != flaky_file]
|
|
_test_e2e_docx_conversions_impl(target)
|
|
|
|
|
|
@pytest.mark.xfail(strict=False)
|
|
def test_textbox_conversion(documents):
|
|
target = [item for item in documents if item[0].name == flaky_file]
|
|
_test_e2e_docx_conversions_impl(target)
|
|
|
|
|
|
@pytest.mark.xfail(strict=False)
|
|
def test_textbox_extraction(documents):
|
|
name = "textbox.docx"
|
|
doc = next(item[1] for item in documents if item[0].name == name)
|
|
|
|
# Verify if a particular textbox content is extracted
|
|
textbox_found = False
|
|
for item, _ in doc.iterate_items():
|
|
if item.text[:30] == """Suggested Reportable Symptoms:""":
|
|
textbox_found = True
|
|
assert textbox_found
|
|
|
|
|
|
def test_heading_levels(documents):
|
|
name = "word_sample.docx"
|
|
doc = next(item[1] for item in documents if item[0].name == name)
|
|
|
|
found_lvl_1 = found_lvl_2 = False
|
|
for item, _ in doc.iterate_items():
|
|
if isinstance(item, SectionHeaderItem):
|
|
if item.text == "Let\u2019s swim!":
|
|
found_lvl_1 = True
|
|
assert item.level == 1
|
|
elif item.text == "Let\u2019s eat":
|
|
found_lvl_2 = True
|
|
assert item.level == 2
|
|
assert found_lvl_1 and found_lvl_2
|
|
|
|
|
|
def test_text_after_image_anchors(documents):
|
|
"""Test to analyse whether text gets parsed after image anchors."""
|
|
|
|
name = "word_image_anchors.docx"
|
|
doc = next(item[1] for item in documents if item[0].name == name)
|
|
|
|
found_text_after_anchor_1 = found_text_after_anchor_2 = (
|
|
found_text_after_anchor_3
|
|
) = found_text_after_anchor_4 = False
|
|
for item, _ in doc.iterate_items():
|
|
if isinstance(item, TextItem):
|
|
if item.text == "This is test 1":
|
|
found_text_after_anchor_1 = True
|
|
elif item.text == "0:08\nCorrect, he is not.":
|
|
found_text_after_anchor_2 = True
|
|
elif item.text == "This is test 2":
|
|
found_text_after_anchor_3 = True
|
|
elif item.text == "0:16\nYeah, exactly.":
|
|
found_text_after_anchor_4 = True
|
|
|
|
assert (
|
|
found_text_after_anchor_1
|
|
and found_text_after_anchor_2
|
|
and found_text_after_anchor_3
|
|
and found_text_after_anchor_4
|
|
)
|
|
|
|
|
|
def test_is_rich_table_cell(docx_paths):
|
|
"""Test the function is_rich_table_cell."""
|
|
|
|
name = "docx_rich_cells.docx"
|
|
path = next(item for item in docx_paths if item.name == name)
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
filename=name,
|
|
)
|
|
backend = MsWordDocumentBackend(
|
|
in_doc=in_doc,
|
|
path_or_stream=path,
|
|
)
|
|
|
|
gt_cells: list[bool] = []
|
|
# table: Table with rich cells
|
|
gt_cells.extend([False, False, True, True, True, True, True, False])
|
|
# table: Table with nested table
|
|
gt_cells.extend([False, False, False, True, True, True])
|
|
# table: Table with pictures
|
|
gt_cells.extend([False, False, False, True, True, False])
|
|
# table: Lists with same numId in different cells
|
|
gt_cells.extend([True, True])
|
|
# table: Lists with different numIds in different cells
|
|
gt_cells.extend([True, True])
|
|
# table: Multiple columns with lists
|
|
gt_cells.extend([True, True, True, True])
|
|
# table: Mixed content - list and regular text in different cells
|
|
gt_cells.extend([True, False])
|
|
gt_it = iter(gt_cells)
|
|
|
|
for idx_t, table in enumerate(backend.docx_obj.tables):
|
|
for idx_r, row in enumerate(table.rows):
|
|
for idx_c, cell in enumerate(row.cells):
|
|
assert next(gt_it) == backend._is_rich_table_cell(cell), (
|
|
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
|
|
f"with text: {cell.text}"
|
|
)
|
|
|
|
|
|
def test_add_header_footer(documents):
|
|
"""Test the funciton _add_header_footer."""
|
|
|
|
name = "unit_test_formatting.docx"
|
|
doc = next(item[1] for item in documents if item[0].name == name)
|
|
|
|
headers: list[GroupItem] = []
|
|
footers: list[GroupItem] = []
|
|
for group in doc.groups:
|
|
if not isinstance(group, GroupItem):
|
|
continue
|
|
if group.name == "page header":
|
|
headers.append(group)
|
|
elif group.name == "page footer":
|
|
footers.append(group)
|
|
|
|
assert len(headers) == 2, "Expected 2 different headers"
|
|
assert len(footers) == 2, "Expected 2 different footers"
|
|
|
|
assert len(headers[0].children) == 1, "First page header should have 1 paragraph"
|
|
assert len(headers[1].children) == 2, "Second page header should have 2 paragraphs"
|
|
|
|
assert len(footers[0].children) == 1, "First page footer should have 1 paragraph"
|
|
assert len(footers[1].children) == 4, (
|
|
"Second page footer should have 3 paragraphs and 1 picture"
|
|
)
|
|
|
|
|
|
def test_handle_pictures(documents):
|
|
"""Test the function _handle_pictures."""
|
|
|
|
name = "docx_grouped_images.docx"
|
|
doc = next(item[1] for item in documents if item[0].name == name)
|
|
|
|
assert len(doc.pictures) == 6
|
|
assert isinstance(doc.pictures[0].parent.resolve(doc), GroupItem)
|
|
assert doc.pictures[0].parent == doc.pictures[1].parent
|
|
assert isinstance(doc.pictures[2].parent.resolve(doc), GroupItem)
|
|
assert doc.pictures[2].parent == doc.pictures[3].parent
|
|
assert isinstance(doc.pictures[4].parent.resolve(doc), SectionHeaderItem)
|
|
assert doc.pictures[4].parent == doc.pictures[5].parent
|
|
|
|
|
|
def test_comments_extraction(documents):
|
|
"""Test the function _add_comments for extracting Word document comments."""
|
|
|
|
name = "word_comments.docx"
|
|
doc = next(item[1] for item in documents if item[0].name == name)
|
|
|
|
# Find comment groups
|
|
comment_groups: list[GroupItem] = []
|
|
for group in doc.groups:
|
|
if not isinstance(group, GroupItem):
|
|
continue
|
|
if group.name.startswith("comment-"):
|
|
comment_groups.append(group)
|
|
|
|
assert len(comment_groups) == 3, "Expected 3 comments in the document"
|
|
|
|
# Collect all comment text content
|
|
comment_texts = []
|
|
for text_item in doc.texts:
|
|
if hasattr(text_item, "content_layer") and text_item.content_layer == "notes":
|
|
comment_texts.append(text_item.text)
|
|
|
|
# Check that author info is included with new format
|
|
assert any("author: John Reviewer (JR)" in text for text in comment_texts), (
|
|
"Expected 'author: John Reviewer (JR)' in comments"
|
|
)
|
|
assert any("author: Jane Editor (JE)" in text for text in comment_texts), (
|
|
"Expected 'author: Jane Editor (JE)' in comments"
|
|
)
|
|
|
|
# Check that comment text is included
|
|
assert any("sample reviewer comment" in text for text in comment_texts), (
|
|
"Expected comment text content"
|
|
)
|
|
assert any(
|
|
"Another comment by a different reviewer" in text for text in comment_texts
|
|
), "Expected second comment text content"
|
|
|
|
# Check content layer is NOTES
|
|
for group in comment_groups:
|
|
assert group.content_layer == "notes", (
|
|
"Comments should be in NOTES content layer"
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"style_label,expected_label,expected_level",
|
|
[
|
|
("Heading 1", "Heading", 1),
|
|
("Heading 2", "Heading", 2),
|
|
("Heading 9", "Heading", 9),
|
|
("Heading 0", "Heading", 1), # Custom style - level 0 should be clamped to 1
|
|
("1 Heading", "Heading", 1), # Number before text
|
|
("0 Heading", "Heading", 1), # Zero before text should be clamped to 1
|
|
],
|
|
)
|
|
def test_get_heading_and_level(docx_paths, style_label, expected_label, expected_level):
|
|
"""Test _get_heading_and_level handles edge cases like 'Heading 0' correctly."""
|
|
# Create a backend instance using any existing docx file
|
|
docx_path = docx_paths[0]
|
|
in_doc = InputDocument(
|
|
path_or_stream=docx_path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
)
|
|
backend = in_doc._backend
|
|
|
|
label, level = backend._get_heading_and_level(style_label)
|
|
assert label == expected_label, (
|
|
f"Expected label '{expected_label}' for '{style_label}', got '{label}'"
|
|
)
|
|
assert level == expected_level, (
|
|
f"Expected level {expected_level} for '{style_label}', got {level}"
|
|
)
|
|
|
|
|
|
def test_get_outline_level_from_style():
|
|
"""Test that _get_outline_level_from_style correctly extracts outlineLvl.
|
|
|
|
Uses word_sample.docx which has known heading paragraphs:
|
|
- Paragraph 5: "Let's swim!" with Heading 1 style (outlineLvl=0 in XML)
|
|
- Paragraph 15: "Let's eat" with Heading 2 style (outlineLvl=1 in XML)
|
|
|
|
OOXML outlineLvl is 0-indexed, so our method should return outlineLvl + 1.
|
|
"""
|
|
from docx import Document
|
|
|
|
docx_path = Path("./tests/data/docx/word_sample.docx")
|
|
in_doc = InputDocument(
|
|
path_or_stream=docx_path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
)
|
|
backend = in_doc._backend
|
|
doc = Document(docx_path)
|
|
paragraphs = doc.paragraphs
|
|
|
|
# Test Heading 1: outlineLvl=0 should return level 1
|
|
heading1_para = paragraphs[5]
|
|
assert heading1_para.text == "Let\u2019s swim!", "Test document structure changed"
|
|
assert heading1_para.style.name == "Heading 1"
|
|
assert backend._get_outline_level_from_style(heading1_para) == 1
|
|
|
|
# Test Heading 2: outlineLvl=1 should return level 2
|
|
heading2_para = paragraphs[15]
|
|
assert heading2_para.text == "Let\u2019s eat", "Test document structure changed"
|
|
assert heading2_para.style.name == "Heading 2"
|
|
assert backend._get_outline_level_from_style(heading2_para) == 2
|
|
|
|
# Test non-heading paragraph: should return None
|
|
normal_para = paragraphs[0] # First paragraph is not a heading
|
|
assert "heading" not in normal_para.style.name.lower()
|
|
assert backend._get_outline_level_from_style(normal_para) is None
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"style_label,expected_label,expected_level",
|
|
[
|
|
("Normal", "Normal", None), # Non-heading style
|
|
("Title", "Title", None), # Non-heading style
|
|
("CustomStyle", "CustomStyle", None), # Non-heading style
|
|
],
|
|
)
|
|
def test_get_heading_and_level_non_heading(
|
|
docx_paths, style_label, expected_label, expected_level
|
|
):
|
|
"""Test _get_heading_and_level returns input unchanged for non-heading styles."""
|
|
docx_path = docx_paths[0]
|
|
in_doc = InputDocument(
|
|
path_or_stream=docx_path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
)
|
|
backend = in_doc._backend
|
|
|
|
label, level = backend._get_heading_and_level(style_label)
|
|
assert label == expected_label
|
|
assert level == expected_level
|
|
|
|
|
|
def test_external_image_references():
|
|
"""Test that .docx files with external image references convert without crashing.
|
|
|
|
Docx files saved from web browsers often have images as external references
|
|
(TargetMode="External") pointing to URLs or file:// paths rather than embedded
|
|
in word/media/. Previously this caused a ValueError from python-docx:
|
|
"target_part property on _Relationship is undefined when target mode is External"
|
|
|
|
See: https://github.com/docling-project/docling/issues/3113
|
|
"""
|
|
docx_path = Path("./tests/data/docx/docx_external_image.docx")
|
|
assert docx_path.exists(), f"Test file not found: {docx_path}"
|
|
|
|
converter = get_converter()
|
|
|
|
with pytest.warns(UserWarning, match="Skipping external image reference"):
|
|
conv_result = converter.convert(docx_path)
|
|
|
|
doc = conv_result.document
|
|
|
|
# Document should convert successfully (not crash)
|
|
assert doc is not None
|
|
|
|
# Text content should still be extracted even though the external image is skipped
|
|
md = doc.export_to_markdown()
|
|
assert "Test Document with External Image" in md
|
|
assert "text before the image" in md
|
|
assert "after the external image" in md
|
|
|
|
|
|
def test_inline_sdt_references(tmp_path):
|
|
"""Test that inline SDT citation blocks are preserved in DOCX paragraphs."""
|
|
from docx import Document
|
|
from docx.oxml import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
|
|
def _append_citation(paragraph, text: str):
|
|
sdt = OxmlElement("w:sdt")
|
|
sdt_pr = OxmlElement("w:sdtPr")
|
|
tag = OxmlElement("w:tag")
|
|
tag.set(qn("w:val"), "MENDELEY_CITATION_v3_test")
|
|
sdt_pr.append(tag)
|
|
|
|
sdt_content = OxmlElement("w:sdtContent")
|
|
run = OxmlElement("w:r")
|
|
run_text = OxmlElement("w:t")
|
|
run_text.text = text
|
|
run.append(run_text)
|
|
sdt_content.append(run)
|
|
|
|
sdt.append(sdt_pr)
|
|
sdt.append(sdt_content)
|
|
paragraph._p.append(sdt)
|
|
|
|
docx_path = tmp_path / "inline_sdt_reference.docx"
|
|
doc = Document()
|
|
|
|
first_paragraph = doc.add_paragraph()
|
|
first_paragraph.add_run("Impact ")
|
|
_append_citation(first_paragraph, "(Hagman G 1984)")
|
|
first_paragraph.add_run(". After.")
|
|
|
|
second_paragraph = doc.add_paragraph()
|
|
_append_citation(second_paragraph, "(Standalone citation)")
|
|
|
|
doc.save(docx_path)
|
|
|
|
conv_result = get_converter().convert(docx_path)
|
|
markdown = conv_result.document.export_to_markdown()
|
|
|
|
assert "Impact (Hagman G 1984). After." in markdown
|
|
assert "(Standalone citation)" in markdown
|
|
|
|
|
|
def test_list_counter_and_enum_marker(docx_paths):
|
|
"""Test list counter increment, sub-level reset, marker building, and sequence reset."""
|
|
docx_path = docx_paths[0]
|
|
in_doc = InputDocument(
|
|
path_or_stream=docx_path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
)
|
|
backend = in_doc._backend
|
|
|
|
# Basic increment
|
|
assert backend._get_list_counter(1, 0) == 1
|
|
assert backend._get_list_counter(1, 0) == 2
|
|
assert backend._get_list_counter(1, 1) == 1
|
|
assert backend._get_list_counter(1, 1) == 2
|
|
assert backend._get_list_counter(1, 1) == 3
|
|
|
|
# Advancing parent level resets sub-levels
|
|
backend._get_list_counter(1, 2) # (1,2) = 1
|
|
backend._get_list_counter(1, 0) # (1,0) = 3, resets lvl 1 and 2
|
|
assert backend.list_counters[(1, 1)] == 0
|
|
assert backend.list_counters[(1, 2)] == 0
|
|
assert backend._get_list_counter(1, 1) == 1 # restarts from 1
|
|
|
|
# Hierarchical enum markers
|
|
backend.list_counters[(1, 0)] = 2
|
|
backend.list_counters[(1, 1)] = 3
|
|
backend.list_counters[(1, 2)] = 1
|
|
assert backend._build_enum_marker(1, 0) == "2."
|
|
assert backend._build_enum_marker(1, 1) == "2.3."
|
|
assert backend._build_enum_marker(1, 2) == "2.3.1."
|
|
assert backend._build_enum_marker(99, 0) == "1." # missing counter defaults to 1
|
|
|
|
# Reset sequence for a specific numid
|
|
backend._get_list_counter(2, 0) # (2,0) = 1
|
|
backend._reset_list_counters_for_new_sequence(1)
|
|
assert backend.list_counters[(1, 0)] == 0
|
|
assert backend.list_counters[(1, 1)] == 0
|
|
assert backend.list_counters[(2, 0)] == 1 # unaffected
|
|
|
|
|
|
def test_custom_numbering_format_markers(tmp_path):
|
|
"""Test that lvlText templates like 'Proposal %1:' produce correct markers.
|
|
|
|
Word documents can define custom numbering formats in the lvlText element,
|
|
e.g. 'Proposal %1:' or 'Observation %1:'. The marker should preserve the
|
|
text prefix/suffix and substitute %N with the counter value for level N.
|
|
"""
|
|
from docx import Document
|
|
from docx.oxml import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
|
|
doc = Document()
|
|
|
|
# Add numbering definitions with custom lvlText
|
|
numbering_part = doc.part.numbering_part
|
|
numbering = numbering_part.element
|
|
|
|
# Create abstractNum with lvlText="Proposal %1:" at level 0
|
|
abstract_num = OxmlElement("w:abstractNum")
|
|
abstract_num.set(qn("w:abstractNumId"), "100")
|
|
lvl = OxmlElement("w:lvl")
|
|
lvl.set(qn("w:ilvl"), "0")
|
|
start = OxmlElement("w:start")
|
|
start.set(qn("w:val"), "1")
|
|
lvl.append(start)
|
|
numfmt = OxmlElement("w:numFmt")
|
|
numfmt.set(qn("w:val"), "decimal")
|
|
lvl.append(numfmt)
|
|
lvltext = OxmlElement("w:lvlText")
|
|
lvltext.set(qn("w:val"), "Proposal %1:")
|
|
lvl.append(lvltext)
|
|
abstract_num.append(lvl)
|
|
numbering.append(abstract_num)
|
|
|
|
# Create num referencing abstractNum 100
|
|
num_elem = OxmlElement("w:num")
|
|
num_elem.set(qn("w:numId"), "200")
|
|
abstract_ref = OxmlElement("w:abstractNumId")
|
|
abstract_ref.set(qn("w:val"), "100")
|
|
num_elem.append(abstract_ref)
|
|
numbering.append(num_elem)
|
|
|
|
# Save and load through backend
|
|
docx_path = tmp_path / "custom_numbering.docx"
|
|
doc.save(str(docx_path))
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=docx_path,
|
|
format=InputFormat.DOCX,
|
|
backend=MsWordDocumentBackend,
|
|
)
|
|
backend = in_doc._backend
|
|
|
|
# Simulate counter state and verify markers
|
|
backend.list_counters[(200, 0)] = 1
|
|
assert backend._build_enum_marker(200, 0) == "Proposal 1:"
|
|
|
|
backend.list_counters[(200, 0)] = 3
|
|
assert backend._build_enum_marker(200, 0) == "Proposal 3:"
|
|
|
|
# Verify plain numeric markers still work (no text prefix)
|
|
backend.list_counters[(1, 0)] = 5
|
|
assert backend._build_enum_marker(1, 0) == "5."
|
|
|
|
# Verify hierarchical markers still work
|
|
backend.list_counters[(1, 0)] = 2
|
|
backend.list_counters[(1, 1)] = 3
|
|
assert backend._build_enum_marker(1, 1) == "2.3."
|
|
|
|
|
|
def test_handle_equations_in_text_returns_original_text_on_mismatch(
|
|
backend, monkeypatch
|
|
):
|
|
element = etree.Element("p")
|
|
run = etree.SubElement(element, "r")
|
|
text_elem = etree.SubElement(run, "t")
|
|
text_elem.text = "alpha"
|
|
etree.SubElement(element, "oMath")
|
|
|
|
monkeypatch.setattr(msword_backend_module, "oMath2Latex", lambda _: "x")
|
|
|
|
text, equations = backend._handle_equations_in_text(element=element, text="beta")
|
|
|
|
assert text == "beta"
|
|
assert equations == []
|
|
|
|
|
|
def test_handle_equations_in_text_skips_empty_substrings(backend, monkeypatch):
|
|
equation = backend.equation_bookends.format(EQ="x")
|
|
|
|
element = etree.Element("p")
|
|
empty_run = etree.SubElement(element, "r")
|
|
empty_text = etree.SubElement(empty_run, "t")
|
|
empty_text.text = ""
|
|
etree.SubElement(element, "oMath")
|
|
tail_run = etree.SubElement(element, "r")
|
|
tail_text = etree.SubElement(tail_run, "t")
|
|
tail_text.text = "tail"
|
|
|
|
monkeypatch.setattr(msword_backend_module, "oMath2Latex", lambda _: "x")
|
|
|
|
text, equations = backend._handle_equations_in_text(element=element, text="tail")
|
|
|
|
assert equations == [equation]
|
|
assert text == f"{equation}tail"
|
|
|
|
|
|
def test_handle_text_elements_returns_empty_refs_when_text_is_none(
|
|
backend, monkeypatch
|
|
):
|
|
element = backend.docx_obj.paragraphs[0]._element
|
|
|
|
monkeypatch.setattr(
|
|
backend, "_handle_equations_in_text", lambda element, text: (None, [])
|
|
)
|
|
|
|
refs = backend._handle_text_elements(element, DoclingDocument(name="test"))
|
|
|
|
assert refs == []
|
|
|
|
|
|
def test_handle_text_elements_heading_defaults_to_non_numbered_when_style_missing(
|
|
backend, monkeypatch
|
|
):
|
|
captured: dict[str, tuple[int, str, bool]] = {}
|
|
|
|
class FakeParagraph:
|
|
def __init__(self, element, docx_obj):
|
|
self.text = "Heading text"
|
|
self.style = SimpleNamespace()
|
|
|
|
monkeypatch.setattr(msword_backend_module, "Paragraph", FakeParagraph)
|
|
monkeypatch.setattr(backend, "_get_paragraph_elements", lambda paragraph: [])
|
|
monkeypatch.setattr(
|
|
backend, "_handle_equations_in_text", lambda element, text: (text, [])
|
|
)
|
|
monkeypatch.setattr(backend, "_get_comment_ids_for_element", lambda element: [])
|
|
monkeypatch.setattr(
|
|
backend, "_get_label_and_level", lambda paragraph: ("Heading", 1)
|
|
)
|
|
monkeypatch.setattr(backend, "_get_numId_and_ilvl", lambda paragraph: (None, None))
|
|
|
|
def fake_add_heading(doc, level, text, is_numbered_style):
|
|
captured["heading"] = (level, text, is_numbered_style)
|
|
return []
|
|
|
|
monkeypatch.setattr(backend, "_add_heading", fake_add_heading)
|
|
|
|
refs = backend._handle_text_elements(object(), DoclingDocument(name="test"))
|
|
|
|
assert refs == []
|
|
assert captured["heading"] == (1, "Heading text", False)
|
|
|
|
|
|
def test_handle_text_elements_inline_equations_stop_when_text_is_consumed(
|
|
backend, monkeypatch
|
|
):
|
|
equation_one = backend.equation_bookends.format(EQ="a")
|
|
equation_two = backend.equation_bookends.format(EQ="b")
|
|
|
|
class FakeParagraph:
|
|
def __init__(self, element, docx_obj):
|
|
self.text = "inline eq"
|
|
self.style = SimpleNamespace()
|
|
|
|
monkeypatch.setattr(msword_backend_module, "Paragraph", FakeParagraph)
|
|
monkeypatch.setattr(backend, "_get_paragraph_elements", lambda paragraph: [])
|
|
monkeypatch.setattr(
|
|
backend,
|
|
"_handle_equations_in_text",
|
|
lambda element, text: (equation_one, [equation_one, equation_two]),
|
|
)
|
|
monkeypatch.setattr(backend, "_get_comment_ids_for_element", lambda element: [])
|
|
monkeypatch.setattr(
|
|
backend, "_get_label_and_level", lambda paragraph: ("Normal", None)
|
|
)
|
|
monkeypatch.setattr(backend, "_get_numId_and_ilvl", lambda paragraph: (None, None))
|
|
monkeypatch.setattr(backend, "_prev_numid", lambda: None)
|
|
monkeypatch.setattr(backend, "_get_level", lambda: 1)
|
|
backend.parents[0] = None
|
|
|
|
refs = backend._handle_text_elements(object(), DoclingDocument(name="test"))
|
|
|
|
assert len(refs) == 2
|
|
|
|
|
|
def test_checkbox_detection_and_parsing(documents):
|
|
"""Test that checkboxes in DOCX files are correctly detected and parsed."""
|
|
name = "docx_checkboxes.docx"
|
|
doc = next((item[1] for item in documents if item[0].name == name), None)
|
|
|
|
if doc is None:
|
|
pytest.skip(f"Test file not found: {name}")
|
|
|
|
checkbox_items = [
|
|
item
|
|
for item in doc.texts
|
|
if item.label
|
|
in (DocItemLabel.CHECKBOX_SELECTED, DocItemLabel.CHECKBOX_UNSELECTED)
|
|
]
|
|
|
|
assert len(checkbox_items) > 0, "No checkboxes found in the document"
|
|
|
|
# Verify we have both selected and unselected checkboxes
|
|
selected = [
|
|
item for item in checkbox_items if item.label == DocItemLabel.CHECKBOX_SELECTED
|
|
]
|
|
unselected = [
|
|
item
|
|
for item in checkbox_items
|
|
if item.label == DocItemLabel.CHECKBOX_UNSELECTED
|
|
]
|
|
|
|
assert len(selected) > 0, "No selected checkboxes found"
|
|
assert len(unselected) > 0, "No unselected checkboxes found"
|
|
|
|
checkbox_texts = [item.text for item in checkbox_items]
|
|
assert any("Design" in text for text in checkbox_texts), (
|
|
"Expected checkbox text not found"
|
|
)
|
|
assert any("Implementation" in text for text in checkbox_texts), (
|
|
"Expected checkbox text not found"
|
|
)
|
|
assert any("Documentation" in text for text in checkbox_texts), (
|
|
"Expected checkbox text not found"
|
|
)
|
|
|
|
|
|
def test_checkbox_labels_in_tables(documents):
|
|
"""Test that checkboxes in table cells are correctly parsed."""
|
|
name = "docx_checkboxes.docx"
|
|
doc = next((item[1] for item in documents if item[0].name == name), None)
|
|
|
|
if doc is None:
|
|
pytest.skip(f"Test file not found: {name}")
|
|
|
|
checkbox_items = [
|
|
item
|
|
for item in doc.texts
|
|
if item.label
|
|
in (DocItemLabel.CHECKBOX_SELECTED, DocItemLabel.CHECKBOX_UNSELECTED)
|
|
]
|
|
|
|
food_items = [
|
|
"Orange juice",
|
|
"Tea",
|
|
"Coffee",
|
|
"Milk",
|
|
"Water",
|
|
"Scramble eggs",
|
|
"Porridge",
|
|
"Bread",
|
|
"Croissant",
|
|
]
|
|
|
|
found_food_checkboxes = [
|
|
item for item in checkbox_items if any(food in item.text for food in food_items)
|
|
]
|
|
|
|
assert len(found_food_checkboxes) > 0, "No checkboxes found in table cells"
|