mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
132 lines
4.1 KiB
Python
132 lines
4.1 KiB
Python
from pathlib import Path
|
|
|
|
from docx import Document
|
|
from docx.enum.style import WD_STYLE_TYPE
|
|
from docx.oxml import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
|
|
def _add_numbering_definition(doc: Document, abstract_id: str, num_id: str) -> None:
|
|
numbering_part = doc.part.numbering_part
|
|
numbering_xml = numbering_part._element
|
|
|
|
abstract_num = OxmlElement("w:abstractNum")
|
|
abstract_num.set(qn("w:abstractNumId"), abstract_id)
|
|
|
|
lvl0 = OxmlElement("w:lvl")
|
|
lvl0.set(qn("w:ilvl"), "0")
|
|
start0 = OxmlElement("w:start")
|
|
start0.set(qn("w:val"), "3")
|
|
lvl0.append(start0)
|
|
num_fmt0 = OxmlElement("w:numFmt")
|
|
num_fmt0.set(qn("w:val"), "decimal")
|
|
lvl0.append(num_fmt0)
|
|
lvl_text0 = OxmlElement("w:lvlText")
|
|
lvl_text0.set(qn("w:val"), "%1")
|
|
lvl0.append(lvl_text0)
|
|
abstract_num.append(lvl0)
|
|
|
|
lvl1 = OxmlElement("w:lvl")
|
|
lvl1.set(qn("w:ilvl"), "1")
|
|
start1 = OxmlElement("w:start")
|
|
start1.set(qn("w:val"), "1")
|
|
lvl1.append(start1)
|
|
num_fmt1 = OxmlElement("w:numFmt")
|
|
num_fmt1.set(qn("w:val"), "decimal")
|
|
lvl1.append(num_fmt1)
|
|
lvl_text1 = OxmlElement("w:lvlText")
|
|
lvl_text1.set(qn("w:val"), "%1.%2")
|
|
lvl1.append(lvl_text1)
|
|
abstract_num.append(lvl1)
|
|
|
|
numbering_xml.insert(0, abstract_num)
|
|
|
|
num = OxmlElement("w:num")
|
|
num.set(qn("w:numId"), num_id)
|
|
abstract_num_id = OxmlElement("w:abstractNumId")
|
|
abstract_num_id.set(qn("w:val"), abstract_id)
|
|
num.append(abstract_num_id)
|
|
numbering_xml.append(num)
|
|
|
|
|
|
def _add_num_pr_to_paragraph(paragraph, num_id: str, ilvl: str) -> None:
|
|
ppr = paragraph._p.get_or_add_pPr()
|
|
num_pr = OxmlElement("w:numPr")
|
|
ilvl_elem = OxmlElement("w:ilvl")
|
|
ilvl_elem.set(qn("w:val"), ilvl)
|
|
num_pr.append(ilvl_elem)
|
|
num_id_elem = OxmlElement("w:numId")
|
|
num_id_elem.set(qn("w:val"), num_id)
|
|
num_pr.append(num_id_elem)
|
|
ppr.append(num_pr)
|
|
|
|
|
|
def _add_num_pr_to_style(style, num_id: str, ilvl: str) -> None:
|
|
style_element = style._element
|
|
ppr = style_element.find("w:pPr", namespaces=style_element.nsmap)
|
|
if ppr is None:
|
|
ppr = OxmlElement("w:pPr")
|
|
style_element.append(ppr)
|
|
num_pr = OxmlElement("w:numPr")
|
|
ilvl_elem = OxmlElement("w:ilvl")
|
|
ilvl_elem.set(qn("w:val"), ilvl)
|
|
num_pr.append(ilvl_elem)
|
|
num_id_elem = OxmlElement("w:numId")
|
|
num_id_elem.set(qn("w:val"), num_id)
|
|
num_pr.append(num_id_elem)
|
|
ppr.append(num_pr)
|
|
|
|
|
|
def _build_docx(path: Path, use_style_inheritance: bool) -> None:
|
|
doc = Document()
|
|
_add_numbering_definition(doc, abstract_id="1", num_id="10")
|
|
|
|
if use_style_inheritance:
|
|
base_style = doc.styles["Heading 2"]
|
|
_add_num_pr_to_style(base_style, num_id="10", ilvl="1")
|
|
derived_style = doc.styles.add_style("Heading 2a", WD_STYLE_TYPE.PARAGRAPH)
|
|
derived_style.base_style = base_style
|
|
style = derived_style
|
|
apply_num_pr = False
|
|
else:
|
|
style = doc.styles["Heading 2"]
|
|
apply_num_pr = True
|
|
|
|
headings = ["Introduction", "Solution Overview", "Requirements"]
|
|
for heading in headings:
|
|
p = doc.add_paragraph()
|
|
p.style = style
|
|
if apply_num_pr:
|
|
_add_num_pr_to_paragraph(p, num_id="10", ilvl="1")
|
|
p.add_run(heading)
|
|
|
|
doc.save(str(path))
|
|
|
|
|
|
def _convert_to_markdown(docx_path: Path) -> str:
|
|
converter = DocumentConverter()
|
|
result = converter.convert(docx_path)
|
|
return result.document.export_to_markdown()
|
|
|
|
|
|
def _assert_numbering(md: str) -> None:
|
|
assert "3.1 Introduction" in md
|
|
assert "3.2 Solution Overview" in md
|
|
assert "3.3 Requirements" in md
|
|
|
|
|
|
def test_docx_numbering_paragraph_numpr(tmp_path: Path) -> None:
|
|
docx_path = tmp_path / "scenario_a.docx"
|
|
_build_docx(docx_path, use_style_inheritance=False)
|
|
md = _convert_to_markdown(docx_path)
|
|
_assert_numbering(md)
|
|
|
|
|
|
def test_docx_numbering_style_inheritance(tmp_path: Path) -> None:
|
|
docx_path = tmp_path / "scenario_b.docx"
|
|
_build_docx(docx_path, use_style_inheritance=True)
|
|
md = _convert_to_markdown(docx_path)
|
|
_assert_numbering(md)
|