mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
72942486ff
* fix(pptx): skip malformed picture shapes instead of aborting conversion MsPowerpointDocumentBackend._handle_pictures reads embedded image bytes via python-pptx's shape.image accessor. On PPTX files with slightly malformed <p:pic> shapes, shape.image raises three exceptions that the existing (UnidentifiedImageError, OSError, ValueError) clause does not catch, so one bad picture aborts conversion of the entire presentation: - InvalidXmlError when <p:blipFill> is missing - KeyError when <a:blip r:embed> points to an unknown relationship - AttributeError when the embedded part's content-type isn't an image These files open normally in Keynote and Google Drive, so the backend should handle them as gracefully as it already handles truncated or unreadable image payloads. This follows the same pattern as #2914, which extended the same except tuple with ValueError to handle linked (external) image references. The three cases above are the remaining shape.image failure modes that still escape. Extend the except tuple to cover the three cases and log the same warning used for other unreadable images, leaving the rest of the presentation to convert normally. Add a regression fixture with one malformed picture per failure mode plus a focused test. Fixes #3371 Signed-off-by: pateltejas <tejas226@hotmail.com> * refactor(pptx): use warnings.warn for malformed picture skips Address PR review feedback: use Python's warnings module with UserWarning to signal the skip to callers instead of logging.Logger.warning, matching the pattern used in msword_backend for "Skipping external image reference". This makes the skip visible via standard warning filters and catchable in tests. Update the regression test to assert the warning is emitted via pytest.warns, which also suppresses the message during the test run so it doesn't clutter suite output. Signed-off-by: pateltejas <tejas226@hotmail.com> --------- Signed-off-by: pateltejas <tejas226@hotmail.com>
125 lines
4.1 KiB
Python
125 lines
4.1 KiB
Python
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import ConversionResult, DoclingDocument
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
|
from .verify_utils import verify_document, verify_export
|
|
|
|
GENERATE = GEN_TEST_DATA
|
|
|
|
|
|
def get_pptx_paths():
|
|
# Define the directory you want to search
|
|
directory = Path("./tests/data/pptx/")
|
|
|
|
# List all PPTX files in the directory and its subdirectories
|
|
pptx_files = sorted(directory.rglob("*.pptx"))
|
|
return pptx_files
|
|
|
|
|
|
def get_converter():
|
|
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
|
|
|
|
return converter
|
|
|
|
|
|
def test_e2e_pptx_conversions():
|
|
pptx_paths = get_pptx_paths()
|
|
converter = get_converter()
|
|
|
|
for pptx_path in pptx_paths:
|
|
# print(f"converting {pptx_path}")
|
|
|
|
gt_path = (
|
|
pptx_path.parent.parent / "groundtruth" / "docling_v2" / pptx_path.name
|
|
)
|
|
|
|
conv_result: ConversionResult = converter.convert(pptx_path)
|
|
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
pred_md: str = doc.export_to_markdown()
|
|
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
|
|
|
|
pred_itxt: str = doc._export_to_indented_text(
|
|
max_text_len=70, explicit_tables=False
|
|
)
|
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt", GENERATE), (
|
|
"export to indented-text"
|
|
)
|
|
|
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
|
|
"document document"
|
|
)
|
|
|
|
|
|
def test_pptx_unrecognized_shape_type():
|
|
"""PPTX with a <p:sp> that has no geometry should not crash.
|
|
|
|
python-pptx raises NotImplementedError from Shape.shape_type for shapes
|
|
that aren't placeholders, autoshapes, textboxes, or freeforms. The
|
|
backend should skip the unrecognized shape gracefully and still extract
|
|
text from the rest of the presentation.
|
|
|
|
Ref: https://github.com/docling-project/docling/issues/3308
|
|
"""
|
|
converter = get_converter()
|
|
pptx_path = Path("./tests/data/pptx/powerpoint_unrecognized_shape.pptx")
|
|
|
|
conv_result: ConversionResult = converter.convert(pptx_path)
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
pred_md = doc.export_to_markdown()
|
|
|
|
# Normal slide content should still be extracted
|
|
assert "Q3 Revenue Summary" in pred_md
|
|
assert "Enterprise segment" in pred_md
|
|
assert "Key Metrics" in pred_md
|
|
assert "Next Steps" in pred_md
|
|
|
|
|
|
def test_pptx_malformed_picture_shapes():
|
|
"""PPTX with malformed <p:pic> shapes should not crash conversion.
|
|
|
|
python-pptx's shape.image accessor raises three distinct exceptions on
|
|
picture shapes that slip past other tools' parsers (Keynote/Google Drive
|
|
open these files fine): InvalidXmlError when <p:blipFill> is missing,
|
|
KeyError when <a:blip r:embed> points at an unknown relationship, and
|
|
AttributeError when the embedded part's content-type isn't an image.
|
|
|
|
The backend should skip each malformed picture with a warning and still
|
|
extract text from the slides.
|
|
"""
|
|
converter = get_converter()
|
|
pptx_path = Path("./tests/data/pptx/powerpoint_malformed_pictures.pptx")
|
|
|
|
with pytest.warns(UserWarning, match="Skipping malformed picture shape"):
|
|
conv_result: ConversionResult = converter.convert(pptx_path)
|
|
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
pred_md = doc.export_to_markdown()
|
|
assert "Slide With Missing BlipFill" in pred_md
|
|
assert "Slide With Dangling Rel" in pred_md
|
|
assert "Slide With Wrong Content Type" in pred_md
|
|
|
|
|
|
def test_pptx_page_range():
|
|
converter = get_converter()
|
|
pptx_path = Path("./tests/data/pptx/powerpoint_sample.pptx")
|
|
|
|
conv_result: ConversionResult = converter.convert(pptx_path, page_range=(2, 2))
|
|
|
|
assert conv_result.input.page_count == 3
|
|
assert conv_result.document.num_pages() == 1
|
|
assert list(conv_result.document.pages.keys()) == [2]
|
|
|
|
pred_md = conv_result.document.export_to_markdown()
|
|
assert "Second slide title" in pred_md
|
|
assert "Test Table Slide" not in pred_md
|
|
assert "List item4" not in pred_md
|