Files
docling/tests/test_backend_pptx.py
pateltejas 72942486ff fix(pptx): skip malformed picture shapes instead of aborting conversion (#3372)
* fix(pptx): skip malformed picture shapes instead of aborting conversion

MsPowerpointDocumentBackend._handle_pictures reads embedded image bytes via python-pptx's shape.image accessor. On PPTX files with slightly malformed <p:pic> shapes, shape.image raises three exceptions that the existing (UnidentifiedImageError, OSError, ValueError) clause does not catch, so one bad picture aborts conversion of the entire presentation:

- InvalidXmlError when <p:blipFill> is missing
- KeyError when <a:blip r:embed> points to an unknown relationship
- AttributeError when the embedded part's content-type isn't an image

These files open normally in Keynote and Google Drive, so the backend should handle them as gracefully as it already handles truncated or unreadable image payloads.

This follows the same pattern as #2914, which extended the same except tuple with ValueError to handle linked (external) image references. The three cases above are the remaining shape.image failure modes that still escape.

Extend the except tuple to cover the three cases and log the same warning used for other unreadable images, leaving the rest of the presentation to convert normally. Add a regression fixture with one malformed picture per failure mode plus a focused test.

Fixes #3371

Signed-off-by: pateltejas <tejas226@hotmail.com>

* refactor(pptx): use warnings.warn for malformed picture skips

Address PR review feedback: use Python's warnings module with UserWarning to signal the skip to callers instead of logging.Logger.warning, matching the pattern used in msword_backend for "Skipping external image reference". This makes the skip visible via standard warning filters and catchable in tests.

Update the regression test to assert the warning is emitted via pytest.warns, which also suppresses the message during the test run so it doesn't clutter suite output.

Signed-off-by: pateltejas <tejas226@hotmail.com>

---------

Signed-off-by: pateltejas <tejas226@hotmail.com>
2026-04-29 08:29:08 +02:00

125 lines
4.1 KiB
Python

from pathlib import Path
import pytest
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def get_pptx_paths():
# Define the directory you want to search
directory = Path("./tests/data/pptx/")
# List all PPTX files in the directory and its subdirectories
pptx_files = sorted(directory.rglob("*.pptx"))
return pptx_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
return converter
def test_e2e_pptx_conversions():
pptx_paths = get_pptx_paths()
converter = get_converter()
for pptx_path in pptx_paths:
# print(f"converting {pptx_path}")
gt_path = (
pptx_path.parent.parent / "groundtruth" / "docling_v2" / pptx_path.name
)
conv_result: ConversionResult = converter.convert(pptx_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
"document document"
)
def test_pptx_unrecognized_shape_type():
"""PPTX with a <p:sp> that has no geometry should not crash.
python-pptx raises NotImplementedError from Shape.shape_type for shapes
that aren't placeholders, autoshapes, textboxes, or freeforms. The
backend should skip the unrecognized shape gracefully and still extract
text from the rest of the presentation.
Ref: https://github.com/docling-project/docling/issues/3308
"""
converter = get_converter()
pptx_path = Path("./tests/data/pptx/powerpoint_unrecognized_shape.pptx")
conv_result: ConversionResult = converter.convert(pptx_path)
doc: DoclingDocument = conv_result.document
pred_md = doc.export_to_markdown()
# Normal slide content should still be extracted
assert "Q3 Revenue Summary" in pred_md
assert "Enterprise segment" in pred_md
assert "Key Metrics" in pred_md
assert "Next Steps" in pred_md
def test_pptx_malformed_picture_shapes():
"""PPTX with malformed <p:pic> shapes should not crash conversion.
python-pptx's shape.image accessor raises three distinct exceptions on
picture shapes that slip past other tools' parsers (Keynote/Google Drive
open these files fine): InvalidXmlError when <p:blipFill> is missing,
KeyError when <a:blip r:embed> points at an unknown relationship, and
AttributeError when the embedded part's content-type isn't an image.
The backend should skip each malformed picture with a warning and still
extract text from the slides.
"""
converter = get_converter()
pptx_path = Path("./tests/data/pptx/powerpoint_malformed_pictures.pptx")
with pytest.warns(UserWarning, match="Skipping malformed picture shape"):
conv_result: ConversionResult = converter.convert(pptx_path)
doc: DoclingDocument = conv_result.document
pred_md = doc.export_to_markdown()
assert "Slide With Missing BlipFill" in pred_md
assert "Slide With Dangling Rel" in pred_md
assert "Slide With Wrong Content Type" in pred_md
def test_pptx_page_range():
converter = get_converter()
pptx_path = Path("./tests/data/pptx/powerpoint_sample.pptx")
conv_result: ConversionResult = converter.convert(pptx_path, page_range=(2, 2))
assert conv_result.input.page_count == 3
assert conv_result.document.num_pages() == 1
assert list(conv_result.document.pages.keys()) == [2]
pred_md = conv_result.document.export_to_markdown()
assert "Second slide title" in pred_md
assert "Test Table Slide" not in pred_md
assert "List item4" not in pred_md