mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
fix(pptx): handle picture shapes with external image references
When processing PowerPoint files containing picture shapes that reference
external images (rather than embedded images), the python-pptx library
raises a ValueError("no embedded image") when accessing the `image`
property.
Previously, this caused the entire document conversion to fail because:
1. The `hasattr(shape, "image")` check at line 690 would trigger the
property getter, which raises ValueError (hasattr only catches
AttributeError, not ValueError)
2. The exception handler in `_handle_pictures()` only caught
UnidentifiedImageError and OSError, not ValueError
This fix:
- Removes the unnecessary hasattr check since we already verify the
shape type is MSO_SHAPE_TYPE.PICTURE
- Adds ValueError to the exception handler in `_handle_pictures()` so
that picture shapes with external references are gracefully skipped
with a warning instead of crashing the pipeline
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Cesar Berrospi Ramis
parent
0602a7cdab
commit
e69779e07b
@@ -590,8 +590,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
caption=None,
|
||||
prov=prov,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError) as e:
|
||||
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
||||
except (UnidentifiedImageError, OSError, ValueError) as e:
|
||||
_log.warning(f"Warning: image cannot be loaded: {e}")
|
||||
return
|
||||
|
||||
def _handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
@@ -687,10 +687,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
self._handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
# Handle Pictures
|
||||
if hasattr(shape, "image"):
|
||||
self._handle_pictures(
|
||||
shape, parent_slide, slide_ind, doc, slide_size
|
||||
)
|
||||
self._handle_pictures(
|
||||
shape, parent_slide, slide_ind, doc, slide_size
|
||||
)
|
||||
# If shape doesn't have any text, move on to the next shape
|
||||
if not hasattr(shape, "text"):
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user