Files
docling/tests/test_failed_pages.py
jhchoi1182 1f914826bb fix: add failed pages to DoclingDocument for page break consistency (#2939)
* fix: add failed pages to DoclingDocument for page break consistency

When some PDF pages fail to parse, they were not added to
DoclingDocument.pages, causing page break markers to be incorrect
during export. This adds failed/skipped pages with their size info
(if available) to maintain correct page numbering and structure.

- Add _add_failed_pages_to_document() method in StandardPdfPipeline
- Add test cases for failed page handling
- Add test cases for normal page handling (regression test)
- Add test PDF files

Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>

* fix: ensure resource cleanup and simplify type hints

- Wrap page_backend usage in try-finally to guarantee unload (prevents resource leaks).
- Simplify redundant 'float | None | None' type hint.

Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>

* fix: add groundtruth for normal_4pages.pdf and exclude failing PDFs from e2e test

Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>

* fix: ensure correct status assertion for failed pages in tests

Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>

---------

Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>
2026-02-13 13:35:35 +01:00

218 lines
7.5 KiB
Python

"""Tests for failed page handling in StandardPdfPipeline.
These tests verify that when some PDF pages fail to parse, they are still
added to DoclingDocument.pages to maintain correct page numbering and
ensure page break markers are generated correctly during export.
Related: https://github.com/docling-project/docling-core/pull/466
"""
from pathlib import Path
import pytest
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@pytest.fixture
def skipped_1page_path():
return Path("./tests/data/pdf/skipped_1page.pdf")
@pytest.fixture
def skipped_2pages_path():
return Path("./tests/data/pdf/skipped_2pages.pdf")
@pytest.fixture
def normal_4pages_path():
return Path("./tests/data/pdf/normal_4pages.pdf")
def test_normal_pages_all_present(normal_4pages_path):
"""Test that all pages are present in DoclingDocument.pages for a normal PDF."""
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=PdfPipelineOptions(
do_ocr=False,
do_table_structure=False,
),
)
}
)
result = converter.convert(normal_4pages_path, raises_on_error=False)
# Document should succeed completely
assert result.status == ConversionStatus.SUCCESS, (
f"Expected SUCCESS status, got: {result.status}"
)
# Get expected page count from input
expected_page_count = result.input.page_count
# DoclingDocument.pages should contain all pages
assert result.document is not None, "Document should not be None"
actual_page_count = len(result.document.pages)
assert actual_page_count == expected_page_count, (
f"DoclingDocument.pages should contain all {expected_page_count} pages, "
f"but got {actual_page_count}"
)
# Verify all page numbers are present
expected_page_nos = set(range(1, expected_page_count + 1))
actual_page_nos = set(result.document.pages.keys())
assert actual_page_nos == expected_page_nos, (
f"Missing page numbers in DoclingDocument.pages. "
f"Expected: {expected_page_nos}, Got: {actual_page_nos}"
)
# No errors should be recorded
assert len(result.errors) == 0, (
f"No errors should be recorded for normal PDF, but got: {result.errors}"
)
def test_failed_pages_added_to_document_1page(skipped_1page_path):
"""Test that a single failed page is added to DoclingDocument.pages."""
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=PdfPipelineOptions(
do_ocr=False,
do_table_structure=False,
),
)
}
)
result = converter.convert(skipped_1page_path, raises_on_error=False)
# Document should have partial success due to failed page(s)
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
f"Unexpected status: {result.status}"
)
# Get expected page count from input
expected_page_count = result.input.page_count
# DoclingDocument.pages should contain all pages (including failed ones)
assert result.document is not None, "Document should not be None"
actual_page_count = len(result.document.pages)
assert actual_page_count == expected_page_count, (
f"DoclingDocument.pages should contain all {expected_page_count} pages "
f"(including failed ones), but got {actual_page_count}"
)
# Verify all page numbers are present
expected_page_nos = set(range(1, expected_page_count + 1))
actual_page_nos = set(result.document.pages.keys())
assert actual_page_nos == expected_page_nos, (
f"Missing page numbers in DoclingDocument.pages. "
f"Expected: {expected_page_nos}, Got: {actual_page_nos}"
)
def test_failed_pages_added_to_document_2pages(skipped_2pages_path):
"""Test that multiple failed pages are added to DoclingDocument.pages."""
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=PdfPipelineOptions(
do_ocr=False,
do_table_structure=False,
),
)
}
)
result = converter.convert(skipped_2pages_path, raises_on_error=False)
# Document should have partial success due to failed page(s)
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
f"Unexpected status: {result.status}"
)
# Get expected page count from input
expected_page_count = result.input.page_count
# DoclingDocument.pages should contain all pages (including failed ones)
assert result.document is not None, "Document should not be None"
actual_page_count = len(result.document.pages)
assert actual_page_count == expected_page_count, (
f"DoclingDocument.pages should contain all {expected_page_count} pages "
f"(including failed ones), but got {actual_page_count}"
)
# Verify all page numbers are present
expected_page_nos = set(range(1, expected_page_count + 1))
actual_page_nos = set(result.document.pages.keys())
assert actual_page_nos == expected_page_nos, (
f"Missing page numbers in DoclingDocument.pages. "
f"Expected: {expected_page_nos}, Got: {actual_page_nos}"
)
def test_failed_pages_have_size_info(skipped_1page_path):
"""Test that failed pages have size information when available."""
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=PdfPipelineOptions(
do_ocr=False,
do_table_structure=False,
),
)
}
)
result = converter.convert(skipped_1page_path, raises_on_error=False)
assert result.document is not None, "Document should not be None"
# All pages should have size information
for page_no, page_item in result.document.pages.items():
assert page_item.size is not None, (
f"Page {page_no} should have size information"
)
# Size should be valid (either from backend or default 0.0)
assert page_item.size.width >= 0, f"Page {page_no} width should be >= 0"
assert page_item.size.height >= 0, f"Page {page_no} height should be >= 0"
def test_errors_recorded_for_failed_pages(skipped_1page_path):
"""Test that errors are recorded in conv_res.errors for failed pages."""
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
pipeline_options=PdfPipelineOptions(
do_ocr=False,
do_table_structure=False,
),
)
}
)
result = converter.convert(skipped_1page_path, raises_on_error=False)
# If status is PARTIAL_SUCCESS, there should be errors recorded
if result.status == ConversionStatus.PARTIAL_SUCCESS:
assert len(result.errors) > 0, (
"PARTIAL_SUCCESS status should have errors recorded"
)