mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
1f914826bb
* fix: add failed pages to DoclingDocument for page break consistency When some PDF pages fail to parse, they were not added to DoclingDocument.pages, causing page break markers to be incorrect during export. This adds failed/skipped pages with their size info (if available) to maintain correct page numbering and structure. - Add _add_failed_pages_to_document() method in StandardPdfPipeline - Add test cases for failed page handling - Add test cases for normal page handling (regression test) - Add test PDF files Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com> * fix: ensure resource cleanup and simplify type hints - Wrap page_backend usage in try-finally to guarantee unload (prevents resource leaks). - Simplify redundant 'float | None | None' type hint. Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com> * fix: add groundtruth for normal_4pages.pdf and exclude failing PDFs from e2e test Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com> * fix: ensure correct status assertion for failed pages in tests Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com> --------- Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>
218 lines
7.5 KiB
Python
218 lines
7.5 KiB
Python
"""Tests for failed page handling in StandardPdfPipeline.
|
|
|
|
These tests verify that when some PDF pages fail to parse, they are still
|
|
added to DoclingDocument.pages to maintain correct page numbering and
|
|
ensure page break markers are generated correctly during export.
|
|
|
|
Related: https://github.com/docling-project/docling-core/pull/466
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
|
|
|
|
@pytest.fixture
|
|
def skipped_1page_path():
|
|
return Path("./tests/data/pdf/skipped_1page.pdf")
|
|
|
|
|
|
@pytest.fixture
|
|
def skipped_2pages_path():
|
|
return Path("./tests/data/pdf/skipped_2pages.pdf")
|
|
|
|
|
|
@pytest.fixture
|
|
def normal_4pages_path():
|
|
return Path("./tests/data/pdf/normal_4pages.pdf")
|
|
|
|
|
|
def test_normal_pages_all_present(normal_4pages_path):
|
|
"""Test that all pages are present in DoclingDocument.pages for a normal PDF."""
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=StandardPdfPipeline,
|
|
pipeline_options=PdfPipelineOptions(
|
|
do_ocr=False,
|
|
do_table_structure=False,
|
|
),
|
|
)
|
|
}
|
|
)
|
|
|
|
result = converter.convert(normal_4pages_path, raises_on_error=False)
|
|
|
|
# Document should succeed completely
|
|
assert result.status == ConversionStatus.SUCCESS, (
|
|
f"Expected SUCCESS status, got: {result.status}"
|
|
)
|
|
|
|
# Get expected page count from input
|
|
expected_page_count = result.input.page_count
|
|
|
|
# DoclingDocument.pages should contain all pages
|
|
assert result.document is not None, "Document should not be None"
|
|
actual_page_count = len(result.document.pages)
|
|
|
|
assert actual_page_count == expected_page_count, (
|
|
f"DoclingDocument.pages should contain all {expected_page_count} pages, "
|
|
f"but got {actual_page_count}"
|
|
)
|
|
|
|
# Verify all page numbers are present
|
|
expected_page_nos = set(range(1, expected_page_count + 1))
|
|
actual_page_nos = set(result.document.pages.keys())
|
|
|
|
assert actual_page_nos == expected_page_nos, (
|
|
f"Missing page numbers in DoclingDocument.pages. "
|
|
f"Expected: {expected_page_nos}, Got: {actual_page_nos}"
|
|
)
|
|
|
|
# No errors should be recorded
|
|
assert len(result.errors) == 0, (
|
|
f"No errors should be recorded for normal PDF, but got: {result.errors}"
|
|
)
|
|
|
|
|
|
def test_failed_pages_added_to_document_1page(skipped_1page_path):
|
|
"""Test that a single failed page is added to DoclingDocument.pages."""
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=StandardPdfPipeline,
|
|
pipeline_options=PdfPipelineOptions(
|
|
do_ocr=False,
|
|
do_table_structure=False,
|
|
),
|
|
)
|
|
}
|
|
)
|
|
|
|
result = converter.convert(skipped_1page_path, raises_on_error=False)
|
|
|
|
# Document should have partial success due to failed page(s)
|
|
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
|
|
f"Unexpected status: {result.status}"
|
|
)
|
|
|
|
# Get expected page count from input
|
|
expected_page_count = result.input.page_count
|
|
|
|
# DoclingDocument.pages should contain all pages (including failed ones)
|
|
assert result.document is not None, "Document should not be None"
|
|
actual_page_count = len(result.document.pages)
|
|
|
|
assert actual_page_count == expected_page_count, (
|
|
f"DoclingDocument.pages should contain all {expected_page_count} pages "
|
|
f"(including failed ones), but got {actual_page_count}"
|
|
)
|
|
|
|
# Verify all page numbers are present
|
|
expected_page_nos = set(range(1, expected_page_count + 1))
|
|
actual_page_nos = set(result.document.pages.keys())
|
|
|
|
assert actual_page_nos == expected_page_nos, (
|
|
f"Missing page numbers in DoclingDocument.pages. "
|
|
f"Expected: {expected_page_nos}, Got: {actual_page_nos}"
|
|
)
|
|
|
|
|
|
def test_failed_pages_added_to_document_2pages(skipped_2pages_path):
|
|
"""Test that multiple failed pages are added to DoclingDocument.pages."""
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=StandardPdfPipeline,
|
|
pipeline_options=PdfPipelineOptions(
|
|
do_ocr=False,
|
|
do_table_structure=False,
|
|
),
|
|
)
|
|
}
|
|
)
|
|
|
|
result = converter.convert(skipped_2pages_path, raises_on_error=False)
|
|
|
|
# Document should have partial success due to failed page(s)
|
|
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
|
|
f"Unexpected status: {result.status}"
|
|
)
|
|
|
|
# Get expected page count from input
|
|
expected_page_count = result.input.page_count
|
|
|
|
# DoclingDocument.pages should contain all pages (including failed ones)
|
|
assert result.document is not None, "Document should not be None"
|
|
actual_page_count = len(result.document.pages)
|
|
|
|
assert actual_page_count == expected_page_count, (
|
|
f"DoclingDocument.pages should contain all {expected_page_count} pages "
|
|
f"(including failed ones), but got {actual_page_count}"
|
|
)
|
|
|
|
# Verify all page numbers are present
|
|
expected_page_nos = set(range(1, expected_page_count + 1))
|
|
actual_page_nos = set(result.document.pages.keys())
|
|
|
|
assert actual_page_nos == expected_page_nos, (
|
|
f"Missing page numbers in DoclingDocument.pages. "
|
|
f"Expected: {expected_page_nos}, Got: {actual_page_nos}"
|
|
)
|
|
|
|
|
|
def test_failed_pages_have_size_info(skipped_1page_path):
|
|
"""Test that failed pages have size information when available."""
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=StandardPdfPipeline,
|
|
pipeline_options=PdfPipelineOptions(
|
|
do_ocr=False,
|
|
do_table_structure=False,
|
|
),
|
|
)
|
|
}
|
|
)
|
|
|
|
result = converter.convert(skipped_1page_path, raises_on_error=False)
|
|
|
|
assert result.document is not None, "Document should not be None"
|
|
|
|
# All pages should have size information
|
|
for page_no, page_item in result.document.pages.items():
|
|
assert page_item.size is not None, (
|
|
f"Page {page_no} should have size information"
|
|
)
|
|
# Size should be valid (either from backend or default 0.0)
|
|
assert page_item.size.width >= 0, f"Page {page_no} width should be >= 0"
|
|
assert page_item.size.height >= 0, f"Page {page_no} height should be >= 0"
|
|
|
|
|
|
def test_errors_recorded_for_failed_pages(skipped_1page_path):
|
|
"""Test that errors are recorded in conv_res.errors for failed pages."""
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=StandardPdfPipeline,
|
|
pipeline_options=PdfPipelineOptions(
|
|
do_ocr=False,
|
|
do_table_structure=False,
|
|
),
|
|
)
|
|
}
|
|
)
|
|
|
|
result = converter.convert(skipped_1page_path, raises_on_error=False)
|
|
|
|
# If status is PARTIAL_SUCCESS, there should be errors recorded
|
|
if result.status == ConversionStatus.PARTIAL_SUCCESS:
|
|
assert len(result.errors) > 0, (
|
|
"PARTIAL_SUCCESS status should have errors recorded"
|
|
)
|