Updated the tests and DoclingPdfRenderer

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar
2026-04-16 10:08:43 +02:00
parent 0288f6ea85
commit f60c12cd88
6 changed files with 25462 additions and 1844 deletions
+125
View File
@@ -1159,3 +1159,128 @@ class DoclingThreadedPdfRenderer:
PdfPageRenderResult: wraps doc_key, page_number, success, and get_image().
"""
return PdfPageRenderResult(self._renderer.get_task())
class PdfRenderDocument:
def __init__(
self,
*,
path_or_stream: Union[Path, bytes],
parser_doc: PdfDocument,
renderer_config: ThreadedPdfRendererConfig,
decode_config: DecodePageConfig,
render_config: RenderConfig,
password: Optional[str] = None,
):
self._path_or_stream = path_or_stream
self._parser_doc = parser_doc
self._renderer_config = renderer_config
self._decode_config = decode_config
self._render_config = render_config
self._password = password
self._pages: Dict[int, PdfPageRenderResult] = {}
def _make_renderer(self) -> "DoclingThreadedPdfRenderer":
return DoclingThreadedPdfRenderer(
renderer_config=self._renderer_config,
decode_config=self._decode_config,
render_config=self._render_config,
)
def _load_source(self, renderer: "DoclingThreadedPdfRenderer") -> str:
if isinstance(self._path_or_stream, Path):
return renderer.load(self._path_or_stream, password=self._password)
return renderer.load(BytesIO(self._path_or_stream), password=self._password)
def _render_all_pages(self) -> None:
if len(self._pages) == self.number_of_pages():
return
renderer = self._make_renderer()
key = self._load_source(renderer)
while renderer.has_tasks():
result = renderer.get_task()
if result.doc_key != key:
continue
if not result.success:
raise RuntimeError(
f"Failed to render page {result.page_number + 1}: {result.error()}"
)
self._pages[result.page_number + 1] = result
def number_of_pages(self) -> int:
return self._parser_doc.number_of_pages()
def get_page(self, page_no: int) -> PdfPageRenderResult:
if not (1 <= page_no <= self.number_of_pages()):
raise ValueError(
f"incorrect page_no: {page_no} (min:1, max:{self.number_of_pages()})"
)
if page_no not in self._pages:
self._render_all_pages()
return self._pages[page_no]
def iterate_pages(self) -> Iterator[Tuple[int, PdfPageRenderResult]]:
self._render_all_pages()
for page_no in range(1, self.number_of_pages() + 1):
yield page_no, self._pages[page_no]
def unload(self) -> bool:
self._pages.clear()
return self._parser_doc.unload()
class DoclingPdfRenderer:
def __init__(
self,
loglevel: str = "fatal",
decode_config: Optional[DecodePageConfig] = None,
render_config: Optional[RenderConfig] = None,
):
self._loglevel = loglevel
self._parser = DoclingPdfParser(loglevel=loglevel)
self._renderer_config = ThreadedPdfRendererConfig(
loglevel=loglevel,
threads=1,
max_concurrent_results=1,
)
self._decode_config = decode_config or DecodePageConfig()
self._render_config = render_config or RenderConfig()
def load(
self,
path_or_stream: Union[str, Path, BytesIO],
lazy: bool = True,
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
password: Optional[str] = None,
) -> PdfRenderDocument:
parser_doc = self._parser.load(
path_or_stream=path_or_stream,
lazy=lazy,
boundary_type=boundary_type,
password=password,
)
if isinstance(path_or_stream, str):
source: Union[Path, bytes] = Path(path_or_stream)
elif isinstance(path_or_stream, Path):
source = path_or_stream
elif isinstance(path_or_stream, BytesIO):
source = path_or_stream.getvalue()
else:
raise TypeError(
f"Expected str, Path, or BytesIO, got {type(path_or_stream)}"
)
return PdfRenderDocument(
path_or_stream=source,
parser_doc=parser_doc,
renderer_config=self._renderer_config,
decode_config=self._decode_config,
render_config=self._render_config,
password=password,
)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+68 -24
View File
@@ -1,21 +1,28 @@
#!/usr/bin/env python
import glob
import hashlib
import json
import os
from pathlib import Path
from docling_parse.pdf_parser import DecodePageConfig, DoclingPdfParser, PdfDocument
from docling_parse.pdf_parser import (
DecodePageConfig,
DoclingPdfRenderer,
PdfRenderDocument,
)
GENERATE = False
GROUNDTRUTH_RENDERER_FOLDER = "tests/data/groundtruth_renderer"
REGRESSION_FOLDER = "tests/data/regression/*.pdf"
RENDER_CASES = {
"font_01.pdf": [1],
"rotated_page_01.pdf": [1],
"fillable_form.pdf": [1],
"indexed_iccbased.pdf": [1],
"rotated_image.pdf": [1],
PAGE_RESTRICTIONS = {
"deep-mediabox-inheritance.pdf": [2],
"font_06.pdf": [1],
"font_07.pdf": [1],
"font_08.pdf": [1],
"font_09.pdf": [1],
"font_10.pdf": [1],
}
BITMAP_RESTRICTIONS = {
@@ -23,6 +30,7 @@ BITMAP_RESTRICTIONS = {
1: [1, 5, 10, 15],
},
}
MAX_BITMAPS_PER_PAGE = 5
def _round_floats(obj, ndigits=3):
@@ -47,6 +55,10 @@ def _bitmap_json_path(pdf_name: str, page_no: int, bitmap_index: int) -> Path:
return Path(f"{_page_prefix(pdf_name, page_no)}.bitmap_{bitmap_index}.json")
def _full_page_png_path(pdf_name: str, page_no: int) -> Path:
return Path(f"{_page_prefix(pdf_name, page_no)}.full_page.png")
def _write_json(path: Path, payload) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as fw:
@@ -58,14 +70,26 @@ def _load_json(path: Path):
return json.load(fr)
def _artifact_basename(pdf_name: str, page_no: int, bitmap_index: int, extension: str) -> str:
def _artifact_basename(
pdf_name: str, page_no: int, bitmap_index: int, extension: str
) -> str:
return f"{pdf_name}.page_no_{page_no}.bitmap_{bitmap_index}{extension}"
def _selected_bitmap_indices(pdf_name: str, page_no: int, num_bitmaps: int) -> set[int]:
restricted = BITMAP_RESTRICTIONS.get(pdf_name, {}).get(page_no)
if restricted is None:
return set(range(1, min(num_bitmaps, MAX_BITMAPS_PER_PAGE) + 1))
return set(restricted[:MAX_BITMAPS_PER_PAGE])
def _export_or_verify_bitmaps(pdf_name: str, page_no: int, bitmaps) -> None:
selected = _selected_bitmap_indices(pdf_name, page_no, len(bitmaps))
for bitmap_index, bitmap in enumerate(bitmaps, start=1):
allowed = BITMAP_RESTRICTIONS.get(pdf_name, {}).get(page_no)
if allowed is not None and bitmap_index not in allowed:
if bitmap_index not in selected:
continue
raw_sha256 = hashlib.sha256(bitmap["raw_data"]).hexdigest()
@@ -104,31 +128,50 @@ def _export_or_verify_bitmaps(pdf_name: str, page_no: int, bitmaps) -> None:
), f"bitmap artifact bytes mismatch for {artifact_path}"
def test_render_reference_documents():
parser = DoclingPdfParser(loglevel="fatal")
def _export_full_page_png(pdf_name: str, page_no: int, image) -> None:
out_path = _full_page_png_path(pdf_name, page_no)
if out_path.exists():
return
if image is None:
return
out_path.parent.mkdir(parents=True, exist_ok=True)
image.save(out_path, format="PNG")
def test_render_reference_documents():
config = DecodePageConfig()
config.page_boundary = "crop_box"
config.do_sanitization = False
config.keep_glyphs = True
config.keep_qpdf_warnings = False
renderer = DoclingPdfRenderer(loglevel="fatal", decode_config=config)
results = []
for pdf_name, page_numbers in RENDER_CASES.items():
pdf_path = os.path.join("tests/data/regression", pdf_name)
pdf_paths = sorted(glob.glob(REGRESSION_FOLDER))
assert len(pdf_paths) > 0, "len(pdf_paths)==0 -> nothing to test"
pdf_doc: PdfDocument = parser.load(path_or_stream=pdf_path, lazy=True)
for pdf_path in pdf_paths:
pdf_name = os.path.basename(pdf_path)
pdf_doc: PdfRenderDocument = renderer.load(path_or_stream=pdf_path, lazy=True)
assert pdf_doc is not None
for page_no in page_numbers:
for page_no in range(1, pdf_doc.number_of_pages() + 1):
if (
pdf_name in PAGE_RESTRICTIONS
and page_no not in PAGE_RESTRICTIONS[pdf_name]
):
continue
try:
page_decoder = pdf_doc._parser.get_page_decoder(
key=pdf_doc._key,
page=page_no - 1,
config=config,
)
assert page_decoder is not None, f"failed to decode {pdf_name}@{page_no}"
render_result = pdf_doc.get_page(page_no)
assert (
render_result is not None
), f"failed to render {pdf_name}@{page_no}"
page_decoder, timings = render_result.get()
instructions = page_decoder.export_render_instructions_json()
instruction_path = _instruction_path(pdf_name, page_no)
@@ -143,12 +186,13 @@ def test_render_reference_documents():
bitmap_artifacts = page_decoder.export_bitmap_artifacts()
_export_or_verify_bitmaps(pdf_name, page_no, bitmap_artifacts)
_export_full_page_png(pdf_name, page_no, render_result.get_image())
results.append((pdf_name, page_no, True, ""))
except Exception as exc:
results.append((pdf_name, page_no, False, str(exc)))
finally:
pdf_doc.unload_pages(page_range=(page_no, page_no + 1))
pdf_doc.unload()
failed = [(doc, page, err) for doc, page, ok, err in results if not ok]
assert not failed, f"{len(failed)} page(s) failed: " + ", ".join(