Files
docling/tests/test_options.py
Nikos Livathinos 8b67fae687 feat: Extend the kserve-triton OCR model to have multi-lingual support (#3368)
* chore: Update .gitignore with local dirs of AI agents

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend KserveV2OcrModel and kserve_v2_grpc.py to support the new version of Triton-RapidOCR
model where the language is the first input parameter:
- The gRPC client has been extended to encode BYTE input, needed for String types.
- An additional test ensures to have proper BYTE encoding/decoding.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Add test for the KServe-Triton integration: WIP
- The test currently supports only the gRPC KServe client
- Extend the ground-truth test data.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Simplify code in kserve test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* chore: Rename test file

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* feat: Extend the kserve_v2 implementation to support binary data in the HTTP interface.
- Decouple functions for binary encoding/decoding inside the kserve_v2_utils.py and share for both HTTP and gRPC.
- Introduce use_binary_data init parameter in KserveV2OptionsMixin
- Improve tests

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Put back the field grpc_use_binary_data of KserveV2OptionsMixin as a deprecated alias to use_binary_data

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2026-04-28 16:00:57 +02:00

289 lines
9.5 KiB
Python

import os
from pathlib import Path
from unittest.mock import Mock, patch
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
from docling.datamodel.document import ConversionResult
from docling.datamodel.image_classification_engine_options import (
ApiKserveV2ImageClassificationEngineOptions,
)
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TableFormerMode,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.legacy_standard_pdf_pipeline import LegacyStandardPdfPipeline
@pytest.fixture
def test_doc_path():
return Path("./tests/data/pdf/2206.01062.pdf")
def get_converters_with_table_options():
for cell_matching in [True, False]:
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = cell_matching
pipeline_options.table_structure_options.mode = mode
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
yield converter
def test_accelerator_options():
# Check the default options
ao = AcceleratorOptions()
assert ao.num_threads == 4, "Wrong default num_threads"
assert ao.device == AcceleratorDevice.AUTO, "Wrong default device"
# Use API
ao2 = AcceleratorOptions(num_threads=2, device=AcceleratorDevice.MPS)
ao3 = AcceleratorOptions(num_threads=3, device=AcceleratorDevice.CUDA)
ao4 = AcceleratorOptions(num_threads=4, device=AcceleratorDevice.XPU)
assert ao2.num_threads == 2
assert ao2.device == AcceleratorDevice.MPS
assert ao3.num_threads == 3
assert ao3.device == AcceleratorDevice.CUDA
assert ao4.num_threads == 4
assert ao4.device == AcceleratorDevice.XPU
# Use envvars (regular + alternative) and default values
os.environ["OMP_NUM_THREADS"] = "1"
ao.__init__()
assert ao.num_threads == 1
assert ao.device == AcceleratorDevice.AUTO
os.environ["DOCLING_DEVICE"] = "cpu"
ao.__init__()
assert ao.device == AcceleratorDevice.CPU
assert ao.num_threads == 1
# Use envvars and override in init
os.environ["DOCLING_DEVICE"] = "cpu"
ao5 = AcceleratorOptions(num_threads=5, device=AcceleratorDevice.MPS)
assert ao5.num_threads == 5
assert ao5.device == AcceleratorDevice.MPS
# Use regular and alternative envvar
os.environ["DOCLING_NUM_THREADS"] = "2"
ao6 = AcceleratorOptions()
assert ao6.num_threads == 2
assert ao6.device == AcceleratorDevice.CPU
# Use wrong values
is_exception = False
try:
os.environ["DOCLING_DEVICE"] = "wrong"
ao5.__init__()
except Exception as ex:
print(ex)
is_exception = True
assert is_exception
# Use misformatted alternative envvar
del os.environ["DOCLING_NUM_THREADS"]
del os.environ["DOCLING_DEVICE"]
os.environ["OMP_NUM_THREADS"] = "wrong"
ao7 = AcceleratorOptions()
assert ao7.num_threads == 4
assert ao7.device == AcceleratorDevice.AUTO
def test_kserve_v2_binary_data_deprecated_alias():
options = ApiKserveV2ImageClassificationEngineOptions(
url="localhost:8001",
grpc_use_binary_data=False,
)
assert options.use_binary_data is False
with pytest.deprecated_call(match="deprecated; use use_binary_data instead"):
assert options.grpc_use_binary_data is False
assert "grpc_use_binary_data" not in options.model_dump()
options = ApiKserveV2ImageClassificationEngineOptions(
url="localhost:8001",
use_binary_data=True,
grpc_use_binary_data=False,
)
assert options.use_binary_data is True
def test_e2e_conversions(test_doc_path):
for converter in get_converters_with_table_options():
print(f"converting {test_doc_path}")
doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS
def test_page_range(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
assert doc_result.status == ConversionStatus.SUCCESS
assert doc_result.input.page_count == 9
assert doc_result.document.num_pages() == 1
doc_result: ConversionResult = converter.convert(
test_doc_path, page_range=(10, 10), raises_on_error=False
)
assert doc_result.status == ConversionStatus.FAILURE
def test_document_timeout(test_doc_path):
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(document_timeout=1)
)
}
)
result = converter.convert(test_doc_path)
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
"Expected document timeout to be used"
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(document_timeout=1),
pipeline_cls=LegacyStandardPdfPipeline,
)
}
)
result = converter.convert(test_doc_path)
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
"Expected document timeout to be used"
)
def test_ocr_coverage_threshold(test_doc_path):
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options.bitmap_area_threshold = 1.1
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
test_doc_path = Path("./tests/data_scanned/ocr_test.pdf")
doc_result: ConversionResult = converter.convert(test_doc_path)
# this should have generated no results, since we set a very high threshold
assert len(doc_result.document.texts) == 0
def test_parser_backends(test_doc_path):
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = False
for backend_t in [
DoclingParseDocumentBackend,
PyPdfiumDocumentBackend,
]:
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend_t,
)
}
)
test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf")
doc_result: ConversionResult = converter.convert(test_doc_path)
assert doc_result.status == ConversionStatus.SUCCESS
def test_pipeline_cache_after_initialize(test_doc_path):
"""Test that initialize_pipeline caches correctly and convert reuses the cache.
Regression test for #3109: code_formula_options were mutated in-place during
pipeline initialization, changing the options hash and causing a cache miss
when convert() was called afterwards.
"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = False
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
converter.initialize_pipeline(InputFormat.PDF)
assert len(converter._get_initialized_pipelines()) == 1
converter.convert(test_doc_path)
assert len(converter._get_initialized_pipelines()) == 1, (
"Pipeline should be reused from cache, not re-initialized"
)
def test_confidence(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9))
assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT
assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT
def test_pipeline_cache_with_chart_extraction():
"""Test that chart extraction doesn't cause pipeline cache invalidation.
Verifies the fix for a bug where enabling chart extraction mutated shared
pipeline_options, changing its hash and causing unnecessary re-initialization.
"""
pipeline_options = PdfPipelineOptions()
pipeline_options.do_chart_extraction = True
with (
patch(
"docling.pipeline.base_pipeline.ChartExtractionModelGraniteVisionV4"
) as mock_chart,
patch(
"docling.pipeline.base_pipeline.DocumentPictureClassifier"
) as mock_classifier,
):
mock_chart.return_value = Mock(enabled=True)
mock_classifier.return_value = Mock(enabled=True)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
converter.initialize_pipeline(InputFormat.PDF)
assert len(converter._get_initialized_pipelines()) == 1
converter._get_pipeline(InputFormat.PDF)
assert len(converter._get_initialized_pipelines()) == 1, (
"Pipeline should be reused from cache, not re-initialized"
)