import os from pathlib import Path from unittest.mock import Mock, patch import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade from docling.datamodel.document import ConversionResult from docling.datamodel.image_classification_engine_options import ( ApiKserveV2ImageClassificationEngineOptions, ) from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TableFormerMode, ) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.legacy_standard_pdf_pipeline import LegacyStandardPdfPipeline @pytest.fixture def test_doc_path(): return Path("./tests/data/pdf/2206.01062.pdf") def get_converters_with_table_options(): for cell_matching in [True, False]: for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]: pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = cell_matching pipeline_options.table_structure_options.mode = mode converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) } ) yield converter def test_accelerator_options(): # Check the default options ao = AcceleratorOptions() assert ao.num_threads == 4, "Wrong default num_threads" assert ao.device == AcceleratorDevice.AUTO, "Wrong default device" # Use API ao2 = AcceleratorOptions(num_threads=2, device=AcceleratorDevice.MPS) ao3 = AcceleratorOptions(num_threads=3, device=AcceleratorDevice.CUDA) ao4 = AcceleratorOptions(num_threads=4, device=AcceleratorDevice.XPU) assert ao2.num_threads == 2 assert ao2.device == AcceleratorDevice.MPS assert ao3.num_threads == 3 assert ao3.device == AcceleratorDevice.CUDA assert ao4.num_threads == 4 assert ao4.device == AcceleratorDevice.XPU # Use envvars (regular + alternative) and default values os.environ["OMP_NUM_THREADS"] = "1" ao.__init__() assert ao.num_threads == 1 assert ao.device == AcceleratorDevice.AUTO os.environ["DOCLING_DEVICE"] = "cpu" ao.__init__() assert ao.device == AcceleratorDevice.CPU assert ao.num_threads == 1 # Use envvars and override in init os.environ["DOCLING_DEVICE"] = "cpu" ao5 = AcceleratorOptions(num_threads=5, device=AcceleratorDevice.MPS) assert ao5.num_threads == 5 assert ao5.device == AcceleratorDevice.MPS # Use regular and alternative envvar os.environ["DOCLING_NUM_THREADS"] = "2" ao6 = AcceleratorOptions() assert ao6.num_threads == 2 assert ao6.device == AcceleratorDevice.CPU # Use wrong values is_exception = False try: os.environ["DOCLING_DEVICE"] = "wrong" ao5.__init__() except Exception as ex: print(ex) is_exception = True assert is_exception # Use misformatted alternative envvar del os.environ["DOCLING_NUM_THREADS"] del os.environ["DOCLING_DEVICE"] os.environ["OMP_NUM_THREADS"] = "wrong" ao7 = AcceleratorOptions() assert ao7.num_threads == 4 assert ao7.device == AcceleratorDevice.AUTO def test_kserve_v2_binary_data_deprecated_alias(): options = ApiKserveV2ImageClassificationEngineOptions( url="localhost:8001", grpc_use_binary_data=False, ) assert options.use_binary_data is False with pytest.deprecated_call(match="deprecated; use use_binary_data instead"): assert options.grpc_use_binary_data is False assert "grpc_use_binary_data" not in options.model_dump() options = ApiKserveV2ImageClassificationEngineOptions( url="localhost:8001", use_binary_data=True, grpc_use_binary_data=False, ) assert options.use_binary_data is True def test_e2e_conversions(test_doc_path): for converter in get_converters_with_table_options(): print(f"converting {test_doc_path}") doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS def test_page_range(test_doc_path): converter = DocumentConverter() doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9)) assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.input.page_count == 9 assert doc_result.document.num_pages() == 1 doc_result: ConversionResult = converter.convert( test_doc_path, page_range=(10, 10), raises_on_error=False ) assert doc_result.status == ConversionStatus.FAILURE def test_document_timeout(test_doc_path): converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=PdfPipelineOptions(document_timeout=1) ) } ) result = converter.convert(test_doc_path) assert result.status == ConversionStatus.PARTIAL_SUCCESS, ( "Expected document timeout to be used" ) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=PdfPipelineOptions(document_timeout=1), pipeline_cls=LegacyStandardPdfPipeline, ) } ) result = converter.convert(test_doc_path) assert result.status == ConversionStatus.PARTIAL_SUCCESS, ( "Expected document timeout to be used" ) def test_ocr_coverage_threshold(test_doc_path): pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True pipeline_options.ocr_options.bitmap_area_threshold = 1.1 converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) test_doc_path = Path("./tests/data_scanned/ocr_test.pdf") doc_result: ConversionResult = converter.convert(test_doc_path) # this should have generated no results, since we set a very high threshold assert len(doc_result.document.texts) == 0 def test_parser_backends(test_doc_path): pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = False for backend_t in [ DoclingParseDocumentBackend, PyPdfiumDocumentBackend, ]: converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, backend=backend_t, ) } ) test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf") doc_result: ConversionResult = converter.convert(test_doc_path) assert doc_result.status == ConversionStatus.SUCCESS def test_pipeline_cache_after_initialize(test_doc_path): """Test that initialize_pipeline caches correctly and convert reuses the cache. Regression test for #3109: code_formula_options were mutated in-place during pipeline initialization, changing the options hash and causing a cache miss when convert() was called afterwards. """ pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = False converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) converter.initialize_pipeline(InputFormat.PDF) assert len(converter._get_initialized_pipelines()) == 1 converter.convert(test_doc_path) assert len(converter._get_initialized_pipelines()) == 1, ( "Pipeline should be reused from cache, not re-initialized" ) def test_confidence(test_doc_path): converter = DocumentConverter() doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9)) assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT def test_pipeline_cache_with_chart_extraction(): """Test that chart extraction doesn't cause pipeline cache invalidation. Verifies the fix for a bug where enabling chart extraction mutated shared pipeline_options, changing its hash and causing unnecessary re-initialization. """ pipeline_options = PdfPipelineOptions() pipeline_options.do_chart_extraction = True with ( patch( "docling.pipeline.base_pipeline.ChartExtractionModelGraniteVisionV4" ) as mock_chart, patch( "docling.pipeline.base_pipeline.DocumentPictureClassifier" ) as mock_classifier, ): mock_chart.return_value = Mock(enabled=True) mock_classifier.return_value = Mock(enabled=True) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) converter.initialize_pipeline(InputFormat.PDF) assert len(converter._get_initialized_pipelines()) == 1 converter._get_pipeline(InputFormat.PDF) assert len(converter._get_initialized_pipelines()) == 1, ( "Pipeline should be reused from cache, not re-initialized" )