From ab6aa050be2c106ef4e3664ba0d4ab3fbc45097b Mon Sep 17 00:00:00 2001 From: Pragnya Khandelwal Date: Sun, 17 May 2026 12:05:46 +0530 Subject: [PATCH] fix: fail on empty markdown export (#3429) * fix: fail on empty markdown export Signed-off-by: Pragnya * chore: format files Signed-off-by: Pragnya * test: cover markdown export stat failures Signed-off-by: Pragnya * refactor: move cli export helpers Signed-off-by: Pragnya * test: cover export utils split list Signed-off-by: Pragnya --------- Signed-off-by: Pragnya --- docling/cli/export_utils.py | 37 +++++++++++ docling/cli/main.py | 55 ++++++++--------- tests/test_cli.py | 119 +++++++++++++++++++++++++++++++++++- 3 files changed, 182 insertions(+), 29 deletions(-) create mode 100644 docling/cli/export_utils.py diff --git a/docling/cli/export_utils.py b/docling/cli/export_utils.py new file mode 100644 index 00000000..b353256c --- /dev/null +++ b/docling/cli/export_utils.py @@ -0,0 +1,37 @@ +import re +from pathlib import Path + +from docling_core.types.doc import ImageRefMode + +from docling.datamodel.base_models import OutputFormat + +_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset( + { + OutputFormat.TEXT, + OutputFormat.DOCTAGS, + OutputFormat.VTT, + } +) + + +def _should_generate_export_images( + image_export_mode: ImageRefMode, + to_formats: list[OutputFormat], +) -> bool: + return image_export_mode != ImageRefMode.PLACEHOLDER and any( + to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING + for to_format in to_formats + ) + + +def _split_list(raw: str | None) -> list[str] | None: + if raw is None: + return None + return re.split(r"[;,]", raw) + + +def _is_empty_output(path: Path) -> bool: + try: + return not path.exists() or path.stat().st_size == 0 + except OSError: + return True diff --git a/docling/cli/main.py b/docling/cli/main.py index 14a026a9..aecff4e8 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -44,6 +44,11 @@ from docling.backend.image_backend import ImageDocumentBackend from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.cli.export_utils import ( + _is_empty_output, + _should_generate_export_images, + _split_list, +) from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.asr_model_specs import ( WHISPER_BASE, @@ -69,6 +74,8 @@ from docling.datamodel.asr_model_specs import ( from docling.datamodel.backend_options import LatexBackendOptions, PdfBackendOptions from docling.datamodel.base_models import ( ConversionStatus, + DoclingComponentType, + ErrorItem, FormatToExtensions, InputFormat, OutputFormat, @@ -239,8 +246,8 @@ def export_documents( failure_count = 0 for conv_res in conv_results: - if conv_res.status == ConversionStatus.SUCCESS: - success_count += 1 + doc_failed = conv_res.status != ConversionStatus.SUCCESS + if not doc_failed: doc_filename = conv_res.input.file.stem # Export JSON format: @@ -310,6 +317,21 @@ def export_documents( conv_res.document.save_as_markdown( filename=fname, image_mode=image_export_mode ) + if _is_empty_output(fname): + error_message = ( + "Markdown export produced empty output for " + f"{conv_res.input.file.name}" + ) + _log.error(error_message) + conv_res.errors.append( + ErrorItem( + component_type=DoclingComponentType.DOC_ASSEMBLER, + module_name="export_documents", + error_message=error_message, + ) + ) + conv_res.status = ConversionStatus.FAILURE + doc_failed = True # Export Document Tags format: if export_doctags: @@ -367,7 +389,7 @@ def export_documents( r = TimingsT.dump_json(conv_res.timings, indent=2) fp.write(r) - else: + if doc_failed: _log.warning(f"Document {conv_res.input.file} failed to convert.") if _log.isEnabledFor(logging.INFO): for err in conv_res.errors: @@ -376,37 +398,14 @@ def export_documents( f"Module: {err.module_name}, Message: {err.error_message}" ) failure_count += 1 + else: + success_count += 1 _log.info( f"Processed {success_count + failure_count} docs, of which {failure_count} failed" ) -def _split_list(raw: str | None) -> list[str] | None: - if raw is None: - return None - return re.split(r"[;,]", raw) - - -_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset( - { - OutputFormat.TEXT, - OutputFormat.DOCTAGS, - OutputFormat.VTT, - } -) - - -def _should_generate_export_images( - image_export_mode: ImageRefMode, - to_formats: list[OutputFormat], -) -> bool: - return image_export_mode != ImageRefMode.PLACEHOLDER and any( - to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING - for to_format in to_formats - ) - - @app.command(no_args_is_help=True) def convert( # noqa: C901 input_sources: Annotated[ diff --git a/tests/test_cli.py b/tests/test_cli.py index 64506fd2..08b1e0ae 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,7 +4,8 @@ import pytest from docling_core.types.doc import ImageRefMode from typer.testing import CliRunner -from docling.cli.main import _should_generate_export_images, app +from docling.cli.export_utils import _should_generate_export_images, _split_list +from docling.cli.main import app from docling.datamodel.base_models import OutputFormat runner = CliRunner() @@ -30,6 +31,117 @@ def test_cli_convert(tmp_path): assert converted.exists() +def test_export_documents_marks_empty_markdown_as_failure(tmp_path): + from docling.cli.main import export_documents + from docling.datamodel.base_models import ConversionStatus, InputFormat + from docling.datamodel.document import ( + ConversionResult, + InputDocument, + _DummyBackend, + ) + + input_path = tmp_path / "input.pdf" + input_path.write_bytes(b"%PDF-1.4") + + input_doc = InputDocument( + path_or_stream=input_path, + format=InputFormat.PDF, + backend=_DummyBackend, + ) + + conv_res = ConversionResult(input=input_doc) + conv_res.status = ConversionStatus.SUCCESS + + class DummyDocument: + def save_as_markdown(self, *, filename, image_mode): + Path(filename).write_text("") + + conv_res.document = DummyDocument() + + output_dir = tmp_path / "out" + output_dir.mkdir() + + export_documents( + [conv_res], + output_dir=output_dir, + export_json=False, + export_yaml=False, + export_html=False, + export_html_split_page=False, + show_layout=False, + export_md=True, + export_txt=False, + export_doctags=False, + export_vtt=False, + print_timings=False, + export_timings=False, + image_export_mode=ImageRefMode.PLACEHOLDER, + ) + + assert conv_res.status == ConversionStatus.FAILURE + assert conv_res.errors + + +def test_export_documents_marks_stat_errors_as_failure(tmp_path, monkeypatch): + from docling.cli.main import export_documents + from docling.datamodel.base_models import ConversionStatus, InputFormat + from docling.datamodel.document import ( + ConversionResult, + InputDocument, + _DummyBackend, + ) + + input_path = tmp_path / "input.pdf" + input_path.write_bytes(b"%PDF-1.4") + + input_doc = InputDocument( + path_or_stream=input_path, + format=InputFormat.PDF, + backend=_DummyBackend, + ) + + conv_res = ConversionResult(input=input_doc) + conv_res.status = ConversionStatus.SUCCESS + + class DummyDocument: + def save_as_markdown(self, *, filename, image_mode): + Path(filename).write_text("ok") + + conv_res.document = DummyDocument() + + output_dir = tmp_path / "out" + output_dir.mkdir() + + original_stat = Path.stat + + def _raise_for_markdown(self): + if self.name == "input.md": + raise OSError("stat failed") + return original_stat(self) + + monkeypatch.setattr(Path, "stat", _raise_for_markdown) + + export_documents( + [conv_res], + output_dir=output_dir, + export_json=False, + export_yaml=False, + export_html=False, + export_html_split_page=False, + show_layout=False, + export_md=True, + export_txt=False, + export_doctags=False, + export_vtt=False, + print_timings=False, + export_timings=False, + image_export_mode=ImageRefMode.PLACEHOLDER, + ) + + assert conv_res.status == ConversionStatus.FAILURE + assert conv_res.errors + + @pytest.mark.parametrize( ("image_export_mode", "to_formats", "expected"), [ @@ -59,6 +171,11 @@ def test_image_export_policy_covers_all_output_formats(): assert image_export_formats | non_image_export_formats == set(OutputFormat) +def test_split_list_handles_none_and_delimiters(): + assert _split_list(None) is None + assert _split_list("a,b;c") == ["a", "b", "c"] + + def test_cli_audio_auto_detection(tmp_path): """Test that CLI automatically detects audio files and sets ASR pipeline.""" from docling.datamodel.base_models import FormatToExtensions, InputFormat