fix: fail on empty markdown export (#3429)

* fix: fail on empty markdown export

Signed-off-by: Pragnya <prag1704@gmail.com>

* chore: format files

Signed-off-by: Pragnya <prag1704@gmail.com>

* test: cover markdown export stat failures

Signed-off-by: Pragnya <prag1704@gmail.com>

* refactor: move cli export helpers

Signed-off-by: Pragnya <prag1704@gmail.com>

* test: cover export utils split list

Signed-off-by: Pragnya <prag1704@gmail.com>

---------

Signed-off-by: Pragnya <prag1704@gmail.com>
This commit is contained in:
Pragnya Khandelwal
2026-05-17 12:05:46 +05:30
committed by GitHub
parent 038b9916bc
commit ab6aa050be
3 changed files with 182 additions and 29 deletions
+37
View File
@@ -0,0 +1,37 @@
import re
from pathlib import Path
from docling_core.types.doc import ImageRefMode
from docling.datamodel.base_models import OutputFormat
_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset(
{
OutputFormat.TEXT,
OutputFormat.DOCTAGS,
OutputFormat.VTT,
}
)
def _should_generate_export_images(
image_export_mode: ImageRefMode,
to_formats: list[OutputFormat],
) -> bool:
return image_export_mode != ImageRefMode.PLACEHOLDER and any(
to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING
for to_format in to_formats
)
def _split_list(raw: str | None) -> list[str] | None:
if raw is None:
return None
return re.split(r"[;,]", raw)
def _is_empty_output(path: Path) -> bool:
try:
return not path.exists() or path.stat().st_size == 0
except OSError:
return True
+27 -28
View File
@@ -44,6 +44,11 @@ from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.cli.export_utils import (
_is_empty_output,
_should_generate_export_images,
_split_list,
)
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
@@ -69,6 +74,8 @@ from docling.datamodel.asr_model_specs import (
from docling.datamodel.backend_options import LatexBackendOptions, PdfBackendOptions
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
FormatToExtensions,
InputFormat,
OutputFormat,
@@ -239,8 +246,8 @@ def export_documents(
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_failed = conv_res.status != ConversionStatus.SUCCESS
if not doc_failed:
doc_filename = conv_res.input.file.stem
# Export JSON format:
@@ -310,6 +317,21 @@ def export_documents(
conv_res.document.save_as_markdown(
filename=fname, image_mode=image_export_mode
)
if _is_empty_output(fname):
error_message = (
"Markdown export produced empty output for "
f"{conv_res.input.file.name}"
)
_log.error(error_message)
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.DOC_ASSEMBLER,
module_name="export_documents",
error_message=error_message,
)
)
conv_res.status = ConversionStatus.FAILURE
doc_failed = True
# Export Document Tags format:
if export_doctags:
@@ -367,7 +389,7 @@ def export_documents(
r = TimingsT.dump_json(conv_res.timings, indent=2)
fp.write(r)
else:
if doc_failed:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
if _log.isEnabledFor(logging.INFO):
for err in conv_res.errors:
@@ -376,37 +398,14 @@ def export_documents(
f"Module: {err.module_name}, Message: {err.error_message}"
)
failure_count += 1
else:
success_count += 1
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
def _split_list(raw: str | None) -> list[str] | None:
if raw is None:
return None
return re.split(r"[;,]", raw)
_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset(
{
OutputFormat.TEXT,
OutputFormat.DOCTAGS,
OutputFormat.VTT,
}
)
def _should_generate_export_images(
image_export_mode: ImageRefMode,
to_formats: list[OutputFormat],
) -> bool:
return image_export_mode != ImageRefMode.PLACEHOLDER and any(
to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING
for to_format in to_formats
)
@app.command(no_args_is_help=True)
def convert( # noqa: C901
input_sources: Annotated[
+118 -1
View File
@@ -4,7 +4,8 @@ import pytest
from docling_core.types.doc import ImageRefMode
from typer.testing import CliRunner
from docling.cli.main import _should_generate_export_images, app
from docling.cli.export_utils import _should_generate_export_images, _split_list
from docling.cli.main import app
from docling.datamodel.base_models import OutputFormat
runner = CliRunner()
@@ -30,6 +31,117 @@ def test_cli_convert(tmp_path):
assert converted.exists()
def test_export_documents_marks_empty_markdown_as_failure(tmp_path):
from docling.cli.main import export_documents
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
InputDocument,
_DummyBackend,
)
input_path = tmp_path / "input.pdf"
input_path.write_bytes(b"%PDF-1.4")
input_doc = InputDocument(
path_or_stream=input_path,
format=InputFormat.PDF,
backend=_DummyBackend,
)
conv_res = ConversionResult(input=input_doc)
conv_res.status = ConversionStatus.SUCCESS
class DummyDocument:
def save_as_markdown(self, *, filename, image_mode):
Path(filename).write_text("")
conv_res.document = DummyDocument()
output_dir = tmp_path / "out"
output_dir.mkdir()
export_documents(
[conv_res],
output_dir=output_dir,
export_json=False,
export_yaml=False,
export_html=False,
export_html_split_page=False,
show_layout=False,
export_md=True,
export_txt=False,
export_doctags=False,
export_vtt=False,
print_timings=False,
export_timings=False,
image_export_mode=ImageRefMode.PLACEHOLDER,
)
assert conv_res.status == ConversionStatus.FAILURE
assert conv_res.errors
def test_export_documents_marks_stat_errors_as_failure(tmp_path, monkeypatch):
from docling.cli.main import export_documents
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
InputDocument,
_DummyBackend,
)
input_path = tmp_path / "input.pdf"
input_path.write_bytes(b"%PDF-1.4")
input_doc = InputDocument(
path_or_stream=input_path,
format=InputFormat.PDF,
backend=_DummyBackend,
)
conv_res = ConversionResult(input=input_doc)
conv_res.status = ConversionStatus.SUCCESS
class DummyDocument:
def save_as_markdown(self, *, filename, image_mode):
Path(filename).write_text("ok")
conv_res.document = DummyDocument()
output_dir = tmp_path / "out"
output_dir.mkdir()
original_stat = Path.stat
def _raise_for_markdown(self):
if self.name == "input.md":
raise OSError("stat failed")
return original_stat(self)
monkeypatch.setattr(Path, "stat", _raise_for_markdown)
export_documents(
[conv_res],
output_dir=output_dir,
export_json=False,
export_yaml=False,
export_html=False,
export_html_split_page=False,
show_layout=False,
export_md=True,
export_txt=False,
export_doctags=False,
export_vtt=False,
print_timings=False,
export_timings=False,
image_export_mode=ImageRefMode.PLACEHOLDER,
)
assert conv_res.status == ConversionStatus.FAILURE
assert conv_res.errors
@pytest.mark.parametrize(
("image_export_mode", "to_formats", "expected"),
[
@@ -59,6 +171,11 @@ def test_image_export_policy_covers_all_output_formats():
assert image_export_formats | non_image_export_formats == set(OutputFormat)
def test_split_list_handles_none_and_delimiters():
assert _split_list(None) is None
assert _split_list("a,b;c") == ["a", "b", "c"]
def test_cli_audio_auto_detection(tmp_path):
"""Test that CLI automatically detects audio files and sets ASR pipeline."""
from docling.datamodel.base_models import FormatToExtensions, InputFormat