Files
docling/tests/test_cli.py
T
Pragnya Khandelwal ab6aa050be fix: fail on empty markdown export (#3429)
* fix: fail on empty markdown export

Signed-off-by: Pragnya <prag1704@gmail.com>

* chore: format files

Signed-off-by: Pragnya <prag1704@gmail.com>

* test: cover markdown export stat failures

Signed-off-by: Pragnya <prag1704@gmail.com>

* refactor: move cli export helpers

Signed-off-by: Pragnya <prag1704@gmail.com>

* test: cover export utils split list

Signed-off-by: Pragnya <prag1704@gmail.com>

---------

Signed-off-by: Pragnya <prag1704@gmail.com>
2026-05-17 08:35:46 +02:00

242 lines
7.1 KiB
Python

from pathlib import Path
import pytest
from docling_core.types.doc import ImageRefMode
from typer.testing import CliRunner
from docling.cli.export_utils import _should_generate_export_images, _split_list
from docling.cli.main import app
from docling.datamodel.base_models import OutputFormat
runner = CliRunner()
def test_cli_help():
result = runner.invoke(app, ["--help"])
assert result.exit_code == 0
def test_cli_version():
result = runner.invoke(app, ["--version"])
assert result.exit_code == 0
def test_cli_convert(tmp_path):
source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
output = tmp_path / "out"
output.mkdir()
result = runner.invoke(app, [source, "--output", str(output)])
assert result.exit_code == 0
converted = output / f"{Path(source).stem}.md"
assert converted.exists()
def test_export_documents_marks_empty_markdown_as_failure(tmp_path):
from docling.cli.main import export_documents
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
InputDocument,
_DummyBackend,
)
input_path = tmp_path / "input.pdf"
input_path.write_bytes(b"%PDF-1.4")
input_doc = InputDocument(
path_or_stream=input_path,
format=InputFormat.PDF,
backend=_DummyBackend,
)
conv_res = ConversionResult(input=input_doc)
conv_res.status = ConversionStatus.SUCCESS
class DummyDocument:
def save_as_markdown(self, *, filename, image_mode):
Path(filename).write_text("")
conv_res.document = DummyDocument()
output_dir = tmp_path / "out"
output_dir.mkdir()
export_documents(
[conv_res],
output_dir=output_dir,
export_json=False,
export_yaml=False,
export_html=False,
export_html_split_page=False,
show_layout=False,
export_md=True,
export_txt=False,
export_doctags=False,
export_vtt=False,
print_timings=False,
export_timings=False,
image_export_mode=ImageRefMode.PLACEHOLDER,
)
assert conv_res.status == ConversionStatus.FAILURE
assert conv_res.errors
def test_export_documents_marks_stat_errors_as_failure(tmp_path, monkeypatch):
from docling.cli.main import export_documents
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
InputDocument,
_DummyBackend,
)
input_path = tmp_path / "input.pdf"
input_path.write_bytes(b"%PDF-1.4")
input_doc = InputDocument(
path_or_stream=input_path,
format=InputFormat.PDF,
backend=_DummyBackend,
)
conv_res = ConversionResult(input=input_doc)
conv_res.status = ConversionStatus.SUCCESS
class DummyDocument:
def save_as_markdown(self, *, filename, image_mode):
Path(filename).write_text("ok")
conv_res.document = DummyDocument()
output_dir = tmp_path / "out"
output_dir.mkdir()
original_stat = Path.stat
def _raise_for_markdown(self):
if self.name == "input.md":
raise OSError("stat failed")
return original_stat(self)
monkeypatch.setattr(Path, "stat", _raise_for_markdown)
export_documents(
[conv_res],
output_dir=output_dir,
export_json=False,
export_yaml=False,
export_html=False,
export_html_split_page=False,
show_layout=False,
export_md=True,
export_txt=False,
export_doctags=False,
export_vtt=False,
print_timings=False,
export_timings=False,
image_export_mode=ImageRefMode.PLACEHOLDER,
)
assert conv_res.status == ConversionStatus.FAILURE
assert conv_res.errors
@pytest.mark.parametrize(
("image_export_mode", "to_formats", "expected"),
[
(ImageRefMode.PLACEHOLDER, [OutputFormat.JSON], False),
(ImageRefMode.EMBEDDED, [OutputFormat.TEXT, OutputFormat.DOCTAGS], False),
(ImageRefMode.EMBEDDED, [OutputFormat.MARKDOWN], True),
(
ImageRefMode.EMBEDDED,
[OutputFormat.TEXT, OutputFormat.MARKDOWN],
True,
),
],
)
def test_should_generate_export_images(image_export_mode, to_formats, expected):
assert _should_generate_export_images(image_export_mode, to_formats) is expected
def test_image_export_policy_covers_all_output_formats():
non_image_export_formats = {
OutputFormat.TEXT,
OutputFormat.DOCTAGS,
OutputFormat.VTT,
}
image_export_formats = set(OutputFormat) - non_image_export_formats
assert image_export_formats.isdisjoint(non_image_export_formats)
assert image_export_formats | non_image_export_formats == set(OutputFormat)
def test_split_list_handles_none_and_delimiters():
assert _split_list(None) is None
assert _split_list("a,b;c") == ["a", "b", "c"]
def test_cli_audio_auto_detection(tmp_path):
"""Test that CLI automatically detects audio files and sets ASR pipeline."""
from docling.datamodel.base_models import FormatToExtensions, InputFormat
# Create a dummy audio file for testing
audio_file = tmp_path / "test_audio.mp3"
audio_file.write_bytes(b"dummy audio content")
output = tmp_path / "out"
output.mkdir()
# Test that audio file triggers ASR pipeline auto-detection
result = runner.invoke(app, [str(audio_file), "--output", str(output)])
# The command should succeed (even if ASR fails due to dummy content)
# The key is that it should attempt ASR processing, not standard processing
assert (
result.exit_code == 0 or result.exit_code == 1
) # Allow for ASR processing failure
def test_cli_explicit_pipeline_not_overridden(tmp_path):
"""Test that explicit pipeline choice is not overridden by audio auto-detection."""
from docling.datamodel.base_models import FormatToExtensions, InputFormat
# Create a dummy audio file for testing
audio_file = tmp_path / "test_audio.mp3"
audio_file.write_bytes(b"dummy audio content")
output = tmp_path / "out"
output.mkdir()
# Test that explicit --pipeline STANDARD is not overridden
result = runner.invoke(
app, [str(audio_file), "--output", str(output), "--pipeline", "standard"]
)
# Should still use standard pipeline despite audio file
assert (
result.exit_code == 0 or result.exit_code == 1
) # Allow for processing failure
def test_cli_audio_extensions_coverage():
"""Test that all audio extensions from FormatToExtensions are covered."""
from docling.datamodel.base_models import FormatToExtensions, InputFormat
# Verify that the centralized audio extensions include all expected formats
audio_extensions = FormatToExtensions[InputFormat.AUDIO]
expected_extensions = [
"wav",
"mp3",
"m4a",
"aac",
"ogg",
"flac",
"mp4",
"avi",
"mov",
]
for ext in expected_extensions:
assert ext in audio_extensions, (
f"Audio extension {ext} not found in FormatToExtensions[InputFormat.AUDIO]"
)