mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
46763a195c
fix(cli): clarify help text Fix confusing CLI help output by correcting wording and aligning the source argument name. Signed-off-by: Jefsky <hwj3344@hotmail.com> Co-authored-by: Cursor <cursoragent@cursor.com>
247 lines
7.3 KiB
Python
247 lines
7.3 KiB
Python
from pathlib import Path
|
|
|
|
import pytest
|
|
from docling_core.types.doc import ImageRefMode
|
|
from typer.testing import CliRunner
|
|
|
|
from docling.cli.export_utils import _should_generate_export_images, _split_list
|
|
from docling.cli.main import app
|
|
from docling.datamodel.base_models import OutputFormat
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
def test_cli_help():
|
|
result = runner.invoke(app, ["--help"])
|
|
assert result.exit_code == 0
|
|
assert "Input formats to" in result.output
|
|
assert "all supported" in result.output
|
|
assert "layout clusters" in result.output
|
|
assert "layour" not in result.output
|
|
assert "input_sources" not in result.output
|
|
|
|
|
|
def test_cli_version():
|
|
result = runner.invoke(app, ["--version"])
|
|
assert result.exit_code == 0
|
|
|
|
|
|
def test_cli_convert(tmp_path):
|
|
source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
|
|
output = tmp_path / "out"
|
|
output.mkdir()
|
|
result = runner.invoke(app, [source, "--output", str(output)])
|
|
assert result.exit_code == 0
|
|
converted = output / f"{Path(source).stem}.md"
|
|
assert converted.exists()
|
|
|
|
|
|
def test_export_documents_marks_empty_markdown_as_failure(tmp_path):
|
|
from docling.cli.main import export_documents
|
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
from docling.datamodel.document import (
|
|
ConversionResult,
|
|
InputDocument,
|
|
_DummyBackend,
|
|
)
|
|
|
|
input_path = tmp_path / "input.pdf"
|
|
input_path.write_bytes(b"%PDF-1.4")
|
|
|
|
input_doc = InputDocument(
|
|
path_or_stream=input_path,
|
|
format=InputFormat.PDF,
|
|
backend=_DummyBackend,
|
|
)
|
|
|
|
conv_res = ConversionResult(input=input_doc)
|
|
conv_res.status = ConversionStatus.SUCCESS
|
|
|
|
class DummyDocument:
|
|
def save_as_markdown(self, *, filename, image_mode):
|
|
Path(filename).write_text("")
|
|
|
|
conv_res.document = DummyDocument()
|
|
|
|
output_dir = tmp_path / "out"
|
|
output_dir.mkdir()
|
|
|
|
export_documents(
|
|
[conv_res],
|
|
output_dir=output_dir,
|
|
export_json=False,
|
|
export_yaml=False,
|
|
export_html=False,
|
|
export_html_split_page=False,
|
|
show_layout=False,
|
|
export_md=True,
|
|
export_txt=False,
|
|
export_doctags=False,
|
|
export_vtt=False,
|
|
print_timings=False,
|
|
export_timings=False,
|
|
image_export_mode=ImageRefMode.PLACEHOLDER,
|
|
)
|
|
|
|
assert conv_res.status == ConversionStatus.FAILURE
|
|
assert conv_res.errors
|
|
|
|
|
|
def test_export_documents_marks_stat_errors_as_failure(tmp_path, monkeypatch):
|
|
from docling.cli.main import export_documents
|
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
from docling.datamodel.document import (
|
|
ConversionResult,
|
|
InputDocument,
|
|
_DummyBackend,
|
|
)
|
|
|
|
input_path = tmp_path / "input.pdf"
|
|
input_path.write_bytes(b"%PDF-1.4")
|
|
|
|
input_doc = InputDocument(
|
|
path_or_stream=input_path,
|
|
format=InputFormat.PDF,
|
|
backend=_DummyBackend,
|
|
)
|
|
|
|
conv_res = ConversionResult(input=input_doc)
|
|
conv_res.status = ConversionStatus.SUCCESS
|
|
|
|
class DummyDocument:
|
|
def save_as_markdown(self, *, filename, image_mode):
|
|
Path(filename).write_text("ok")
|
|
|
|
conv_res.document = DummyDocument()
|
|
|
|
output_dir = tmp_path / "out"
|
|
output_dir.mkdir()
|
|
|
|
original_stat = Path.stat
|
|
|
|
def _raise_for_markdown(self):
|
|
if self.name == "input.md":
|
|
raise OSError("stat failed")
|
|
return original_stat(self)
|
|
|
|
monkeypatch.setattr(Path, "stat", _raise_for_markdown)
|
|
|
|
export_documents(
|
|
[conv_res],
|
|
output_dir=output_dir,
|
|
export_json=False,
|
|
export_yaml=False,
|
|
export_html=False,
|
|
export_html_split_page=False,
|
|
show_layout=False,
|
|
export_md=True,
|
|
export_txt=False,
|
|
export_doctags=False,
|
|
export_vtt=False,
|
|
print_timings=False,
|
|
export_timings=False,
|
|
image_export_mode=ImageRefMode.PLACEHOLDER,
|
|
)
|
|
|
|
assert conv_res.status == ConversionStatus.FAILURE
|
|
assert conv_res.errors
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("image_export_mode", "to_formats", "expected"),
|
|
[
|
|
(ImageRefMode.PLACEHOLDER, [OutputFormat.JSON], False),
|
|
(ImageRefMode.EMBEDDED, [OutputFormat.TEXT, OutputFormat.DOCTAGS], False),
|
|
(ImageRefMode.EMBEDDED, [OutputFormat.MARKDOWN], True),
|
|
(
|
|
ImageRefMode.EMBEDDED,
|
|
[OutputFormat.TEXT, OutputFormat.MARKDOWN],
|
|
True,
|
|
),
|
|
],
|
|
)
|
|
def test_should_generate_export_images(image_export_mode, to_formats, expected):
|
|
assert _should_generate_export_images(image_export_mode, to_formats) is expected
|
|
|
|
|
|
def test_image_export_policy_covers_all_output_formats():
|
|
non_image_export_formats = {
|
|
OutputFormat.TEXT,
|
|
OutputFormat.DOCTAGS,
|
|
OutputFormat.VTT,
|
|
}
|
|
image_export_formats = set(OutputFormat) - non_image_export_formats
|
|
|
|
assert image_export_formats.isdisjoint(non_image_export_formats)
|
|
assert image_export_formats | non_image_export_formats == set(OutputFormat)
|
|
|
|
|
|
def test_split_list_handles_none_and_delimiters():
|
|
assert _split_list(None) is None
|
|
assert _split_list("a,b;c") == ["a", "b", "c"]
|
|
|
|
|
|
def test_cli_audio_auto_detection(tmp_path):
|
|
"""Test that CLI automatically detects audio files and sets ASR pipeline."""
|
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
|
|
# Create a dummy audio file for testing
|
|
audio_file = tmp_path / "test_audio.mp3"
|
|
audio_file.write_bytes(b"dummy audio content")
|
|
|
|
output = tmp_path / "out"
|
|
output.mkdir()
|
|
|
|
# Test that audio file triggers ASR pipeline auto-detection
|
|
result = runner.invoke(app, [str(audio_file), "--output", str(output)])
|
|
# The command should succeed (even if ASR fails due to dummy content)
|
|
# The key is that it should attempt ASR processing, not standard processing
|
|
assert (
|
|
result.exit_code == 0 or result.exit_code == 1
|
|
) # Allow for ASR processing failure
|
|
|
|
|
|
def test_cli_explicit_pipeline_not_overridden(tmp_path):
|
|
"""Test that explicit pipeline choice is not overridden by audio auto-detection."""
|
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
|
|
# Create a dummy audio file for testing
|
|
audio_file = tmp_path / "test_audio.mp3"
|
|
audio_file.write_bytes(b"dummy audio content")
|
|
|
|
output = tmp_path / "out"
|
|
output.mkdir()
|
|
|
|
# Test that explicit --pipeline STANDARD is not overridden
|
|
result = runner.invoke(
|
|
app, [str(audio_file), "--output", str(output), "--pipeline", "standard"]
|
|
)
|
|
# Should still use standard pipeline despite audio file
|
|
assert (
|
|
result.exit_code == 0 or result.exit_code == 1
|
|
) # Allow for processing failure
|
|
|
|
|
|
def test_cli_audio_extensions_coverage():
|
|
"""Test that all audio extensions from FormatToExtensions are covered."""
|
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
|
|
# Verify that the centralized audio extensions include all expected formats
|
|
audio_extensions = FormatToExtensions[InputFormat.AUDIO]
|
|
expected_extensions = [
|
|
"wav",
|
|
"mp3",
|
|
"m4a",
|
|
"aac",
|
|
"ogg",
|
|
"flac",
|
|
"mp4",
|
|
"avi",
|
|
"mov",
|
|
]
|
|
|
|
for ext in expected_extensions:
|
|
assert ext in audio_extensions, (
|
|
f"Audio extension {ext} not found in FormatToExtensions[InputFormat.AUDIO]"
|
|
)
|