mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
5473e07450
* fix(cli): avoid generating images for non-image exports Signed-off-by: Hassan Raza <raihassanraza10@gmail.com> * test(cli): cover output image export policy Signed-off-by: Hassan Raza <raihassanraza10@gmail.com> * fix(cli): use denylist for image export gating Signed-off-by: Hassan Raza <raihassanraza10@gmail.com> * fix(cli): clarify image export mode help text Signed-off-by: Hassan Raza <raihassanraza10@gmail.com> --------- Signed-off-by: Hassan Raza <raihassanraza10@gmail.com>
125 lines
4.0 KiB
Python
125 lines
4.0 KiB
Python
from pathlib import Path
|
|
|
|
import pytest
|
|
from docling_core.types.doc import ImageRefMode
|
|
from typer.testing import CliRunner
|
|
|
|
from docling.cli.main import _should_generate_export_images, app
|
|
from docling.datamodel.base_models import OutputFormat
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
def test_cli_help():
|
|
result = runner.invoke(app, ["--help"])
|
|
assert result.exit_code == 0
|
|
|
|
|
|
def test_cli_version():
|
|
result = runner.invoke(app, ["--version"])
|
|
assert result.exit_code == 0
|
|
|
|
|
|
def test_cli_convert(tmp_path):
|
|
source = "./tests/data/pdf/2305.03393v1-pg9.pdf"
|
|
output = tmp_path / "out"
|
|
output.mkdir()
|
|
result = runner.invoke(app, [source, "--output", str(output)])
|
|
assert result.exit_code == 0
|
|
converted = output / f"{Path(source).stem}.md"
|
|
assert converted.exists()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("image_export_mode", "to_formats", "expected"),
|
|
[
|
|
(ImageRefMode.PLACEHOLDER, [OutputFormat.JSON], False),
|
|
(ImageRefMode.EMBEDDED, [OutputFormat.TEXT, OutputFormat.DOCTAGS], False),
|
|
(ImageRefMode.EMBEDDED, [OutputFormat.MARKDOWN], True),
|
|
(
|
|
ImageRefMode.EMBEDDED,
|
|
[OutputFormat.TEXT, OutputFormat.MARKDOWN],
|
|
True,
|
|
),
|
|
],
|
|
)
|
|
def test_should_generate_export_images(image_export_mode, to_formats, expected):
|
|
assert _should_generate_export_images(image_export_mode, to_formats) is expected
|
|
|
|
|
|
def test_image_export_policy_covers_all_output_formats():
|
|
non_image_export_formats = {
|
|
OutputFormat.TEXT,
|
|
OutputFormat.DOCTAGS,
|
|
OutputFormat.VTT,
|
|
}
|
|
image_export_formats = set(OutputFormat) - non_image_export_formats
|
|
|
|
assert image_export_formats.isdisjoint(non_image_export_formats)
|
|
assert image_export_formats | non_image_export_formats == set(OutputFormat)
|
|
|
|
|
|
def test_cli_audio_auto_detection(tmp_path):
|
|
"""Test that CLI automatically detects audio files and sets ASR pipeline."""
|
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
|
|
# Create a dummy audio file for testing
|
|
audio_file = tmp_path / "test_audio.mp3"
|
|
audio_file.write_bytes(b"dummy audio content")
|
|
|
|
output = tmp_path / "out"
|
|
output.mkdir()
|
|
|
|
# Test that audio file triggers ASR pipeline auto-detection
|
|
result = runner.invoke(app, [str(audio_file), "--output", str(output)])
|
|
# The command should succeed (even if ASR fails due to dummy content)
|
|
# The key is that it should attempt ASR processing, not standard processing
|
|
assert (
|
|
result.exit_code == 0 or result.exit_code == 1
|
|
) # Allow for ASR processing failure
|
|
|
|
|
|
def test_cli_explicit_pipeline_not_overridden(tmp_path):
|
|
"""Test that explicit pipeline choice is not overridden by audio auto-detection."""
|
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
|
|
# Create a dummy audio file for testing
|
|
audio_file = tmp_path / "test_audio.mp3"
|
|
audio_file.write_bytes(b"dummy audio content")
|
|
|
|
output = tmp_path / "out"
|
|
output.mkdir()
|
|
|
|
# Test that explicit --pipeline STANDARD is not overridden
|
|
result = runner.invoke(
|
|
app, [str(audio_file), "--output", str(output), "--pipeline", "standard"]
|
|
)
|
|
# Should still use standard pipeline despite audio file
|
|
assert (
|
|
result.exit_code == 0 or result.exit_code == 1
|
|
) # Allow for processing failure
|
|
|
|
|
|
def test_cli_audio_extensions_coverage():
|
|
"""Test that all audio extensions from FormatToExtensions are covered."""
|
|
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
|
|
|
# Verify that the centralized audio extensions include all expected formats
|
|
audio_extensions = FormatToExtensions[InputFormat.AUDIO]
|
|
expected_extensions = [
|
|
"wav",
|
|
"mp3",
|
|
"m4a",
|
|
"aac",
|
|
"ogg",
|
|
"flac",
|
|
"mp4",
|
|
"avi",
|
|
"mov",
|
|
]
|
|
|
|
for ext in expected_extensions:
|
|
assert ext in audio_extensions, (
|
|
f"Audio extension {ext} not found in FormatToExtensions[InputFormat.AUDIO]"
|
|
)
|