mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
fix: fail on empty markdown export (#3429)
* fix: fail on empty markdown export Signed-off-by: Pragnya <prag1704@gmail.com> * chore: format files Signed-off-by: Pragnya <prag1704@gmail.com> * test: cover markdown export stat failures Signed-off-by: Pragnya <prag1704@gmail.com> * refactor: move cli export helpers Signed-off-by: Pragnya <prag1704@gmail.com> * test: cover export utils split list Signed-off-by: Pragnya <prag1704@gmail.com> --------- Signed-off-by: Pragnya <prag1704@gmail.com>
This commit is contained in:
committed by
GitHub
parent
038b9916bc
commit
ab6aa050be
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
|
||||
_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset(
|
||||
{
|
||||
OutputFormat.TEXT,
|
||||
OutputFormat.DOCTAGS,
|
||||
OutputFormat.VTT,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _should_generate_export_images(
|
||||
image_export_mode: ImageRefMode,
|
||||
to_formats: list[OutputFormat],
|
||||
) -> bool:
|
||||
return image_export_mode != ImageRefMode.PLACEHOLDER and any(
|
||||
to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING
|
||||
for to_format in to_formats
|
||||
)
|
||||
|
||||
|
||||
def _split_list(raw: str | None) -> list[str] | None:
|
||||
if raw is None:
|
||||
return None
|
||||
return re.split(r"[;,]", raw)
|
||||
|
||||
|
||||
def _is_empty_output(path: Path) -> bool:
|
||||
try:
|
||||
return not path.exists() or path.stat().st_size == 0
|
||||
except OSError:
|
||||
return True
|
||||
+27
-28
@@ -44,6 +44,11 @@ from docling.backend.image_backend import ImageDocumentBackend
|
||||
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.cli.export_utils import (
|
||||
_is_empty_output,
|
||||
_should_generate_export_images,
|
||||
_split_list,
|
||||
)
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.asr_model_specs import (
|
||||
WHISPER_BASE,
|
||||
@@ -69,6 +74,8 @@ from docling.datamodel.asr_model_specs import (
|
||||
from docling.datamodel.backend_options import LatexBackendOptions, PdfBackendOptions
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
FormatToExtensions,
|
||||
InputFormat,
|
||||
OutputFormat,
|
||||
@@ -239,8 +246,8 @@ def export_documents(
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_failed = conv_res.status != ConversionStatus.SUCCESS
|
||||
if not doc_failed:
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export JSON format:
|
||||
@@ -310,6 +317,21 @@ def export_documents(
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
if _is_empty_output(fname):
|
||||
error_message = (
|
||||
"Markdown export produced empty output for "
|
||||
f"{conv_res.input.file.name}"
|
||||
)
|
||||
_log.error(error_message)
|
||||
conv_res.errors.append(
|
||||
ErrorItem(
|
||||
component_type=DoclingComponentType.DOC_ASSEMBLER,
|
||||
module_name="export_documents",
|
||||
error_message=error_message,
|
||||
)
|
||||
)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
doc_failed = True
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
@@ -367,7 +389,7 @@ def export_documents(
|
||||
r = TimingsT.dump_json(conv_res.timings, indent=2)
|
||||
fp.write(r)
|
||||
|
||||
else:
|
||||
if doc_failed:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
if _log.isEnabledFor(logging.INFO):
|
||||
for err in conv_res.errors:
|
||||
@@ -376,37 +398,14 @@ def export_documents(
|
||||
f"Module: {err.module_name}, Message: {err.error_message}"
|
||||
)
|
||||
failure_count += 1
|
||||
else:
|
||||
success_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||
)
|
||||
|
||||
|
||||
def _split_list(raw: str | None) -> list[str] | None:
|
||||
if raw is None:
|
||||
return None
|
||||
return re.split(r"[;,]", raw)
|
||||
|
||||
|
||||
_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset(
|
||||
{
|
||||
OutputFormat.TEXT,
|
||||
OutputFormat.DOCTAGS,
|
||||
OutputFormat.VTT,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _should_generate_export_images(
|
||||
image_export_mode: ImageRefMode,
|
||||
to_formats: list[OutputFormat],
|
||||
) -> bool:
|
||||
return image_export_mode != ImageRefMode.PLACEHOLDER and any(
|
||||
to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING
|
||||
for to_format in to_formats
|
||||
)
|
||||
|
||||
|
||||
@app.command(no_args_is_help=True)
|
||||
def convert( # noqa: C901
|
||||
input_sources: Annotated[
|
||||
|
||||
+118
-1
@@ -4,7 +4,8 @@ import pytest
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from docling.cli.main import _should_generate_export_images, app
|
||||
from docling.cli.export_utils import _should_generate_export_images, _split_list
|
||||
from docling.cli.main import app
|
||||
from docling.datamodel.base_models import OutputFormat
|
||||
|
||||
runner = CliRunner()
|
||||
@@ -30,6 +31,117 @@ def test_cli_convert(tmp_path):
|
||||
assert converted.exists()
|
||||
|
||||
|
||||
def test_export_documents_marks_empty_markdown_as_failure(tmp_path):
|
||||
from docling.cli.main import export_documents
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
_DummyBackend,
|
||||
)
|
||||
|
||||
input_path = tmp_path / "input.pdf"
|
||||
input_path.write_bytes(b"%PDF-1.4")
|
||||
|
||||
input_doc = InputDocument(
|
||||
path_or_stream=input_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=_DummyBackend,
|
||||
)
|
||||
|
||||
conv_res = ConversionResult(input=input_doc)
|
||||
conv_res.status = ConversionStatus.SUCCESS
|
||||
|
||||
class DummyDocument:
|
||||
def save_as_markdown(self, *, filename, image_mode):
|
||||
Path(filename).write_text("")
|
||||
|
||||
conv_res.document = DummyDocument()
|
||||
|
||||
output_dir = tmp_path / "out"
|
||||
output_dir.mkdir()
|
||||
|
||||
export_documents(
|
||||
[conv_res],
|
||||
output_dir=output_dir,
|
||||
export_json=False,
|
||||
export_yaml=False,
|
||||
export_html=False,
|
||||
export_html_split_page=False,
|
||||
show_layout=False,
|
||||
export_md=True,
|
||||
export_txt=False,
|
||||
export_doctags=False,
|
||||
export_vtt=False,
|
||||
print_timings=False,
|
||||
export_timings=False,
|
||||
image_export_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
assert conv_res.status == ConversionStatus.FAILURE
|
||||
assert conv_res.errors
|
||||
|
||||
|
||||
def test_export_documents_marks_stat_errors_as_failure(tmp_path, monkeypatch):
|
||||
from docling.cli.main import export_documents
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
_DummyBackend,
|
||||
)
|
||||
|
||||
input_path = tmp_path / "input.pdf"
|
||||
input_path.write_bytes(b"%PDF-1.4")
|
||||
|
||||
input_doc = InputDocument(
|
||||
path_or_stream=input_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=_DummyBackend,
|
||||
)
|
||||
|
||||
conv_res = ConversionResult(input=input_doc)
|
||||
conv_res.status = ConversionStatus.SUCCESS
|
||||
|
||||
class DummyDocument:
|
||||
def save_as_markdown(self, *, filename, image_mode):
|
||||
Path(filename).write_text("ok")
|
||||
|
||||
conv_res.document = DummyDocument()
|
||||
|
||||
output_dir = tmp_path / "out"
|
||||
output_dir.mkdir()
|
||||
|
||||
original_stat = Path.stat
|
||||
|
||||
def _raise_for_markdown(self):
|
||||
if self.name == "input.md":
|
||||
raise OSError("stat failed")
|
||||
return original_stat(self)
|
||||
|
||||
monkeypatch.setattr(Path, "stat", _raise_for_markdown)
|
||||
|
||||
export_documents(
|
||||
[conv_res],
|
||||
output_dir=output_dir,
|
||||
export_json=False,
|
||||
export_yaml=False,
|
||||
export_html=False,
|
||||
export_html_split_page=False,
|
||||
show_layout=False,
|
||||
export_md=True,
|
||||
export_txt=False,
|
||||
export_doctags=False,
|
||||
export_vtt=False,
|
||||
print_timings=False,
|
||||
export_timings=False,
|
||||
image_export_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
assert conv_res.status == ConversionStatus.FAILURE
|
||||
assert conv_res.errors
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("image_export_mode", "to_formats", "expected"),
|
||||
[
|
||||
@@ -59,6 +171,11 @@ def test_image_export_policy_covers_all_output_formats():
|
||||
assert image_export_formats | non_image_export_formats == set(OutputFormat)
|
||||
|
||||
|
||||
def test_split_list_handles_none_and_delimiters():
|
||||
assert _split_list(None) is None
|
||||
assert _split_list("a,b;c") == ["a", "b", "c"]
|
||||
|
||||
|
||||
def test_cli_audio_auto_detection(tmp_path):
|
||||
"""Test that CLI automatically detects audio files and sets ASR pipeline."""
|
||||
from docling.datamodel.base_models import FormatToExtensions, InputFormat
|
||||
|
||||
Reference in New Issue
Block a user