mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
60fc517af0
chore:Condensing the latex test backend into multiple files Signed-off-by: Aditya Sasidhar <telikicherlaadityasasidhar@gmail.com>
350 lines
11 KiB
Python
350 lines
11 KiB
Python
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from docling_core.types.doc import DocItemLabel, GroupLabel
|
|
|
|
from docling.backend.latex_backend import LatexDocumentBackend
|
|
from docling.datamodel.backend_options import LatexBackendOptions
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
|
|
from docling.document_converter import DocumentConverter
|
|
|
|
from ..test_data_gen_flag import GEN_TEST_DATA
|
|
from ..verify_utils import verify_document, verify_export
|
|
|
|
GENERATE = GEN_TEST_DATA
|
|
LATEX_DATA_DIR = Path("./tests/data/latex/")
|
|
|
|
|
|
def test_latex_basic_conversion():
|
|
latex_content = b"""
|
|
\\documentclass{article}
|
|
\\begin{document}
|
|
\\section{Introduction}
|
|
Hello World.
|
|
\\end{document}
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
assert len(doc.texts) > 0
|
|
# Check structure
|
|
headers = [t for t in doc.texts if t.label == DocItemLabel.SECTION_HEADER]
|
|
paragraphs = [t for t in doc.texts if t.label != DocItemLabel.SECTION_HEADER]
|
|
|
|
assert len(headers) == 1
|
|
assert headers[0].text == "Introduction"
|
|
assert "Hello World" in paragraphs[0].text
|
|
|
|
|
|
def test_latex_preamble_filter():
|
|
latex_content = b"""
|
|
\\documentclass{article}
|
|
\\usepackage{test}
|
|
\\title{Ignored Title}
|
|
\\begin{document}
|
|
Real Content
|
|
\\end{document}
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
# Preamble metadata (\title, \author, \date) is now extracted
|
|
# following pandoc's approach. Only package commands should be filtered.
|
|
|
|
full_text = doc.export_to_markdown()
|
|
assert "Real Content" in full_text
|
|
assert "Ignored Title" in full_text
|
|
assert "usepackage" not in full_text
|
|
|
|
|
|
def test_latex_escaped_chars():
|
|
# Test correct handling of escaped chars to ensure text isn't split
|
|
latex_content = b"""
|
|
\\documentclass{article}
|
|
\\begin{document}
|
|
value is 23\\% which is high.
|
|
Costs \\$100.
|
|
\\end{document}
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
text_items = [
|
|
t.text
|
|
for t in doc.texts
|
|
if t.label == DocItemLabel.TEXT or t.label == DocItemLabel.PARAGRAPH
|
|
]
|
|
full_text = " ".join(text_items)
|
|
|
|
# "23%" should be together, not "23" and "%" split
|
|
assert "23%" in full_text or "23\\%" in full_text
|
|
# Should not have loose "%" newline
|
|
assert "which is high" in full_text
|
|
assert "$100" in full_text or "\\$100" in full_text
|
|
|
|
|
|
def test_latex_is_valid():
|
|
"""Test is_valid method"""
|
|
# Valid document
|
|
latex_content = b"\\documentclass{article}\\begin{document}Content\\end{document}"
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
assert backend.is_valid() is True
|
|
|
|
# Empty document
|
|
empty_content = b" "
|
|
in_doc_empty = InputDocument(
|
|
path_or_stream=BytesIO(empty_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="empty.tex",
|
|
)
|
|
backend_empty = LatexDocumentBackend(
|
|
in_doc=in_doc_empty, path_or_stream=BytesIO(empty_content)
|
|
)
|
|
assert backend_empty.is_valid() is False
|
|
|
|
|
|
def test_latex_supports_pagination():
|
|
"""Test supports_pagination class method"""
|
|
assert LatexDocumentBackend.supports_pagination() is False
|
|
|
|
|
|
def test_latex_supported_formats():
|
|
"""Test supported_formats class method"""
|
|
formats = LatexDocumentBackend.supported_formats()
|
|
assert InputFormat.LATEX in formats
|
|
|
|
|
|
def test_latex_file_path_loading(tmp_path):
|
|
"""Test loading LaTeX from file path instead of BytesIO"""
|
|
latex_file = tmp_path / "test.tex"
|
|
latex_file.write_text(
|
|
r"""
|
|
\documentclass{article}
|
|
\begin{document}
|
|
File content here.
|
|
\end{document}
|
|
"""
|
|
)
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=latex_file,
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=latex_file)
|
|
doc = backend.convert()
|
|
|
|
md = doc.export_to_markdown()
|
|
assert "File content here" in md
|
|
|
|
|
|
def test_latex_no_document_env():
|
|
"""Test LaTeX without document environment processes all nodes"""
|
|
latex_content = b"""
|
|
\\section{Direct Section}
|
|
Some direct content without document environment.
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
md = doc.export_to_markdown()
|
|
assert "Direct Section" in md or "direct content" in md
|
|
|
|
|
|
def get_latex_converter():
|
|
"""Create a DocumentConverter for LaTeX files."""
|
|
converter = DocumentConverter(allowed_formats=[InputFormat.LATEX])
|
|
return converter
|
|
|
|
|
|
def test_e2e_latex_conversions(latex_paths):
|
|
"""E2E test for LaTeX conversions with ground-truth comparison."""
|
|
if not latex_paths:
|
|
pytest.skip("No LaTeX test files found")
|
|
|
|
converter = get_latex_converter()
|
|
|
|
for latex_path in latex_paths:
|
|
if latex_path.parent.resolve() == LATEX_DATA_DIR.resolve():
|
|
gt_name = latex_path.name
|
|
else:
|
|
gt_name = f"{latex_path.parent.name}_{latex_path.name}"
|
|
|
|
gt_path = LATEX_DATA_DIR.parent / "groundtruth" / "docling_v2" / gt_name
|
|
|
|
conv_result: ConversionResult = converter.convert(latex_path)
|
|
doc: DoclingDocument = conv_result.document
|
|
|
|
pred_md: str = doc.export_to_markdown()
|
|
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
|
f"Markdown export mismatch for {latex_path}"
|
|
)
|
|
|
|
pred_itxt: str = doc._export_to_indented_text(
|
|
max_text_len=70, explicit_tables=False
|
|
)
|
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
|
f"Indented text export mismatch for {latex_path}"
|
|
)
|
|
|
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
|
|
f"Document JSON mismatch for {latex_path}"
|
|
)
|
|
|
|
|
|
def test_latex_document_with_leading_comments():
|
|
"""Test that documents starting with comment lines don't cause regex errors"""
|
|
latex_content = b"""% This is a leading comment
|
|
% Another comment line
|
|
\\documentclass{article}
|
|
\\begin{document}
|
|
\\section{Test Section}
|
|
This is test content.
|
|
\\end{document}
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
# Should parse successfully without regex errors
|
|
assert len(doc.texts) > 0
|
|
md = doc.export_to_markdown()
|
|
assert "Test Section" in md
|
|
assert "test content" in md
|
|
|
|
|
|
def test_latex_filecontents_ignored():
|
|
"""Test filecontents environment is ignored"""
|
|
latex_content = b"""
|
|
\\documentclass{article}
|
|
\\begin{filecontents}{sample.bib}
|
|
@article{test, author={A}, title={B}}
|
|
\\end{filecontents}
|
|
\\begin{document}
|
|
Actual content.
|
|
\\end{document}
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
md = doc.export_to_markdown()
|
|
assert "Actual content" in md
|
|
# filecontents should not appear in output
|
|
assert "@article" not in md
|
|
|
|
|
|
def test_latex_convert_error_fallback():
|
|
"""Test convert() returns an empty doc (not an exception) when _do_parse_and_process errors."""
|
|
latex_content = b"\\documentclass{article}\\begin{document}Hello\\end{document}"
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
options = LatexBackendOptions(parse_timeout=0.05)
|
|
backend = LatexDocumentBackend(
|
|
in_doc=in_doc, path_or_stream=BytesIO(latex_content), options=options
|
|
)
|
|
|
|
def _raise(doc):
|
|
raise RuntimeError("Simulated parse failure")
|
|
|
|
backend._do_parse_and_process = _raise # type: ignore[method-assign]
|
|
doc = backend.convert()
|
|
assert doc is not None
|
|
|
|
|
|
def test_latex_input_cycle_detection(tmp_path):
|
|
"""Test that circular \\input doesn't stack overflow"""
|
|
# Create two files that reference each other
|
|
file_a = tmp_path / "a.tex"
|
|
file_b = tmp_path / "b.tex"
|
|
|
|
file_a.write_text(
|
|
"\\documentclass{article}\\begin{document}A content\\input{b}\\end{document}"
|
|
)
|
|
file_b.write_text("B content\\input{a}")
|
|
|
|
in_doc = InputDocument(
|
|
path_or_stream=file_a,
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="a.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=file_a)
|
|
# Should not crash / stack overflow
|
|
doc = backend.convert()
|
|
md = doc.export_to_markdown()
|
|
assert "A content" in md
|
|
|
|
|
|
def test_latex_author_date():
|
|
"""Test \\author and \\date text is preserved"""
|
|
latex_content = b"""
|
|
\\documentclass{article}
|
|
\\begin{document}
|
|
\\title{My Paper}
|
|
\\author{Jane Doe}
|
|
\\date{January 2025}
|
|
Some content.
|
|
\\end{document}
|
|
"""
|
|
in_doc = InputDocument(
|
|
path_or_stream=BytesIO(latex_content),
|
|
format=InputFormat.LATEX,
|
|
backend=LatexDocumentBackend,
|
|
filename="test.tex",
|
|
)
|
|
backend = LatexDocumentBackend(in_doc=in_doc, path_or_stream=BytesIO(latex_content))
|
|
doc = backend.convert()
|
|
|
|
md = doc.export_to_markdown()
|
|
assert "Jane Doe" in md
|
|
assert "January 2025" in md
|