docling/tests/test_backend_msexcel.py

import logging
from io import BytesIO
from pathlib import Path

import pytest
from openpyxl import load_workbook

from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.backend_options import MsExcelBackendOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument
from docling.document_converter import DocumentConverter, ExcelFormatOption

from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export

_log = logging.getLogger(__name__)

GENERATE = GEN_TEST_DATA


def get_excel_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/xlsx/")

    # List all Excel files in the directory and its subdirectories
    excel_files = sorted(directory.rglob("*.xlsx")) + sorted(directory.rglob("*.xlsm"))
    return excel_files


def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])

    return converter


@pytest.fixture(scope="module")
def documents() -> list[tuple[Path, DoclingDocument]]:
    documents: list[dict[Path, DoclingDocument]] = []

    excel_paths = get_excel_paths()
    converter = get_converter()

    for excel_path in excel_paths:
        _log.debug(f"converting {excel_path}")

        gt_path = (
            excel_path.parent.parent / "groundtruth" / "docling_v2" / excel_path.name
        )

        conv_result: ConversionResult = converter.convert(excel_path)

        doc: DoclingDocument = conv_result.document

        assert doc, f"Failed to convert document from file {gt_path}"
        documents.append((gt_path, doc))

    return documents


def test_e2e_excel_conversions(documents) -> None:
    for gt_path, doc in documents:
        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"

        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
            "export to indented-text"
        )

        assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
            "document document"
        )


def test_pages(documents) -> None:
    """Test the page count and page size of converted documents.

    Args:
        documents: The paths and converted documents.
    """
    # number of pages from the backend method
    path = next(item for item in get_excel_paths() if item.stem == "xlsx_01")
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.XLSX,
        filename=path.stem,
        backend=MsExcelDocumentBackend,
    )
    backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
    assert backend.page_count() == 4

    # number of pages from the converted document
    doc = next(item for path, item in documents if path.stem == "xlsx_01")
    assert len(doc.pages) == 4

    # page sizes as number of cells
    assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
    assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
    assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
    assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)


def test_chartsheet(documents) -> None:
    """Test the conversion of Chartsheets.

    Args:
        documents: The paths and converted documents.
    """
    doc = next(item for path, item in documents if path.stem == "xlsx_03_chartsheet")
    assert len(doc.pages) == 2

    # Chartseet content is for now ignored
    assert doc.groups[1].name == "sheet: Duck Chart"
    assert doc.pages[2].size.height == 0
    assert doc.pages[2].size.width == 0


def test_chartsheet_data_values(documents) -> None:
    """Test that data values are extracted correctly from xlsx_03_chartsheet.

    This test verifies that calculated values (not formulas) are returned.
    The file contains duck observations with year 2024 having a total of 310 ducks.
    We need to verify that both 2024 and 310 appear in the parsed data.

    Args:
        documents: The paths and converted documents.
    """
    doc = next(item for path, item in documents if path.stem == "xlsx_03_chartsheet")

    # Find all tables
    tables = list(doc.tables)
    assert len(tables) > 0, "Should have at least one table"

    # Look for a table that has the year 2024 in it
    table_with_2024 = None
    row_index_of_2024 = None

    for table in tables:
        for cell in table.data.table_cells:
            if cell.text == "2024":
                table_with_2024 = table
                row_index_of_2024 = cell.start_row_offset_idx
                break
        if table_with_2024:
            break

    assert table_with_2024 is not None, "Should find a table containing year 2024"
    assert row_index_of_2024 is not None, "Should find row index for 2024"

    # Now verify that the value 310 exists in the document
    # (it may be in the same table or a different table due to how the parser splits tables)
    found_310 = False
    for table in tables:
        for cell in table.data.table_cells:
            if cell.text == "310":
                found_310 = True
                break
        if found_310:
            break

    assert found_310, "Should find the value 310 (total ducks for 2024) in the document"


def test_inflated_rows_handling(documents) -> None:
    """Test that files with inflated max_row are handled correctly.

    xlsx_04_inflated.xlsx has inflated max_row (1,048,496) but only 7 rows of actual data.
    This test verifies that our backend correctly identifies true data bounds.
    """
    # First, verify the file has inflated max_row using openpyxl directly
    path = next(item for item in get_excel_paths() if item.stem == "xlsx_04_inflated")

    wb = load_workbook(path)
    ws = wb.active
    reported_max_row = ws.max_row

    # Assert that openpyxl reports inflated max_row
    assert reported_max_row > 100000, (
        f"xlsx_04_inflated.xlsx should have inflated max_row (expected >100k, got {reported_max_row:,}). "
        f"This test file is designed to verify proper handling of Excel files with inflated row counts."
    )

    _log.info(
        f"xlsx_04_inflated.xlsx - Openpyxl reported max_row: {reported_max_row:,}"
    )

    # Now test that our backend handles it correctly
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.XLSX,
        filename=path.stem,
        backend=MsExcelDocumentBackend,
    )
    backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)

    # Verify backend detects correct number of pages (should be 4, like test-01)
    page_count = backend.page_count()
    assert page_count == 4, (
        f"Backend should detect 4 pages (same as test-01), got {page_count}"
    )

    # Verify converted document has correct pages
    doc = next(item for path, item in documents if path.stem == "xlsx_04_inflated")
    assert len(doc.pages) == 4, f"Document should have 4 pages, got {len(doc.pages)}"

    # Verify page sizes match expected dimensions (same as test-01)
    # These should reflect actual data, not inflated row counts
    assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0), (
        f"Page 1 should be 3x7 cells, got {doc.pages.get(1).size.as_tuple()}"
    )
    assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0), (
        f"Page 2 should be 9x18 cells, got {doc.pages.get(2).size.as_tuple()}"
    )
    assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0), (
        f"Page 3 should be 13x36 cells, got {doc.pages.get(3).size.as_tuple()}"
    )
    assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0), (
        f"Page 4 should be 0x0 cells (empty), got {doc.pages.get(4).size.as_tuple()}"
    )

    _log.info(
        f"✓ Successfully handled inflated max_row: "
        f"reported {reported_max_row:,} rows, "
        f"correctly processed as {page_count} pages with proper dimensions"
    )


def test_table_with_title():
    """Test that singleton cells with non-numeric content are treated as TextItem.

    When treat_singleton_as_text option is enabled, 1x1 tables containing non-numeric
    text should be converted to TextItem instead of TableItem. This test verifies that
    xlsx_05_table_with_title.xlsx is correctly parsed with this option.
    """
    path = next(
        item for item in get_excel_paths() if item.stem == "xlsx_05_table_with_title"
    )

    # Create converter with treat_singleton_as_text=True
    options = MsExcelBackendOptions(treat_singleton_as_text=True)
    format_options = {InputFormat.XLSX: ExcelFormatOption(backend_options=options)}
    converter = DocumentConverter(
        allowed_formats=[InputFormat.XLSX], format_options=format_options
    )

    conv_result: ConversionResult = converter.convert(path)
    doc: DoclingDocument = conv_result.document

    # With treat_singleton_as_text=True, the singleton title cell should be a TextItem
    texts = list(doc.texts)
    tables = list(doc.tables)

    assert len(texts) == 1, f"Should have 1 text item (the title), got {len(texts)}"
    assert len(tables) == 1, f"Should have 1 table, got {len(tables)}"

    # Verify the text item contains the title
    assert texts[0].text == "Number of freshwater ducks per year", (
        f"Text should be 'Number of freshwater ducks per year', got '{texts[0].text}'"
    )

    # Verify table dimensions
    table = tables[0]
    assert table.data.num_rows == 7, (
        f"Table should have 7 rows, got {table.data.num_rows}"
    )
    assert table.data.num_cols == 2, (
        f"Table should have 2 columns, got {table.data.num_cols}"
    )


def test_bytesio_stream():
    """Test that Excel files can be loaded from BytesIO streams.

    This test verifies that the BytesIO code path in the backend is working correctly,
    ensuring that data_only=True is applied when loading workbooks from streams.
    """
    # Get a test Excel file
    path = next(item for item in get_excel_paths() if item.stem == "xlsx_01")

    # Load the file into a BytesIO stream
    buf = BytesIO(path.open("rb").read())

    # Create an InputDocument with the BytesIO stream
    in_doc = InputDocument(
        path_or_stream=buf,
        format=InputFormat.XLSX,
        filename=path.stem,
        backend=MsExcelDocumentBackend,
    )

    # Initialize the backend with the BytesIO stream
    backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=buf)

    # Verify the backend is valid
    assert backend.is_valid(), "Backend should be valid when loaded from BytesIO"

    # Verify page count matches expected value
    assert backend.page_count() == 4, "Should detect 4 pages from BytesIO stream"

    # Convert the document
    doc = backend.convert()

    # Verify the document was converted successfully
    assert doc is not None, "Document should be converted from BytesIO stream"
    assert len(doc.pages) == 4, "Document should have 4 pages"

    # Verify page sizes match expected dimensions
    assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
    assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
    assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
    assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)


def test_edge_cases_merging() -> None:
    """Test that split tables are correctly merged using the region growing algorithm.

    Verifies:
    - Sheet 1 (missing_header): 1 table (Standard case)
    - Sheet 2 (Attached_left): 1 MERGED table (The critical fix!)
    - Sheet 3 (Diagonal): 2 separate tables (Correctly separated)
    """
    path = next(
        item for item in get_excel_paths() if item.stem == "xlsx_06_edge_cases_"
    )

    if not path.exists():
        pytest.skip(f"Test file {path} not found.")

    converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
    conv_result = converter.convert(path)
    doc = conv_result.document

    # Organize tables by Page Number (1-based index)
    tables_by_page = {}
    for table in doc.tables:
        p_no = table.prov[0].page_no
        if p_no not in tables_by_page:
            tables_by_page[p_no] = []
        tables_by_page[p_no].append(table)

    # Page 1: Standard table
    assert len(tables_by_page.get(1, [])) == 1, "Page 1 should have 1 table"

    # Page 2: The 'Attached left' case.
    # SUCCESS CONDITION: It is 1 single table.
    # (If the fix failed, this would be 2 tables).
    assert len(tables_by_page.get(2, [])) == 1, (
        f"Page 2 (Attached Left) should be 1 merged table, but found {len(tables_by_page.get(2, []))}"
    )

    # Page 3: Diagonal case.
    # These are physically separated by empty space, so they should remain 2 tables.
    assert len(tables_by_page.get(3, [])) == 2, (
        "Page 3 (Diagonal) should have 2 separate tables"
    )


def test_gap_tolerance_comparison() -> None:
    """Test the effect of gap_tolerance on table detection.

    Target: excel-tests.xlsx (Page 1), 'Power system' table.
    Structure: Col A ("1") | Col B (Empty) | Col C ("Rated system voltage")

    Verifies:
    1. Tolerance 0 (Default): The gap causes a split. The main data table starts at Col C.
    2. Tolerance 1: The gap is bridged. The table merges with Col A, starting at Col A.
    """
    path = next(
        item for item in get_excel_paths() if item.stem == "xlsx_07_gap_tolerance_"
    )
    if not path.exists():
        pytest.skip("Test file not found")

    # --- Helper to get the start column of the "Rated system voltage" table ---
    def get_table_start_col(tolerance: int) -> int:
        options = MsExcelBackendOptions(gap_tolerance=tolerance)
        format_options = {InputFormat.XLSX: ExcelFormatOption(backend_options=options)}

        converter = DocumentConverter(
            allowed_formats=[InputFormat.XLSX], format_options=format_options
        )
        doc = converter.convert(path).document
        print(doc)

        for table in doc.tables:
            # Check for unique text in the main body of the table
            texts = {cell.text for cell in table.data.table_cells}
            if "Rated system voltage" in texts:
                # Return the leftmost column index (0-based)
                return table.prov[0].bbox.l

        pytest.fail(f"Could not find 'Power system' table with tolerance={tolerance}")

    # --- ASSERTION 1: Strict Behavior (gap_tolerance=0) ---
    # The empty Col B should split the table.
    # The text "Rated system voltage" is in Col C (Index 2).
    start_col_strict = get_table_start_col(0)
    assert start_col_strict == 2, (
        f"Default (0) tolerance should split the table. "
        f"Expected start at Col C (2), got {start_col_strict}"
    )

    # --- ASSERTION 2: Merged Behavior (gap_tolerance=1) ---
    # The empty Col B should be ignored.
    # The table should merge left to include "1" in Col A (Index 0).
    start_col_merged = get_table_start_col(1)
    assert start_col_merged == 0, (
        f"Tolerance 1 should merge the table. "
        f"Expected start at Col A (0), got {start_col_merged}"
    )


def test_one_cell_anchor_image():
    """Test that images with OneCellAnchor are positioned correctly.

    OneCellAnchor images (the default when inserting images in Excel) should
    use the anchor cell as the bounding box origin, not default to (0,0,0,0).
    """
    path = next(
        item for item in get_excel_paths() if item.stem == "xlsx_08_one_cell_anchor"
    )

    converter = get_converter()
    conv_result = converter.convert(path)
    doc = conv_result.document

    pictures = list(doc.pictures)
    assert len(pictures) == 1, f"Should have 1 picture, got {len(pictures)}"

    prov = pictures[0].prov[0]
    # Image was placed at cell D2 (col=3, row=1 in 0-based)
    assert prov.bbox.l == 3.0, f"Image left should be 3.0 (col D), got {prov.bbox.l}"
    assert prov.bbox.t == 1.0, f"Image top should be 1.0 (row 2), got {prov.bbox.t}"
    assert prov.bbox.r == 4.0, f"Image right should be 4.0, got {prov.bbox.r}"
    assert prov.bbox.b == 2.0, f"Image bottom should be 2.0, got {prov.bbox.b}"