import base64 import os import threading import time from io import BytesIO from pathlib import Path, PurePath from unittest.mock import Mock, mock_open, patch import pytest import requests from bs4 import BeautifulSoup from docling_core.types.doc import PictureItem, RichTableCell from docling_core.types.doc.document import ContentLayer from pydantic import AnyUrl, ValidationError from docling.backend.html_backend import HTMLDocumentBackend, _validate_url_safety from docling.datamodel.backend_options import HTMLBackendOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ( ConversionResult, DoclingDocument, InputDocument, SectionHeaderItem, ) from docling.document_converter import DocumentConverter, HTMLFormatOption from docling.exceptions import OperationNotAllowed from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export GENERATE = GEN_TEST_DATA def test_html_backend_options(): options = HTMLBackendOptions() assert options.kind == "html" assert not options.fetch_images assert options.source_uri is None url = "http://example.com" source_location = AnyUrl(url=url) options = HTMLBackendOptions(source_uri=source_location) assert options.source_uri == source_location source_location = PurePath("/local/path/to/file.html") options = HTMLBackendOptions(source_uri=source_location) assert options.source_uri == source_location with pytest.raises(ValidationError, match="Input is not a valid path"): HTMLBackendOptions(source_uri=12345) def test_resolve_relative_path(): html_path = Path("./tests/data/html/example_01.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) html_doc = HTMLDocumentBackend(path_or_stream=html_path, in_doc=in_doc) html_doc.base_path = "/local/path/to/file.html" relative_path = "subdir/another.html" expected_abs_loc = "/local/path/to/subdir/another.html" assert html_doc._resolve_relative_path(relative_path) == expected_abs_loc absolute_path = "/absolute/path/to/file.html" with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path(absolute_path) html_doc.base_path = "http://my_host.com" protocol_relative_url = "//example.com/file.html" expected_abs_loc = "https://example.com/file.html" assert html_doc._resolve_relative_path(protocol_relative_url) == expected_abs_loc html_doc.base_path = "http://example.com" remote_relative_path = "subdir/file.html" expected_abs_loc = "http://example.com/subdir/file.html" assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc html_doc.base_path = "http://example.com" remote_relative_path = "https://my_host.com/my_page.html" expected_abs_loc = "https://my_host.com/my_page.html" assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc html_doc.base_path = "http://example.com" remote_relative_path = "/static/images/my_image.png" expected_abs_loc = "http://example.com/static/images/my_image.png" assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc # when base_path is None, paths pass through unchanged # (validation happens in _load_image_data for actual file access) html_doc.base_path = None # Paths pass through _resolve_relative_path unchanged assert html_doc._resolve_relative_path("subdir/file.html") == "subdir/file.html" # Remote URLs also pass through remote_url = "https://example.com/file.html" assert html_doc._resolve_relative_path(remote_url) == remote_url # Fragment-only hrefs must pass through unchanged html_doc.base_path = "/local/path/to/file.html" assert html_doc._resolve_relative_path("#section1") == "#section1" assert html_doc._resolve_relative_path("#") == "#" html_doc.base_path = "http://example.com/page.html" assert html_doc._resolve_relative_path("#section1") == "#section1" html_doc.base_path = None assert html_doc._resolve_relative_path("#section1") == "#section1" def test_heading_levels(): in_path = Path("tests/data/html/wiki_duck.html") in_doc = InputDocument( path_or_stream=in_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=in_path, ) doc = backend.convert() found_lvl_1 = found_lvl_2 = False for item, _ in doc.iterate_items(): if isinstance(item, SectionHeaderItem): if item.text == "Etymology": found_lvl_1 = True # h2 becomes level 1 because of h1 as title assert item.level == 1 elif item.text == "Feeding": found_lvl_2 = True # h3 becomes level 2 because of h1 as title assert item.level == 2 assert found_lvl_1 and found_lvl_2 def test_ordered_lists(): test_set: list[tuple[bytes, str]] = [] test_set.append( ( b"
  1. 1st item
  2. 2nd item
", "1. 1st item\n2. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "1. 1st item\n2. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "2. 1st item\n3. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "0. 1st item\n1. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "1. 1st item\n2. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "1. 1st item\n2. 2nd item", ) ) for idx, pair in enumerate(test_set): in_doc = InputDocument( path_or_stream=BytesIO(pair[0]), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(pair[0]), ) doc: DoclingDocument = backend.convert() assert doc assert doc.export_to_markdown() == pair[1], f"Error in case {idx}" def test_unicode_characters(): raw_html = "

Hello World!

".encode() # noqa: RUF001 in_doc = InputDocument( path_or_stream=BytesIO(raw_html), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(raw_html), ) doc: DoclingDocument = backend.convert() assert doc.texts[0].text == "Hello World!" def test_extract_parent_hyperlinks(): html_path = Path("./tests/data/html/hyperlink_04.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=html_path, ) div_tag = backend.soup.find("div") a_tag = backend.soup.find("a") annotated_text_list = backend._extract_text_and_hyperlink_recursively( div_tag, find_parent_annotation=True ) assert str(annotated_text_list[0].hyperlink) == a_tag.get("href") @pytest.fixture(scope="module") def html_paths() -> list[Path]: # Define the directory you want to search directory = Path("./tests/data/html/") # List all HTML files in the directory and its subdirectories html_files = sorted(directory.rglob("*.html")) return html_files def get_converter(): converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) return converter def test_e2e_html_conversions(html_paths): converter = get_converter() for html_path in html_paths: gt_path = ( html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name ) conv_result: ConversionResult = converter.convert(html_path) doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( "export to md" ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( "export to indented-text" ) assert verify_document(doc, str(gt_path) + ".json", GENERATE) @patch("docling.backend.html_backend.requests.get") @patch("docling.backend.html_backend.open", new_callable=mock_open) def test_e2e_html_conversion_with_images(mock_local, mock_remote): source = "tests/data/html/example_01.html" image_path = "tests/data/html/example_image_01.png" with open(image_path, "rb") as f: img_bytes = f.read() # fetching image locally mock_local.return_value.__enter__.return_value = BytesIO(img_bytes) backend_options = HTMLBackendOptions( enable_local_fetch=True, fetch_images=True, source_uri=source ) converter = DocumentConverter( allowed_formats=[InputFormat.HTML], format_options={ InputFormat.HTML: HTMLFormatOption(backend_options=backend_options) }, ) res_local = converter.convert(source) mock_local.assert_called_once() assert res_local.document num_pic: int = 0 for element, _ in res_local.document.iterate_items(): if isinstance(element, PictureItem): assert element.image num_pic += 1 assert num_pic == 1, "No embedded picture was found in the converted file" # fetching image remotely - need to mock Session.get instead of requests.get with patch( "docling.backend.html_backend.requests.Session.get" ) as mocked_session_get: mock_resp = Mock() mock_resp.status_code = 200 mock_resp.headers = {} mock_resp.raise_for_status = Mock() mock_resp.iter_content = Mock(return_value=[img_bytes]) mock_resp.is_redirect = False mock_resp.is_permanent_redirect = False mocked_session_get.return_value = mock_resp source_location = "https://example.com/example_01.html" backend_options = HTMLBackendOptions( enable_remote_fetch=True, fetch_images=True, source_uri=source_location ) converter = DocumentConverter( allowed_formats=[InputFormat.HTML], format_options={ InputFormat.HTML: HTMLFormatOption(backend_options=backend_options) }, ) res_remote = converter.convert(source) # Verify the session.get was called assert mocked_session_get.call_count == 1 call_args = mocked_session_get.call_args assert call_args[0][0] == "https://example.com/example_image_01.png" assert call_args[1]["stream"] is True assert call_args[1]["headers"] == {"Range": "bytes=0-20971519"} assert call_args[1]["timeout"] == (5, 30) assert res_remote.document num_pic = 0 for element, _ in res_remote.document.iterate_items(): if isinstance(element, PictureItem): assert element.image assert element.image.mimetype == "image/png" num_pic += 1 assert num_pic == 1, "No embedded picture was found in the converted file" # both methods should generate the same DoclingDocument assert res_remote.document == res_local.document # checking exported formats gt_path = ( "tests/data/groundtruth/docling_v2/" + str(Path(source).stem) + "_images.html" ) pred_md: str = res_local.document.export_to_markdown() assert verify_export(pred_md, gt_path + ".md", generate=GENERATE) assert verify_document(res_local.document, gt_path + ".json", GENERATE) def test_html_furniture(): raw_html = ( b"

Initial content with some bold text

" b"

Main Heading

" b"

Some Content

" b"
KeyValue
Section From TR Rich Section From TR
Section From TD Normal Cell
""" in_doc = InputDocument( path_or_stream=BytesIO(raw_html), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test_row_section.html", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(raw_html), ) doc: DoclingDocument = backend.convert() cells = doc.tables[0].data.table_cells cells_by_text = {cell.text: cell for cell in cells} assert cells_by_text["Section From TR"].row_section is True assert cells_by_text["Section From TD"].row_section is True assert cells_by_text["Normal Cell"].row_section is False rich_section_cell = cells_by_text["Rich Section From TR"] assert isinstance(rich_section_cell, RichTableCell) assert rich_section_cell.row_section is True data_fix_par = [ ( "

Text

Heading

More text

", "

Text

Heading

More text

", ), ( "

Some text

A heading

More text

", "

Some text

A heading

More text

", ), ( "

Some text

A heading

Italics

", "

Some text

A heading

Italics

", ), ( "

Some text

Another paragraph

More text

", "

Some text

Another paragraph

More text

", ), ( "

" "
NameAge
Alice29
Bob34

", "" "
NameAge
Alice29
Bob34
", ), ] @pytest.mark.parametrize("html,expected", data_fix_par) def test_fix_invalid_paragraph_structure(html, expected): """Test the function _fix_invalid_paragraph_structure.""" soup = BeautifulSoup(html, "html.parser") HTMLDocumentBackend._fix_invalid_paragraph_structure(soup) assert str(soup) == expected def test_e2e_inline_group_in_table_cell(html_paths): """Regression: InlineGroup in table cell must not cause content duplication.""" name = "html_inline_group_in_table_cell.html" path = next(item for item in html_paths if item.name == name) converter = DocumentConverter() result = converter.convert(path) assert result.document is not None md = result.document.export_to_markdown() assert isinstance(md, str) assert len(md) > 0 assert "Page A" in md assert "Page B" in md assert md.count("Page A") == 1 assert md.count("Page B") == 1 def _build_large_rich_table_html( num_tables: int = 10, rows_per_table: int = 20 ) -> bytes: """Build a synthetic HTML page with many tables whose cells have multiple hyperlinks.""" parts = [""] for t in range(num_tables): parts.append( f"

Table {t}

" ) for r in range(rows_per_table): cell_a = ( f"" ) cell_b = ( f"" ) parts.append(f"{cell_a}{cell_b}") parts.append("
NameLinks

" f'Link {t}-{r}-0, ' f'Link {t}-{r}-1, ' f'Link {t}-{r}-2' f"

" f'B-Link {t}-{r}-0 and ' f'B-Link {t}-{r}-1' f"

") parts.append("") return "\n".join(parts).encode() def test_e2e_rich_table_oom_regression(): """Regression: orphaned InlineGroups must not cause OOM on pages with many rich cells.""" num_tables, rows_per_table = 30, 20 html_bytes = _build_large_rich_table_html( num_tables=num_tables, rows_per_table=rows_per_table ) in_doc = InputDocument( path_or_stream=BytesIO(html_bytes), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="rich_table_oom_test.html", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(html_bytes), ) doc: DoclingDocument = backend.convert() assert doc is not None, "Conversion returned None" result: list[str] = [] def _run() -> None: result.append(doc.export_to_markdown()) t = threading.Thread(target=_run, daemon=True) t0 = time.monotonic() t.start() t.join(timeout=15.0) elapsed = time.monotonic() - t0 assert not t.is_alive(), ( f"export_to_markdown() hung after {elapsed:.1f}s on rich table cells." ) assert result, "export_to_markdown() produced no output" md = result[0] assert isinstance(md, str) and len(md) > 0 max_expected_chars = num_tables * rows_per_table * 2 * 128 * 3 assert len(md) <= max_expected_chars, ( f"Markdown output is suspiciously large ({len(md):,} chars > {max_expected_chars:,})." ) def _build_nested_clade_html(depth: int) -> bytes: """Build nested-table HTML with one per level, mirroring Wikipedia cladograms.""" def _inner(lvl: int) -> str: img = f'' if lvl == depth - 1: return f"
{img}
" return f"
{img}{_inner(lvl + 1)}
" return f"

Cladogram

{_inner(0)}".encode() def test_nested_table_images_no_quadratic_pictures(): """Regression: nested tables must produce exactly one PictureItem per .""" DEPTH = 15 html_bytes = _build_nested_clade_html(DEPTH) from bs4 import BeautifulSoup as _BS soup = _BS(html_bytes, "html.parser") num_img_tags = len(soup.find_all("img")) assert num_img_tags == DEPTH, "fixture sanity check" in_doc = InputDocument( path_or_stream=BytesIO(html_bytes), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="nested_clade_imgs.html", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(html_bytes), ) doc: DoclingDocument = backend.convert() num_pictures = sum( 1 for item, _ in doc.iterate_items() if isinstance(item, PictureItem) ) assert num_pictures == DEPTH, ( f"Expected {DEPTH} PictureItems (one per ), got {num_pictures}." ) t0 = time.time() md = doc.export_to_markdown() elapsed = time.time() - t0 assert isinstance(md, str) and len(md) > 0 assert elapsed < 5.0, f"export_to_markdown() took {elapsed:.2f}s; should be < 5s" def test_validate_url_safety_rejects_private_ips(): """Test that private and restricted IP addresses are rejected.""" with pytest.raises(ValueError, match="Access to restricted IP address"): _validate_url_safety("http://127.0.0.1/file") with pytest.raises(ValueError, match="Access to restricted IP address"): _validate_url_safety("http://10.0.0.1/file") with pytest.raises(ValueError, match="Access to restricted IP address"): _validate_url_safety("http://192.168.1.1/file") with pytest.raises(ValueError, match="Access to restricted IP address"): _validate_url_safety("http://172.16.0.1/file") with pytest.raises(ValueError, match="Access to restricted IP address"): _validate_url_safety("http://169.254.169.254/metadata") def test_load_image_data_enforces_size_limit(monkeypatch): """Test that image downloads are capped at the size limit.""" class MockResponse: def __init__(self, content_size): self.status_code = 200 self.headers = {"content-length": str(content_size)} self._content_size = content_size def raise_for_status(self): pass def iter_content(self, chunk_size=8192): remaining = self._content_size while remaining > 0: chunk_len = min(chunk_size, remaining) yield b"x" * chunk_len remaining -= chunk_len html_path = Path("./tests/data/html/example_01.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=html_path, options=HTMLBackendOptions(enable_remote_fetch=True), ) oversized_response = MockResponse(25 * 1024 * 1024) # 25 MB, exceeds 20 MB limit monkeypatch.setattr( requests.Session, "get", lambda *args, **kwargs: oversized_response ) with pytest.raises(ValueError, match="Resource size exceeds limit"): backend._load_image_data("http://example.com/huge_image.png") def test_load_image_data_enforces_data_uri_size_limit(): """Test that base64 data URIs are capped at the size limit.""" html_path = Path("./tests/data/html/example_01.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=html_path, options=HTMLBackendOptions(), ) oversized_data = b"x" * (21 * 1024 * 1024) encoded = base64.b64encode(oversized_data).decode() data_uri = f"data:image/png;base64,{encoded}" with pytest.raises(ValueError, match="exceeds size limit"): backend._load_image_data(data_uri) def test_anchor_fragment_links_with_source_uri(): """Fragment-only hrefs must not be mangled when source_uri is set.""" html_path = Path("tests/data/html/hyperlink_06.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=html_path, options=HTMLBackendOptions(source_uri=PurePath(str(html_path.resolve()))), ) doc = backend.convert() md = doc.export_to_markdown() # Fragment links preserved assert "[Section 2](#section-2)" in md assert "[top link](#)" in md # External links still work (regression check) assert ( "[Example](https://example.com)" in md or "[Example](https://example.com/)" in md ) def test_path_traversal_blocked_in_resolve_relative_path(): """Test that path traversal attempts are blocked.""" html_path = Path("./tests/data/html/example_01.html") options = HTMLBackendOptions(enable_local_fetch=True, fetch_images=True) in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) html_doc = HTMLDocumentBackend( path_or_stream=html_path, in_doc=in_doc, options=options ) html_doc.base_path = "/tmp/docs/report.html" # Path traversal with ../ blocked with pytest.raises(ValueError, match="Path traversal blocked"): html_doc._resolve_relative_path("../../../../../../../etc/something") with pytest.raises(ValueError, match="Path traversal blocked"): html_doc._resolve_relative_path("subdir/../../../../../../etc/something") # Valid relative paths work result = html_doc._resolve_relative_path("images/photo.png") assert "/tmp/docs/images/photo.png" in result assert "etc" not in result # Absolute paths blocked with local base_path with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path("/absolute/path/to/file.html") # file:// URIs blocked with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path("file:///etc/something") # Windows absolute paths blocked with local base_path (forward slashes) with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path("C:/Windows/System32/config/sam") with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path("D:/sensitive/data.txt") # Windows absolute paths with backslashes (native Windows separator) with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path(r"C:\Windows\System32\config\sam") with pytest.raises( ValueError, match="Absolute paths are not allowed with local base_path" ): html_doc._resolve_relative_path(r"D:\Users\Foo\Documents\something.txt") # Hypothetical single-letter URI schemes (c://, z://) should be rejected as URIs with pytest.raises(ValueError, match="Invalid base_path format"): html_doc.base_path = "c://example.com/path" html_doc._resolve_relative_path("image.png") # Reset base_path for remaining tests html_doc.base_path = "/tmp/docs/report.html" # Filesystem access blocked when base_path is None html_doc.base_path = None # Paths pass through unchanged for hyperlinks assert ( html_doc._resolve_relative_path("../../../etc/something") == "../../../etc/something" ) assert html_doc._resolve_relative_path("/etc/something") == "/etc/something" assert html_doc._resolve_relative_path("image.png") == "image.png" # But file access is blocked with pytest.raises( OperationNotAllowed, match="Local file access requires base_path" ): html_doc._load_image_data("../../../etc/something") with pytest.raises( OperationNotAllowed, match="Local file access requires base_path" ): html_doc._load_image_data("/etc/something") with pytest.raises( OperationNotAllowed, match="Local file access requires base_path" ): html_doc._load_image_data("image.png") def test_valid_local_paths_still_work(): """Test that valid paths within the base directory still work.""" html_path = Path("./tests/data/html/example_01.html").resolve() options = HTMLBackendOptions(enable_local_fetch=True, fetch_images=True) in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) html_doc = HTMLDocumentBackend( path_or_stream=html_path, in_doc=in_doc, options=options ) html_doc.base_path = str(html_path) resolved = html_doc._resolve_relative_path("example_image_01.png") assert "tests/data/html" in resolved assert "example_image_01.png" in resolved