Files
docling-core/test/test_profiler.py
Cesar Berrospi Ramis af50f1cb07 feat: profile a document or collection (#511)
* feat: profile a document or collection

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(profiler): add deciles and histograms

Add deciles and histograms to the Docling collection statistics.
Add an example script to plot histograms.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(profiler): add option to plot log frequencies in histogram

Add the option to plot the histogram frequencies in logarithmic scale.
Extend README with documentation on the document profiler.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* test(profiler): cover missing lines in doc_profiler with tests

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2026-03-13 13:36:38 +01:00

402 lines
14 KiB
Python

"""Tests for document profiler."""
import json
from pathlib import Path
import pytest
from docling_core.transforms.profiler import DocumentProfiler
from docling_core.types.doc import BoundingBox, DoclingDocument, ProvenanceItem
from docling_core.types.doc.document import DocumentOrigin, PageItem, Size, TableData
from docling_core.types.doc.labels import DocItemLabel
def test_profile_empty_document():
"""Test profiling an empty document."""
doc = DoclingDocument(name="Empty Document")
stats = DocumentProfiler.profile_document(doc)
assert stats.name == "Empty Document"
assert stats.num_pages == 0
assert stats.num_tables == 0
assert stats.num_pictures == 0
assert stats.num_texts == 0
assert stats.num_key_value_items == 0
assert stats.num_form_items == 0
assert stats.total_items == 0
assert stats.avg_items_per_page == 0.0
assert stats.origin_mimetype is None
def test_profile_simple_document():
"""Test profiling a simple document with basic content."""
doc = DoclingDocument(
name="Simple Document",
origin=DocumentOrigin(
mimetype="application/pdf",
binary_hash=12345,
filename="test.pdf",
),
)
# Add some pages
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
# Add some text items
doc.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
doc.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
# Add a table
doc.add_table(data=TableData(num_rows=2, num_cols=2))
# Add a picture
doc.add_picture()
stats = DocumentProfiler.profile_document(doc)
assert stats.name == "Simple Document"
assert stats.num_pages == 2
assert stats.num_tables == 1
assert stats.num_pictures == 1
assert stats.num_texts == 3
assert stats.num_section_headers == 1
assert stats.total_items == 5
assert stats.avg_items_per_page == 2.5
assert stats.origin_mimetype == "application/pdf"
def test_profile_document_with_pictures_for_ocr():
"""Test profiling pictures that would trigger OCR based on area coverage."""
doc = DoclingDocument(name="Document with Pictures for OCR")
# Add a page
doc.pages[1] = PageItem(page_no=1, size=Size(width=1000, height=1000))
# Add a large picture (10% of page area, above default 5% threshold)
doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=0, t=0, r=316.2, b=316.2), # ~10% of page area
charspan=(0, 0),
)
)
# Add a small picture (2% of page area, below default 5% threshold)
doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=0, t=0, r=141.4, b=141.4), # ~2% of page area
charspan=(0, 0),
)
)
# Add a medium picture (exactly 5% of page area, at threshold)
doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=0, t=0, r=223.607, b=223.607), # exactly 5% of page area
charspan=(0, 0),
)
)
stats = DocumentProfiler.profile_document(doc)
assert stats.num_pictures == 3
# 2 out of 3 pictures meet the threshold (large and medium)
assert stats.num_pictures_for_ocr == 2
# Test with custom threshold of 10%
stats_custom = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.10)
# Only large picture (9.99%) is below 10%, so 0 pictures
assert stats_custom.num_pictures_for_ocr == 0
# Test with custom threshold of 2%
stats_custom2 = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.02)
# 2 pictures are above 2% threshold (large and medium, small is 1.99%)
assert stats_custom2.num_pictures_for_ocr == 2
def test_profile_collection_empty():
"""Test profiling an empty collection."""
stats = DocumentProfiler.profile_collection([])
assert stats.num_documents == 0
assert stats.total_pages == 0
assert stats.total_tables == 0
assert stats.total_pictures == 0
assert stats.avg_items_per_document == 0.0
assert stats.avg_items_per_page == 0.0
assert stats.deciles_pages == [0.0] * 9
assert stats.deciles_tables == [0.0] * 9
assert stats.histogram_pages.bins == []
assert stats.histogram_pages.frequencies == []
assert stats.histogram_pages.bin_width == 0.0
def test_profile_collection_single_document():
"""Test profiling a collection with a single document."""
doc = DoclingDocument(name="Single Doc")
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text", orig="Text")
doc.add_table(data=TableData(num_rows=1, num_cols=1))
doc.add_picture()
stats = DocumentProfiler.profile_collection(doc)
assert stats.num_documents == 1
assert stats.total_pages == 1
assert stats.total_tables == 1
assert stats.total_pictures == 1
assert stats.total_texts == 1
assert stats.min_pages == 1
assert stats.max_pages == 1
assert stats.deciles_pages[4] == 1.0 # median is d5 (5th decile, index 4)
assert stats.mean_pages == 1.0
assert stats.std_pages == 0.0
# Check histogram exists
assert len(stats.histogram_pages.bins) > 0
assert len(stats.histogram_pages.frequencies) > 0
def test_profile_collection_multiple_documents():
"""Test profiling a collection with multiple documents."""
docs = []
# Document 1: 2 pages, 1 table, 2 pictures, 2 texts
doc1 = DoclingDocument(
name="Doc1",
origin=DocumentOrigin(mimetype="application/pdf", binary_hash=1, filename="doc1.pdf"),
)
doc1.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc1.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
doc1.add_table(data=TableData(num_rows=1, num_cols=1))
doc1.add_picture()
doc1.add_picture()
doc1.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
doc1.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
docs.append(doc1)
# Document 2: 5 pages, 3 tables, 1 picture, 10 texts
doc2 = DoclingDocument(
name="Doc2",
origin=DocumentOrigin(mimetype="application/pdf", binary_hash=2, filename="doc2.pdf"),
)
for i in range(1, 6):
doc2.pages[i] = PageItem(page_no=i, size=Size(width=612, height=792))
for _ in range(3):
doc2.add_table(data=TableData(num_rows=1, num_cols=1))
doc2.add_picture()
for i in range(10):
doc2.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
docs.append(doc2)
# Document 3: 1 page, 0 tables, 5 pictures, 2 texts
doc3 = DoclingDocument(
name="Doc3",
origin=DocumentOrigin(mimetype="text/html", binary_hash=3, filename="doc3.html"),
)
doc3.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
for _ in range(5):
doc3.add_picture()
doc3.add_text(label=DocItemLabel.TEXT, text="T1", orig="T1")
doc3.add_text(label=DocItemLabel.TEXT, text="T2", orig="T2")
docs.append(doc3)
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
# Basic counts
assert stats.num_documents == 3
assert stats.total_pages == 8 # 2 + 5 + 1
assert stats.total_tables == 4 # 1 + 3 + 0
assert stats.total_pictures == 8 # 2 + 1 + 5
assert stats.total_texts == 14 # 2 + 10 + 2
# Page statistics
assert stats.min_pages == 1
assert stats.max_pages == 5
assert stats.deciles_pages[4] == 2.0 # median is d5 (5th decile, index 4)
assert stats.mean_pages == pytest.approx(8 / 3)
assert stats.std_pages > 0
# Check deciles are in order: [d1, d2, d3, d4, d5, d6, d7, d8, d9]
assert stats.deciles_pages[0] <= stats.deciles_pages[4] <= stats.deciles_pages[8]
# Check histogram exists
assert len(stats.histogram_pages.bins) > 0
assert len(stats.histogram_pages.frequencies) > 0
# Table statistics
assert stats.min_tables == 0
assert stats.max_tables == 3
assert stats.deciles_tables[4] == 1.0 # median is d5 (5th decile, index 4)
assert stats.mean_tables == pytest.approx(4 / 3)
# Check histogram exists
assert len(stats.histogram_tables.bins) > 0
# Picture statistics
assert stats.min_pictures == 1
assert stats.max_pictures == 5
assert stats.deciles_pictures[4] == 2.0 # median is d5 (5th decile, index 4)
assert stats.mean_pictures == pytest.approx(8 / 3)
# Check histogram exists
assert len(stats.histogram_pictures.bins) > 0
# Text statistics
assert stats.min_texts == 2
assert stats.max_texts == 10
assert stats.deciles_texts[4] == 2.0 # median is d5 (5th decile, index 4)
assert stats.mean_texts == pytest.approx(14 / 3)
# Check histogram exists
assert len(stats.histogram_texts.bins) > 0
# Document characteristics
assert len(stats.document_stats) == 3
# MIME type distribution
assert stats.mimetype_distribution["application/pdf"] == 2
assert stats.mimetype_distribution["text/html"] == 1
# Computed fields
assert stats.total_items == 26 # 14 texts + 4 tables + 8 pictures
assert stats.avg_items_per_document == pytest.approx(26 / 3)
assert stats.avg_items_per_page == pytest.approx(26 / 8)
def test_profile_collection_with_iterator():
"""Test profiling a collection using an iterator (generator)."""
def doc_generator():
for i in range(3):
doc = DoclingDocument(name=f"Doc{i}")
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
yield doc
stats = DocumentProfiler.profile_collection(doc_generator())
assert stats.num_documents == 3
assert stats.total_pages == 3
assert stats.total_texts == 3
def test_profile_collection_without_individual_stats():
"""Test that individual stats are not included by default."""
docs = [DoclingDocument(name=f"Doc{i}") for i in range(3)]
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=False)
assert len(stats.document_stats) == 0
def test_statistics_serialization():
"""Test that statistics can be serialized to JSON."""
doc = DoclingDocument(name="Test Doc")
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
doc_stats = DocumentProfiler.profile_document(doc)
# Test DocumentStatistics serialization
json_str = doc_stats.model_dump_json()
data = json.loads(json_str)
assert data["name"] == "Test Doc"
assert data["num_pages"] == 1
assert data["total_items"] == 1
# Test CollectionStatistics serialization
coll_stats = DocumentProfiler.profile_collection([doc])
json_str = coll_stats.model_dump_json()
data = json.loads(json_str)
assert data["num_documents"] == 1
assert data["total_pages"] == 1
def test_profile_real_document():
"""Test profiling a real document from test data."""
test_file = Path("./test/data/doc/2408.09869v3_enriched.json")
if not test_file.exists():
pytest.skip("Test file not found")
doc = DoclingDocument.load_from_json(test_file)
stats = DocumentProfiler.profile_document(doc)
# Basic sanity checks
assert stats.name == doc.name
assert stats.num_pages == len(doc.pages)
assert stats.num_tables == len(doc.tables)
assert stats.num_pictures == len(doc.pictures)
assert stats.num_texts == len(doc.texts)
assert stats.total_items > 0
def test_label_specific_counts():
"""Test that label-specific counts are accurate."""
doc = DoclingDocument(name="Label Test")
# Add various types of text items
doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 1", orig="Item 1")
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 2", orig="Item 2")
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 3", orig="Item 3")
doc.add_text(label=DocItemLabel.CODE, text="code", orig="code")
doc.add_text(label=DocItemLabel.FORMULA, text="x=y", orig="x=y")
doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
stats = DocumentProfiler.profile_document(doc)
assert stats.num_section_headers == 1
assert stats.num_list_items == 3
assert stats.num_code_items == 1
assert stats.num_formulas == 1
assert stats.num_texts == 7
def test_profile_sample_document(sample_doc):
"""Test profiling the sample document from conftest.py fixture."""
stats = DocumentProfiler.profile_document(sample_doc)
# Verify basic document properties
assert stats.name == "Untitled 1"
assert stats.num_pages == 0 # sample_doc doesn't add pages explicitly
# Verify item counts based on the sample_doc construction
assert stats.num_tables == len(sample_doc.tables)
assert stats.num_pictures == len(sample_doc.pictures)
assert stats.num_texts == len(sample_doc.texts)
assert stats.num_key_value_items == len(sample_doc.key_value_items)
assert stats.num_form_items == len(sample_doc.form_items)
# Verify label-specific counts
assert stats.num_section_headers > 0 # sample_doc has section headers
assert stats.num_list_items > 0 # sample_doc has many list items
assert stats.num_code_items > 0 # sample_doc has code items
assert stats.num_formulas > 0 # sample_doc has formulas
# Verify computed fields
assert stats.total_items > 0
assert stats.total_items == (
stats.num_texts
+ stats.num_tables
+ stats.num_pictures
+ stats.num_key_value_items
+ stats.num_form_items
)
# sample_doc has no pages, so avg_items_per_page should be 0
assert stats.avg_items_per_page == 0.0
def test_calculate_deciles_empty():
"""Test _calculate_deciles with empty data (line 191)."""
result = DocumentProfiler._calculate_deciles([])
assert result == [0.0] * 9
def test_calculate_histogram_empty():
"""Test _calculate_histogram with empty data (line 208)."""
result = DocumentProfiler._calculate_histogram([])
assert result.bins == []
assert result.frequencies == []
assert result.bin_width == 0.0