feat: profile a document or collection (#511)

* feat: profile a document or collection

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(profiler): add deciles and histograms

Add deciles and histograms to the Docling collection statistics.
Add an example script to plot histograms.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(profiler): add option to plot log frequencies in histogram

Add the option to plot the histogram frequencies in logarithmic scale.
Extend README with documentation on the document profiler.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* test(profiler): cover missing lines in doc_profiler with tests

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2026-03-13 13:36:38 +01:00
committed by GitHub
parent b435090fdf
commit af50f1cb07
7 changed files with 1680 additions and 0 deletions
+11
View File
@@ -72,6 +72,17 @@ different use cases.
- [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
- [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)
### Profiling
The Profiling API enables extraction of comprehensive statistics from DoclingDocument objects,
both for individual documents and collections. It provides metrics on document structure
(pages, tables, pictures, text items) along with statistical distributions (deciles, histograms)
and visualization capabilities for analyzing document collections at scale.
👉 More details:
- [Document profiling example](./examples/document_profiling.py)
- [Collection statistics visualization](./examples/visualize_collection_stats.py)
## Contributing
Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
@@ -0,0 +1,17 @@
"""Document profiling and statistics module."""
from docling_core.transforms.profiler.doc_profiler import (
CollectionStats,
DecilesT,
DocumentProfiler,
DocumentStats,
Histogram,
)
__all__ = [
"CollectionStats",
"DecilesT",
"DocumentProfiler",
"DocumentStats",
"Histogram",
]
@@ -0,0 +1,425 @@
"""Document profiler for extracting statistics from DoclingDocument objects."""
import statistics
from collections.abc import Iterable
from typing import Annotated
import numpy as np
from annotated_types import Len
from pydantic import BaseModel, Field, computed_field
from typing_extensions import TypeAliasType
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.labels import DocItemLabel
DecilesT = TypeAliasType("DecilesT", Annotated[list[float], Len(max_length=9, min_length=9)])
"""Type alias for deciles: list of 9 floats representing 1st through 9th deciles (10th, 20th, ..., 90th percentiles)."""
class Histogram(BaseModel):
"""Histogram representation with bins and frequencies."""
bins: Annotated[list[float], Field(description="Histogram bin edges")] = []
frequencies: Annotated[list[int], Field(description="Frequency count for each bin")] = []
bin_width: Annotated[float, Field(description="Width of each bin")] = 0.0
class DocumentStats(BaseModel):
"""Statistics for a single DoclingDocument."""
name: Annotated[str, Field(description="Document name")]
num_pages: Annotated[int, Field(description="Number of pages in the document")] = 0
num_tables: Annotated[int, Field(description="Number of tables in the document")] = 0
num_pictures: Annotated[int, Field(description="Number of pictures in the document")] = 0
num_texts: Annotated[int, Field(description="Number of text items in the document")] = 0
num_key_value_items: Annotated[int, Field(description="Number of key-value items in the document")] = 0
num_form_items: Annotated[int, Field(description="Number of form items in the document")] = 0
# Label-specific counts
num_section_headers: Annotated[int, Field(description="Number of section headers")] = 0
num_list_items: Annotated[int, Field(description="Number of list items")] = 0
num_code_items: Annotated[int, Field(description="Number of code items")] = 0
num_formulas: Annotated[int, Field(description="Number of formula items")] = 0
# Document characteristics
origin_mimetype: Annotated[str | None, Field(description="Origin MIME type if available")] = None
num_pictures_for_ocr: Annotated[
int,
Field(description="Number of pictures that would trigger OCR based on area coverage threshold"),
] = 0
@computed_field # type: ignore[prop-decorator]
@property
def total_items(self) -> int:
"""Total number of items in the document."""
return self.num_texts + self.num_tables + self.num_pictures + self.num_key_value_items + self.num_form_items
@computed_field # type: ignore[prop-decorator]
@property
def avg_items_per_page(self) -> float:
"""Average number of items per page."""
if self.num_pages == 0:
return 0.0
return self.total_items / self.num_pages
class CollectionStats(BaseModel):
"""Statistics for a collection of DoclingDocument objects."""
num_documents: Annotated[int, Field(description="Total number of documents in the collection")] = 0
# Page statistics
total_pages: Annotated[int, Field(description="Total number of pages across all documents")] = 0
min_pages: Annotated[int, Field(description="Minimum number of pages in a document")] = 0
max_pages: Annotated[int, Field(description="Maximum number of pages in a document")] = 0
deciles_pages: Annotated[DecilesT, Field(description="Deciles of pages per document")] = [0.0] * 9
histogram_pages: Annotated[Histogram, Field(description="Histogram of pages per document")] = Histogram()
mean_pages: Annotated[float, Field(description="Mean number of pages per document")] = 0.0
std_pages: Annotated[float, Field(description="Standard deviation of pages per document")] = 0.0
# Table statistics
total_tables: Annotated[int, Field(description="Total number of tables across all documents")] = 0
min_tables: Annotated[int, Field(description="Minimum number of tables in a document")] = 0
max_tables: Annotated[int, Field(description="Maximum number of tables in a document")] = 0
deciles_tables: Annotated[DecilesT, Field(description="Deciles of tables per document")] = [0.0] * 9
histogram_tables: Annotated[Histogram, Field(description="Histogram of tables per document")] = Histogram()
mean_tables: Annotated[float, Field(description="Mean number of tables per document")] = 0.0
std_tables: Annotated[float, Field(description="Standard deviation of tables per document")] = 0.0
# Picture statistics
total_pictures: Annotated[int, Field(description="Total number of pictures across all documents")] = 0
min_pictures: Annotated[int, Field(description="Minimum number of pictures in a document")] = 0
max_pictures: Annotated[int, Field(description="Maximum number of pictures in a document")] = 0
deciles_pictures: Annotated[DecilesT, Field(description="Deciles of pictures per document")] = [0.0] * 9
histogram_pictures: Annotated[Histogram, Field(description="Histogram of pictures per document")] = Histogram()
mean_pictures: Annotated[float, Field(description="Mean number of pictures per document")] = 0.0
std_pictures: Annotated[float, Field(description="Standard deviation of pictures per document")] = 0.0
# Text statistics
total_texts: Annotated[int, Field(description="Total number of text items across all documents")] = 0
min_texts: Annotated[int, Field(description="Minimum number of text items in a document")] = 0
max_texts: Annotated[int, Field(description="Maximum number of text items in a document")] = 0
deciles_texts: Annotated[DecilesT, Field(description="Deciles of text items per document")] = [0.0] * 9
histogram_texts: Annotated[Histogram, Field(description="Histogram of text items per document")] = Histogram()
mean_texts: Annotated[float, Field(description="Mean number of text items per document")] = 0.0
std_texts: Annotated[float, Field(description="Standard deviation of text items per document")] = 0.0
# Additional item statistics
total_key_value_items: Annotated[int, Field(description="Total number of key-value items")] = 0
total_form_items: Annotated[int, Field(description="Total number of form items")] = 0
total_section_headers: Annotated[int, Field(description="Total number of section headers")] = 0
total_list_items: Annotated[int, Field(description="Total number of list items")] = 0
total_code_items: Annotated[int, Field(description="Total number of code items")] = 0
total_formulas: Annotated[int, Field(description="Total number of formula items")] = 0
# Document characteristics
# Pictures for OCR statistics
total_pictures_for_ocr: Annotated[
int, Field(description="Total number of pictures requiring OCR across all documents")
] = 0
min_pictures_for_ocr: Annotated[
int, Field(description="Minimum number of pictures requiring OCR in a document")
] = 0
max_pictures_for_ocr: Annotated[
int, Field(description="Maximum number of pictures requiring OCR in a document")
] = 0
deciles_pictures_for_ocr: Annotated[
DecilesT, Field(description="Deciles of pictures requiring OCR per document")
] = [0.0] * 9
histogram_pictures_for_ocr: Annotated[
Histogram, Field(description="Histogram of pictures requiring OCR per document")
] = Histogram()
mean_pictures_for_ocr: Annotated[float, Field(description="Mean number of pictures requiring OCR per document")] = (
0.0
)
std_pictures_for_ocr: Annotated[
float, Field(description="Standard deviation of pictures requiring OCR per document")
] = 0.0
# MIME type distribution
mimetype_distribution: Annotated[
dict[str, int], Field(description="Distribution of MIME types in the collection")
] = {}
# Per-document statistics (optional, for detailed analysis)
document_stats: Annotated[list[DocumentStats], Field(description="Individual statistics for each document")] = []
@computed_field # type: ignore[prop-decorator]
@property
def total_items(self) -> int:
"""Total number of items across all documents."""
return (
self.total_texts
+ self.total_tables
+ self.total_pictures
+ self.total_key_value_items
+ self.total_form_items
)
@computed_field # type: ignore[prop-decorator]
@property
def avg_items_per_document(self) -> float:
"""Average number of items per document."""
if self.num_documents == 0:
return 0.0
return self.total_items / self.num_documents
@computed_field # type: ignore[prop-decorator]
@property
def avg_items_per_page(self) -> float:
"""Average number of items per page across all documents."""
if self.total_pages == 0:
return 0.0
return self.total_items / self.total_pages
class DocumentProfiler:
"""Profiler for extracting statistics from DoclingDocument objects."""
@staticmethod
def _calculate_deciles(data: list[int]) -> list[float]:
"""Calculate deciles (1st through 9th) for a list of values.
Args:
data: List of integer values
Returns:
List of 9 floats representing [d1, d2, d3, d4, d5, d6, d7, d8, d9]
(10th, 20th, 30th, 40th, 50th, 60th, 70th, 80th, 90th percentiles)
"""
if not data:
return [0.0] * 9
decile_values = np.percentile(data, [10, 20, 30, 40, 50, 60, 70, 80, 90])
return [float(val) for val in decile_values]
@staticmethod
def _calculate_histogram(data: list[int], num_bins: int = 10) -> Histogram:
"""Calculate histogram for a list of values.
Args:
data: List of integer values
num_bins: Number of bins for the histogram (default: 10)
Returns:
Histogram object with bins and frequencies
"""
if not data:
return Histogram()
# Use numpy to calculate histogram
frequencies, bin_edges = np.histogram(data, bins=num_bins)
# Calculate bin width
bin_width = float(bin_edges[1] - bin_edges[0]) if len(bin_edges) > 1 else 0.0
return Histogram(
bins=[float(edge) for edge in bin_edges],
frequencies=[int(freq) for freq in frequencies],
bin_width=bin_width,
)
@staticmethod
def profile_document(doc: DoclingDocument, bitmap_coverage_threshold: float = 0.05) -> DocumentStats:
"""Extract statistics from a single DoclingDocument.
Args:
doc: The DoclingDocument to profile
bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to trigger OCR.
Pictures with area coverage above this threshold are counted as requiring OCR.
Default is 0.05 (5% of page area).
Returns:
DocumentStatistics containing the extracted metrics
"""
# Count items by label
label_counts = {
DocItemLabel.SECTION_HEADER: 0,
DocItemLabel.LIST_ITEM: 0,
DocItemLabel.CODE: 0,
DocItemLabel.FORMULA: 0,
}
for text_item in doc.texts:
if text_item.label in label_counts:
label_counts[text_item.label] += 1
# Calculate percentage of pictures that would trigger OCR based on area coverage
num_pictures_for_ocr = 0
for picture in doc.pictures:
# Get picture's bounding box area from provenance
if picture.prov and len(picture.prov) > 0:
prov = picture.prov[0] # Use first provenance item
bbox = prov.bbox
picture_area = bbox.width * bbox.height
# Get page size
page_no = prov.page_no
if page_no in doc.pages:
page = doc.pages[page_no]
page_area = page.size.width * page.size.height
# Calculate coverage ratio
if page_area > 0:
coverage_ratio = picture_area / page_area
# Check if coverage exceeds threshold
if coverage_ratio >= bitmap_coverage_threshold:
num_pictures_for_ocr += 1
return DocumentStats(
name=doc.name,
num_pages=len(doc.pages),
num_tables=len(doc.tables),
num_pictures=len(doc.pictures),
num_texts=len(doc.texts),
num_key_value_items=len(doc.key_value_items),
num_form_items=len(doc.form_items),
num_section_headers=label_counts[DocItemLabel.SECTION_HEADER],
num_list_items=label_counts[DocItemLabel.LIST_ITEM],
num_code_items=label_counts[DocItemLabel.CODE],
num_formulas=label_counts[DocItemLabel.FORMULA],
origin_mimetype=doc.origin.mimetype if doc.origin else None,
num_pictures_for_ocr=num_pictures_for_ocr,
)
@staticmethod
def profile_collection(
documents: Iterable[DoclingDocument] | DoclingDocument,
include_individual_stats: bool = False,
bitmap_coverage_threshold: float = 0.05,
num_bins: int = 10,
) -> CollectionStats:
"""Extract statistics from a collection of DoclingDocument objects.
Args:
documents: An iterable of DoclingDocument objects, or a single document
include_individual_stats: Whether to include individual document statistics
in the result (useful for detailed analysis but increases memory usage)
bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to
trigger OCR. Pictures with area coverage above this threshold are counted
as requiring OCR. Default is 0.05 (5% of page area).
num_bins: Number of bins for histograms. Default is 10.
Returns:
CollectionStatistics containing the aggregated metrics
"""
# Handle single document case
if isinstance(documents, DoclingDocument):
documents = [documents]
# Collect statistics
doc_stats_list: list[DocumentStats] = []
pages_list: list[int] = []
tables_list: list[int] = []
pictures_list: list[int] = []
texts_list: list[int] = []
pictures_for_ocr_list: list[int] = []
total_pages = 0
total_tables = 0
total_pictures = 0
total_texts = 0
total_key_value_items = 0
total_form_items = 0
total_section_headers = 0
total_list_items = 0
total_code_items = 0
total_formulas = 0
total_pictures_for_ocr = 0
mimetype_distribution: dict[str, int] = {}
# Process each document
for doc in documents:
doc_stats = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=bitmap_coverage_threshold)
if include_individual_stats:
doc_stats_list.append(doc_stats)
# Collect values for statistics
pages_list.append(doc_stats.num_pages)
tables_list.append(doc_stats.num_tables)
pictures_list.append(doc_stats.num_pictures)
texts_list.append(doc_stats.num_texts)
pictures_for_ocr_list.append(doc_stats.num_pictures_for_ocr)
# Accumulate totals
total_pages += doc_stats.num_pages
total_tables += doc_stats.num_tables
total_pictures += doc_stats.num_pictures
total_texts += doc_stats.num_texts
total_key_value_items += doc_stats.num_key_value_items
total_form_items += doc_stats.num_form_items
total_section_headers += doc_stats.num_section_headers
total_list_items += doc_stats.num_list_items
total_code_items += doc_stats.num_code_items
total_formulas += doc_stats.num_formulas
total_pictures_for_ocr += doc_stats.num_pictures_for_ocr
# Track MIME types
if doc_stats.origin_mimetype:
mimetype_distribution[doc_stats.origin_mimetype] = (
mimetype_distribution.get(doc_stats.origin_mimetype, 0) + 1
)
num_documents = len(pages_list)
# Handle edge case of empty collection
if num_documents == 0:
return CollectionStats()
# Calculate statistics
return CollectionStats(
num_documents=num_documents,
# Page statistics
total_pages=total_pages,
min_pages=min(pages_list),
max_pages=max(pages_list),
deciles_pages=DocumentProfiler._calculate_deciles(pages_list),
histogram_pages=DocumentProfiler._calculate_histogram(pages_list, num_bins=num_bins),
mean_pages=statistics.mean(pages_list),
std_pages=statistics.stdev(pages_list) if num_documents > 1 else 0.0,
# Table statistics
total_tables=total_tables,
min_tables=min(tables_list),
max_tables=max(tables_list),
deciles_tables=DocumentProfiler._calculate_deciles(tables_list),
histogram_tables=DocumentProfiler._calculate_histogram(tables_list, num_bins=num_bins),
mean_tables=statistics.mean(tables_list),
std_tables=statistics.stdev(tables_list) if num_documents > 1 else 0.0,
# Picture statistics
total_pictures=total_pictures,
min_pictures=min(pictures_list),
max_pictures=max(pictures_list),
deciles_pictures=DocumentProfiler._calculate_deciles(pictures_list),
histogram_pictures=DocumentProfiler._calculate_histogram(pictures_list, num_bins=num_bins),
mean_pictures=statistics.mean(pictures_list),
std_pictures=statistics.stdev(pictures_list) if num_documents > 1 else 0.0,
# Text statistics
total_texts=total_texts,
min_texts=min(texts_list),
max_texts=max(texts_list),
deciles_texts=DocumentProfiler._calculate_deciles(texts_list),
histogram_texts=DocumentProfiler._calculate_histogram(texts_list, num_bins=num_bins),
mean_texts=statistics.mean(texts_list),
std_texts=statistics.stdev(texts_list) if num_documents > 1 else 0.0,
# Additional totals
total_key_value_items=total_key_value_items,
total_form_items=total_form_items,
total_section_headers=total_section_headers,
total_list_items=total_list_items,
total_code_items=total_code_items,
total_formulas=total_formulas,
# Document characteristics
# Pictures for OCR statistics
total_pictures_for_ocr=total_pictures_for_ocr,
min_pictures_for_ocr=min(pictures_for_ocr_list),
max_pictures_for_ocr=max(pictures_for_ocr_list),
deciles_pictures_for_ocr=DocumentProfiler._calculate_deciles(pictures_for_ocr_list),
histogram_pictures_for_ocr=DocumentProfiler._calculate_histogram(pictures_for_ocr_list, num_bins=num_bins),
mean_pictures_for_ocr=statistics.mean(pictures_for_ocr_list),
std_pictures_for_ocr=(statistics.stdev(pictures_for_ocr_list) if num_documents > 1 else 0.0),
mimetype_distribution=mimetype_distribution,
document_stats=doc_stats_list if include_individual_stats else [],
)
+250
View File
@@ -0,0 +1,250 @@
"""Example usage of the document profiler for extracting statistics."""
import time
from pathlib import Path
from docling_core.transforms.profiler import DocumentProfiler
from docling_core.types.doc import DoclingDocument
def profile_single_document():
"""Example: Profile a single document."""
print("=" * 80)
print("Example 1: Profiling a Single Document")
print("=" * 80)
# Load a document
doc_path = Path("./examples/2408.09869v3.json")
if not doc_path.exists():
print(f"Document not found: {doc_path}")
return
doc = DoclingDocument.load_from_json(doc_path)
# Profile the document
stats = DocumentProfiler.profile_document(doc)
# Print statistics
print(f"\nDocument: {stats.name}")
print(f"Pages: {stats.num_pages}")
print(f"Tables: {stats.num_tables}")
print(f"Pictures: {stats.num_pictures}")
print(f"Text items: {stats.num_texts}")
print(f" - Section headers: {stats.num_section_headers}")
print(f" - List items: {stats.num_list_items}")
print(f" - Code blocks: {stats.num_code_items}")
print(f" - Formulas: {stats.num_formulas}")
print(f"\nTotal items: {stats.total_items}")
print(f"Average items per page: {stats.avg_items_per_page:.2f}")
print(f"\nOrigin MIME type: {stats.origin_mimetype}")
print(f"Pictures requiring OCR: {stats.num_pictures_for_ocr}")
# Export to JSON
json_output = stats.model_dump_json(indent=2)
print(f"\nJSON export (first 500 chars):\n{json_output[:500]}...")
def profile_document_collection():
"""Example: Profile a collection of documents."""
print("\n" + "=" * 80)
print("Example 2: Profiling a Document Collection")
print("=" * 80)
# Load multiple documents
doc_dir = Path("./test/data/doc")
if not doc_dir.exists():
print(f"Directory not found: {doc_dir}")
return
# Load all JSON documents
docs = []
for json_file in doc_dir.glob("*.json"):
try:
doc = DoclingDocument.load_from_json(json_file)
docs.append(doc)
except Exception as e:
print(f"Skipping {json_file.name}: {e}")
if not docs:
print("No documents found")
return
print(f"\nLoaded {len(docs)} documents")
# Profile the collection
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
# Print collection statistics
print("\nCollection Statistics:")
print(f"Number of documents: {stats.num_documents}")
print("\nPages:")
print(f" Total: {stats.total_pages}")
print(f" Min: {stats.min_pages}, Max: {stats.max_pages}")
print(f" Median (d5): {stats.deciles_pages[4]:.1f}, Mean: {stats.mean_pages:.2f}")
print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}")
print(f" Std Dev: {stats.std_pages:.2f}")
print(f" Histogram bins: {len(stats.histogram_pages.bins)}, bin width: {stats.histogram_pages.bin_width:.2f}")
print("\nTables:")
print(f" Total: {stats.total_tables}")
print(f" Min: {stats.min_tables}, Max: {stats.max_tables}")
print(f" Median (d5): {stats.deciles_tables[4]:.1f}, Mean: {stats.mean_tables:.2f}")
print(f" Deciles: d1={stats.deciles_tables[0]:.1f}, d5={stats.deciles_tables[4]:.1f}, d9={stats.deciles_tables[8]:.1f}")
print(f" Std Dev: {stats.std_tables:.2f}")
print("\nPictures:")
print(f" Total: {stats.total_pictures}")
print(f" Min: {stats.min_pictures}, Max: {stats.max_pictures}")
print(f" Median (d5): {stats.deciles_pictures[4]:.1f}, Mean: {stats.mean_pictures:.2f}")
print(f" Deciles: d1={stats.deciles_pictures[0]:.1f}, d5={stats.deciles_pictures[4]:.1f}, d9={stats.deciles_pictures[8]:.1f}")
print(f" Std Dev: {stats.std_pictures:.2f}")
print("\nText Items:")
print(f" Total: {stats.total_texts}")
print(f" Min: {stats.min_texts}, Max: {stats.max_texts}")
print(f" Median (d5): {stats.deciles_texts[4]:.1f}, Mean: {stats.mean_texts:.2f}")
print(f" Deciles: d1={stats.deciles_texts[0]:.1f}, d5={stats.deciles_texts[4]:.1f}, d9={stats.deciles_texts[8]:.1f}")
print(f" Std Dev: {stats.std_texts:.2f}")
print("\nPictures Requiring OCR:")
print(f" Total: {stats.total_pictures_for_ocr}")
print(f" Min: {stats.min_pictures_for_ocr}, Max: {stats.max_pictures_for_ocr}")
print(f" Median (d5): {stats.deciles_pictures_for_ocr[4]:.1f}, Mean: {stats.mean_pictures_for_ocr:.2f}")
print(f" Deciles: d1={stats.deciles_pictures_for_ocr[0]:.1f}, d5={stats.deciles_pictures_for_ocr[4]:.1f}, d9={stats.deciles_pictures_for_ocr[8]:.1f}")
print(f" Std Dev: {stats.std_pictures_for_ocr:.2f}")
if stats.mimetype_distribution:
print("\nMIME Type Distribution:")
for mimetype, count in sorted(stats.mimetype_distribution.items()):
print(f" {mimetype}: {count}")
print("\nComputed Metrics:")
print(f" Total items: {stats.total_items}")
print(f" Avg items per document: {stats.avg_items_per_document:.2f}")
print(f" Avg items per page: {stats.avg_items_per_page:.2f}")
# Show individual document stats
if stats.document_stats:
print("\nIndividual Document Statistics:")
for i, doc_stat in enumerate(stats.document_stats[:3], 1): # Show first 3
print(f"\n Document {i}: {doc_stat.name}")
print(f" Pages: {doc_stat.num_pages}, Tables: {doc_stat.num_tables}, "
f"Pictures: {doc_stat.num_pictures}, Texts: {doc_stat.num_texts}")
def profile_with_generator():
"""Example: Profile documents using a generator (memory efficient)."""
print("\n" + "=" * 80)
print("Example 3: Profiling with Generator (Memory Efficient)")
print("=" * 80)
doc_dir = Path("./test/data/doc")
if not doc_dir.exists():
print(f"Directory not found: {doc_dir}")
return
def document_generator():
"""Generator that yields documents one at a time."""
for json_file in doc_dir.glob("*.json"):
try:
doc = DoclingDocument.load_from_json(json_file)
yield doc
except Exception:
pass # Skip invalid documents
# Profile using generator - documents are not all loaded into memory
start_time = time.time()
stats = DocumentProfiler.profile_collection(
document_generator(),
include_individual_stats=False # Don't store individual stats to save memory
)
elapsed_time = time.time() - start_time
print(f"\nProcessed {stats.num_documents} documents in {elapsed_time:.2f} seconds")
print(f"Total pages: {stats.total_pages}")
print(f"Total tables: {stats.total_tables}")
print(f"Total pictures: {stats.total_pictures}")
print(f"Mean pages per document: {stats.mean_pages:.2f}")
def export_statistics_report():
"""Example: Export statistics to a JSON report."""
print("\n" + "=" * 80)
print("Example 4: Exporting Statistics Report")
print("=" * 80)
doc_dir = Path("./test/data/doc")
if not doc_dir.exists():
print(f"Directory not found: {doc_dir}")
return
# Load documents
docs = []
for json_file in doc_dir.glob("*.json"):
try:
docs.append(DoclingDocument.load_from_json(json_file))
except Exception:
pass
if not docs:
print("No documents found")
return
# Profile collection
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
# Export to JSON file
output_file = Path("./document_statistics_report.json")
with open(output_file, "w") as f:
f.write(stats.model_dump_json(indent=2))
print(f"\nStatistics report exported to: {output_file}")
print(f"File size: {output_file.stat().st_size} bytes")
# Also export as Python dict for further processing
stats_dict = stats.model_dump()
print(f"\nStatistics as dict (keys): {list(stats_dict.keys())[:10]}...")
def analyze_document_characteristics():
"""Example: Analyze specific document characteristics."""
print("\n" + "=" * 80)
print("Example 5: Analyzing Document Characteristics")
print("=" * 80)
doc_dir = Path("./test/data/doc")
if not doc_dir.exists():
print(f"Directory not found: {doc_dir}")
return
# Profile each document individually
ocr_candidate_docs = []
for json_file in doc_dir.glob("*.json"):
try:
doc = DoclingDocument.load_from_json(json_file)
stats = DocumentProfiler.profile_document(doc)
if stats.num_pictures_for_ocr > 0:
ocr_candidate_docs.append((stats.name, stats.num_pictures_for_ocr))
except Exception:
pass
print(f"\nDocuments with OCR requirements: {len(ocr_candidate_docs)}")
if ocr_candidate_docs:
for name, count in sorted(ocr_candidate_docs, key=lambda x: x[1], reverse=True)[:5]:
print(f" - {name}: {count} pictures require OCR")
if __name__ == "__main__":
# Run all examples
profile_single_document()
profile_document_collection()
profile_with_generator()
export_statistics_report()
analyze_document_characteristics()
print("\n" + "=" * 80)
print("Examples completed!")
print("=" * 80)
+319
View File
@@ -0,0 +1,319 @@
"""Visualization utilities for collection statistics.
This module provides utilities for creating charts from CollectionStats data.
Requires matplotlib to be installed (available with 'examples' extra).
Install with: pip install docling-core[examples]
"""
from pathlib import Path
from typing import Literal
try:
import matplotlib.figure
import matplotlib.pyplot as plt
MATPLOTLIB_AVAILABLE = True
except ImportError:
MATPLOTLIB_AVAILABLE = False
from docling_core.transforms.profiler.doc_profiler import CollectionStats, Histogram
class StatsVisualizer:
"""Visualizer for creating charts from CollectionStats data."""
@staticmethod
def _check_matplotlib() -> None:
"""Check if matplotlib is available."""
if not MATPLOTLIB_AVAILABLE:
raise ImportError(
"matplotlib is required for visualization. "
"Install it with: pip install docling-core[examples]"
)
@staticmethod
def plot_histogram(
histogram: Histogram,
title: str = "Distribution",
xlabel: str = "Value",
ylabel: str = "Frequency",
color: str = "steelblue",
figsize: tuple[int, int] = (10, 6),
log_scale: bool = False,
) -> "matplotlib.figure.Figure":
"""Plot a histogram from Histogram data.
Args:
histogram: Histogram object containing bins and frequencies
title: Plot title
xlabel: X-axis label
ylabel: Y-axis label
color: Bar color
figsize: Figure size as (width, height)
log_scale: If True, use logarithmic scale for y-axis (frequency counts)
Returns:
matplotlib Figure object
Raises:
ImportError: If matplotlib is not installed
"""
StatsVisualizer._check_matplotlib()
fig, ax = plt.subplots(figsize=figsize)
# Calculate bin centers for plotting
bins = histogram.bins
frequencies = histogram.frequencies
if len(bins) > 0 and len(frequencies) > 0:
# bins has n+1 edges, frequencies has n values
bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))]
bin_width = histogram.bin_width
ax.bar(bin_centers, frequencies, width=bin_width * 0.9, color=color, edgecolor="black", alpha=0.7)
ax.set_xlabel(xlabel, fontsize=12)
ax.set_ylabel(ylabel, fontsize=12)
ax.set_title(title, fontsize=14, fontweight="bold")
ax.grid(axis="y", alpha=0.3, linestyle="--")
if log_scale:
ax.set_yscale('log')
ax.set_ylabel(f"{ylabel} (log scale)", fontsize=12)
plt.tight_layout()
return fig
@staticmethod
def plot_deciles(
deciles: list[float],
title: str = "Decile Distribution",
ylabel: str = "Value",
color: str = "coral",
figsize: tuple[int, int] = (10, 6),
) -> "matplotlib.figure.Figure":
"""Plot deciles as a line chart.
Args:
deciles: List of 9 decile values [d1, d2, ..., d9] (10th, 20th, ..., 90th percentiles)
title: Plot title
ylabel: Y-axis label
color: Line color
figsize: Figure size as (width, height)
Returns:
matplotlib Figure object
Raises:
ImportError: If matplotlib is not installed
"""
StatsVisualizer._check_matplotlib()
fig, ax = plt.subplots(figsize=figsize)
decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
percentile_labels = [10, 20, 30, 40, 50, 60, 70, 80, 90]
ax.plot(decile_labels, deciles, marker="o", linewidth=2, markersize=8, color=color)
ax.fill_between(decile_labels, deciles, alpha=0.3, color=color)
# Highlight median (d5 = 50th percentile)
ax.axvline(x=5, color="red", linestyle="--", alpha=0.5, label="Median (d5)")
ax.set_xlabel("Decile", fontsize=12)
ax.set_ylabel(ylabel, fontsize=12)
ax.set_title(title, fontsize=14, fontweight="bold")
ax.set_xticks(decile_labels)
ax.set_xticklabels([f"d{d} (p{p})" for d, p in zip(decile_labels, percentile_labels)])
ax.grid(True, alpha=0.3, linestyle="--")
ax.legend()
plt.tight_layout()
return fig
@staticmethod
def plot_collection_overview(
stats: CollectionStats,
metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None,
figsize: tuple[int, int] = (16, 10),
log_scale: bool = False,
) -> "matplotlib.figure.Figure":
"""Create a comprehensive overview plot with multiple histograms.
Args:
stats: CollectionStats object
metrics: List of metrics to plot. If None, plots all available metrics.
figsize: Figure size as (width, height)
log_scale: If True, use logarithmic scale for y-axis (frequency counts)
Returns:
matplotlib Figure object with subplots
Raises:
ImportError: If matplotlib is not installed
"""
StatsVisualizer._check_matplotlib()
if metrics is None:
metrics = ["pages", "tables", "pictures", "texts"]
n_metrics = len(metrics)
n_cols = 2
n_rows = (n_metrics + 1) // 2
fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
if n_rows == 1:
axes = axes.reshape(1, -1)
metric_config = {
"pages": {
"histogram": stats.histogram_pages,
"title": "Pages per Document",
"color": "steelblue",
},
"tables": {
"histogram": stats.histogram_tables,
"title": "Tables per Document",
"color": "forestgreen",
},
"pictures": {
"histogram": stats.histogram_pictures,
"title": "Pictures per Document",
"color": "coral",
},
"texts": {
"histogram": stats.histogram_texts,
"title": "Text Items per Document",
"color": "mediumpurple",
},
}
for idx, metric in enumerate(metrics):
row = idx // n_cols
col = idx % n_cols
ax = axes[row, col]
config = metric_config[metric]
histogram = config["histogram"]
bins = histogram.bins
frequencies = histogram.frequencies
if len(bins) > 0 and len(frequencies) > 0:
bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))]
bin_width = histogram.bin_width
ax.bar(
bin_centers,
frequencies,
width=bin_width * 0.9,
color=config["color"],
edgecolor="black",
alpha=0.7,
)
ax.set_xlabel("Count", fontsize=10)
ylabel = "Frequency (log scale)" if log_scale else "Frequency"
ax.set_ylabel(ylabel, fontsize=10)
ax.set_title(config["title"], fontsize=12, fontweight="bold")
ax.grid(axis="y", alpha=0.3, linestyle="--")
if log_scale:
ax.set_yscale("log")
# Hide unused subplots
for idx in range(n_metrics, n_rows * n_cols):
row = idx // n_cols
col = idx % n_cols
axes[row, col].axis("off")
fig.suptitle(
f"Collection Statistics Overview ({stats.num_documents} documents)",
fontsize=16,
fontweight="bold",
)
plt.tight_layout()
return fig
@staticmethod
def plot_deciles_comparison(
stats: CollectionStats,
metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None,
figsize: tuple[int, int] = (12, 6),
) -> "matplotlib.figure.Figure":
"""Create a comparison plot of deciles for multiple metrics.
Args:
stats: CollectionStats object
metrics: List of metrics to plot. If None, plots all available metrics.
figsize: Figure size as (width, height)
Returns:
matplotlib Figure object
Raises:
ImportError: If matplotlib is not installed
"""
StatsVisualizer._check_matplotlib()
if metrics is None:
metrics = ["pages", "tables", "pictures", "texts"]
fig, ax = plt.subplots(figsize=figsize)
decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
metric_config = {
"pages": {"deciles": stats.deciles_pages, "label": "Pages", "color": "steelblue"},
"tables": {"deciles": stats.deciles_tables, "label": "Tables", "color": "forestgreen"},
"pictures": {"deciles": stats.deciles_pictures, "label": "Pictures", "color": "coral"},
"texts": {"deciles": stats.deciles_texts, "label": "Text Items", "color": "mediumpurple"},
}
for metric in metrics:
config = metric_config[metric]
ax.plot(
decile_labels,
config["deciles"],
marker="o",
linewidth=2,
markersize=6,
label=config["label"],
color=config["color"],
)
ax.axvline(x=5, color="red", linestyle="--", alpha=0.3, label="Median (d5)")
ax.set_xlabel("Decile", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
ax.set_title("Decile Comparison Across Metrics", fontsize=14, fontweight="bold")
ax.set_xticks(decile_labels)
ax.set_xticklabels([f"d{d}" for d in decile_labels])
ax.grid(True, alpha=0.3, linestyle="--")
ax.legend(loc="best")
plt.tight_layout()
return fig
@staticmethod
def save_figure(fig: "matplotlib.figure.Figure", filepath: str | Path, dpi: int = 300) -> None:
"""Save a matplotlib figure to file.
Args:
fig: matplotlib Figure object
filepath: Output file path (supports .png, .pdf, .svg, etc.)
dpi: Resolution in dots per inch
"""
StatsVisualizer._check_matplotlib()
fig.savefig(filepath, dpi=dpi, bbox_inches="tight")
@staticmethod
def show_figure(fig: "matplotlib.figure.Figure") -> None:
"""Display a matplotlib figure.
Args:
fig: matplotlib Figure object
"""
StatsVisualizer._check_matplotlib()
plt.show()
+257
View File
@@ -0,0 +1,257 @@
"""Example: Visualizing Collection Statistics with Charts.
This example demonstrates how to use the StatsVisualizer to create
various charts from CollectionStats data.
Requirements:
pip install docling-core[examples] # Includes matplotlib
"""
from pathlib import Path
from stats_visualizer import StatsVisualizer
from docling_core.transforms.profiler import CollectionStats, DocumentProfiler
from docling_core.types.doc import DoclingDocument
def load_documents_and_profile(doc_dir: Path) -> CollectionStats | None:
"""Load documents from directory and profile them.
Args:
doc_dir: Directory containing JSON documents
Returns:
CollectionStats object or None if no documents found
"""
if not doc_dir.exists():
print(f"Directory not found: {doc_dir}")
return None
docs = []
for json_file in doc_dir.glob("*.json"):
try:
docs.append(DoclingDocument.load_from_json(json_file))
except Exception:
pass
if not docs:
print("No documents found")
return None
# Profile collection
stats = DocumentProfiler.profile_collection(docs)
print(f"Loaded and profiled {stats.num_documents} documents")
return stats
def visualize_single_histogram(stats: CollectionStats):
"""Example 1: Plot a single histogram."""
print("\n" + "=" * 80)
print("Example 1: Single Histogram Plot")
print("=" * 80)
# Create histogram plot for pages (linear scale)
fig = StatsVisualizer.plot_histogram(
histogram=stats.histogram_pages,
title="Distribution of Pages per Document",
xlabel="Number of Pages",
ylabel="Number of Documents",
color="steelblue",
)
# Save the figure
output_file = Path("./pages_histogram.png")
StatsVisualizer.save_figure(fig, output_file)
print(f"Saved histogram to: {output_file}")
# Create histogram plot for pages (logarithmic scale)
fig_log = StatsVisualizer.plot_histogram(
histogram=stats.histogram_pages,
title="Distribution of Pages per Document (Log Scale)",
xlabel="Number of Pages",
ylabel="Number of Documents",
color="steelblue",
log_scale=True,
)
# Save the figure
output_file_log = Path("./pages_histogram_log.png")
StatsVisualizer.save_figure(fig_log, output_file_log)
print(f"Saved histogram (log scale) to: {output_file_log}")
def visualize_deciles(stats: CollectionStats):
"""Example 2: Plot deciles."""
print("\n" + "=" * 80)
print("Example 2: Decile Distribution Plot")
print("=" * 80)
# Create decile plot for tables
fig = StatsVisualizer.plot_deciles(
deciles=stats.deciles_tables,
title="Decile Distribution of Tables per Document",
ylabel="Number of Tables",
color="forestgreen",
)
# Save the figure
output_file = Path("./tables_deciles.png")
StatsVisualizer.save_figure(fig, output_file)
print(f"Saved decile plot to: {output_file}")
def visualize_collection_overview(stats: CollectionStats):
"""Example 3: Create comprehensive overview with multiple metrics."""
print("\n" + "=" * 80)
print("Example 3: Collection Overview (Multiple Histograms)")
print("=" * 80)
# Create overview plot with all metrics (linear scale)
fig = StatsVisualizer.plot_collection_overview(
stats=stats,
metrics=["pages", "tables", "pictures", "texts"],
figsize=(16, 10),
)
# Save the figure
output_file = Path("./collection_overview.png")
StatsVisualizer.save_figure(fig, output_file)
print(f"Saved collection overview to: {output_file}")
# Create overview plot with all metrics (logarithmic scale)
fig_log = StatsVisualizer.plot_collection_overview(
stats=stats,
metrics=["pages", "tables", "pictures", "texts"],
figsize=(16, 10),
log_scale=True,
)
# Save the figure
output_file_log = Path("./collection_overview_log.png")
StatsVisualizer.save_figure(fig_log, output_file_log)
print(f"Saved collection overview (log scale) to: {output_file_log}")
def visualize_deciles_comparison(stats: CollectionStats):
"""Example 4: Compare deciles across multiple metrics."""
print("\n" + "=" * 80)
print("Example 4: Decile Comparison Across Metrics")
print("=" * 80)
# Create comparison plot
fig = StatsVisualizer.plot_deciles_comparison(
stats=stats,
metrics=["pages", "tables", "pictures", "texts"],
figsize=(12, 6),
)
# Save the figure
output_file = Path("./deciles_comparison.png")
StatsVisualizer.save_figure(fig, output_file)
print(f"Saved decile comparison to: {output_file}")
def create_custom_visualization(stats: CollectionStats):
"""Example 5: Create custom visualization for specific metrics."""
print("\n" + "=" * 80)
print("Example 5: Custom Visualization")
print("=" * 80)
# Create histogram for pictures only (with log scale for high frequency on low values)
fig1 = StatsVisualizer.plot_histogram(
histogram=stats.histogram_pictures,
title="Picture Distribution (Log Scale)",
xlabel="Pictures per Document",
ylabel="Frequency",
color="coral",
figsize=(10, 6),
log_scale=True,
)
StatsVisualizer.save_figure(fig1, "./pictures_histogram_log.png")
print("Saved pictures histogram (log scale)")
# Create decile plot for texts only
fig2 = StatsVisualizer.plot_deciles(
deciles=stats.deciles_texts,
title="Text Items Decile Distribution",
ylabel="Number of Text Items",
color="mediumpurple",
figsize=(10, 6),
)
StatsVisualizer.save_figure(fig2, "./texts_deciles.png")
print("Saved texts decile plot")
# Create overview with selected metrics (log scale)
fig3 = StatsVisualizer.plot_collection_overview(
stats=stats,
metrics=["pages", "tables"], # Only pages and tables
figsize=(12, 6),
log_scale=True,
)
StatsVisualizer.save_figure(fig3, "./pages_tables_overview_log.png")
print("Saved pages and tables overview (log scale)")
def display_statistics_summary(stats: CollectionStats):
"""Example 6: Display statistics summary with key insights."""
print("\n" + "=" * 80)
print("Example 6: Statistics Summary")
print("=" * 80)
print(f"\nCollection Summary ({stats.num_documents} documents):")
print("\nPages:")
print(f" Range: {stats.min_pages} - {stats.max_pages}")
print(f" Median (d5): {stats.deciles_pages[4]:.1f}")
print(f" Mean: {stats.mean_pages:.2f}")
print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, "
f"d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}")
print("\nTables:")
print(f" Range: {stats.min_tables} - {stats.max_tables}")
print(f" Median (d5): {stats.deciles_tables[4]:.1f}")
print(f" Mean: {stats.mean_tables:.2f}")
print("\nPictures:")
print(f" Range: {stats.min_pictures} - {stats.max_pictures}")
print(f" Median (d5): {stats.deciles_pictures[4]:.1f}")
print(f" Mean: {stats.mean_pictures:.2f}")
print("\nText Items:")
print(f" Range: {stats.min_texts} - {stats.max_texts}")
print(f" Median (d5): {stats.deciles_texts[4]:.1f}")
print(f" Mean: {stats.mean_texts:.2f}")
if __name__ == "__main__":
try:
# Load documents once and profile them
doc_dir = Path("./test/data/doc")
stats = load_documents_and_profile(doc_dir)
if stats is None:
print("Failed to load documents. Exiting.")
exit(1)
# Run all examples with the same stats object
visualize_single_histogram(stats)
visualize_deciles(stats)
visualize_collection_overview(stats)
# visualize_deciles_comparison(stats)
create_custom_visualization(stats)
display_statistics_summary(stats)
print("\n" + "=" * 80)
print("All visualizations created successfully!")
print("Check the current directory for generated PNG files.")
print("=" * 80)
except ImportError as e:
print(f"\nError: {e}")
print("\nTo run this example, install matplotlib:")
print(" pip install docling-core[examples]")
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
+401
View File
@@ -0,0 +1,401 @@
"""Tests for document profiler."""
import json
from pathlib import Path
import pytest
from docling_core.transforms.profiler import DocumentProfiler
from docling_core.types.doc import BoundingBox, DoclingDocument, ProvenanceItem
from docling_core.types.doc.document import DocumentOrigin, PageItem, Size, TableData
from docling_core.types.doc.labels import DocItemLabel
def test_profile_empty_document():
"""Test profiling an empty document."""
doc = DoclingDocument(name="Empty Document")
stats = DocumentProfiler.profile_document(doc)
assert stats.name == "Empty Document"
assert stats.num_pages == 0
assert stats.num_tables == 0
assert stats.num_pictures == 0
assert stats.num_texts == 0
assert stats.num_key_value_items == 0
assert stats.num_form_items == 0
assert stats.total_items == 0
assert stats.avg_items_per_page == 0.0
assert stats.origin_mimetype is None
def test_profile_simple_document():
"""Test profiling a simple document with basic content."""
doc = DoclingDocument(
name="Simple Document",
origin=DocumentOrigin(
mimetype="application/pdf",
binary_hash=12345,
filename="test.pdf",
),
)
# Add some pages
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
# Add some text items
doc.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
doc.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
# Add a table
doc.add_table(data=TableData(num_rows=2, num_cols=2))
# Add a picture
doc.add_picture()
stats = DocumentProfiler.profile_document(doc)
assert stats.name == "Simple Document"
assert stats.num_pages == 2
assert stats.num_tables == 1
assert stats.num_pictures == 1
assert stats.num_texts == 3
assert stats.num_section_headers == 1
assert stats.total_items == 5
assert stats.avg_items_per_page == 2.5
assert stats.origin_mimetype == "application/pdf"
def test_profile_document_with_pictures_for_ocr():
"""Test profiling pictures that would trigger OCR based on area coverage."""
doc = DoclingDocument(name="Document with Pictures for OCR")
# Add a page
doc.pages[1] = PageItem(page_no=1, size=Size(width=1000, height=1000))
# Add a large picture (10% of page area, above default 5% threshold)
doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=0, t=0, r=316.2, b=316.2), # ~10% of page area
charspan=(0, 0),
)
)
# Add a small picture (2% of page area, below default 5% threshold)
doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=0, t=0, r=141.4, b=141.4), # ~2% of page area
charspan=(0, 0),
)
)
# Add a medium picture (exactly 5% of page area, at threshold)
doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=0, t=0, r=223.607, b=223.607), # exactly 5% of page area
charspan=(0, 0),
)
)
stats = DocumentProfiler.profile_document(doc)
assert stats.num_pictures == 3
# 2 out of 3 pictures meet the threshold (large and medium)
assert stats.num_pictures_for_ocr == 2
# Test with custom threshold of 10%
stats_custom = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.10)
# Only large picture (9.99%) is below 10%, so 0 pictures
assert stats_custom.num_pictures_for_ocr == 0
# Test with custom threshold of 2%
stats_custom2 = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.02)
# 2 pictures are above 2% threshold (large and medium, small is 1.99%)
assert stats_custom2.num_pictures_for_ocr == 2
def test_profile_collection_empty():
"""Test profiling an empty collection."""
stats = DocumentProfiler.profile_collection([])
assert stats.num_documents == 0
assert stats.total_pages == 0
assert stats.total_tables == 0
assert stats.total_pictures == 0
assert stats.avg_items_per_document == 0.0
assert stats.avg_items_per_page == 0.0
assert stats.deciles_pages == [0.0] * 9
assert stats.deciles_tables == [0.0] * 9
assert stats.histogram_pages.bins == []
assert stats.histogram_pages.frequencies == []
assert stats.histogram_pages.bin_width == 0.0
def test_profile_collection_single_document():
"""Test profiling a collection with a single document."""
doc = DoclingDocument(name="Single Doc")
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text", orig="Text")
doc.add_table(data=TableData(num_rows=1, num_cols=1))
doc.add_picture()
stats = DocumentProfiler.profile_collection(doc)
assert stats.num_documents == 1
assert stats.total_pages == 1
assert stats.total_tables == 1
assert stats.total_pictures == 1
assert stats.total_texts == 1
assert stats.min_pages == 1
assert stats.max_pages == 1
assert stats.deciles_pages[4] == 1.0 # median is d5 (5th decile, index 4)
assert stats.mean_pages == 1.0
assert stats.std_pages == 0.0
# Check histogram exists
assert len(stats.histogram_pages.bins) > 0
assert len(stats.histogram_pages.frequencies) > 0
def test_profile_collection_multiple_documents():
"""Test profiling a collection with multiple documents."""
docs = []
# Document 1: 2 pages, 1 table, 2 pictures, 2 texts
doc1 = DoclingDocument(
name="Doc1",
origin=DocumentOrigin(mimetype="application/pdf", binary_hash=1, filename="doc1.pdf"),
)
doc1.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc1.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
doc1.add_table(data=TableData(num_rows=1, num_cols=1))
doc1.add_picture()
doc1.add_picture()
doc1.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
doc1.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
docs.append(doc1)
# Document 2: 5 pages, 3 tables, 1 picture, 10 texts
doc2 = DoclingDocument(
name="Doc2",
origin=DocumentOrigin(mimetype="application/pdf", binary_hash=2, filename="doc2.pdf"),
)
for i in range(1, 6):
doc2.pages[i] = PageItem(page_no=i, size=Size(width=612, height=792))
for _ in range(3):
doc2.add_table(data=TableData(num_rows=1, num_cols=1))
doc2.add_picture()
for i in range(10):
doc2.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
docs.append(doc2)
# Document 3: 1 page, 0 tables, 5 pictures, 2 texts
doc3 = DoclingDocument(
name="Doc3",
origin=DocumentOrigin(mimetype="text/html", binary_hash=3, filename="doc3.html"),
)
doc3.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
for _ in range(5):
doc3.add_picture()
doc3.add_text(label=DocItemLabel.TEXT, text="T1", orig="T1")
doc3.add_text(label=DocItemLabel.TEXT, text="T2", orig="T2")
docs.append(doc3)
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
# Basic counts
assert stats.num_documents == 3
assert stats.total_pages == 8 # 2 + 5 + 1
assert stats.total_tables == 4 # 1 + 3 + 0
assert stats.total_pictures == 8 # 2 + 1 + 5
assert stats.total_texts == 14 # 2 + 10 + 2
# Page statistics
assert stats.min_pages == 1
assert stats.max_pages == 5
assert stats.deciles_pages[4] == 2.0 # median is d5 (5th decile, index 4)
assert stats.mean_pages == pytest.approx(8 / 3)
assert stats.std_pages > 0
# Check deciles are in order: [d1, d2, d3, d4, d5, d6, d7, d8, d9]
assert stats.deciles_pages[0] <= stats.deciles_pages[4] <= stats.deciles_pages[8]
# Check histogram exists
assert len(stats.histogram_pages.bins) > 0
assert len(stats.histogram_pages.frequencies) > 0
# Table statistics
assert stats.min_tables == 0
assert stats.max_tables == 3
assert stats.deciles_tables[4] == 1.0 # median is d5 (5th decile, index 4)
assert stats.mean_tables == pytest.approx(4 / 3)
# Check histogram exists
assert len(stats.histogram_tables.bins) > 0
# Picture statistics
assert stats.min_pictures == 1
assert stats.max_pictures == 5
assert stats.deciles_pictures[4] == 2.0 # median is d5 (5th decile, index 4)
assert stats.mean_pictures == pytest.approx(8 / 3)
# Check histogram exists
assert len(stats.histogram_pictures.bins) > 0
# Text statistics
assert stats.min_texts == 2
assert stats.max_texts == 10
assert stats.deciles_texts[4] == 2.0 # median is d5 (5th decile, index 4)
assert stats.mean_texts == pytest.approx(14 / 3)
# Check histogram exists
assert len(stats.histogram_texts.bins) > 0
# Document characteristics
assert len(stats.document_stats) == 3
# MIME type distribution
assert stats.mimetype_distribution["application/pdf"] == 2
assert stats.mimetype_distribution["text/html"] == 1
# Computed fields
assert stats.total_items == 26 # 14 texts + 4 tables + 8 pictures
assert stats.avg_items_per_document == pytest.approx(26 / 3)
assert stats.avg_items_per_page == pytest.approx(26 / 8)
def test_profile_collection_with_iterator():
"""Test profiling a collection using an iterator (generator)."""
def doc_generator():
for i in range(3):
doc = DoclingDocument(name=f"Doc{i}")
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
yield doc
stats = DocumentProfiler.profile_collection(doc_generator())
assert stats.num_documents == 3
assert stats.total_pages == 3
assert stats.total_texts == 3
def test_profile_collection_without_individual_stats():
"""Test that individual stats are not included by default."""
docs = [DoclingDocument(name=f"Doc{i}") for i in range(3)]
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=False)
assert len(stats.document_stats) == 0
def test_statistics_serialization():
"""Test that statistics can be serialized to JSON."""
doc = DoclingDocument(name="Test Doc")
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
doc_stats = DocumentProfiler.profile_document(doc)
# Test DocumentStatistics serialization
json_str = doc_stats.model_dump_json()
data = json.loads(json_str)
assert data["name"] == "Test Doc"
assert data["num_pages"] == 1
assert data["total_items"] == 1
# Test CollectionStatistics serialization
coll_stats = DocumentProfiler.profile_collection([doc])
json_str = coll_stats.model_dump_json()
data = json.loads(json_str)
assert data["num_documents"] == 1
assert data["total_pages"] == 1
def test_profile_real_document():
"""Test profiling a real document from test data."""
test_file = Path("./test/data/doc/2408.09869v3_enriched.json")
if not test_file.exists():
pytest.skip("Test file not found")
doc = DoclingDocument.load_from_json(test_file)
stats = DocumentProfiler.profile_document(doc)
# Basic sanity checks
assert stats.name == doc.name
assert stats.num_pages == len(doc.pages)
assert stats.num_tables == len(doc.tables)
assert stats.num_pictures == len(doc.pictures)
assert stats.num_texts == len(doc.texts)
assert stats.total_items > 0
def test_label_specific_counts():
"""Test that label-specific counts are accurate."""
doc = DoclingDocument(name="Label Test")
# Add various types of text items
doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 1", orig="Item 1")
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 2", orig="Item 2")
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 3", orig="Item 3")
doc.add_text(label=DocItemLabel.CODE, text="code", orig="code")
doc.add_text(label=DocItemLabel.FORMULA, text="x=y", orig="x=y")
doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
stats = DocumentProfiler.profile_document(doc)
assert stats.num_section_headers == 1
assert stats.num_list_items == 3
assert stats.num_code_items == 1
assert stats.num_formulas == 1
assert stats.num_texts == 7
def test_profile_sample_document(sample_doc):
"""Test profiling the sample document from conftest.py fixture."""
stats = DocumentProfiler.profile_document(sample_doc)
# Verify basic document properties
assert stats.name == "Untitled 1"
assert stats.num_pages == 0 # sample_doc doesn't add pages explicitly
# Verify item counts based on the sample_doc construction
assert stats.num_tables == len(sample_doc.tables)
assert stats.num_pictures == len(sample_doc.pictures)
assert stats.num_texts == len(sample_doc.texts)
assert stats.num_key_value_items == len(sample_doc.key_value_items)
assert stats.num_form_items == len(sample_doc.form_items)
# Verify label-specific counts
assert stats.num_section_headers > 0 # sample_doc has section headers
assert stats.num_list_items > 0 # sample_doc has many list items
assert stats.num_code_items > 0 # sample_doc has code items
assert stats.num_formulas > 0 # sample_doc has formulas
# Verify computed fields
assert stats.total_items > 0
assert stats.total_items == (
stats.num_texts
+ stats.num_tables
+ stats.num_pictures
+ stats.num_key_value_items
+ stats.num_form_items
)
# sample_doc has no pages, so avg_items_per_page should be 0
assert stats.avg_items_per_page == 0.0
def test_calculate_deciles_empty():
"""Test _calculate_deciles with empty data (line 191)."""
result = DocumentProfiler._calculate_deciles([])
assert result == [0.0] * 9
def test_calculate_histogram_empty():
"""Test _calculate_histogram with empty data (line 208)."""
result = DocumentProfiler._calculate_histogram([])
assert result.bins == []
assert result.frequencies == []
assert result.bin_width == 0.0