mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
feat: profile a document or collection (#511)
* feat: profile a document or collection Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(profiler): add deciles and histograms Add deciles and histograms to the Docling collection statistics. Add an example script to plot histograms. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(profiler): add option to plot log frequencies in histogram Add the option to plot the histogram frequencies in logarithmic scale. Extend README with documentation on the document profiler. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test(profiler): cover missing lines in doc_profiler with tests Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
b435090fdf
commit
af50f1cb07
@@ -72,6 +72,17 @@ different use cases.
|
||||
- [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
|
||||
- [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)
|
||||
|
||||
### Profiling
|
||||
|
||||
The Profiling API enables extraction of comprehensive statistics from DoclingDocument objects,
|
||||
both for individual documents and collections. It provides metrics on document structure
|
||||
(pages, tables, pictures, text items) along with statistical distributions (deciles, histograms)
|
||||
and visualization capabilities for analyzing document collections at scale.
|
||||
|
||||
👉 More details:
|
||||
- [Document profiling example](./examples/document_profiling.py)
|
||||
- [Collection statistics visualization](./examples/visualize_collection_stats.py)
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
"""Document profiling and statistics module."""
|
||||
|
||||
from docling_core.transforms.profiler.doc_profiler import (
|
||||
CollectionStats,
|
||||
DecilesT,
|
||||
DocumentProfiler,
|
||||
DocumentStats,
|
||||
Histogram,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"CollectionStats",
|
||||
"DecilesT",
|
||||
"DocumentProfiler",
|
||||
"DocumentStats",
|
||||
"Histogram",
|
||||
]
|
||||
@@ -0,0 +1,425 @@
|
||||
"""Document profiler for extracting statistics from DoclingDocument objects."""
|
||||
|
||||
import statistics
|
||||
from collections.abc import Iterable
|
||||
from typing import Annotated
|
||||
|
||||
import numpy as np
|
||||
from annotated_types import Len
|
||||
from pydantic import BaseModel, Field, computed_field
|
||||
from typing_extensions import TypeAliasType
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from docling_core.types.doc.labels import DocItemLabel
|
||||
|
||||
DecilesT = TypeAliasType("DecilesT", Annotated[list[float], Len(max_length=9, min_length=9)])
|
||||
"""Type alias for deciles: list of 9 floats representing 1st through 9th deciles (10th, 20th, ..., 90th percentiles)."""
|
||||
|
||||
|
||||
class Histogram(BaseModel):
|
||||
"""Histogram representation with bins and frequencies."""
|
||||
|
||||
bins: Annotated[list[float], Field(description="Histogram bin edges")] = []
|
||||
frequencies: Annotated[list[int], Field(description="Frequency count for each bin")] = []
|
||||
bin_width: Annotated[float, Field(description="Width of each bin")] = 0.0
|
||||
|
||||
|
||||
class DocumentStats(BaseModel):
|
||||
"""Statistics for a single DoclingDocument."""
|
||||
|
||||
name: Annotated[str, Field(description="Document name")]
|
||||
num_pages: Annotated[int, Field(description="Number of pages in the document")] = 0
|
||||
num_tables: Annotated[int, Field(description="Number of tables in the document")] = 0
|
||||
num_pictures: Annotated[int, Field(description="Number of pictures in the document")] = 0
|
||||
num_texts: Annotated[int, Field(description="Number of text items in the document")] = 0
|
||||
num_key_value_items: Annotated[int, Field(description="Number of key-value items in the document")] = 0
|
||||
num_form_items: Annotated[int, Field(description="Number of form items in the document")] = 0
|
||||
|
||||
# Label-specific counts
|
||||
num_section_headers: Annotated[int, Field(description="Number of section headers")] = 0
|
||||
num_list_items: Annotated[int, Field(description="Number of list items")] = 0
|
||||
num_code_items: Annotated[int, Field(description="Number of code items")] = 0
|
||||
num_formulas: Annotated[int, Field(description="Number of formula items")] = 0
|
||||
|
||||
# Document characteristics
|
||||
origin_mimetype: Annotated[str | None, Field(description="Origin MIME type if available")] = None
|
||||
num_pictures_for_ocr: Annotated[
|
||||
int,
|
||||
Field(description="Number of pictures that would trigger OCR based on area coverage threshold"),
|
||||
] = 0
|
||||
|
||||
@computed_field # type: ignore[prop-decorator]
|
||||
@property
|
||||
def total_items(self) -> int:
|
||||
"""Total number of items in the document."""
|
||||
return self.num_texts + self.num_tables + self.num_pictures + self.num_key_value_items + self.num_form_items
|
||||
|
||||
@computed_field # type: ignore[prop-decorator]
|
||||
@property
|
||||
def avg_items_per_page(self) -> float:
|
||||
"""Average number of items per page."""
|
||||
if self.num_pages == 0:
|
||||
return 0.0
|
||||
return self.total_items / self.num_pages
|
||||
|
||||
|
||||
class CollectionStats(BaseModel):
|
||||
"""Statistics for a collection of DoclingDocument objects."""
|
||||
|
||||
num_documents: Annotated[int, Field(description="Total number of documents in the collection")] = 0
|
||||
|
||||
# Page statistics
|
||||
total_pages: Annotated[int, Field(description="Total number of pages across all documents")] = 0
|
||||
min_pages: Annotated[int, Field(description="Minimum number of pages in a document")] = 0
|
||||
max_pages: Annotated[int, Field(description="Maximum number of pages in a document")] = 0
|
||||
deciles_pages: Annotated[DecilesT, Field(description="Deciles of pages per document")] = [0.0] * 9
|
||||
histogram_pages: Annotated[Histogram, Field(description="Histogram of pages per document")] = Histogram()
|
||||
mean_pages: Annotated[float, Field(description="Mean number of pages per document")] = 0.0
|
||||
std_pages: Annotated[float, Field(description="Standard deviation of pages per document")] = 0.0
|
||||
|
||||
# Table statistics
|
||||
total_tables: Annotated[int, Field(description="Total number of tables across all documents")] = 0
|
||||
min_tables: Annotated[int, Field(description="Minimum number of tables in a document")] = 0
|
||||
max_tables: Annotated[int, Field(description="Maximum number of tables in a document")] = 0
|
||||
deciles_tables: Annotated[DecilesT, Field(description="Deciles of tables per document")] = [0.0] * 9
|
||||
histogram_tables: Annotated[Histogram, Field(description="Histogram of tables per document")] = Histogram()
|
||||
mean_tables: Annotated[float, Field(description="Mean number of tables per document")] = 0.0
|
||||
std_tables: Annotated[float, Field(description="Standard deviation of tables per document")] = 0.0
|
||||
|
||||
# Picture statistics
|
||||
total_pictures: Annotated[int, Field(description="Total number of pictures across all documents")] = 0
|
||||
min_pictures: Annotated[int, Field(description="Minimum number of pictures in a document")] = 0
|
||||
max_pictures: Annotated[int, Field(description="Maximum number of pictures in a document")] = 0
|
||||
deciles_pictures: Annotated[DecilesT, Field(description="Deciles of pictures per document")] = [0.0] * 9
|
||||
histogram_pictures: Annotated[Histogram, Field(description="Histogram of pictures per document")] = Histogram()
|
||||
mean_pictures: Annotated[float, Field(description="Mean number of pictures per document")] = 0.0
|
||||
std_pictures: Annotated[float, Field(description="Standard deviation of pictures per document")] = 0.0
|
||||
|
||||
# Text statistics
|
||||
total_texts: Annotated[int, Field(description="Total number of text items across all documents")] = 0
|
||||
min_texts: Annotated[int, Field(description="Minimum number of text items in a document")] = 0
|
||||
max_texts: Annotated[int, Field(description="Maximum number of text items in a document")] = 0
|
||||
deciles_texts: Annotated[DecilesT, Field(description="Deciles of text items per document")] = [0.0] * 9
|
||||
histogram_texts: Annotated[Histogram, Field(description="Histogram of text items per document")] = Histogram()
|
||||
mean_texts: Annotated[float, Field(description="Mean number of text items per document")] = 0.0
|
||||
std_texts: Annotated[float, Field(description="Standard deviation of text items per document")] = 0.0
|
||||
|
||||
# Additional item statistics
|
||||
total_key_value_items: Annotated[int, Field(description="Total number of key-value items")] = 0
|
||||
total_form_items: Annotated[int, Field(description="Total number of form items")] = 0
|
||||
total_section_headers: Annotated[int, Field(description="Total number of section headers")] = 0
|
||||
total_list_items: Annotated[int, Field(description="Total number of list items")] = 0
|
||||
total_code_items: Annotated[int, Field(description="Total number of code items")] = 0
|
||||
total_formulas: Annotated[int, Field(description="Total number of formula items")] = 0
|
||||
|
||||
# Document characteristics
|
||||
# Pictures for OCR statistics
|
||||
total_pictures_for_ocr: Annotated[
|
||||
int, Field(description="Total number of pictures requiring OCR across all documents")
|
||||
] = 0
|
||||
min_pictures_for_ocr: Annotated[
|
||||
int, Field(description="Minimum number of pictures requiring OCR in a document")
|
||||
] = 0
|
||||
max_pictures_for_ocr: Annotated[
|
||||
int, Field(description="Maximum number of pictures requiring OCR in a document")
|
||||
] = 0
|
||||
deciles_pictures_for_ocr: Annotated[
|
||||
DecilesT, Field(description="Deciles of pictures requiring OCR per document")
|
||||
] = [0.0] * 9
|
||||
histogram_pictures_for_ocr: Annotated[
|
||||
Histogram, Field(description="Histogram of pictures requiring OCR per document")
|
||||
] = Histogram()
|
||||
mean_pictures_for_ocr: Annotated[float, Field(description="Mean number of pictures requiring OCR per document")] = (
|
||||
0.0
|
||||
)
|
||||
std_pictures_for_ocr: Annotated[
|
||||
float, Field(description="Standard deviation of pictures requiring OCR per document")
|
||||
] = 0.0
|
||||
|
||||
# MIME type distribution
|
||||
mimetype_distribution: Annotated[
|
||||
dict[str, int], Field(description="Distribution of MIME types in the collection")
|
||||
] = {}
|
||||
|
||||
# Per-document statistics (optional, for detailed analysis)
|
||||
document_stats: Annotated[list[DocumentStats], Field(description="Individual statistics for each document")] = []
|
||||
|
||||
@computed_field # type: ignore[prop-decorator]
|
||||
@property
|
||||
def total_items(self) -> int:
|
||||
"""Total number of items across all documents."""
|
||||
return (
|
||||
self.total_texts
|
||||
+ self.total_tables
|
||||
+ self.total_pictures
|
||||
+ self.total_key_value_items
|
||||
+ self.total_form_items
|
||||
)
|
||||
|
||||
@computed_field # type: ignore[prop-decorator]
|
||||
@property
|
||||
def avg_items_per_document(self) -> float:
|
||||
"""Average number of items per document."""
|
||||
if self.num_documents == 0:
|
||||
return 0.0
|
||||
return self.total_items / self.num_documents
|
||||
|
||||
@computed_field # type: ignore[prop-decorator]
|
||||
@property
|
||||
def avg_items_per_page(self) -> float:
|
||||
"""Average number of items per page across all documents."""
|
||||
if self.total_pages == 0:
|
||||
return 0.0
|
||||
return self.total_items / self.total_pages
|
||||
|
||||
|
||||
class DocumentProfiler:
|
||||
"""Profiler for extracting statistics from DoclingDocument objects."""
|
||||
|
||||
@staticmethod
|
||||
def _calculate_deciles(data: list[int]) -> list[float]:
|
||||
"""Calculate deciles (1st through 9th) for a list of values.
|
||||
|
||||
Args:
|
||||
data: List of integer values
|
||||
|
||||
Returns:
|
||||
List of 9 floats representing [d1, d2, d3, d4, d5, d6, d7, d8, d9]
|
||||
(10th, 20th, 30th, 40th, 50th, 60th, 70th, 80th, 90th percentiles)
|
||||
"""
|
||||
if not data:
|
||||
return [0.0] * 9
|
||||
|
||||
decile_values = np.percentile(data, [10, 20, 30, 40, 50, 60, 70, 80, 90])
|
||||
return [float(val) for val in decile_values]
|
||||
|
||||
@staticmethod
|
||||
def _calculate_histogram(data: list[int], num_bins: int = 10) -> Histogram:
|
||||
"""Calculate histogram for a list of values.
|
||||
|
||||
Args:
|
||||
data: List of integer values
|
||||
num_bins: Number of bins for the histogram (default: 10)
|
||||
|
||||
Returns:
|
||||
Histogram object with bins and frequencies
|
||||
"""
|
||||
if not data:
|
||||
return Histogram()
|
||||
|
||||
# Use numpy to calculate histogram
|
||||
frequencies, bin_edges = np.histogram(data, bins=num_bins)
|
||||
|
||||
# Calculate bin width
|
||||
bin_width = float(bin_edges[1] - bin_edges[0]) if len(bin_edges) > 1 else 0.0
|
||||
|
||||
return Histogram(
|
||||
bins=[float(edge) for edge in bin_edges],
|
||||
frequencies=[int(freq) for freq in frequencies],
|
||||
bin_width=bin_width,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def profile_document(doc: DoclingDocument, bitmap_coverage_threshold: float = 0.05) -> DocumentStats:
|
||||
"""Extract statistics from a single DoclingDocument.
|
||||
|
||||
Args:
|
||||
doc: The DoclingDocument to profile
|
||||
bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to trigger OCR.
|
||||
Pictures with area coverage above this threshold are counted as requiring OCR.
|
||||
Default is 0.05 (5% of page area).
|
||||
|
||||
Returns:
|
||||
DocumentStatistics containing the extracted metrics
|
||||
"""
|
||||
# Count items by label
|
||||
label_counts = {
|
||||
DocItemLabel.SECTION_HEADER: 0,
|
||||
DocItemLabel.LIST_ITEM: 0,
|
||||
DocItemLabel.CODE: 0,
|
||||
DocItemLabel.FORMULA: 0,
|
||||
}
|
||||
|
||||
for text_item in doc.texts:
|
||||
if text_item.label in label_counts:
|
||||
label_counts[text_item.label] += 1
|
||||
|
||||
# Calculate percentage of pictures that would trigger OCR based on area coverage
|
||||
num_pictures_for_ocr = 0
|
||||
for picture in doc.pictures:
|
||||
# Get picture's bounding box area from provenance
|
||||
if picture.prov and len(picture.prov) > 0:
|
||||
prov = picture.prov[0] # Use first provenance item
|
||||
bbox = prov.bbox
|
||||
picture_area = bbox.width * bbox.height
|
||||
|
||||
# Get page size
|
||||
page_no = prov.page_no
|
||||
if page_no in doc.pages:
|
||||
page = doc.pages[page_no]
|
||||
page_area = page.size.width * page.size.height
|
||||
|
||||
# Calculate coverage ratio
|
||||
if page_area > 0:
|
||||
coverage_ratio = picture_area / page_area
|
||||
|
||||
# Check if coverage exceeds threshold
|
||||
if coverage_ratio >= bitmap_coverage_threshold:
|
||||
num_pictures_for_ocr += 1
|
||||
|
||||
return DocumentStats(
|
||||
name=doc.name,
|
||||
num_pages=len(doc.pages),
|
||||
num_tables=len(doc.tables),
|
||||
num_pictures=len(doc.pictures),
|
||||
num_texts=len(doc.texts),
|
||||
num_key_value_items=len(doc.key_value_items),
|
||||
num_form_items=len(doc.form_items),
|
||||
num_section_headers=label_counts[DocItemLabel.SECTION_HEADER],
|
||||
num_list_items=label_counts[DocItemLabel.LIST_ITEM],
|
||||
num_code_items=label_counts[DocItemLabel.CODE],
|
||||
num_formulas=label_counts[DocItemLabel.FORMULA],
|
||||
origin_mimetype=doc.origin.mimetype if doc.origin else None,
|
||||
num_pictures_for_ocr=num_pictures_for_ocr,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def profile_collection(
|
||||
documents: Iterable[DoclingDocument] | DoclingDocument,
|
||||
include_individual_stats: bool = False,
|
||||
bitmap_coverage_threshold: float = 0.05,
|
||||
num_bins: int = 10,
|
||||
) -> CollectionStats:
|
||||
"""Extract statistics from a collection of DoclingDocument objects.
|
||||
|
||||
Args:
|
||||
documents: An iterable of DoclingDocument objects, or a single document
|
||||
include_individual_stats: Whether to include individual document statistics
|
||||
in the result (useful for detailed analysis but increases memory usage)
|
||||
bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to
|
||||
trigger OCR. Pictures with area coverage above this threshold are counted
|
||||
as requiring OCR. Default is 0.05 (5% of page area).
|
||||
num_bins: Number of bins for histograms. Default is 10.
|
||||
|
||||
Returns:
|
||||
CollectionStatistics containing the aggregated metrics
|
||||
"""
|
||||
# Handle single document case
|
||||
if isinstance(documents, DoclingDocument):
|
||||
documents = [documents]
|
||||
|
||||
# Collect statistics
|
||||
doc_stats_list: list[DocumentStats] = []
|
||||
pages_list: list[int] = []
|
||||
tables_list: list[int] = []
|
||||
pictures_list: list[int] = []
|
||||
texts_list: list[int] = []
|
||||
pictures_for_ocr_list: list[int] = []
|
||||
|
||||
total_pages = 0
|
||||
total_tables = 0
|
||||
total_pictures = 0
|
||||
total_texts = 0
|
||||
total_key_value_items = 0
|
||||
total_form_items = 0
|
||||
total_section_headers = 0
|
||||
total_list_items = 0
|
||||
total_code_items = 0
|
||||
total_formulas = 0
|
||||
total_pictures_for_ocr = 0
|
||||
|
||||
mimetype_distribution: dict[str, int] = {}
|
||||
|
||||
# Process each document
|
||||
for doc in documents:
|
||||
doc_stats = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=bitmap_coverage_threshold)
|
||||
|
||||
if include_individual_stats:
|
||||
doc_stats_list.append(doc_stats)
|
||||
|
||||
# Collect values for statistics
|
||||
pages_list.append(doc_stats.num_pages)
|
||||
tables_list.append(doc_stats.num_tables)
|
||||
pictures_list.append(doc_stats.num_pictures)
|
||||
texts_list.append(doc_stats.num_texts)
|
||||
pictures_for_ocr_list.append(doc_stats.num_pictures_for_ocr)
|
||||
|
||||
# Accumulate totals
|
||||
total_pages += doc_stats.num_pages
|
||||
total_tables += doc_stats.num_tables
|
||||
total_pictures += doc_stats.num_pictures
|
||||
total_texts += doc_stats.num_texts
|
||||
total_key_value_items += doc_stats.num_key_value_items
|
||||
total_form_items += doc_stats.num_form_items
|
||||
total_section_headers += doc_stats.num_section_headers
|
||||
total_list_items += doc_stats.num_list_items
|
||||
total_code_items += doc_stats.num_code_items
|
||||
total_formulas += doc_stats.num_formulas
|
||||
total_pictures_for_ocr += doc_stats.num_pictures_for_ocr
|
||||
|
||||
# Track MIME types
|
||||
if doc_stats.origin_mimetype:
|
||||
mimetype_distribution[doc_stats.origin_mimetype] = (
|
||||
mimetype_distribution.get(doc_stats.origin_mimetype, 0) + 1
|
||||
)
|
||||
|
||||
num_documents = len(pages_list)
|
||||
|
||||
# Handle edge case of empty collection
|
||||
if num_documents == 0:
|
||||
return CollectionStats()
|
||||
|
||||
# Calculate statistics
|
||||
return CollectionStats(
|
||||
num_documents=num_documents,
|
||||
# Page statistics
|
||||
total_pages=total_pages,
|
||||
min_pages=min(pages_list),
|
||||
max_pages=max(pages_list),
|
||||
deciles_pages=DocumentProfiler._calculate_deciles(pages_list),
|
||||
histogram_pages=DocumentProfiler._calculate_histogram(pages_list, num_bins=num_bins),
|
||||
mean_pages=statistics.mean(pages_list),
|
||||
std_pages=statistics.stdev(pages_list) if num_documents > 1 else 0.0,
|
||||
# Table statistics
|
||||
total_tables=total_tables,
|
||||
min_tables=min(tables_list),
|
||||
max_tables=max(tables_list),
|
||||
deciles_tables=DocumentProfiler._calculate_deciles(tables_list),
|
||||
histogram_tables=DocumentProfiler._calculate_histogram(tables_list, num_bins=num_bins),
|
||||
mean_tables=statistics.mean(tables_list),
|
||||
std_tables=statistics.stdev(tables_list) if num_documents > 1 else 0.0,
|
||||
# Picture statistics
|
||||
total_pictures=total_pictures,
|
||||
min_pictures=min(pictures_list),
|
||||
max_pictures=max(pictures_list),
|
||||
deciles_pictures=DocumentProfiler._calculate_deciles(pictures_list),
|
||||
histogram_pictures=DocumentProfiler._calculate_histogram(pictures_list, num_bins=num_bins),
|
||||
mean_pictures=statistics.mean(pictures_list),
|
||||
std_pictures=statistics.stdev(pictures_list) if num_documents > 1 else 0.0,
|
||||
# Text statistics
|
||||
total_texts=total_texts,
|
||||
min_texts=min(texts_list),
|
||||
max_texts=max(texts_list),
|
||||
deciles_texts=DocumentProfiler._calculate_deciles(texts_list),
|
||||
histogram_texts=DocumentProfiler._calculate_histogram(texts_list, num_bins=num_bins),
|
||||
mean_texts=statistics.mean(texts_list),
|
||||
std_texts=statistics.stdev(texts_list) if num_documents > 1 else 0.0,
|
||||
# Additional totals
|
||||
total_key_value_items=total_key_value_items,
|
||||
total_form_items=total_form_items,
|
||||
total_section_headers=total_section_headers,
|
||||
total_list_items=total_list_items,
|
||||
total_code_items=total_code_items,
|
||||
total_formulas=total_formulas,
|
||||
# Document characteristics
|
||||
# Pictures for OCR statistics
|
||||
total_pictures_for_ocr=total_pictures_for_ocr,
|
||||
min_pictures_for_ocr=min(pictures_for_ocr_list),
|
||||
max_pictures_for_ocr=max(pictures_for_ocr_list),
|
||||
deciles_pictures_for_ocr=DocumentProfiler._calculate_deciles(pictures_for_ocr_list),
|
||||
histogram_pictures_for_ocr=DocumentProfiler._calculate_histogram(pictures_for_ocr_list, num_bins=num_bins),
|
||||
mean_pictures_for_ocr=statistics.mean(pictures_for_ocr_list),
|
||||
std_pictures_for_ocr=(statistics.stdev(pictures_for_ocr_list) if num_documents > 1 else 0.0),
|
||||
mimetype_distribution=mimetype_distribution,
|
||||
document_stats=doc_stats_list if include_individual_stats else [],
|
||||
)
|
||||
@@ -0,0 +1,250 @@
|
||||
"""Example usage of the document profiler for extracting statistics."""
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.transforms.profiler import DocumentProfiler
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
|
||||
def profile_single_document():
|
||||
"""Example: Profile a single document."""
|
||||
print("=" * 80)
|
||||
print("Example 1: Profiling a Single Document")
|
||||
print("=" * 80)
|
||||
|
||||
# Load a document
|
||||
doc_path = Path("./examples/2408.09869v3.json")
|
||||
if not doc_path.exists():
|
||||
print(f"Document not found: {doc_path}")
|
||||
return
|
||||
|
||||
doc = DoclingDocument.load_from_json(doc_path)
|
||||
|
||||
# Profile the document
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
# Print statistics
|
||||
print(f"\nDocument: {stats.name}")
|
||||
print(f"Pages: {stats.num_pages}")
|
||||
print(f"Tables: {stats.num_tables}")
|
||||
print(f"Pictures: {stats.num_pictures}")
|
||||
print(f"Text items: {stats.num_texts}")
|
||||
print(f" - Section headers: {stats.num_section_headers}")
|
||||
print(f" - List items: {stats.num_list_items}")
|
||||
print(f" - Code blocks: {stats.num_code_items}")
|
||||
print(f" - Formulas: {stats.num_formulas}")
|
||||
print(f"\nTotal items: {stats.total_items}")
|
||||
print(f"Average items per page: {stats.avg_items_per_page:.2f}")
|
||||
print(f"\nOrigin MIME type: {stats.origin_mimetype}")
|
||||
print(f"Pictures requiring OCR: {stats.num_pictures_for_ocr}")
|
||||
|
||||
# Export to JSON
|
||||
json_output = stats.model_dump_json(indent=2)
|
||||
print(f"\nJSON export (first 500 chars):\n{json_output[:500]}...")
|
||||
|
||||
|
||||
def profile_document_collection():
|
||||
"""Example: Profile a collection of documents."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 2: Profiling a Document Collection")
|
||||
print("=" * 80)
|
||||
|
||||
# Load multiple documents
|
||||
doc_dir = Path("./test/data/doc")
|
||||
if not doc_dir.exists():
|
||||
print(f"Directory not found: {doc_dir}")
|
||||
return
|
||||
|
||||
# Load all JSON documents
|
||||
docs = []
|
||||
for json_file in doc_dir.glob("*.json"):
|
||||
try:
|
||||
doc = DoclingDocument.load_from_json(json_file)
|
||||
docs.append(doc)
|
||||
except Exception as e:
|
||||
print(f"Skipping {json_file.name}: {e}")
|
||||
|
||||
if not docs:
|
||||
print("No documents found")
|
||||
return
|
||||
|
||||
print(f"\nLoaded {len(docs)} documents")
|
||||
|
||||
# Profile the collection
|
||||
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
|
||||
|
||||
# Print collection statistics
|
||||
print("\nCollection Statistics:")
|
||||
print(f"Number of documents: {stats.num_documents}")
|
||||
print("\nPages:")
|
||||
print(f" Total: {stats.total_pages}")
|
||||
print(f" Min: {stats.min_pages}, Max: {stats.max_pages}")
|
||||
print(f" Median (d5): {stats.deciles_pages[4]:.1f}, Mean: {stats.mean_pages:.2f}")
|
||||
print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}")
|
||||
print(f" Std Dev: {stats.std_pages:.2f}")
|
||||
print(f" Histogram bins: {len(stats.histogram_pages.bins)}, bin width: {stats.histogram_pages.bin_width:.2f}")
|
||||
|
||||
print("\nTables:")
|
||||
print(f" Total: {stats.total_tables}")
|
||||
print(f" Min: {stats.min_tables}, Max: {stats.max_tables}")
|
||||
print(f" Median (d5): {stats.deciles_tables[4]:.1f}, Mean: {stats.mean_tables:.2f}")
|
||||
print(f" Deciles: d1={stats.deciles_tables[0]:.1f}, d5={stats.deciles_tables[4]:.1f}, d9={stats.deciles_tables[8]:.1f}")
|
||||
print(f" Std Dev: {stats.std_tables:.2f}")
|
||||
|
||||
print("\nPictures:")
|
||||
print(f" Total: {stats.total_pictures}")
|
||||
print(f" Min: {stats.min_pictures}, Max: {stats.max_pictures}")
|
||||
print(f" Median (d5): {stats.deciles_pictures[4]:.1f}, Mean: {stats.mean_pictures:.2f}")
|
||||
print(f" Deciles: d1={stats.deciles_pictures[0]:.1f}, d5={stats.deciles_pictures[4]:.1f}, d9={stats.deciles_pictures[8]:.1f}")
|
||||
print(f" Std Dev: {stats.std_pictures:.2f}")
|
||||
|
||||
print("\nText Items:")
|
||||
print(f" Total: {stats.total_texts}")
|
||||
print(f" Min: {stats.min_texts}, Max: {stats.max_texts}")
|
||||
print(f" Median (d5): {stats.deciles_texts[4]:.1f}, Mean: {stats.mean_texts:.2f}")
|
||||
print(f" Deciles: d1={stats.deciles_texts[0]:.1f}, d5={stats.deciles_texts[4]:.1f}, d9={stats.deciles_texts[8]:.1f}")
|
||||
print(f" Std Dev: {stats.std_texts:.2f}")
|
||||
|
||||
print("\nPictures Requiring OCR:")
|
||||
print(f" Total: {stats.total_pictures_for_ocr}")
|
||||
print(f" Min: {stats.min_pictures_for_ocr}, Max: {stats.max_pictures_for_ocr}")
|
||||
print(f" Median (d5): {stats.deciles_pictures_for_ocr[4]:.1f}, Mean: {stats.mean_pictures_for_ocr:.2f}")
|
||||
print(f" Deciles: d1={stats.deciles_pictures_for_ocr[0]:.1f}, d5={stats.deciles_pictures_for_ocr[4]:.1f}, d9={stats.deciles_pictures_for_ocr[8]:.1f}")
|
||||
print(f" Std Dev: {stats.std_pictures_for_ocr:.2f}")
|
||||
|
||||
if stats.mimetype_distribution:
|
||||
print("\nMIME Type Distribution:")
|
||||
for mimetype, count in sorted(stats.mimetype_distribution.items()):
|
||||
print(f" {mimetype}: {count}")
|
||||
|
||||
print("\nComputed Metrics:")
|
||||
print(f" Total items: {stats.total_items}")
|
||||
print(f" Avg items per document: {stats.avg_items_per_document:.2f}")
|
||||
print(f" Avg items per page: {stats.avg_items_per_page:.2f}")
|
||||
|
||||
# Show individual document stats
|
||||
if stats.document_stats:
|
||||
print("\nIndividual Document Statistics:")
|
||||
for i, doc_stat in enumerate(stats.document_stats[:3], 1): # Show first 3
|
||||
print(f"\n Document {i}: {doc_stat.name}")
|
||||
print(f" Pages: {doc_stat.num_pages}, Tables: {doc_stat.num_tables}, "
|
||||
f"Pictures: {doc_stat.num_pictures}, Texts: {doc_stat.num_texts}")
|
||||
|
||||
|
||||
def profile_with_generator():
|
||||
"""Example: Profile documents using a generator (memory efficient)."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 3: Profiling with Generator (Memory Efficient)")
|
||||
print("=" * 80)
|
||||
|
||||
doc_dir = Path("./test/data/doc")
|
||||
if not doc_dir.exists():
|
||||
print(f"Directory not found: {doc_dir}")
|
||||
return
|
||||
|
||||
def document_generator():
|
||||
"""Generator that yields documents one at a time."""
|
||||
for json_file in doc_dir.glob("*.json"):
|
||||
try:
|
||||
doc = DoclingDocument.load_from_json(json_file)
|
||||
yield doc
|
||||
except Exception:
|
||||
pass # Skip invalid documents
|
||||
|
||||
# Profile using generator - documents are not all loaded into memory
|
||||
start_time = time.time()
|
||||
stats = DocumentProfiler.profile_collection(
|
||||
document_generator(),
|
||||
include_individual_stats=False # Don't store individual stats to save memory
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
print(f"\nProcessed {stats.num_documents} documents in {elapsed_time:.2f} seconds")
|
||||
print(f"Total pages: {stats.total_pages}")
|
||||
print(f"Total tables: {stats.total_tables}")
|
||||
print(f"Total pictures: {stats.total_pictures}")
|
||||
print(f"Mean pages per document: {stats.mean_pages:.2f}")
|
||||
|
||||
|
||||
def export_statistics_report():
|
||||
"""Example: Export statistics to a JSON report."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 4: Exporting Statistics Report")
|
||||
print("=" * 80)
|
||||
|
||||
doc_dir = Path("./test/data/doc")
|
||||
if not doc_dir.exists():
|
||||
print(f"Directory not found: {doc_dir}")
|
||||
return
|
||||
|
||||
# Load documents
|
||||
docs = []
|
||||
for json_file in doc_dir.glob("*.json"):
|
||||
try:
|
||||
docs.append(DoclingDocument.load_from_json(json_file))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not docs:
|
||||
print("No documents found")
|
||||
return
|
||||
|
||||
# Profile collection
|
||||
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
|
||||
|
||||
# Export to JSON file
|
||||
output_file = Path("./document_statistics_report.json")
|
||||
with open(output_file, "w") as f:
|
||||
f.write(stats.model_dump_json(indent=2))
|
||||
|
||||
print(f"\nStatistics report exported to: {output_file}")
|
||||
print(f"File size: {output_file.stat().st_size} bytes")
|
||||
|
||||
# Also export as Python dict for further processing
|
||||
stats_dict = stats.model_dump()
|
||||
print(f"\nStatistics as dict (keys): {list(stats_dict.keys())[:10]}...")
|
||||
|
||||
|
||||
def analyze_document_characteristics():
|
||||
"""Example: Analyze specific document characteristics."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 5: Analyzing Document Characteristics")
|
||||
print("=" * 80)
|
||||
|
||||
doc_dir = Path("./test/data/doc")
|
||||
if not doc_dir.exists():
|
||||
print(f"Directory not found: {doc_dir}")
|
||||
return
|
||||
|
||||
# Profile each document individually
|
||||
ocr_candidate_docs = []
|
||||
|
||||
for json_file in doc_dir.glob("*.json"):
|
||||
try:
|
||||
doc = DoclingDocument.load_from_json(json_file)
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
if stats.num_pictures_for_ocr > 0:
|
||||
ocr_candidate_docs.append((stats.name, stats.num_pictures_for_ocr))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f"\nDocuments with OCR requirements: {len(ocr_candidate_docs)}")
|
||||
if ocr_candidate_docs:
|
||||
for name, count in sorted(ocr_candidate_docs, key=lambda x: x[1], reverse=True)[:5]:
|
||||
print(f" - {name}: {count} pictures require OCR")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run all examples
|
||||
profile_single_document()
|
||||
profile_document_collection()
|
||||
profile_with_generator()
|
||||
export_statistics_report()
|
||||
analyze_document_characteristics()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Examples completed!")
|
||||
print("=" * 80)
|
||||
|
||||
@@ -0,0 +1,319 @@
|
||||
"""Visualization utilities for collection statistics.
|
||||
|
||||
This module provides utilities for creating charts from CollectionStats data.
|
||||
Requires matplotlib to be installed (available with 'examples' extra).
|
||||
|
||||
Install with: pip install docling-core[examples]
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
try:
|
||||
import matplotlib.figure
|
||||
import matplotlib.pyplot as plt
|
||||
MATPLOTLIB_AVAILABLE = True
|
||||
except ImportError:
|
||||
MATPLOTLIB_AVAILABLE = False
|
||||
|
||||
from docling_core.transforms.profiler.doc_profiler import CollectionStats, Histogram
|
||||
|
||||
|
||||
class StatsVisualizer:
|
||||
"""Visualizer for creating charts from CollectionStats data."""
|
||||
|
||||
@staticmethod
|
||||
def _check_matplotlib() -> None:
|
||||
"""Check if matplotlib is available."""
|
||||
if not MATPLOTLIB_AVAILABLE:
|
||||
raise ImportError(
|
||||
"matplotlib is required for visualization. "
|
||||
"Install it with: pip install docling-core[examples]"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def plot_histogram(
|
||||
histogram: Histogram,
|
||||
title: str = "Distribution",
|
||||
xlabel: str = "Value",
|
||||
ylabel: str = "Frequency",
|
||||
color: str = "steelblue",
|
||||
figsize: tuple[int, int] = (10, 6),
|
||||
log_scale: bool = False,
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Plot a histogram from Histogram data.
|
||||
|
||||
Args:
|
||||
histogram: Histogram object containing bins and frequencies
|
||||
title: Plot title
|
||||
xlabel: X-axis label
|
||||
ylabel: Y-axis label
|
||||
color: Bar color
|
||||
figsize: Figure size as (width, height)
|
||||
log_scale: If True, use logarithmic scale for y-axis (frequency counts)
|
||||
|
||||
Returns:
|
||||
matplotlib Figure object
|
||||
|
||||
Raises:
|
||||
ImportError: If matplotlib is not installed
|
||||
"""
|
||||
StatsVisualizer._check_matplotlib()
|
||||
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
|
||||
# Calculate bin centers for plotting
|
||||
bins = histogram.bins
|
||||
frequencies = histogram.frequencies
|
||||
|
||||
if len(bins) > 0 and len(frequencies) > 0:
|
||||
# bins has n+1 edges, frequencies has n values
|
||||
bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))]
|
||||
bin_width = histogram.bin_width
|
||||
|
||||
ax.bar(bin_centers, frequencies, width=bin_width * 0.9, color=color, edgecolor="black", alpha=0.7)
|
||||
|
||||
ax.set_xlabel(xlabel, fontsize=12)
|
||||
ax.set_ylabel(ylabel, fontsize=12)
|
||||
ax.set_title(title, fontsize=14, fontweight="bold")
|
||||
ax.grid(axis="y", alpha=0.3, linestyle="--")
|
||||
|
||||
if log_scale:
|
||||
ax.set_yscale('log')
|
||||
ax.set_ylabel(f"{ylabel} (log scale)", fontsize=12)
|
||||
|
||||
plt.tight_layout()
|
||||
return fig
|
||||
|
||||
@staticmethod
|
||||
def plot_deciles(
|
||||
deciles: list[float],
|
||||
title: str = "Decile Distribution",
|
||||
ylabel: str = "Value",
|
||||
color: str = "coral",
|
||||
figsize: tuple[int, int] = (10, 6),
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Plot deciles as a line chart.
|
||||
|
||||
Args:
|
||||
deciles: List of 9 decile values [d1, d2, ..., d9] (10th, 20th, ..., 90th percentiles)
|
||||
title: Plot title
|
||||
ylabel: Y-axis label
|
||||
color: Line color
|
||||
figsize: Figure size as (width, height)
|
||||
|
||||
Returns:
|
||||
matplotlib Figure object
|
||||
|
||||
Raises:
|
||||
ImportError: If matplotlib is not installed
|
||||
"""
|
||||
StatsVisualizer._check_matplotlib()
|
||||
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
|
||||
decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
percentile_labels = [10, 20, 30, 40, 50, 60, 70, 80, 90]
|
||||
|
||||
ax.plot(decile_labels, deciles, marker="o", linewidth=2, markersize=8, color=color)
|
||||
ax.fill_between(decile_labels, deciles, alpha=0.3, color=color)
|
||||
|
||||
# Highlight median (d5 = 50th percentile)
|
||||
ax.axvline(x=5, color="red", linestyle="--", alpha=0.5, label="Median (d5)")
|
||||
|
||||
ax.set_xlabel("Decile", fontsize=12)
|
||||
ax.set_ylabel(ylabel, fontsize=12)
|
||||
ax.set_title(title, fontsize=14, fontweight="bold")
|
||||
ax.set_xticks(decile_labels)
|
||||
ax.set_xticklabels([f"d{d} (p{p})" for d, p in zip(decile_labels, percentile_labels)])
|
||||
ax.grid(True, alpha=0.3, linestyle="--")
|
||||
ax.legend()
|
||||
|
||||
plt.tight_layout()
|
||||
return fig
|
||||
|
||||
@staticmethod
|
||||
def plot_collection_overview(
|
||||
stats: CollectionStats,
|
||||
metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None,
|
||||
figsize: tuple[int, int] = (16, 10),
|
||||
log_scale: bool = False,
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Create a comprehensive overview plot with multiple histograms.
|
||||
|
||||
Args:
|
||||
stats: CollectionStats object
|
||||
metrics: List of metrics to plot. If None, plots all available metrics.
|
||||
figsize: Figure size as (width, height)
|
||||
log_scale: If True, use logarithmic scale for y-axis (frequency counts)
|
||||
|
||||
Returns:
|
||||
matplotlib Figure object with subplots
|
||||
|
||||
Raises:
|
||||
ImportError: If matplotlib is not installed
|
||||
"""
|
||||
StatsVisualizer._check_matplotlib()
|
||||
|
||||
if metrics is None:
|
||||
metrics = ["pages", "tables", "pictures", "texts"]
|
||||
|
||||
n_metrics = len(metrics)
|
||||
n_cols = 2
|
||||
n_rows = (n_metrics + 1) // 2
|
||||
|
||||
fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
|
||||
if n_rows == 1:
|
||||
axes = axes.reshape(1, -1)
|
||||
|
||||
metric_config = {
|
||||
"pages": {
|
||||
"histogram": stats.histogram_pages,
|
||||
"title": "Pages per Document",
|
||||
"color": "steelblue",
|
||||
},
|
||||
"tables": {
|
||||
"histogram": stats.histogram_tables,
|
||||
"title": "Tables per Document",
|
||||
"color": "forestgreen",
|
||||
},
|
||||
"pictures": {
|
||||
"histogram": stats.histogram_pictures,
|
||||
"title": "Pictures per Document",
|
||||
"color": "coral",
|
||||
},
|
||||
"texts": {
|
||||
"histogram": stats.histogram_texts,
|
||||
"title": "Text Items per Document",
|
||||
"color": "mediumpurple",
|
||||
},
|
||||
}
|
||||
|
||||
for idx, metric in enumerate(metrics):
|
||||
row = idx // n_cols
|
||||
col = idx % n_cols
|
||||
ax = axes[row, col]
|
||||
|
||||
config = metric_config[metric]
|
||||
histogram = config["histogram"]
|
||||
bins = histogram.bins
|
||||
frequencies = histogram.frequencies
|
||||
|
||||
if len(bins) > 0 and len(frequencies) > 0:
|
||||
bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))]
|
||||
bin_width = histogram.bin_width
|
||||
|
||||
ax.bar(
|
||||
bin_centers,
|
||||
frequencies,
|
||||
width=bin_width * 0.9,
|
||||
color=config["color"],
|
||||
edgecolor="black",
|
||||
alpha=0.7,
|
||||
)
|
||||
|
||||
ax.set_xlabel("Count", fontsize=10)
|
||||
ylabel = "Frequency (log scale)" if log_scale else "Frequency"
|
||||
ax.set_ylabel(ylabel, fontsize=10)
|
||||
ax.set_title(config["title"], fontsize=12, fontweight="bold")
|
||||
ax.grid(axis="y", alpha=0.3, linestyle="--")
|
||||
|
||||
if log_scale:
|
||||
ax.set_yscale("log")
|
||||
|
||||
# Hide unused subplots
|
||||
for idx in range(n_metrics, n_rows * n_cols):
|
||||
row = idx // n_cols
|
||||
col = idx % n_cols
|
||||
axes[row, col].axis("off")
|
||||
|
||||
fig.suptitle(
|
||||
f"Collection Statistics Overview ({stats.num_documents} documents)",
|
||||
fontsize=16,
|
||||
fontweight="bold",
|
||||
)
|
||||
plt.tight_layout()
|
||||
return fig
|
||||
|
||||
@staticmethod
|
||||
def plot_deciles_comparison(
|
||||
stats: CollectionStats,
|
||||
metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None,
|
||||
figsize: tuple[int, int] = (12, 6),
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Create a comparison plot of deciles for multiple metrics.
|
||||
|
||||
Args:
|
||||
stats: CollectionStats object
|
||||
metrics: List of metrics to plot. If None, plots all available metrics.
|
||||
figsize: Figure size as (width, height)
|
||||
|
||||
Returns:
|
||||
matplotlib Figure object
|
||||
|
||||
Raises:
|
||||
ImportError: If matplotlib is not installed
|
||||
"""
|
||||
StatsVisualizer._check_matplotlib()
|
||||
|
||||
if metrics is None:
|
||||
metrics = ["pages", "tables", "pictures", "texts"]
|
||||
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
|
||||
decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
|
||||
metric_config = {
|
||||
"pages": {"deciles": stats.deciles_pages, "label": "Pages", "color": "steelblue"},
|
||||
"tables": {"deciles": stats.deciles_tables, "label": "Tables", "color": "forestgreen"},
|
||||
"pictures": {"deciles": stats.deciles_pictures, "label": "Pictures", "color": "coral"},
|
||||
"texts": {"deciles": stats.deciles_texts, "label": "Text Items", "color": "mediumpurple"},
|
||||
}
|
||||
|
||||
for metric in metrics:
|
||||
config = metric_config[metric]
|
||||
ax.plot(
|
||||
decile_labels,
|
||||
config["deciles"],
|
||||
marker="o",
|
||||
linewidth=2,
|
||||
markersize=6,
|
||||
label=config["label"],
|
||||
color=config["color"],
|
||||
)
|
||||
|
||||
ax.axvline(x=5, color="red", linestyle="--", alpha=0.3, label="Median (d5)")
|
||||
|
||||
ax.set_xlabel("Decile", fontsize=12)
|
||||
ax.set_ylabel("Count", fontsize=12)
|
||||
ax.set_title("Decile Comparison Across Metrics", fontsize=14, fontweight="bold")
|
||||
ax.set_xticks(decile_labels)
|
||||
ax.set_xticklabels([f"d{d}" for d in decile_labels])
|
||||
ax.grid(True, alpha=0.3, linestyle="--")
|
||||
ax.legend(loc="best")
|
||||
|
||||
plt.tight_layout()
|
||||
return fig
|
||||
|
||||
@staticmethod
|
||||
def save_figure(fig: "matplotlib.figure.Figure", filepath: str | Path, dpi: int = 300) -> None:
|
||||
"""Save a matplotlib figure to file.
|
||||
|
||||
Args:
|
||||
fig: matplotlib Figure object
|
||||
filepath: Output file path (supports .png, .pdf, .svg, etc.)
|
||||
dpi: Resolution in dots per inch
|
||||
"""
|
||||
StatsVisualizer._check_matplotlib()
|
||||
fig.savefig(filepath, dpi=dpi, bbox_inches="tight")
|
||||
|
||||
@staticmethod
|
||||
def show_figure(fig: "matplotlib.figure.Figure") -> None:
|
||||
"""Display a matplotlib figure.
|
||||
|
||||
Args:
|
||||
fig: matplotlib Figure object
|
||||
"""
|
||||
StatsVisualizer._check_matplotlib()
|
||||
plt.show()
|
||||
|
||||
@@ -0,0 +1,257 @@
|
||||
"""Example: Visualizing Collection Statistics with Charts.
|
||||
|
||||
This example demonstrates how to use the StatsVisualizer to create
|
||||
various charts from CollectionStats data.
|
||||
|
||||
Requirements:
|
||||
pip install docling-core[examples] # Includes matplotlib
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from stats_visualizer import StatsVisualizer
|
||||
|
||||
from docling_core.transforms.profiler import CollectionStats, DocumentProfiler
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
|
||||
def load_documents_and_profile(doc_dir: Path) -> CollectionStats | None:
|
||||
"""Load documents from directory and profile them.
|
||||
|
||||
Args:
|
||||
doc_dir: Directory containing JSON documents
|
||||
|
||||
Returns:
|
||||
CollectionStats object or None if no documents found
|
||||
"""
|
||||
if not doc_dir.exists():
|
||||
print(f"Directory not found: {doc_dir}")
|
||||
return None
|
||||
|
||||
docs = []
|
||||
for json_file in doc_dir.glob("*.json"):
|
||||
try:
|
||||
docs.append(DoclingDocument.load_from_json(json_file))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not docs:
|
||||
print("No documents found")
|
||||
return None
|
||||
|
||||
# Profile collection
|
||||
stats = DocumentProfiler.profile_collection(docs)
|
||||
print(f"Loaded and profiled {stats.num_documents} documents")
|
||||
return stats
|
||||
|
||||
|
||||
def visualize_single_histogram(stats: CollectionStats):
|
||||
"""Example 1: Plot a single histogram."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 1: Single Histogram Plot")
|
||||
print("=" * 80)
|
||||
|
||||
# Create histogram plot for pages (linear scale)
|
||||
fig = StatsVisualizer.plot_histogram(
|
||||
histogram=stats.histogram_pages,
|
||||
title="Distribution of Pages per Document",
|
||||
xlabel="Number of Pages",
|
||||
ylabel="Number of Documents",
|
||||
color="steelblue",
|
||||
)
|
||||
|
||||
# Save the figure
|
||||
output_file = Path("./pages_histogram.png")
|
||||
StatsVisualizer.save_figure(fig, output_file)
|
||||
print(f"Saved histogram to: {output_file}")
|
||||
|
||||
# Create histogram plot for pages (logarithmic scale)
|
||||
fig_log = StatsVisualizer.plot_histogram(
|
||||
histogram=stats.histogram_pages,
|
||||
title="Distribution of Pages per Document (Log Scale)",
|
||||
xlabel="Number of Pages",
|
||||
ylabel="Number of Documents",
|
||||
color="steelblue",
|
||||
log_scale=True,
|
||||
)
|
||||
|
||||
# Save the figure
|
||||
output_file_log = Path("./pages_histogram_log.png")
|
||||
StatsVisualizer.save_figure(fig_log, output_file_log)
|
||||
print(f"Saved histogram (log scale) to: {output_file_log}")
|
||||
|
||||
|
||||
def visualize_deciles(stats: CollectionStats):
|
||||
"""Example 2: Plot deciles."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 2: Decile Distribution Plot")
|
||||
print("=" * 80)
|
||||
|
||||
# Create decile plot for tables
|
||||
fig = StatsVisualizer.plot_deciles(
|
||||
deciles=stats.deciles_tables,
|
||||
title="Decile Distribution of Tables per Document",
|
||||
ylabel="Number of Tables",
|
||||
color="forestgreen",
|
||||
)
|
||||
|
||||
# Save the figure
|
||||
output_file = Path("./tables_deciles.png")
|
||||
StatsVisualizer.save_figure(fig, output_file)
|
||||
print(f"Saved decile plot to: {output_file}")
|
||||
|
||||
|
||||
def visualize_collection_overview(stats: CollectionStats):
|
||||
"""Example 3: Create comprehensive overview with multiple metrics."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 3: Collection Overview (Multiple Histograms)")
|
||||
print("=" * 80)
|
||||
|
||||
# Create overview plot with all metrics (linear scale)
|
||||
fig = StatsVisualizer.plot_collection_overview(
|
||||
stats=stats,
|
||||
metrics=["pages", "tables", "pictures", "texts"],
|
||||
figsize=(16, 10),
|
||||
)
|
||||
|
||||
# Save the figure
|
||||
output_file = Path("./collection_overview.png")
|
||||
StatsVisualizer.save_figure(fig, output_file)
|
||||
print(f"Saved collection overview to: {output_file}")
|
||||
|
||||
# Create overview plot with all metrics (logarithmic scale)
|
||||
fig_log = StatsVisualizer.plot_collection_overview(
|
||||
stats=stats,
|
||||
metrics=["pages", "tables", "pictures", "texts"],
|
||||
figsize=(16, 10),
|
||||
log_scale=True,
|
||||
)
|
||||
|
||||
# Save the figure
|
||||
output_file_log = Path("./collection_overview_log.png")
|
||||
StatsVisualizer.save_figure(fig_log, output_file_log)
|
||||
print(f"Saved collection overview (log scale) to: {output_file_log}")
|
||||
|
||||
|
||||
def visualize_deciles_comparison(stats: CollectionStats):
|
||||
"""Example 4: Compare deciles across multiple metrics."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 4: Decile Comparison Across Metrics")
|
||||
print("=" * 80)
|
||||
|
||||
# Create comparison plot
|
||||
fig = StatsVisualizer.plot_deciles_comparison(
|
||||
stats=stats,
|
||||
metrics=["pages", "tables", "pictures", "texts"],
|
||||
figsize=(12, 6),
|
||||
)
|
||||
|
||||
# Save the figure
|
||||
output_file = Path("./deciles_comparison.png")
|
||||
StatsVisualizer.save_figure(fig, output_file)
|
||||
print(f"Saved decile comparison to: {output_file}")
|
||||
|
||||
|
||||
def create_custom_visualization(stats: CollectionStats):
|
||||
"""Example 5: Create custom visualization for specific metrics."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 5: Custom Visualization")
|
||||
print("=" * 80)
|
||||
|
||||
# Create histogram for pictures only (with log scale for high frequency on low values)
|
||||
fig1 = StatsVisualizer.plot_histogram(
|
||||
histogram=stats.histogram_pictures,
|
||||
title="Picture Distribution (Log Scale)",
|
||||
xlabel="Pictures per Document",
|
||||
ylabel="Frequency",
|
||||
color="coral",
|
||||
figsize=(10, 6),
|
||||
log_scale=True,
|
||||
)
|
||||
StatsVisualizer.save_figure(fig1, "./pictures_histogram_log.png")
|
||||
print("Saved pictures histogram (log scale)")
|
||||
|
||||
# Create decile plot for texts only
|
||||
fig2 = StatsVisualizer.plot_deciles(
|
||||
deciles=stats.deciles_texts,
|
||||
title="Text Items Decile Distribution",
|
||||
ylabel="Number of Text Items",
|
||||
color="mediumpurple",
|
||||
figsize=(10, 6),
|
||||
)
|
||||
StatsVisualizer.save_figure(fig2, "./texts_deciles.png")
|
||||
print("Saved texts decile plot")
|
||||
|
||||
# Create overview with selected metrics (log scale)
|
||||
fig3 = StatsVisualizer.plot_collection_overview(
|
||||
stats=stats,
|
||||
metrics=["pages", "tables"], # Only pages and tables
|
||||
figsize=(12, 6),
|
||||
log_scale=True,
|
||||
)
|
||||
StatsVisualizer.save_figure(fig3, "./pages_tables_overview_log.png")
|
||||
print("Saved pages and tables overview (log scale)")
|
||||
|
||||
|
||||
def display_statistics_summary(stats: CollectionStats):
|
||||
"""Example 6: Display statistics summary with key insights."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Example 6: Statistics Summary")
|
||||
print("=" * 80)
|
||||
|
||||
print(f"\nCollection Summary ({stats.num_documents} documents):")
|
||||
print("\nPages:")
|
||||
print(f" Range: {stats.min_pages} - {stats.max_pages}")
|
||||
print(f" Median (d5): {stats.deciles_pages[4]:.1f}")
|
||||
print(f" Mean: {stats.mean_pages:.2f}")
|
||||
print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, "
|
||||
f"d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}")
|
||||
|
||||
print("\nTables:")
|
||||
print(f" Range: {stats.min_tables} - {stats.max_tables}")
|
||||
print(f" Median (d5): {stats.deciles_tables[4]:.1f}")
|
||||
print(f" Mean: {stats.mean_tables:.2f}")
|
||||
|
||||
print("\nPictures:")
|
||||
print(f" Range: {stats.min_pictures} - {stats.max_pictures}")
|
||||
print(f" Median (d5): {stats.deciles_pictures[4]:.1f}")
|
||||
print(f" Mean: {stats.mean_pictures:.2f}")
|
||||
|
||||
print("\nText Items:")
|
||||
print(f" Range: {stats.min_texts} - {stats.max_texts}")
|
||||
print(f" Median (d5): {stats.deciles_texts[4]:.1f}")
|
||||
print(f" Mean: {stats.mean_texts:.2f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
# Load documents once and profile them
|
||||
doc_dir = Path("./test/data/doc")
|
||||
stats = load_documents_and_profile(doc_dir)
|
||||
|
||||
if stats is None:
|
||||
print("Failed to load documents. Exiting.")
|
||||
exit(1)
|
||||
|
||||
# Run all examples with the same stats object
|
||||
visualize_single_histogram(stats)
|
||||
visualize_deciles(stats)
|
||||
visualize_collection_overview(stats)
|
||||
# visualize_deciles_comparison(stats)
|
||||
create_custom_visualization(stats)
|
||||
display_statistics_summary(stats)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("All visualizations created successfully!")
|
||||
print("Check the current directory for generated PNG files.")
|
||||
print("=" * 80)
|
||||
|
||||
except ImportError as e:
|
||||
print(f"\nError: {e}")
|
||||
print("\nTo run this example, install matplotlib:")
|
||||
print(" pip install docling-core[examples]")
|
||||
except Exception as e:
|
||||
print(f"\nError: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
@@ -0,0 +1,401 @@
|
||||
"""Tests for document profiler."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling_core.transforms.profiler import DocumentProfiler
|
||||
from docling_core.types.doc import BoundingBox, DoclingDocument, ProvenanceItem
|
||||
from docling_core.types.doc.document import DocumentOrigin, PageItem, Size, TableData
|
||||
from docling_core.types.doc.labels import DocItemLabel
|
||||
|
||||
|
||||
def test_profile_empty_document():
|
||||
"""Test profiling an empty document."""
|
||||
doc = DoclingDocument(name="Empty Document")
|
||||
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
assert stats.name == "Empty Document"
|
||||
assert stats.num_pages == 0
|
||||
assert stats.num_tables == 0
|
||||
assert stats.num_pictures == 0
|
||||
assert stats.num_texts == 0
|
||||
assert stats.num_key_value_items == 0
|
||||
assert stats.num_form_items == 0
|
||||
assert stats.total_items == 0
|
||||
assert stats.avg_items_per_page == 0.0
|
||||
assert stats.origin_mimetype is None
|
||||
|
||||
|
||||
def test_profile_simple_document():
|
||||
"""Test profiling a simple document with basic content."""
|
||||
doc = DoclingDocument(
|
||||
name="Simple Document",
|
||||
origin=DocumentOrigin(
|
||||
mimetype="application/pdf",
|
||||
binary_hash=12345,
|
||||
filename="test.pdf",
|
||||
),
|
||||
)
|
||||
|
||||
# Add some pages
|
||||
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
|
||||
doc.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
|
||||
|
||||
# Add some text items
|
||||
doc.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
|
||||
doc.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
|
||||
doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
|
||||
|
||||
# Add a table
|
||||
doc.add_table(data=TableData(num_rows=2, num_cols=2))
|
||||
|
||||
# Add a picture
|
||||
doc.add_picture()
|
||||
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
assert stats.name == "Simple Document"
|
||||
assert stats.num_pages == 2
|
||||
assert stats.num_tables == 1
|
||||
assert stats.num_pictures == 1
|
||||
assert stats.num_texts == 3
|
||||
assert stats.num_section_headers == 1
|
||||
assert stats.total_items == 5
|
||||
assert stats.avg_items_per_page == 2.5
|
||||
assert stats.origin_mimetype == "application/pdf"
|
||||
|
||||
|
||||
def test_profile_document_with_pictures_for_ocr():
|
||||
"""Test profiling pictures that would trigger OCR based on area coverage."""
|
||||
doc = DoclingDocument(name="Document with Pictures for OCR")
|
||||
|
||||
# Add a page
|
||||
doc.pages[1] = PageItem(page_no=1, size=Size(width=1000, height=1000))
|
||||
|
||||
# Add a large picture (10% of page area, above default 5% threshold)
|
||||
doc.add_picture(
|
||||
prov=ProvenanceItem(
|
||||
page_no=1,
|
||||
bbox=BoundingBox(l=0, t=0, r=316.2, b=316.2), # ~10% of page area
|
||||
charspan=(0, 0),
|
||||
)
|
||||
)
|
||||
|
||||
# Add a small picture (2% of page area, below default 5% threshold)
|
||||
doc.add_picture(
|
||||
prov=ProvenanceItem(
|
||||
page_no=1,
|
||||
bbox=BoundingBox(l=0, t=0, r=141.4, b=141.4), # ~2% of page area
|
||||
charspan=(0, 0),
|
||||
)
|
||||
)
|
||||
|
||||
# Add a medium picture (exactly 5% of page area, at threshold)
|
||||
doc.add_picture(
|
||||
prov=ProvenanceItem(
|
||||
page_no=1,
|
||||
bbox=BoundingBox(l=0, t=0, r=223.607, b=223.607), # exactly 5% of page area
|
||||
charspan=(0, 0),
|
||||
)
|
||||
)
|
||||
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
assert stats.num_pictures == 3
|
||||
# 2 out of 3 pictures meet the threshold (large and medium)
|
||||
assert stats.num_pictures_for_ocr == 2
|
||||
|
||||
# Test with custom threshold of 10%
|
||||
stats_custom = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.10)
|
||||
# Only large picture (9.99%) is below 10%, so 0 pictures
|
||||
assert stats_custom.num_pictures_for_ocr == 0
|
||||
|
||||
# Test with custom threshold of 2%
|
||||
stats_custom2 = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.02)
|
||||
# 2 pictures are above 2% threshold (large and medium, small is 1.99%)
|
||||
assert stats_custom2.num_pictures_for_ocr == 2
|
||||
|
||||
|
||||
def test_profile_collection_empty():
|
||||
"""Test profiling an empty collection."""
|
||||
stats = DocumentProfiler.profile_collection([])
|
||||
|
||||
assert stats.num_documents == 0
|
||||
assert stats.total_pages == 0
|
||||
assert stats.total_tables == 0
|
||||
assert stats.total_pictures == 0
|
||||
assert stats.avg_items_per_document == 0.0
|
||||
assert stats.avg_items_per_page == 0.0
|
||||
assert stats.deciles_pages == [0.0] * 9
|
||||
assert stats.deciles_tables == [0.0] * 9
|
||||
assert stats.histogram_pages.bins == []
|
||||
assert stats.histogram_pages.frequencies == []
|
||||
assert stats.histogram_pages.bin_width == 0.0
|
||||
|
||||
|
||||
def test_profile_collection_single_document():
|
||||
"""Test profiling a collection with a single document."""
|
||||
doc = DoclingDocument(name="Single Doc")
|
||||
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
|
||||
doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text", orig="Text")
|
||||
doc.add_table(data=TableData(num_rows=1, num_cols=1))
|
||||
doc.add_picture()
|
||||
|
||||
stats = DocumentProfiler.profile_collection(doc)
|
||||
|
||||
assert stats.num_documents == 1
|
||||
assert stats.total_pages == 1
|
||||
assert stats.total_tables == 1
|
||||
assert stats.total_pictures == 1
|
||||
assert stats.total_texts == 1
|
||||
assert stats.min_pages == 1
|
||||
assert stats.max_pages == 1
|
||||
assert stats.deciles_pages[4] == 1.0 # median is d5 (5th decile, index 4)
|
||||
assert stats.mean_pages == 1.0
|
||||
assert stats.std_pages == 0.0
|
||||
# Check histogram exists
|
||||
assert len(stats.histogram_pages.bins) > 0
|
||||
assert len(stats.histogram_pages.frequencies) > 0
|
||||
|
||||
|
||||
def test_profile_collection_multiple_documents():
|
||||
"""Test profiling a collection with multiple documents."""
|
||||
docs = []
|
||||
|
||||
# Document 1: 2 pages, 1 table, 2 pictures, 2 texts
|
||||
doc1 = DoclingDocument(
|
||||
name="Doc1",
|
||||
origin=DocumentOrigin(mimetype="application/pdf", binary_hash=1, filename="doc1.pdf"),
|
||||
)
|
||||
doc1.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
|
||||
doc1.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
|
||||
doc1.add_table(data=TableData(num_rows=1, num_cols=1))
|
||||
doc1.add_picture()
|
||||
doc1.add_picture()
|
||||
doc1.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
|
||||
doc1.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
|
||||
docs.append(doc1)
|
||||
|
||||
# Document 2: 5 pages, 3 tables, 1 picture, 10 texts
|
||||
doc2 = DoclingDocument(
|
||||
name="Doc2",
|
||||
origin=DocumentOrigin(mimetype="application/pdf", binary_hash=2, filename="doc2.pdf"),
|
||||
)
|
||||
for i in range(1, 6):
|
||||
doc2.pages[i] = PageItem(page_no=i, size=Size(width=612, height=792))
|
||||
for _ in range(3):
|
||||
doc2.add_table(data=TableData(num_rows=1, num_cols=1))
|
||||
doc2.add_picture()
|
||||
for i in range(10):
|
||||
doc2.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
|
||||
docs.append(doc2)
|
||||
|
||||
# Document 3: 1 page, 0 tables, 5 pictures, 2 texts
|
||||
doc3 = DoclingDocument(
|
||||
name="Doc3",
|
||||
origin=DocumentOrigin(mimetype="text/html", binary_hash=3, filename="doc3.html"),
|
||||
)
|
||||
doc3.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
|
||||
for _ in range(5):
|
||||
doc3.add_picture()
|
||||
doc3.add_text(label=DocItemLabel.TEXT, text="T1", orig="T1")
|
||||
doc3.add_text(label=DocItemLabel.TEXT, text="T2", orig="T2")
|
||||
docs.append(doc3)
|
||||
|
||||
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
|
||||
|
||||
# Basic counts
|
||||
assert stats.num_documents == 3
|
||||
assert stats.total_pages == 8 # 2 + 5 + 1
|
||||
assert stats.total_tables == 4 # 1 + 3 + 0
|
||||
assert stats.total_pictures == 8 # 2 + 1 + 5
|
||||
assert stats.total_texts == 14 # 2 + 10 + 2
|
||||
|
||||
# Page statistics
|
||||
assert stats.min_pages == 1
|
||||
assert stats.max_pages == 5
|
||||
assert stats.deciles_pages[4] == 2.0 # median is d5 (5th decile, index 4)
|
||||
assert stats.mean_pages == pytest.approx(8 / 3)
|
||||
assert stats.std_pages > 0
|
||||
# Check deciles are in order: [d1, d2, d3, d4, d5, d6, d7, d8, d9]
|
||||
assert stats.deciles_pages[0] <= stats.deciles_pages[4] <= stats.deciles_pages[8]
|
||||
# Check histogram exists
|
||||
assert len(stats.histogram_pages.bins) > 0
|
||||
assert len(stats.histogram_pages.frequencies) > 0
|
||||
|
||||
# Table statistics
|
||||
assert stats.min_tables == 0
|
||||
assert stats.max_tables == 3
|
||||
assert stats.deciles_tables[4] == 1.0 # median is d5 (5th decile, index 4)
|
||||
assert stats.mean_tables == pytest.approx(4 / 3)
|
||||
# Check histogram exists
|
||||
assert len(stats.histogram_tables.bins) > 0
|
||||
|
||||
# Picture statistics
|
||||
assert stats.min_pictures == 1
|
||||
assert stats.max_pictures == 5
|
||||
assert stats.deciles_pictures[4] == 2.0 # median is d5 (5th decile, index 4)
|
||||
assert stats.mean_pictures == pytest.approx(8 / 3)
|
||||
# Check histogram exists
|
||||
assert len(stats.histogram_pictures.bins) > 0
|
||||
|
||||
# Text statistics
|
||||
assert stats.min_texts == 2
|
||||
assert stats.max_texts == 10
|
||||
assert stats.deciles_texts[4] == 2.0 # median is d5 (5th decile, index 4)
|
||||
assert stats.mean_texts == pytest.approx(14 / 3)
|
||||
# Check histogram exists
|
||||
assert len(stats.histogram_texts.bins) > 0
|
||||
|
||||
# Document characteristics
|
||||
assert len(stats.document_stats) == 3
|
||||
|
||||
# MIME type distribution
|
||||
assert stats.mimetype_distribution["application/pdf"] == 2
|
||||
assert stats.mimetype_distribution["text/html"] == 1
|
||||
|
||||
# Computed fields
|
||||
assert stats.total_items == 26 # 14 texts + 4 tables + 8 pictures
|
||||
assert stats.avg_items_per_document == pytest.approx(26 / 3)
|
||||
assert stats.avg_items_per_page == pytest.approx(26 / 8)
|
||||
|
||||
|
||||
def test_profile_collection_with_iterator():
|
||||
"""Test profiling a collection using an iterator (generator)."""
|
||||
|
||||
def doc_generator():
|
||||
for i in range(3):
|
||||
doc = DoclingDocument(name=f"Doc{i}")
|
||||
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
|
||||
doc.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
|
||||
yield doc
|
||||
|
||||
stats = DocumentProfiler.profile_collection(doc_generator())
|
||||
|
||||
assert stats.num_documents == 3
|
||||
assert stats.total_pages == 3
|
||||
assert stats.total_texts == 3
|
||||
|
||||
|
||||
def test_profile_collection_without_individual_stats():
|
||||
"""Test that individual stats are not included by default."""
|
||||
docs = [DoclingDocument(name=f"Doc{i}") for i in range(3)]
|
||||
|
||||
stats = DocumentProfiler.profile_collection(docs, include_individual_stats=False)
|
||||
|
||||
assert len(stats.document_stats) == 0
|
||||
|
||||
|
||||
def test_statistics_serialization():
|
||||
"""Test that statistics can be serialized to JSON."""
|
||||
doc = DoclingDocument(name="Test Doc")
|
||||
doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
|
||||
doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
|
||||
|
||||
doc_stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
# Test DocumentStatistics serialization
|
||||
json_str = doc_stats.model_dump_json()
|
||||
data = json.loads(json_str)
|
||||
assert data["name"] == "Test Doc"
|
||||
assert data["num_pages"] == 1
|
||||
assert data["total_items"] == 1
|
||||
|
||||
# Test CollectionStatistics serialization
|
||||
coll_stats = DocumentProfiler.profile_collection([doc])
|
||||
json_str = coll_stats.model_dump_json()
|
||||
data = json.loads(json_str)
|
||||
assert data["num_documents"] == 1
|
||||
assert data["total_pages"] == 1
|
||||
|
||||
|
||||
def test_profile_real_document():
|
||||
"""Test profiling a real document from test data."""
|
||||
test_file = Path("./test/data/doc/2408.09869v3_enriched.json")
|
||||
if not test_file.exists():
|
||||
pytest.skip("Test file not found")
|
||||
|
||||
doc = DoclingDocument.load_from_json(test_file)
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
# Basic sanity checks
|
||||
assert stats.name == doc.name
|
||||
assert stats.num_pages == len(doc.pages)
|
||||
assert stats.num_tables == len(doc.tables)
|
||||
assert stats.num_pictures == len(doc.pictures)
|
||||
assert stats.num_texts == len(doc.texts)
|
||||
assert stats.total_items > 0
|
||||
|
||||
|
||||
def test_label_specific_counts():
|
||||
"""Test that label-specific counts are accurate."""
|
||||
doc = DoclingDocument(name="Label Test")
|
||||
|
||||
# Add various types of text items
|
||||
doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
|
||||
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 1", orig="Item 1")
|
||||
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 2", orig="Item 2")
|
||||
doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 3", orig="Item 3")
|
||||
doc.add_text(label=DocItemLabel.CODE, text="code", orig="code")
|
||||
doc.add_text(label=DocItemLabel.FORMULA, text="x=y", orig="x=y")
|
||||
doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
|
||||
|
||||
stats = DocumentProfiler.profile_document(doc)
|
||||
|
||||
assert stats.num_section_headers == 1
|
||||
assert stats.num_list_items == 3
|
||||
assert stats.num_code_items == 1
|
||||
assert stats.num_formulas == 1
|
||||
assert stats.num_texts == 7
|
||||
|
||||
|
||||
def test_profile_sample_document(sample_doc):
|
||||
"""Test profiling the sample document from conftest.py fixture."""
|
||||
stats = DocumentProfiler.profile_document(sample_doc)
|
||||
|
||||
# Verify basic document properties
|
||||
assert stats.name == "Untitled 1"
|
||||
assert stats.num_pages == 0 # sample_doc doesn't add pages explicitly
|
||||
|
||||
# Verify item counts based on the sample_doc construction
|
||||
assert stats.num_tables == len(sample_doc.tables)
|
||||
assert stats.num_pictures == len(sample_doc.pictures)
|
||||
assert stats.num_texts == len(sample_doc.texts)
|
||||
assert stats.num_key_value_items == len(sample_doc.key_value_items)
|
||||
assert stats.num_form_items == len(sample_doc.form_items)
|
||||
|
||||
# Verify label-specific counts
|
||||
assert stats.num_section_headers > 0 # sample_doc has section headers
|
||||
assert stats.num_list_items > 0 # sample_doc has many list items
|
||||
assert stats.num_code_items > 0 # sample_doc has code items
|
||||
assert stats.num_formulas > 0 # sample_doc has formulas
|
||||
|
||||
# Verify computed fields
|
||||
assert stats.total_items > 0
|
||||
assert stats.total_items == (
|
||||
stats.num_texts
|
||||
+ stats.num_tables
|
||||
+ stats.num_pictures
|
||||
+ stats.num_key_value_items
|
||||
+ stats.num_form_items
|
||||
)
|
||||
|
||||
# sample_doc has no pages, so avg_items_per_page should be 0
|
||||
assert stats.avg_items_per_page == 0.0
|
||||
|
||||
|
||||
def test_calculate_deciles_empty():
|
||||
"""Test _calculate_deciles with empty data (line 191)."""
|
||||
result = DocumentProfiler._calculate_deciles([])
|
||||
assert result == [0.0] * 9
|
||||
|
||||
|
||||
def test_calculate_histogram_empty():
|
||||
"""Test _calculate_histogram with empty data (line 208)."""
|
||||
result = DocumentProfiler._calculate_histogram([])
|
||||
assert result.bins == []
|
||||
assert result.frequencies == []
|
||||
assert result.bin_width == 0.0
|
||||
Reference in New Issue
Block a user