feat: profile a document or collection (#511)

* feat: profile a document or collection Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(profiler): add deciles and histograms Add deciles and histograms to the Docling collection statistics. Add an example script to plot histograms. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(profiler): add option to plot log frequencies in histogram Add the option to plot the histogram frequencies in logarithmic scale. Extend README with documentation on the document profiler. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test(profiler): cover missing lines in doc_profiler with tests Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2026-05-17 13:10:44 +00:00 · 2026-03-13 13:36:38 +01:00
parent b435090fdf
commit af50f1cb07
7 changed files with 1680 additions and 0 deletions
@@ -72,6 +72,17 @@ different use cases.
 - [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/)
 - [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/)

+### Profiling
+
+The Profiling API enables extraction of comprehensive statistics from DoclingDocument objects,
+both for individual documents and collections. It provides metrics on document structure
+(pages, tables, pictures, text items) along with statistical distributions (deciles, histograms)
+and visualization capabilities for analyzing document collections at scale.
+
+👉 More details:
+- [Document profiling example](./examples/document_profiling.py)
+- [Collection statistics visualization](./examples/visualize_collection_stats.py)
+
 ## Contributing

 Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
@@ -0,0 +1,17 @@
+"""Document profiling and statistics module."""
+
+from docling_core.transforms.profiler.doc_profiler import (
+    CollectionStats,
+    DecilesT,
+    DocumentProfiler,
+    DocumentStats,
+    Histogram,
+)
+
+__all__ = [
+    "CollectionStats",
+    "DecilesT",
+    "DocumentProfiler",
+    "DocumentStats",
+    "Histogram",
+]
@@ -0,0 +1,425 @@
+"""Document profiler for extracting statistics from DoclingDocument objects."""
+
+import statistics
+from collections.abc import Iterable
+from typing import Annotated
+
+import numpy as np
+from annotated_types import Len
+from pydantic import BaseModel, Field, computed_field
+from typing_extensions import TypeAliasType
+
+from docling_core.types.doc import DoclingDocument
+from docling_core.types.doc.labels import DocItemLabel
+
+DecilesT = TypeAliasType("DecilesT", Annotated[list[float], Len(max_length=9, min_length=9)])
+"""Type alias for deciles: list of 9 floats representing 1st through 9th deciles (10th, 20th, ..., 90th percentiles)."""
+
+
+class Histogram(BaseModel):
+    """Histogram representation with bins and frequencies."""
+
+    bins: Annotated[list[float], Field(description="Histogram bin edges")] = []
+    frequencies: Annotated[list[int], Field(description="Frequency count for each bin")] = []
+    bin_width: Annotated[float, Field(description="Width of each bin")] = 0.0
+
+
+class DocumentStats(BaseModel):
+    """Statistics for a single DoclingDocument."""
+
+    name: Annotated[str, Field(description="Document name")]
+    num_pages: Annotated[int, Field(description="Number of pages in the document")] = 0
+    num_tables: Annotated[int, Field(description="Number of tables in the document")] = 0
+    num_pictures: Annotated[int, Field(description="Number of pictures in the document")] = 0
+    num_texts: Annotated[int, Field(description="Number of text items in the document")] = 0
+    num_key_value_items: Annotated[int, Field(description="Number of key-value items in the document")] = 0
+    num_form_items: Annotated[int, Field(description="Number of form items in the document")] = 0
+
+    # Label-specific counts
+    num_section_headers: Annotated[int, Field(description="Number of section headers")] = 0
+    num_list_items: Annotated[int, Field(description="Number of list items")] = 0
+    num_code_items: Annotated[int, Field(description="Number of code items")] = 0
+    num_formulas: Annotated[int, Field(description="Number of formula items")] = 0
+
+    # Document characteristics
+    origin_mimetype: Annotated[str | None, Field(description="Origin MIME type if available")] = None
+    num_pictures_for_ocr: Annotated[
+        int,
+        Field(description="Number of pictures that would trigger OCR based on area coverage threshold"),
+    ] = 0
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def total_items(self) -> int:
+        """Total number of items in the document."""
+        return self.num_texts + self.num_tables + self.num_pictures + self.num_key_value_items + self.num_form_items
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def avg_items_per_page(self) -> float:
+        """Average number of items per page."""
+        if self.num_pages == 0:
+            return 0.0
+        return self.total_items / self.num_pages
+
+
+class CollectionStats(BaseModel):
+    """Statistics for a collection of DoclingDocument objects."""
+
+    num_documents: Annotated[int, Field(description="Total number of documents in the collection")] = 0
+
+    # Page statistics
+    total_pages: Annotated[int, Field(description="Total number of pages across all documents")] = 0
+    min_pages: Annotated[int, Field(description="Minimum number of pages in a document")] = 0
+    max_pages: Annotated[int, Field(description="Maximum number of pages in a document")] = 0
+    deciles_pages: Annotated[DecilesT, Field(description="Deciles of pages per document")] = [0.0] * 9
+    histogram_pages: Annotated[Histogram, Field(description="Histogram of pages per document")] = Histogram()
+    mean_pages: Annotated[float, Field(description="Mean number of pages per document")] = 0.0
+    std_pages: Annotated[float, Field(description="Standard deviation of pages per document")] = 0.0
+
+    # Table statistics
+    total_tables: Annotated[int, Field(description="Total number of tables across all documents")] = 0
+    min_tables: Annotated[int, Field(description="Minimum number of tables in a document")] = 0
+    max_tables: Annotated[int, Field(description="Maximum number of tables in a document")] = 0
+    deciles_tables: Annotated[DecilesT, Field(description="Deciles of tables per document")] = [0.0] * 9
+    histogram_tables: Annotated[Histogram, Field(description="Histogram of tables per document")] = Histogram()
+    mean_tables: Annotated[float, Field(description="Mean number of tables per document")] = 0.0
+    std_tables: Annotated[float, Field(description="Standard deviation of tables per document")] = 0.0
+
+    # Picture statistics
+    total_pictures: Annotated[int, Field(description="Total number of pictures across all documents")] = 0
+    min_pictures: Annotated[int, Field(description="Minimum number of pictures in a document")] = 0
+    max_pictures: Annotated[int, Field(description="Maximum number of pictures in a document")] = 0
+    deciles_pictures: Annotated[DecilesT, Field(description="Deciles of pictures per document")] = [0.0] * 9
+    histogram_pictures: Annotated[Histogram, Field(description="Histogram of pictures per document")] = Histogram()
+    mean_pictures: Annotated[float, Field(description="Mean number of pictures per document")] = 0.0
+    std_pictures: Annotated[float, Field(description="Standard deviation of pictures per document")] = 0.0
+
+    # Text statistics
+    total_texts: Annotated[int, Field(description="Total number of text items across all documents")] = 0
+    min_texts: Annotated[int, Field(description="Minimum number of text items in a document")] = 0
+    max_texts: Annotated[int, Field(description="Maximum number of text items in a document")] = 0
+    deciles_texts: Annotated[DecilesT, Field(description="Deciles of text items per document")] = [0.0] * 9
+    histogram_texts: Annotated[Histogram, Field(description="Histogram of text items per document")] = Histogram()
+    mean_texts: Annotated[float, Field(description="Mean number of text items per document")] = 0.0
+    std_texts: Annotated[float, Field(description="Standard deviation of text items per document")] = 0.0
+
+    # Additional item statistics
+    total_key_value_items: Annotated[int, Field(description="Total number of key-value items")] = 0
+    total_form_items: Annotated[int, Field(description="Total number of form items")] = 0
+    total_section_headers: Annotated[int, Field(description="Total number of section headers")] = 0
+    total_list_items: Annotated[int, Field(description="Total number of list items")] = 0
+    total_code_items: Annotated[int, Field(description="Total number of code items")] = 0
+    total_formulas: Annotated[int, Field(description="Total number of formula items")] = 0
+
+    # Document characteristics
+    # Pictures for OCR statistics
+    total_pictures_for_ocr: Annotated[
+        int, Field(description="Total number of pictures requiring OCR across all documents")
+    ] = 0
+    min_pictures_for_ocr: Annotated[
+        int, Field(description="Minimum number of pictures requiring OCR in a document")
+    ] = 0
+    max_pictures_for_ocr: Annotated[
+        int, Field(description="Maximum number of pictures requiring OCR in a document")
+    ] = 0
+    deciles_pictures_for_ocr: Annotated[
+        DecilesT, Field(description="Deciles of pictures requiring OCR per document")
+    ] = [0.0] * 9
+    histogram_pictures_for_ocr: Annotated[
+        Histogram, Field(description="Histogram of pictures requiring OCR per document")
+    ] = Histogram()
+    mean_pictures_for_ocr: Annotated[float, Field(description="Mean number of pictures requiring OCR per document")] = (
+        0.0
+    )
+    std_pictures_for_ocr: Annotated[
+        float, Field(description="Standard deviation of pictures requiring OCR per document")
+    ] = 0.0
+
+    # MIME type distribution
+    mimetype_distribution: Annotated[
+        dict[str, int], Field(description="Distribution of MIME types in the collection")
+    ] = {}
+
+    # Per-document statistics (optional, for detailed analysis)
+    document_stats: Annotated[list[DocumentStats], Field(description="Individual statistics for each document")] = []
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def total_items(self) -> int:
+        """Total number of items across all documents."""
+        return (
+            self.total_texts
+            + self.total_tables
+            + self.total_pictures
+            + self.total_key_value_items
+            + self.total_form_items
+        )
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def avg_items_per_document(self) -> float:
+        """Average number of items per document."""
+        if self.num_documents == 0:
+            return 0.0
+        return self.total_items / self.num_documents
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def avg_items_per_page(self) -> float:
+        """Average number of items per page across all documents."""
+        if self.total_pages == 0:
+            return 0.0
+        return self.total_items / self.total_pages
+
+
+class DocumentProfiler:
+    """Profiler for extracting statistics from DoclingDocument objects."""
+
+    @staticmethod
+    def _calculate_deciles(data: list[int]) -> list[float]:
+        """Calculate deciles (1st through 9th) for a list of values.
+
+        Args:
+            data: List of integer values
+
+        Returns:
+            List of 9 floats representing [d1, d2, d3, d4, d5, d6, d7, d8, d9]
+            (10th, 20th, 30th, 40th, 50th, 60th, 70th, 80th, 90th percentiles)
+        """
+        if not data:
+            return [0.0] * 9
+
+        decile_values = np.percentile(data, [10, 20, 30, 40, 50, 60, 70, 80, 90])
+        return [float(val) for val in decile_values]
+
+    @staticmethod
+    def _calculate_histogram(data: list[int], num_bins: int = 10) -> Histogram:
+        """Calculate histogram for a list of values.
+
+        Args:
+            data: List of integer values
+            num_bins: Number of bins for the histogram (default: 10)
+
+        Returns:
+            Histogram object with bins and frequencies
+        """
+        if not data:
+            return Histogram()
+
+        # Use numpy to calculate histogram
+        frequencies, bin_edges = np.histogram(data, bins=num_bins)
+
+        # Calculate bin width
+        bin_width = float(bin_edges[1] - bin_edges[0]) if len(bin_edges) > 1 else 0.0
+
+        return Histogram(
+            bins=[float(edge) for edge in bin_edges],
+            frequencies=[int(freq) for freq in frequencies],
+            bin_width=bin_width,
+        )
+
+    @staticmethod
+    def profile_document(doc: DoclingDocument, bitmap_coverage_threshold: float = 0.05) -> DocumentStats:
+        """Extract statistics from a single DoclingDocument.
+
+        Args:
+            doc: The DoclingDocument to profile
+            bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to trigger OCR.
+                Pictures with area coverage above this threshold are counted as requiring OCR.
+                Default is 0.05 (5% of page area).
+
+        Returns:
+            DocumentStatistics containing the extracted metrics
+        """
+        # Count items by label
+        label_counts = {
+            DocItemLabel.SECTION_HEADER: 0,
+            DocItemLabel.LIST_ITEM: 0,
+            DocItemLabel.CODE: 0,
+            DocItemLabel.FORMULA: 0,
+        }
+
+        for text_item in doc.texts:
+            if text_item.label in label_counts:
+                label_counts[text_item.label] += 1
+
+        # Calculate percentage of pictures that would trigger OCR based on area coverage
+        num_pictures_for_ocr = 0
+        for picture in doc.pictures:
+            # Get picture's bounding box area from provenance
+            if picture.prov and len(picture.prov) > 0:
+                prov = picture.prov[0]  # Use first provenance item
+                bbox = prov.bbox
+                picture_area = bbox.width * bbox.height
+
+                # Get page size
+                page_no = prov.page_no
+                if page_no in doc.pages:
+                    page = doc.pages[page_no]
+                    page_area = page.size.width * page.size.height
+
+                    # Calculate coverage ratio
+                    if page_area > 0:
+                        coverage_ratio = picture_area / page_area
+
+                        # Check if coverage exceeds threshold
+                        if coverage_ratio >= bitmap_coverage_threshold:
+                            num_pictures_for_ocr += 1
+
+        return DocumentStats(
+            name=doc.name,
+            num_pages=len(doc.pages),
+            num_tables=len(doc.tables),
+            num_pictures=len(doc.pictures),
+            num_texts=len(doc.texts),
+            num_key_value_items=len(doc.key_value_items),
+            num_form_items=len(doc.form_items),
+            num_section_headers=label_counts[DocItemLabel.SECTION_HEADER],
+            num_list_items=label_counts[DocItemLabel.LIST_ITEM],
+            num_code_items=label_counts[DocItemLabel.CODE],
+            num_formulas=label_counts[DocItemLabel.FORMULA],
+            origin_mimetype=doc.origin.mimetype if doc.origin else None,
+            num_pictures_for_ocr=num_pictures_for_ocr,
+        )
+
+    @staticmethod
+    def profile_collection(
+        documents: Iterable[DoclingDocument] | DoclingDocument,
+        include_individual_stats: bool = False,
+        bitmap_coverage_threshold: float = 0.05,
+        num_bins: int = 10,
+    ) -> CollectionStats:
+        """Extract statistics from a collection of DoclingDocument objects.
+
+        Args:
+            documents: An iterable of DoclingDocument objects, or a single document
+            include_individual_stats: Whether to include individual document statistics
+                in the result (useful for detailed analysis but increases memory usage)
+            bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to
+                trigger OCR. Pictures with area coverage above this threshold are counted
+                as requiring OCR. Default is 0.05 (5% of page area).
+            num_bins: Number of bins for histograms. Default is 10.
+
+        Returns:
+            CollectionStatistics containing the aggregated metrics
+        """
+        # Handle single document case
+        if isinstance(documents, DoclingDocument):
+            documents = [documents]
+
+        # Collect statistics
+        doc_stats_list: list[DocumentStats] = []
+        pages_list: list[int] = []
+        tables_list: list[int] = []
+        pictures_list: list[int] = []
+        texts_list: list[int] = []
+        pictures_for_ocr_list: list[int] = []
+
+        total_pages = 0
+        total_tables = 0
+        total_pictures = 0
+        total_texts = 0
+        total_key_value_items = 0
+        total_form_items = 0
+        total_section_headers = 0
+        total_list_items = 0
+        total_code_items = 0
+        total_formulas = 0
+        total_pictures_for_ocr = 0
+
+        mimetype_distribution: dict[str, int] = {}
+
+        # Process each document
+        for doc in documents:
+            doc_stats = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=bitmap_coverage_threshold)
+
+            if include_individual_stats:
+                doc_stats_list.append(doc_stats)
+
+            # Collect values for statistics
+            pages_list.append(doc_stats.num_pages)
+            tables_list.append(doc_stats.num_tables)
+            pictures_list.append(doc_stats.num_pictures)
+            texts_list.append(doc_stats.num_texts)
+            pictures_for_ocr_list.append(doc_stats.num_pictures_for_ocr)
+
+            # Accumulate totals
+            total_pages += doc_stats.num_pages
+            total_tables += doc_stats.num_tables
+            total_pictures += doc_stats.num_pictures
+            total_texts += doc_stats.num_texts
+            total_key_value_items += doc_stats.num_key_value_items
+            total_form_items += doc_stats.num_form_items
+            total_section_headers += doc_stats.num_section_headers
+            total_list_items += doc_stats.num_list_items
+            total_code_items += doc_stats.num_code_items
+            total_formulas += doc_stats.num_formulas
+            total_pictures_for_ocr += doc_stats.num_pictures_for_ocr
+
+            # Track MIME types
+            if doc_stats.origin_mimetype:
+                mimetype_distribution[doc_stats.origin_mimetype] = (
+                    mimetype_distribution.get(doc_stats.origin_mimetype, 0) + 1
+                )
+
+        num_documents = len(pages_list)
+
+        # Handle edge case of empty collection
+        if num_documents == 0:
+            return CollectionStats()
+
+        # Calculate statistics
+        return CollectionStats(
+            num_documents=num_documents,
+            # Page statistics
+            total_pages=total_pages,
+            min_pages=min(pages_list),
+            max_pages=max(pages_list),
+            deciles_pages=DocumentProfiler._calculate_deciles(pages_list),
+            histogram_pages=DocumentProfiler._calculate_histogram(pages_list, num_bins=num_bins),
+            mean_pages=statistics.mean(pages_list),
+            std_pages=statistics.stdev(pages_list) if num_documents > 1 else 0.0,
+            # Table statistics
+            total_tables=total_tables,
+            min_tables=min(tables_list),
+            max_tables=max(tables_list),
+            deciles_tables=DocumentProfiler._calculate_deciles(tables_list),
+            histogram_tables=DocumentProfiler._calculate_histogram(tables_list, num_bins=num_bins),
+            mean_tables=statistics.mean(tables_list),
+            std_tables=statistics.stdev(tables_list) if num_documents > 1 else 0.0,
+            # Picture statistics
+            total_pictures=total_pictures,
+            min_pictures=min(pictures_list),
+            max_pictures=max(pictures_list),
+            deciles_pictures=DocumentProfiler._calculate_deciles(pictures_list),
+            histogram_pictures=DocumentProfiler._calculate_histogram(pictures_list, num_bins=num_bins),
+            mean_pictures=statistics.mean(pictures_list),
+            std_pictures=statistics.stdev(pictures_list) if num_documents > 1 else 0.0,
+            # Text statistics
+            total_texts=total_texts,
+            min_texts=min(texts_list),
+            max_texts=max(texts_list),
+            deciles_texts=DocumentProfiler._calculate_deciles(texts_list),
+            histogram_texts=DocumentProfiler._calculate_histogram(texts_list, num_bins=num_bins),
+            mean_texts=statistics.mean(texts_list),
+            std_texts=statistics.stdev(texts_list) if num_documents > 1 else 0.0,
+            # Additional totals
+            total_key_value_items=total_key_value_items,
+            total_form_items=total_form_items,
+            total_section_headers=total_section_headers,
+            total_list_items=total_list_items,
+            total_code_items=total_code_items,
+            total_formulas=total_formulas,
+            # Document characteristics
+            # Pictures for OCR statistics
+            total_pictures_for_ocr=total_pictures_for_ocr,
+            min_pictures_for_ocr=min(pictures_for_ocr_list),
+            max_pictures_for_ocr=max(pictures_for_ocr_list),
+            deciles_pictures_for_ocr=DocumentProfiler._calculate_deciles(pictures_for_ocr_list),
+            histogram_pictures_for_ocr=DocumentProfiler._calculate_histogram(pictures_for_ocr_list, num_bins=num_bins),
+            mean_pictures_for_ocr=statistics.mean(pictures_for_ocr_list),
+            std_pictures_for_ocr=(statistics.stdev(pictures_for_ocr_list) if num_documents > 1 else 0.0),
+            mimetype_distribution=mimetype_distribution,
+            document_stats=doc_stats_list if include_individual_stats else [],
+        )
@@ -0,0 +1,250 @@
+"""Example usage of the document profiler for extracting statistics."""
+
+import time
+from pathlib import Path
+
+from docling_core.transforms.profiler import DocumentProfiler
+from docling_core.types.doc import DoclingDocument
+
+
+def profile_single_document():
+    """Example: Profile a single document."""
+    print("=" * 80)
+    print("Example 1: Profiling a Single Document")
+    print("=" * 80)
+
+    # Load a document
+    doc_path = Path("./examples/2408.09869v3.json")
+    if not doc_path.exists():
+        print(f"Document not found: {doc_path}")
+        return
+
+    doc = DoclingDocument.load_from_json(doc_path)
+
+    # Profile the document
+    stats = DocumentProfiler.profile_document(doc)
+
+    # Print statistics
+    print(f"\nDocument: {stats.name}")
+    print(f"Pages: {stats.num_pages}")
+    print(f"Tables: {stats.num_tables}")
+    print(f"Pictures: {stats.num_pictures}")
+    print(f"Text items: {stats.num_texts}")
+    print(f"  - Section headers: {stats.num_section_headers}")
+    print(f"  - List items: {stats.num_list_items}")
+    print(f"  - Code blocks: {stats.num_code_items}")
+    print(f"  - Formulas: {stats.num_formulas}")
+    print(f"\nTotal items: {stats.total_items}")
+    print(f"Average items per page: {stats.avg_items_per_page:.2f}")
+    print(f"\nOrigin MIME type: {stats.origin_mimetype}")
+    print(f"Pictures requiring OCR: {stats.num_pictures_for_ocr}")
+
+    # Export to JSON
+    json_output = stats.model_dump_json(indent=2)
+    print(f"\nJSON export (first 500 chars):\n{json_output[:500]}...")
+
+
+def profile_document_collection():
+    """Example: Profile a collection of documents."""
+    print("\n" + "=" * 80)
+    print("Example 2: Profiling a Document Collection")
+    print("=" * 80)
+
+    # Load multiple documents
+    doc_dir = Path("./test/data/doc")
+    if not doc_dir.exists():
+        print(f"Directory not found: {doc_dir}")
+        return
+
+    # Load all JSON documents
+    docs = []
+    for json_file in doc_dir.glob("*.json"):
+        try:
+            doc = DoclingDocument.load_from_json(json_file)
+            docs.append(doc)
+        except Exception as e:
+            print(f"Skipping {json_file.name}: {e}")
+
+    if not docs:
+        print("No documents found")
+        return
+
+    print(f"\nLoaded {len(docs)} documents")
+
+    # Profile the collection
+    stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
+
+    # Print collection statistics
+    print("\nCollection Statistics:")
+    print(f"Number of documents: {stats.num_documents}")
+    print("\nPages:")
+    print(f"  Total: {stats.total_pages}")
+    print(f"  Min: {stats.min_pages}, Max: {stats.max_pages}")
+    print(f"  Median (d5): {stats.deciles_pages[4]:.1f}, Mean: {stats.mean_pages:.2f}")
+    print(f"  Deciles: d1={stats.deciles_pages[0]:.1f}, d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}")
+    print(f"  Std Dev: {stats.std_pages:.2f}")
+    print(f"  Histogram bins: {len(stats.histogram_pages.bins)}, bin width: {stats.histogram_pages.bin_width:.2f}")
+
+    print("\nTables:")
+    print(f"  Total: {stats.total_tables}")
+    print(f"  Min: {stats.min_tables}, Max: {stats.max_tables}")
+    print(f"  Median (d5): {stats.deciles_tables[4]:.1f}, Mean: {stats.mean_tables:.2f}")
+    print(f"  Deciles: d1={stats.deciles_tables[0]:.1f}, d5={stats.deciles_tables[4]:.1f}, d9={stats.deciles_tables[8]:.1f}")
+    print(f"  Std Dev: {stats.std_tables:.2f}")
+
+    print("\nPictures:")
+    print(f"  Total: {stats.total_pictures}")
+    print(f"  Min: {stats.min_pictures}, Max: {stats.max_pictures}")
+    print(f"  Median (d5): {stats.deciles_pictures[4]:.1f}, Mean: {stats.mean_pictures:.2f}")
+    print(f"  Deciles: d1={stats.deciles_pictures[0]:.1f}, d5={stats.deciles_pictures[4]:.1f}, d9={stats.deciles_pictures[8]:.1f}")
+    print(f"  Std Dev: {stats.std_pictures:.2f}")
+
+    print("\nText Items:")
+    print(f"  Total: {stats.total_texts}")
+    print(f"  Min: {stats.min_texts}, Max: {stats.max_texts}")
+    print(f"  Median (d5): {stats.deciles_texts[4]:.1f}, Mean: {stats.mean_texts:.2f}")
+    print(f"  Deciles: d1={stats.deciles_texts[0]:.1f}, d5={stats.deciles_texts[4]:.1f}, d9={stats.deciles_texts[8]:.1f}")
+    print(f"  Std Dev: {stats.std_texts:.2f}")
+
+    print("\nPictures Requiring OCR:")
+    print(f"  Total: {stats.total_pictures_for_ocr}")
+    print(f"  Min: {stats.min_pictures_for_ocr}, Max: {stats.max_pictures_for_ocr}")
+    print(f"  Median (d5): {stats.deciles_pictures_for_ocr[4]:.1f}, Mean: {stats.mean_pictures_for_ocr:.2f}")
+    print(f"  Deciles: d1={stats.deciles_pictures_for_ocr[0]:.1f}, d5={stats.deciles_pictures_for_ocr[4]:.1f}, d9={stats.deciles_pictures_for_ocr[8]:.1f}")
+    print(f"  Std Dev: {stats.std_pictures_for_ocr:.2f}")
+
+    if stats.mimetype_distribution:
+        print("\nMIME Type Distribution:")
+        for mimetype, count in sorted(stats.mimetype_distribution.items()):
+            print(f"  {mimetype}: {count}")
+
+    print("\nComputed Metrics:")
+    print(f"  Total items: {stats.total_items}")
+    print(f"  Avg items per document: {stats.avg_items_per_document:.2f}")
+    print(f"  Avg items per page: {stats.avg_items_per_page:.2f}")
+
+    # Show individual document stats
+    if stats.document_stats:
+        print("\nIndividual Document Statistics:")
+        for i, doc_stat in enumerate(stats.document_stats[:3], 1):  # Show first 3
+            print(f"\n  Document {i}: {doc_stat.name}")
+            print(f"    Pages: {doc_stat.num_pages}, Tables: {doc_stat.num_tables}, "
+                  f"Pictures: {doc_stat.num_pictures}, Texts: {doc_stat.num_texts}")
+
+
+def profile_with_generator():
+    """Example: Profile documents using a generator (memory efficient)."""
+    print("\n" + "=" * 80)
+    print("Example 3: Profiling with Generator (Memory Efficient)")
+    print("=" * 80)
+
+    doc_dir = Path("./test/data/doc")
+    if not doc_dir.exists():
+        print(f"Directory not found: {doc_dir}")
+        return
+
+    def document_generator():
+        """Generator that yields documents one at a time."""
+        for json_file in doc_dir.glob("*.json"):
+            try:
+                doc = DoclingDocument.load_from_json(json_file)
+                yield doc
+            except Exception:
+                pass  # Skip invalid documents
+
+    # Profile using generator - documents are not all loaded into memory
+    start_time = time.time()
+    stats = DocumentProfiler.profile_collection(
+        document_generator(),
+        include_individual_stats=False  # Don't store individual stats to save memory
+    )
+    elapsed_time = time.time() - start_time
+
+    print(f"\nProcessed {stats.num_documents} documents in {elapsed_time:.2f} seconds")
+    print(f"Total pages: {stats.total_pages}")
+    print(f"Total tables: {stats.total_tables}")
+    print(f"Total pictures: {stats.total_pictures}")
+    print(f"Mean pages per document: {stats.mean_pages:.2f}")
+
+
+def export_statistics_report():
+    """Example: Export statistics to a JSON report."""
+    print("\n" + "=" * 80)
+    print("Example 4: Exporting Statistics Report")
+    print("=" * 80)
+
+    doc_dir = Path("./test/data/doc")
+    if not doc_dir.exists():
+        print(f"Directory not found: {doc_dir}")
+        return
+
+    # Load documents
+    docs = []
+    for json_file in doc_dir.glob("*.json"):
+        try:
+            docs.append(DoclingDocument.load_from_json(json_file))
+        except Exception:
+            pass
+
+    if not docs:
+        print("No documents found")
+        return
+
+    # Profile collection
+    stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
+
+    # Export to JSON file
+    output_file = Path("./document_statistics_report.json")
+    with open(output_file, "w") as f:
+        f.write(stats.model_dump_json(indent=2))
+
+    print(f"\nStatistics report exported to: {output_file}")
+    print(f"File size: {output_file.stat().st_size} bytes")
+
+    # Also export as Python dict for further processing
+    stats_dict = stats.model_dump()
+    print(f"\nStatistics as dict (keys): {list(stats_dict.keys())[:10]}...")
+
+
+def analyze_document_characteristics():
+    """Example: Analyze specific document characteristics."""
+    print("\n" + "=" * 80)
+    print("Example 5: Analyzing Document Characteristics")
+    print("=" * 80)
+
+    doc_dir = Path("./test/data/doc")
+    if not doc_dir.exists():
+        print(f"Directory not found: {doc_dir}")
+        return
+
+    # Profile each document individually
+    ocr_candidate_docs = []
+
+    for json_file in doc_dir.glob("*.json"):
+        try:
+            doc = DoclingDocument.load_from_json(json_file)
+            stats = DocumentProfiler.profile_document(doc)
+
+            if stats.num_pictures_for_ocr > 0:
+                ocr_candidate_docs.append((stats.name, stats.num_pictures_for_ocr))
+        except Exception:
+            pass
+
+    print(f"\nDocuments with OCR requirements: {len(ocr_candidate_docs)}")
+    if ocr_candidate_docs:
+        for name, count in sorted(ocr_candidate_docs, key=lambda x: x[1], reverse=True)[:5]:
+            print(f"  - {name}: {count} pictures require OCR")
+
+
+if __name__ == "__main__":
+    # Run all examples
+    profile_single_document()
+    profile_document_collection()
+    profile_with_generator()
+    export_statistics_report()
+    analyze_document_characteristics()
+
+    print("\n" + "=" * 80)
+    print("Examples completed!")
+    print("=" * 80)
+
@@ -0,0 +1,319 @@
+"""Visualization utilities for collection statistics.
+
+This module provides utilities for creating charts from CollectionStats data.
+Requires matplotlib to be installed (available with 'examples' extra).
+
+Install with: pip install docling-core[examples]
+"""
+
+from pathlib import Path
+from typing import Literal
+
+try:
+    import matplotlib.figure
+    import matplotlib.pyplot as plt
+    MATPLOTLIB_AVAILABLE = True
+except ImportError:
+    MATPLOTLIB_AVAILABLE = False
+
+from docling_core.transforms.profiler.doc_profiler import CollectionStats, Histogram
+
+
+class StatsVisualizer:
+    """Visualizer for creating charts from CollectionStats data."""
+
+    @staticmethod
+    def _check_matplotlib() -> None:
+        """Check if matplotlib is available."""
+        if not MATPLOTLIB_AVAILABLE:
+            raise ImportError(
+                "matplotlib is required for visualization. "
+                "Install it with: pip install docling-core[examples]"
+            )
+
+    @staticmethod
+    def plot_histogram(
+        histogram: Histogram,
+        title: str = "Distribution",
+        xlabel: str = "Value",
+        ylabel: str = "Frequency",
+        color: str = "steelblue",
+        figsize: tuple[int, int] = (10, 6),
+        log_scale: bool = False,
+    ) -> "matplotlib.figure.Figure":
+        """Plot a histogram from Histogram data.
+
+        Args:
+            histogram: Histogram object containing bins and frequencies
+            title: Plot title
+            xlabel: X-axis label
+            ylabel: Y-axis label
+            color: Bar color
+            figsize: Figure size as (width, height)
+            log_scale: If True, use logarithmic scale for y-axis (frequency counts)
+
+        Returns:
+            matplotlib Figure object
+
+        Raises:
+            ImportError: If matplotlib is not installed
+        """
+        StatsVisualizer._check_matplotlib()
+
+        fig, ax = plt.subplots(figsize=figsize)
+
+        # Calculate bin centers for plotting
+        bins = histogram.bins
+        frequencies = histogram.frequencies
+
+        if len(bins) > 0 and len(frequencies) > 0:
+            # bins has n+1 edges, frequencies has n values
+            bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))]
+            bin_width = histogram.bin_width
+
+            ax.bar(bin_centers, frequencies, width=bin_width * 0.9, color=color, edgecolor="black", alpha=0.7)
+
+        ax.set_xlabel(xlabel, fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight="bold")
+        ax.grid(axis="y", alpha=0.3, linestyle="--")
+
+        if log_scale:
+            ax.set_yscale('log')
+            ax.set_ylabel(f"{ylabel} (log scale)", fontsize=12)
+
+        plt.tight_layout()
+        return fig
+
+    @staticmethod
+    def plot_deciles(
+        deciles: list[float],
+        title: str = "Decile Distribution",
+        ylabel: str = "Value",
+        color: str = "coral",
+        figsize: tuple[int, int] = (10, 6),
+    ) -> "matplotlib.figure.Figure":
+        """Plot deciles as a line chart.
+
+        Args:
+            deciles: List of 9 decile values [d1, d2, ..., d9] (10th, 20th, ..., 90th percentiles)
+            title: Plot title
+            ylabel: Y-axis label
+            color: Line color
+            figsize: Figure size as (width, height)
+
+        Returns:
+            matplotlib Figure object
+
+        Raises:
+            ImportError: If matplotlib is not installed
+        """
+        StatsVisualizer._check_matplotlib()
+
+        fig, ax = plt.subplots(figsize=figsize)
+
+        decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+        percentile_labels = [10, 20, 30, 40, 50, 60, 70, 80, 90]
+
+        ax.plot(decile_labels, deciles, marker="o", linewidth=2, markersize=8, color=color)
+        ax.fill_between(decile_labels, deciles, alpha=0.3, color=color)
+
+        # Highlight median (d5 = 50th percentile)
+        ax.axvline(x=5, color="red", linestyle="--", alpha=0.5, label="Median (d5)")
+
+        ax.set_xlabel("Decile", fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight="bold")
+        ax.set_xticks(decile_labels)
+        ax.set_xticklabels([f"d{d} (p{p})" for d, p in zip(decile_labels, percentile_labels)])
+        ax.grid(True, alpha=0.3, linestyle="--")
+        ax.legend()
+
+        plt.tight_layout()
+        return fig
+
+    @staticmethod
+    def plot_collection_overview(
+        stats: CollectionStats,
+        metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None,
+        figsize: tuple[int, int] = (16, 10),
+        log_scale: bool = False,
+    ) -> "matplotlib.figure.Figure":
+        """Create a comprehensive overview plot with multiple histograms.
+
+        Args:
+            stats: CollectionStats object
+            metrics: List of metrics to plot. If None, plots all available metrics.
+            figsize: Figure size as (width, height)
+            log_scale: If True, use logarithmic scale for y-axis (frequency counts)
+
+        Returns:
+            matplotlib Figure object with subplots
+
+        Raises:
+            ImportError: If matplotlib is not installed
+        """
+        StatsVisualizer._check_matplotlib()
+
+        if metrics is None:
+            metrics = ["pages", "tables", "pictures", "texts"]
+
+        n_metrics = len(metrics)
+        n_cols = 2
+        n_rows = (n_metrics + 1) // 2
+
+        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
+        if n_rows == 1:
+            axes = axes.reshape(1, -1)
+
+        metric_config = {
+            "pages": {
+                "histogram": stats.histogram_pages,
+                "title": "Pages per Document",
+                "color": "steelblue",
+            },
+            "tables": {
+                "histogram": stats.histogram_tables,
+                "title": "Tables per Document",
+                "color": "forestgreen",
+            },
+            "pictures": {
+                "histogram": stats.histogram_pictures,
+                "title": "Pictures per Document",
+                "color": "coral",
+            },
+            "texts": {
+                "histogram": stats.histogram_texts,
+                "title": "Text Items per Document",
+                "color": "mediumpurple",
+            },
+        }
+
+        for idx, metric in enumerate(metrics):
+            row = idx // n_cols
+            col = idx % n_cols
+            ax = axes[row, col]
+
+            config = metric_config[metric]
+            histogram = config["histogram"]
+            bins = histogram.bins
+            frequencies = histogram.frequencies
+
+            if len(bins) > 0 and len(frequencies) > 0:
+                bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))]
+                bin_width = histogram.bin_width
+
+                ax.bar(
+                    bin_centers,
+                    frequencies,
+                    width=bin_width * 0.9,
+                    color=config["color"],
+                    edgecolor="black",
+                    alpha=0.7,
+                )
+
+            ax.set_xlabel("Count", fontsize=10)
+            ylabel = "Frequency (log scale)" if log_scale else "Frequency"
+            ax.set_ylabel(ylabel, fontsize=10)
+            ax.set_title(config["title"], fontsize=12, fontweight="bold")
+            ax.grid(axis="y", alpha=0.3, linestyle="--")
+
+            if log_scale:
+                ax.set_yscale("log")
+
+        # Hide unused subplots
+        for idx in range(n_metrics, n_rows * n_cols):
+            row = idx // n_cols
+            col = idx % n_cols
+            axes[row, col].axis("off")
+
+        fig.suptitle(
+            f"Collection Statistics Overview ({stats.num_documents} documents)",
+            fontsize=16,
+            fontweight="bold",
+        )
+        plt.tight_layout()
+        return fig
+
+    @staticmethod
+    def plot_deciles_comparison(
+        stats: CollectionStats,
+        metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None,
+        figsize: tuple[int, int] = (12, 6),
+    ) -> "matplotlib.figure.Figure":
+        """Create a comparison plot of deciles for multiple metrics.
+
+        Args:
+            stats: CollectionStats object
+            metrics: List of metrics to plot. If None, plots all available metrics.
+            figsize: Figure size as (width, height)
+
+        Returns:
+            matplotlib Figure object
+
+        Raises:
+            ImportError: If matplotlib is not installed
+        """
+        StatsVisualizer._check_matplotlib()
+
+        if metrics is None:
+            metrics = ["pages", "tables", "pictures", "texts"]
+
+        fig, ax = plt.subplots(figsize=figsize)
+
+        decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+        metric_config = {
+            "pages": {"deciles": stats.deciles_pages, "label": "Pages", "color": "steelblue"},
+            "tables": {"deciles": stats.deciles_tables, "label": "Tables", "color": "forestgreen"},
+            "pictures": {"deciles": stats.deciles_pictures, "label": "Pictures", "color": "coral"},
+            "texts": {"deciles": stats.deciles_texts, "label": "Text Items", "color": "mediumpurple"},
+        }
+
+        for metric in metrics:
+            config = metric_config[metric]
+            ax.plot(
+                decile_labels,
+                config["deciles"],
+                marker="o",
+                linewidth=2,
+                markersize=6,
+                label=config["label"],
+                color=config["color"],
+            )
+
+        ax.axvline(x=5, color="red", linestyle="--", alpha=0.3, label="Median (d5)")
+
+        ax.set_xlabel("Decile", fontsize=12)
+        ax.set_ylabel("Count", fontsize=12)
+        ax.set_title("Decile Comparison Across Metrics", fontsize=14, fontweight="bold")
+        ax.set_xticks(decile_labels)
+        ax.set_xticklabels([f"d{d}" for d in decile_labels])
+        ax.grid(True, alpha=0.3, linestyle="--")
+        ax.legend(loc="best")
+
+        plt.tight_layout()
+        return fig
+
+    @staticmethod
+    def save_figure(fig: "matplotlib.figure.Figure", filepath: str | Path, dpi: int = 300) -> None:
+        """Save a matplotlib figure to file.
+
+        Args:
+            fig: matplotlib Figure object
+            filepath: Output file path (supports .png, .pdf, .svg, etc.)
+            dpi: Resolution in dots per inch
+        """
+        StatsVisualizer._check_matplotlib()
+        fig.savefig(filepath, dpi=dpi, bbox_inches="tight")
+
+    @staticmethod
+    def show_figure(fig: "matplotlib.figure.Figure") -> None:
+        """Display a matplotlib figure.
+
+        Args:
+            fig: matplotlib Figure object
+        """
+        StatsVisualizer._check_matplotlib()
+        plt.show()
+
@@ -0,0 +1,257 @@
+"""Example: Visualizing Collection Statistics with Charts.
+
+This example demonstrates how to use the StatsVisualizer to create
+various charts from CollectionStats data.
+
+Requirements:
+    pip install docling-core[examples]  # Includes matplotlib
+"""
+
+from pathlib import Path
+
+from stats_visualizer import StatsVisualizer
+
+from docling_core.transforms.profiler import CollectionStats, DocumentProfiler
+from docling_core.types.doc import DoclingDocument
+
+
+def load_documents_and_profile(doc_dir: Path) -> CollectionStats | None:
+    """Load documents from directory and profile them.
+
+    Args:
+        doc_dir: Directory containing JSON documents
+
+    Returns:
+        CollectionStats object or None if no documents found
+    """
+    if not doc_dir.exists():
+        print(f"Directory not found: {doc_dir}")
+        return None
+
+    docs = []
+    for json_file in doc_dir.glob("*.json"):
+        try:
+            docs.append(DoclingDocument.load_from_json(json_file))
+        except Exception:
+            pass
+
+    if not docs:
+        print("No documents found")
+        return None
+
+    # Profile collection
+    stats = DocumentProfiler.profile_collection(docs)
+    print(f"Loaded and profiled {stats.num_documents} documents")
+    return stats
+
+
+def visualize_single_histogram(stats: CollectionStats):
+    """Example 1: Plot a single histogram."""
+    print("\n" + "=" * 80)
+    print("Example 1: Single Histogram Plot")
+    print("=" * 80)
+
+    # Create histogram plot for pages (linear scale)
+    fig = StatsVisualizer.plot_histogram(
+        histogram=stats.histogram_pages,
+        title="Distribution of Pages per Document",
+        xlabel="Number of Pages",
+        ylabel="Number of Documents",
+        color="steelblue",
+    )
+
+    # Save the figure
+    output_file = Path("./pages_histogram.png")
+    StatsVisualizer.save_figure(fig, output_file)
+    print(f"Saved histogram to: {output_file}")
+
+    # Create histogram plot for pages (logarithmic scale)
+    fig_log = StatsVisualizer.plot_histogram(
+        histogram=stats.histogram_pages,
+        title="Distribution of Pages per Document (Log Scale)",
+        xlabel="Number of Pages",
+        ylabel="Number of Documents",
+        color="steelblue",
+        log_scale=True,
+    )
+
+    # Save the figure
+    output_file_log = Path("./pages_histogram_log.png")
+    StatsVisualizer.save_figure(fig_log, output_file_log)
+    print(f"Saved histogram (log scale) to: {output_file_log}")
+
+
+def visualize_deciles(stats: CollectionStats):
+    """Example 2: Plot deciles."""
+    print("\n" + "=" * 80)
+    print("Example 2: Decile Distribution Plot")
+    print("=" * 80)
+
+    # Create decile plot for tables
+    fig = StatsVisualizer.plot_deciles(
+        deciles=stats.deciles_tables,
+        title="Decile Distribution of Tables per Document",
+        ylabel="Number of Tables",
+        color="forestgreen",
+    )
+
+    # Save the figure
+    output_file = Path("./tables_deciles.png")
+    StatsVisualizer.save_figure(fig, output_file)
+    print(f"Saved decile plot to: {output_file}")
+
+
+def visualize_collection_overview(stats: CollectionStats):
+    """Example 3: Create comprehensive overview with multiple metrics."""
+    print("\n" + "=" * 80)
+    print("Example 3: Collection Overview (Multiple Histograms)")
+    print("=" * 80)
+
+    # Create overview plot with all metrics (linear scale)
+    fig = StatsVisualizer.plot_collection_overview(
+        stats=stats,
+        metrics=["pages", "tables", "pictures", "texts"],
+        figsize=(16, 10),
+    )
+
+    # Save the figure
+    output_file = Path("./collection_overview.png")
+    StatsVisualizer.save_figure(fig, output_file)
+    print(f"Saved collection overview to: {output_file}")
+
+    # Create overview plot with all metrics (logarithmic scale)
+    fig_log = StatsVisualizer.plot_collection_overview(
+        stats=stats,
+        metrics=["pages", "tables", "pictures", "texts"],
+        figsize=(16, 10),
+        log_scale=True,
+    )
+
+    # Save the figure
+    output_file_log = Path("./collection_overview_log.png")
+    StatsVisualizer.save_figure(fig_log, output_file_log)
+    print(f"Saved collection overview (log scale) to: {output_file_log}")
+
+
+def visualize_deciles_comparison(stats: CollectionStats):
+    """Example 4: Compare deciles across multiple metrics."""
+    print("\n" + "=" * 80)
+    print("Example 4: Decile Comparison Across Metrics")
+    print("=" * 80)
+
+    # Create comparison plot
+    fig = StatsVisualizer.plot_deciles_comparison(
+        stats=stats,
+        metrics=["pages", "tables", "pictures", "texts"],
+        figsize=(12, 6),
+    )
+
+    # Save the figure
+    output_file = Path("./deciles_comparison.png")
+    StatsVisualizer.save_figure(fig, output_file)
+    print(f"Saved decile comparison to: {output_file}")
+
+
+def create_custom_visualization(stats: CollectionStats):
+    """Example 5: Create custom visualization for specific metrics."""
+    print("\n" + "=" * 80)
+    print("Example 5: Custom Visualization")
+    print("=" * 80)
+
+    # Create histogram for pictures only (with log scale for high frequency on low values)
+    fig1 = StatsVisualizer.plot_histogram(
+        histogram=stats.histogram_pictures,
+        title="Picture Distribution (Log Scale)",
+        xlabel="Pictures per Document",
+        ylabel="Frequency",
+        color="coral",
+        figsize=(10, 6),
+        log_scale=True,
+    )
+    StatsVisualizer.save_figure(fig1, "./pictures_histogram_log.png")
+    print("Saved pictures histogram (log scale)")
+
+    # Create decile plot for texts only
+    fig2 = StatsVisualizer.plot_deciles(
+        deciles=stats.deciles_texts,
+        title="Text Items Decile Distribution",
+        ylabel="Number of Text Items",
+        color="mediumpurple",
+        figsize=(10, 6),
+    )
+    StatsVisualizer.save_figure(fig2, "./texts_deciles.png")
+    print("Saved texts decile plot")
+
+    # Create overview with selected metrics (log scale)
+    fig3 = StatsVisualizer.plot_collection_overview(
+        stats=stats,
+        metrics=["pages", "tables"],  # Only pages and tables
+        figsize=(12, 6),
+        log_scale=True,
+    )
+    StatsVisualizer.save_figure(fig3, "./pages_tables_overview_log.png")
+    print("Saved pages and tables overview (log scale)")
+
+
+def display_statistics_summary(stats: CollectionStats):
+    """Example 6: Display statistics summary with key insights."""
+    print("\n" + "=" * 80)
+    print("Example 6: Statistics Summary")
+    print("=" * 80)
+
+    print(f"\nCollection Summary ({stats.num_documents} documents):")
+    print("\nPages:")
+    print(f"  Range: {stats.min_pages} - {stats.max_pages}")
+    print(f"  Median (d5): {stats.deciles_pages[4]:.1f}")
+    print(f"  Mean: {stats.mean_pages:.2f}")
+    print(f"  Deciles: d1={stats.deciles_pages[0]:.1f}, "
+          f"d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}")
+
+    print("\nTables:")
+    print(f"  Range: {stats.min_tables} - {stats.max_tables}")
+    print(f"  Median (d5): {stats.deciles_tables[4]:.1f}")
+    print(f"  Mean: {stats.mean_tables:.2f}")
+
+    print("\nPictures:")
+    print(f"  Range: {stats.min_pictures} - {stats.max_pictures}")
+    print(f"  Median (d5): {stats.deciles_pictures[4]:.1f}")
+    print(f"  Mean: {stats.mean_pictures:.2f}")
+
+    print("\nText Items:")
+    print(f"  Range: {stats.min_texts} - {stats.max_texts}")
+    print(f"  Median (d5): {stats.deciles_texts[4]:.1f}")
+    print(f"  Mean: {stats.mean_texts:.2f}")
+
+
+if __name__ == "__main__":
+    try:
+        # Load documents once and profile them
+        doc_dir = Path("./test/data/doc")
+        stats = load_documents_and_profile(doc_dir)
+
+        if stats is None:
+            print("Failed to load documents. Exiting.")
+            exit(1)
+
+        # Run all examples with the same stats object
+        visualize_single_histogram(stats)
+        visualize_deciles(stats)
+        visualize_collection_overview(stats)
+        # visualize_deciles_comparison(stats)
+        create_custom_visualization(stats)
+        display_statistics_summary(stats)
+
+        print("\n" + "=" * 80)
+        print("All visualizations created successfully!")
+        print("Check the current directory for generated PNG files.")
+        print("=" * 80)
+
+    except ImportError as e:
+        print(f"\nError: {e}")
+        print("\nTo run this example, install matplotlib:")
+        print("  pip install docling-core[examples]")
+    except Exception as e:
+        print(f"\nError: {e}")
+        import traceback
+        traceback.print_exc()
+
@@ -0,0 +1,401 @@
+"""Tests for document profiler."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from docling_core.transforms.profiler import DocumentProfiler
+from docling_core.types.doc import BoundingBox, DoclingDocument, ProvenanceItem
+from docling_core.types.doc.document import DocumentOrigin, PageItem, Size, TableData
+from docling_core.types.doc.labels import DocItemLabel
+
+
+def test_profile_empty_document():
+    """Test profiling an empty document."""
+    doc = DoclingDocument(name="Empty Document")
+
+    stats = DocumentProfiler.profile_document(doc)
+
+    assert stats.name == "Empty Document"
+    assert stats.num_pages == 0
+    assert stats.num_tables == 0
+    assert stats.num_pictures == 0
+    assert stats.num_texts == 0
+    assert stats.num_key_value_items == 0
+    assert stats.num_form_items == 0
+    assert stats.total_items == 0
+    assert stats.avg_items_per_page == 0.0
+    assert stats.origin_mimetype is None
+
+
+def test_profile_simple_document():
+    """Test profiling a simple document with basic content."""
+    doc = DoclingDocument(
+        name="Simple Document",
+        origin=DocumentOrigin(
+            mimetype="application/pdf",
+            binary_hash=12345,
+            filename="test.pdf",
+        ),
+    )
+
+    # Add some pages
+    doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
+    doc.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
+
+    # Add some text items
+    doc.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
+    doc.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
+    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
+
+    # Add a table
+    doc.add_table(data=TableData(num_rows=2, num_cols=2))
+
+    # Add a picture
+    doc.add_picture()
+
+    stats = DocumentProfiler.profile_document(doc)
+
+    assert stats.name == "Simple Document"
+    assert stats.num_pages == 2
+    assert stats.num_tables == 1
+    assert stats.num_pictures == 1
+    assert stats.num_texts == 3
+    assert stats.num_section_headers == 1
+    assert stats.total_items == 5
+    assert stats.avg_items_per_page == 2.5
+    assert stats.origin_mimetype == "application/pdf"
+
+
+def test_profile_document_with_pictures_for_ocr():
+    """Test profiling pictures that would trigger OCR based on area coverage."""
+    doc = DoclingDocument(name="Document with Pictures for OCR")
+
+    # Add a page
+    doc.pages[1] = PageItem(page_no=1, size=Size(width=1000, height=1000))
+
+    # Add a large picture (10% of page area, above default 5% threshold)
+    doc.add_picture(
+        prov=ProvenanceItem(
+            page_no=1,
+            bbox=BoundingBox(l=0, t=0, r=316.2, b=316.2),  # ~10% of page area
+            charspan=(0, 0),
+        )
+    )
+
+    # Add a small picture (2% of page area, below default 5% threshold)
+    doc.add_picture(
+        prov=ProvenanceItem(
+            page_no=1,
+            bbox=BoundingBox(l=0, t=0, r=141.4, b=141.4),  # ~2% of page area
+            charspan=(0, 0),
+        )
+    )
+
+    # Add a medium picture (exactly 5% of page area, at threshold)
+    doc.add_picture(
+        prov=ProvenanceItem(
+            page_no=1,
+            bbox=BoundingBox(l=0, t=0, r=223.607, b=223.607),  # exactly 5% of page area
+            charspan=(0, 0),
+        )
+    )
+
+    stats = DocumentProfiler.profile_document(doc)
+
+    assert stats.num_pictures == 3
+    # 2 out of 3 pictures meet the threshold (large and medium)
+    assert stats.num_pictures_for_ocr == 2
+
+    # Test with custom threshold of 10%
+    stats_custom = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.10)
+    # Only large picture (9.99%) is below 10%, so 0 pictures
+    assert stats_custom.num_pictures_for_ocr == 0
+
+    # Test with custom threshold of 2%
+    stats_custom2 = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.02)
+    # 2 pictures are above 2% threshold (large and medium, small is 1.99%)
+    assert stats_custom2.num_pictures_for_ocr == 2
+
+
+def test_profile_collection_empty():
+    """Test profiling an empty collection."""
+    stats = DocumentProfiler.profile_collection([])
+
+    assert stats.num_documents == 0
+    assert stats.total_pages == 0
+    assert stats.total_tables == 0
+    assert stats.total_pictures == 0
+    assert stats.avg_items_per_document == 0.0
+    assert stats.avg_items_per_page == 0.0
+    assert stats.deciles_pages == [0.0] * 9
+    assert stats.deciles_tables == [0.0] * 9
+    assert stats.histogram_pages.bins == []
+    assert stats.histogram_pages.frequencies == []
+    assert stats.histogram_pages.bin_width == 0.0
+
+
+def test_profile_collection_single_document():
+    """Test profiling a collection with a single document."""
+    doc = DoclingDocument(name="Single Doc")
+    doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
+    doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text", orig="Text")
+    doc.add_table(data=TableData(num_rows=1, num_cols=1))
+    doc.add_picture()
+
+    stats = DocumentProfiler.profile_collection(doc)
+
+    assert stats.num_documents == 1
+    assert stats.total_pages == 1
+    assert stats.total_tables == 1
+    assert stats.total_pictures == 1
+    assert stats.total_texts == 1
+    assert stats.min_pages == 1
+    assert stats.max_pages == 1
+    assert stats.deciles_pages[4] == 1.0  # median is d5 (5th decile, index 4)
+    assert stats.mean_pages == 1.0
+    assert stats.std_pages == 0.0
+    # Check histogram exists
+    assert len(stats.histogram_pages.bins) > 0
+    assert len(stats.histogram_pages.frequencies) > 0
+
+
+def test_profile_collection_multiple_documents():
+    """Test profiling a collection with multiple documents."""
+    docs = []
+
+    # Document 1: 2 pages, 1 table, 2 pictures, 2 texts
+    doc1 = DoclingDocument(
+        name="Doc1",
+        origin=DocumentOrigin(mimetype="application/pdf", binary_hash=1, filename="doc1.pdf"),
+    )
+    doc1.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
+    doc1.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792))
+    doc1.add_table(data=TableData(num_rows=1, num_cols=1))
+    doc1.add_picture()
+    doc1.add_picture()
+    doc1.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1")
+    doc1.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2")
+    docs.append(doc1)
+
+    # Document 2: 5 pages, 3 tables, 1 picture, 10 texts
+    doc2 = DoclingDocument(
+        name="Doc2",
+        origin=DocumentOrigin(mimetype="application/pdf", binary_hash=2, filename="doc2.pdf"),
+    )
+    for i in range(1, 6):
+        doc2.pages[i] = PageItem(page_no=i, size=Size(width=612, height=792))
+    for _ in range(3):
+        doc2.add_table(data=TableData(num_rows=1, num_cols=1))
+    doc2.add_picture()
+    for i in range(10):
+        doc2.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
+    docs.append(doc2)
+
+    # Document 3: 1 page, 0 tables, 5 pictures, 2 texts
+    doc3 = DoclingDocument(
+        name="Doc3",
+        origin=DocumentOrigin(mimetype="text/html", binary_hash=3, filename="doc3.html"),
+    )
+    doc3.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
+    for _ in range(5):
+        doc3.add_picture()
+    doc3.add_text(label=DocItemLabel.TEXT, text="T1", orig="T1")
+    doc3.add_text(label=DocItemLabel.TEXT, text="T2", orig="T2")
+    docs.append(doc3)
+
+    stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True)
+
+    # Basic counts
+    assert stats.num_documents == 3
+    assert stats.total_pages == 8  # 2 + 5 + 1
+    assert stats.total_tables == 4  # 1 + 3 + 0
+    assert stats.total_pictures == 8  # 2 + 1 + 5
+    assert stats.total_texts == 14  # 2 + 10 + 2
+
+    # Page statistics
+    assert stats.min_pages == 1
+    assert stats.max_pages == 5
+    assert stats.deciles_pages[4] == 2.0  # median is d5 (5th decile, index 4)
+    assert stats.mean_pages == pytest.approx(8 / 3)
+    assert stats.std_pages > 0
+    # Check deciles are in order: [d1, d2, d3, d4, d5, d6, d7, d8, d9]
+    assert stats.deciles_pages[0] <= stats.deciles_pages[4] <= stats.deciles_pages[8]
+    # Check histogram exists
+    assert len(stats.histogram_pages.bins) > 0
+    assert len(stats.histogram_pages.frequencies) > 0
+
+    # Table statistics
+    assert stats.min_tables == 0
+    assert stats.max_tables == 3
+    assert stats.deciles_tables[4] == 1.0  # median is d5 (5th decile, index 4)
+    assert stats.mean_tables == pytest.approx(4 / 3)
+    # Check histogram exists
+    assert len(stats.histogram_tables.bins) > 0
+
+    # Picture statistics
+    assert stats.min_pictures == 1
+    assert stats.max_pictures == 5
+    assert stats.deciles_pictures[4] == 2.0  # median is d5 (5th decile, index 4)
+    assert stats.mean_pictures == pytest.approx(8 / 3)
+    # Check histogram exists
+    assert len(stats.histogram_pictures.bins) > 0
+
+    # Text statistics
+    assert stats.min_texts == 2
+    assert stats.max_texts == 10
+    assert stats.deciles_texts[4] == 2.0  # median is d5 (5th decile, index 4)
+    assert stats.mean_texts == pytest.approx(14 / 3)
+    # Check histogram exists
+    assert len(stats.histogram_texts.bins) > 0
+
+    # Document characteristics
+    assert len(stats.document_stats) == 3
+
+    # MIME type distribution
+    assert stats.mimetype_distribution["application/pdf"] == 2
+    assert stats.mimetype_distribution["text/html"] == 1
+
+    # Computed fields
+    assert stats.total_items == 26  # 14 texts + 4 tables + 8 pictures
+    assert stats.avg_items_per_document == pytest.approx(26 / 3)
+    assert stats.avg_items_per_page == pytest.approx(26 / 8)
+
+
+def test_profile_collection_with_iterator():
+    """Test profiling a collection using an iterator (generator)."""
+
+    def doc_generator():
+        for i in range(3):
+            doc = DoclingDocument(name=f"Doc{i}")
+            doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
+            doc.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}")
+            yield doc
+
+    stats = DocumentProfiler.profile_collection(doc_generator())
+
+    assert stats.num_documents == 3
+    assert stats.total_pages == 3
+    assert stats.total_texts == 3
+
+
+def test_profile_collection_without_individual_stats():
+    """Test that individual stats are not included by default."""
+    docs = [DoclingDocument(name=f"Doc{i}") for i in range(3)]
+
+    stats = DocumentProfiler.profile_collection(docs, include_individual_stats=False)
+
+    assert len(stats.document_stats) == 0
+
+
+def test_statistics_serialization():
+    """Test that statistics can be serialized to JSON."""
+    doc = DoclingDocument(name="Test Doc")
+    doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792))
+    doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
+
+    doc_stats = DocumentProfiler.profile_document(doc)
+
+    # Test DocumentStatistics serialization
+    json_str = doc_stats.model_dump_json()
+    data = json.loads(json_str)
+    assert data["name"] == "Test Doc"
+    assert data["num_pages"] == 1
+    assert data["total_items"] == 1
+
+    # Test CollectionStatistics serialization
+    coll_stats = DocumentProfiler.profile_collection([doc])
+    json_str = coll_stats.model_dump_json()
+    data = json.loads(json_str)
+    assert data["num_documents"] == 1
+    assert data["total_pages"] == 1
+
+
+def test_profile_real_document():
+    """Test profiling a real document from test data."""
+    test_file = Path("./test/data/doc/2408.09869v3_enriched.json")
+    if not test_file.exists():
+        pytest.skip("Test file not found")
+
+    doc = DoclingDocument.load_from_json(test_file)
+    stats = DocumentProfiler.profile_document(doc)
+
+    # Basic sanity checks
+    assert stats.name == doc.name
+    assert stats.num_pages == len(doc.pages)
+    assert stats.num_tables == len(doc.tables)
+    assert stats.num_pictures == len(doc.pictures)
+    assert stats.num_texts == len(doc.texts)
+    assert stats.total_items > 0
+
+
+def test_label_specific_counts():
+    """Test that label-specific counts are accurate."""
+    doc = DoclingDocument(name="Label Test")
+
+    # Add various types of text items
+    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section")
+    doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 1", orig="Item 1")
+    doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 2", orig="Item 2")
+    doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 3", orig="Item 3")
+    doc.add_text(label=DocItemLabel.CODE, text="code", orig="code")
+    doc.add_text(label=DocItemLabel.FORMULA, text="x=y", orig="x=y")
+    doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text")
+
+    stats = DocumentProfiler.profile_document(doc)
+
+    assert stats.num_section_headers == 1
+    assert stats.num_list_items == 3
+    assert stats.num_code_items == 1
+    assert stats.num_formulas == 1
+    assert stats.num_texts == 7
+
+
+def test_profile_sample_document(sample_doc):
+    """Test profiling the sample document from conftest.py fixture."""
+    stats = DocumentProfiler.profile_document(sample_doc)
+
+    # Verify basic document properties
+    assert stats.name == "Untitled 1"
+    assert stats.num_pages == 0  # sample_doc doesn't add pages explicitly
+
+    # Verify item counts based on the sample_doc construction
+    assert stats.num_tables == len(sample_doc.tables)
+    assert stats.num_pictures == len(sample_doc.pictures)
+    assert stats.num_texts == len(sample_doc.texts)
+    assert stats.num_key_value_items == len(sample_doc.key_value_items)
+    assert stats.num_form_items == len(sample_doc.form_items)
+
+    # Verify label-specific counts
+    assert stats.num_section_headers > 0  # sample_doc has section headers
+    assert stats.num_list_items > 0  # sample_doc has many list items
+    assert stats.num_code_items > 0  # sample_doc has code items
+    assert stats.num_formulas > 0  # sample_doc has formulas
+
+    # Verify computed fields
+    assert stats.total_items > 0
+    assert stats.total_items == (
+        stats.num_texts
+        + stats.num_tables
+        + stats.num_pictures
+        + stats.num_key_value_items
+        + stats.num_form_items
+    )
+
+    # sample_doc has no pages, so avg_items_per_page should be 0
+    assert stats.avg_items_per_page == 0.0
+
+
+def test_calculate_deciles_empty():
+    """Test _calculate_deciles with empty data (line 191)."""
+    result = DocumentProfiler._calculate_deciles([])
+    assert result == [0.0] * 9
+
+
+def test_calculate_histogram_empty():
+    """Test _calculate_histogram with empty data (line 208)."""
+    result = DocumentProfiler._calculate_histogram([])
+    assert result.bins == []
+    assert result.frequencies == []
+    assert result.bin_width == 0.0