diff --git a/README.md b/README.md index 0fae9942..acd121a4 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,17 @@ different use cases. - [Hybrid chunking example](https://docling-project.github.io/docling/examples/hybrid_chunking/) - [Advanced chunking and serialization](https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/) +### Profiling + +The Profiling API enables extraction of comprehensive statistics from DoclingDocument objects, +both for individual documents and collections. It provides metrics on document structure +(pages, tables, pictures, text items) along with statistical distributions (deciles, histograms) +and visualization capabilities for analyzing document collections at scale. + +👉 More details: +- [Document profiling example](./examples/document_profiling.py) +- [Collection statistics visualization](./examples/visualize_collection_stats.py) + ## Contributing Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details. diff --git a/docling_core/transforms/profiler/__init__.py b/docling_core/transforms/profiler/__init__.py new file mode 100644 index 00000000..9e153b29 --- /dev/null +++ b/docling_core/transforms/profiler/__init__.py @@ -0,0 +1,17 @@ +"""Document profiling and statistics module.""" + +from docling_core.transforms.profiler.doc_profiler import ( + CollectionStats, + DecilesT, + DocumentProfiler, + DocumentStats, + Histogram, +) + +__all__ = [ + "CollectionStats", + "DecilesT", + "DocumentProfiler", + "DocumentStats", + "Histogram", +] diff --git a/docling_core/transforms/profiler/doc_profiler.py b/docling_core/transforms/profiler/doc_profiler.py new file mode 100644 index 00000000..a890d9ed --- /dev/null +++ b/docling_core/transforms/profiler/doc_profiler.py @@ -0,0 +1,425 @@ +"""Document profiler for extracting statistics from DoclingDocument objects.""" + +import statistics +from collections.abc import Iterable +from typing import Annotated + +import numpy as np +from annotated_types import Len +from pydantic import BaseModel, Field, computed_field +from typing_extensions import TypeAliasType + +from docling_core.types.doc import DoclingDocument +from docling_core.types.doc.labels import DocItemLabel + +DecilesT = TypeAliasType("DecilesT", Annotated[list[float], Len(max_length=9, min_length=9)]) +"""Type alias for deciles: list of 9 floats representing 1st through 9th deciles (10th, 20th, ..., 90th percentiles).""" + + +class Histogram(BaseModel): + """Histogram representation with bins and frequencies.""" + + bins: Annotated[list[float], Field(description="Histogram bin edges")] = [] + frequencies: Annotated[list[int], Field(description="Frequency count for each bin")] = [] + bin_width: Annotated[float, Field(description="Width of each bin")] = 0.0 + + +class DocumentStats(BaseModel): + """Statistics for a single DoclingDocument.""" + + name: Annotated[str, Field(description="Document name")] + num_pages: Annotated[int, Field(description="Number of pages in the document")] = 0 + num_tables: Annotated[int, Field(description="Number of tables in the document")] = 0 + num_pictures: Annotated[int, Field(description="Number of pictures in the document")] = 0 + num_texts: Annotated[int, Field(description="Number of text items in the document")] = 0 + num_key_value_items: Annotated[int, Field(description="Number of key-value items in the document")] = 0 + num_form_items: Annotated[int, Field(description="Number of form items in the document")] = 0 + + # Label-specific counts + num_section_headers: Annotated[int, Field(description="Number of section headers")] = 0 + num_list_items: Annotated[int, Field(description="Number of list items")] = 0 + num_code_items: Annotated[int, Field(description="Number of code items")] = 0 + num_formulas: Annotated[int, Field(description="Number of formula items")] = 0 + + # Document characteristics + origin_mimetype: Annotated[str | None, Field(description="Origin MIME type if available")] = None + num_pictures_for_ocr: Annotated[ + int, + Field(description="Number of pictures that would trigger OCR based on area coverage threshold"), + ] = 0 + + @computed_field # type: ignore[prop-decorator] + @property + def total_items(self) -> int: + """Total number of items in the document.""" + return self.num_texts + self.num_tables + self.num_pictures + self.num_key_value_items + self.num_form_items + + @computed_field # type: ignore[prop-decorator] + @property + def avg_items_per_page(self) -> float: + """Average number of items per page.""" + if self.num_pages == 0: + return 0.0 + return self.total_items / self.num_pages + + +class CollectionStats(BaseModel): + """Statistics for a collection of DoclingDocument objects.""" + + num_documents: Annotated[int, Field(description="Total number of documents in the collection")] = 0 + + # Page statistics + total_pages: Annotated[int, Field(description="Total number of pages across all documents")] = 0 + min_pages: Annotated[int, Field(description="Minimum number of pages in a document")] = 0 + max_pages: Annotated[int, Field(description="Maximum number of pages in a document")] = 0 + deciles_pages: Annotated[DecilesT, Field(description="Deciles of pages per document")] = [0.0] * 9 + histogram_pages: Annotated[Histogram, Field(description="Histogram of pages per document")] = Histogram() + mean_pages: Annotated[float, Field(description="Mean number of pages per document")] = 0.0 + std_pages: Annotated[float, Field(description="Standard deviation of pages per document")] = 0.0 + + # Table statistics + total_tables: Annotated[int, Field(description="Total number of tables across all documents")] = 0 + min_tables: Annotated[int, Field(description="Minimum number of tables in a document")] = 0 + max_tables: Annotated[int, Field(description="Maximum number of tables in a document")] = 0 + deciles_tables: Annotated[DecilesT, Field(description="Deciles of tables per document")] = [0.0] * 9 + histogram_tables: Annotated[Histogram, Field(description="Histogram of tables per document")] = Histogram() + mean_tables: Annotated[float, Field(description="Mean number of tables per document")] = 0.0 + std_tables: Annotated[float, Field(description="Standard deviation of tables per document")] = 0.0 + + # Picture statistics + total_pictures: Annotated[int, Field(description="Total number of pictures across all documents")] = 0 + min_pictures: Annotated[int, Field(description="Minimum number of pictures in a document")] = 0 + max_pictures: Annotated[int, Field(description="Maximum number of pictures in a document")] = 0 + deciles_pictures: Annotated[DecilesT, Field(description="Deciles of pictures per document")] = [0.0] * 9 + histogram_pictures: Annotated[Histogram, Field(description="Histogram of pictures per document")] = Histogram() + mean_pictures: Annotated[float, Field(description="Mean number of pictures per document")] = 0.0 + std_pictures: Annotated[float, Field(description="Standard deviation of pictures per document")] = 0.0 + + # Text statistics + total_texts: Annotated[int, Field(description="Total number of text items across all documents")] = 0 + min_texts: Annotated[int, Field(description="Minimum number of text items in a document")] = 0 + max_texts: Annotated[int, Field(description="Maximum number of text items in a document")] = 0 + deciles_texts: Annotated[DecilesT, Field(description="Deciles of text items per document")] = [0.0] * 9 + histogram_texts: Annotated[Histogram, Field(description="Histogram of text items per document")] = Histogram() + mean_texts: Annotated[float, Field(description="Mean number of text items per document")] = 0.0 + std_texts: Annotated[float, Field(description="Standard deviation of text items per document")] = 0.0 + + # Additional item statistics + total_key_value_items: Annotated[int, Field(description="Total number of key-value items")] = 0 + total_form_items: Annotated[int, Field(description="Total number of form items")] = 0 + total_section_headers: Annotated[int, Field(description="Total number of section headers")] = 0 + total_list_items: Annotated[int, Field(description="Total number of list items")] = 0 + total_code_items: Annotated[int, Field(description="Total number of code items")] = 0 + total_formulas: Annotated[int, Field(description="Total number of formula items")] = 0 + + # Document characteristics + # Pictures for OCR statistics + total_pictures_for_ocr: Annotated[ + int, Field(description="Total number of pictures requiring OCR across all documents") + ] = 0 + min_pictures_for_ocr: Annotated[ + int, Field(description="Minimum number of pictures requiring OCR in a document") + ] = 0 + max_pictures_for_ocr: Annotated[ + int, Field(description="Maximum number of pictures requiring OCR in a document") + ] = 0 + deciles_pictures_for_ocr: Annotated[ + DecilesT, Field(description="Deciles of pictures requiring OCR per document") + ] = [0.0] * 9 + histogram_pictures_for_ocr: Annotated[ + Histogram, Field(description="Histogram of pictures requiring OCR per document") + ] = Histogram() + mean_pictures_for_ocr: Annotated[float, Field(description="Mean number of pictures requiring OCR per document")] = ( + 0.0 + ) + std_pictures_for_ocr: Annotated[ + float, Field(description="Standard deviation of pictures requiring OCR per document") + ] = 0.0 + + # MIME type distribution + mimetype_distribution: Annotated[ + dict[str, int], Field(description="Distribution of MIME types in the collection") + ] = {} + + # Per-document statistics (optional, for detailed analysis) + document_stats: Annotated[list[DocumentStats], Field(description="Individual statistics for each document")] = [] + + @computed_field # type: ignore[prop-decorator] + @property + def total_items(self) -> int: + """Total number of items across all documents.""" + return ( + self.total_texts + + self.total_tables + + self.total_pictures + + self.total_key_value_items + + self.total_form_items + ) + + @computed_field # type: ignore[prop-decorator] + @property + def avg_items_per_document(self) -> float: + """Average number of items per document.""" + if self.num_documents == 0: + return 0.0 + return self.total_items / self.num_documents + + @computed_field # type: ignore[prop-decorator] + @property + def avg_items_per_page(self) -> float: + """Average number of items per page across all documents.""" + if self.total_pages == 0: + return 0.0 + return self.total_items / self.total_pages + + +class DocumentProfiler: + """Profiler for extracting statistics from DoclingDocument objects.""" + + @staticmethod + def _calculate_deciles(data: list[int]) -> list[float]: + """Calculate deciles (1st through 9th) for a list of values. + + Args: + data: List of integer values + + Returns: + List of 9 floats representing [d1, d2, d3, d4, d5, d6, d7, d8, d9] + (10th, 20th, 30th, 40th, 50th, 60th, 70th, 80th, 90th percentiles) + """ + if not data: + return [0.0] * 9 + + decile_values = np.percentile(data, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + return [float(val) for val in decile_values] + + @staticmethod + def _calculate_histogram(data: list[int], num_bins: int = 10) -> Histogram: + """Calculate histogram for a list of values. + + Args: + data: List of integer values + num_bins: Number of bins for the histogram (default: 10) + + Returns: + Histogram object with bins and frequencies + """ + if not data: + return Histogram() + + # Use numpy to calculate histogram + frequencies, bin_edges = np.histogram(data, bins=num_bins) + + # Calculate bin width + bin_width = float(bin_edges[1] - bin_edges[0]) if len(bin_edges) > 1 else 0.0 + + return Histogram( + bins=[float(edge) for edge in bin_edges], + frequencies=[int(freq) for freq in frequencies], + bin_width=bin_width, + ) + + @staticmethod + def profile_document(doc: DoclingDocument, bitmap_coverage_threshold: float = 0.05) -> DocumentStats: + """Extract statistics from a single DoclingDocument. + + Args: + doc: The DoclingDocument to profile + bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to trigger OCR. + Pictures with area coverage above this threshold are counted as requiring OCR. + Default is 0.05 (5% of page area). + + Returns: + DocumentStatistics containing the extracted metrics + """ + # Count items by label + label_counts = { + DocItemLabel.SECTION_HEADER: 0, + DocItemLabel.LIST_ITEM: 0, + DocItemLabel.CODE: 0, + DocItemLabel.FORMULA: 0, + } + + for text_item in doc.texts: + if text_item.label in label_counts: + label_counts[text_item.label] += 1 + + # Calculate percentage of pictures that would trigger OCR based on area coverage + num_pictures_for_ocr = 0 + for picture in doc.pictures: + # Get picture's bounding box area from provenance + if picture.prov and len(picture.prov) > 0: + prov = picture.prov[0] # Use first provenance item + bbox = prov.bbox + picture_area = bbox.width * bbox.height + + # Get page size + page_no = prov.page_no + if page_no in doc.pages: + page = doc.pages[page_no] + page_area = page.size.width * page.size.height + + # Calculate coverage ratio + if page_area > 0: + coverage_ratio = picture_area / page_area + + # Check if coverage exceeds threshold + if coverage_ratio >= bitmap_coverage_threshold: + num_pictures_for_ocr += 1 + + return DocumentStats( + name=doc.name, + num_pages=len(doc.pages), + num_tables=len(doc.tables), + num_pictures=len(doc.pictures), + num_texts=len(doc.texts), + num_key_value_items=len(doc.key_value_items), + num_form_items=len(doc.form_items), + num_section_headers=label_counts[DocItemLabel.SECTION_HEADER], + num_list_items=label_counts[DocItemLabel.LIST_ITEM], + num_code_items=label_counts[DocItemLabel.CODE], + num_formulas=label_counts[DocItemLabel.FORMULA], + origin_mimetype=doc.origin.mimetype if doc.origin else None, + num_pictures_for_ocr=num_pictures_for_ocr, + ) + + @staticmethod + def profile_collection( + documents: Iterable[DoclingDocument] | DoclingDocument, + include_individual_stats: bool = False, + bitmap_coverage_threshold: float = 0.05, + num_bins: int = 10, + ) -> CollectionStats: + """Extract statistics from a collection of DoclingDocument objects. + + Args: + documents: An iterable of DoclingDocument objects, or a single document + include_individual_stats: Whether to include individual document statistics + in the result (useful for detailed analysis but increases memory usage) + bitmap_coverage_threshold: Threshold for picture area coverage (0-1) to + trigger OCR. Pictures with area coverage above this threshold are counted + as requiring OCR. Default is 0.05 (5% of page area). + num_bins: Number of bins for histograms. Default is 10. + + Returns: + CollectionStatistics containing the aggregated metrics + """ + # Handle single document case + if isinstance(documents, DoclingDocument): + documents = [documents] + + # Collect statistics + doc_stats_list: list[DocumentStats] = [] + pages_list: list[int] = [] + tables_list: list[int] = [] + pictures_list: list[int] = [] + texts_list: list[int] = [] + pictures_for_ocr_list: list[int] = [] + + total_pages = 0 + total_tables = 0 + total_pictures = 0 + total_texts = 0 + total_key_value_items = 0 + total_form_items = 0 + total_section_headers = 0 + total_list_items = 0 + total_code_items = 0 + total_formulas = 0 + total_pictures_for_ocr = 0 + + mimetype_distribution: dict[str, int] = {} + + # Process each document + for doc in documents: + doc_stats = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=bitmap_coverage_threshold) + + if include_individual_stats: + doc_stats_list.append(doc_stats) + + # Collect values for statistics + pages_list.append(doc_stats.num_pages) + tables_list.append(doc_stats.num_tables) + pictures_list.append(doc_stats.num_pictures) + texts_list.append(doc_stats.num_texts) + pictures_for_ocr_list.append(doc_stats.num_pictures_for_ocr) + + # Accumulate totals + total_pages += doc_stats.num_pages + total_tables += doc_stats.num_tables + total_pictures += doc_stats.num_pictures + total_texts += doc_stats.num_texts + total_key_value_items += doc_stats.num_key_value_items + total_form_items += doc_stats.num_form_items + total_section_headers += doc_stats.num_section_headers + total_list_items += doc_stats.num_list_items + total_code_items += doc_stats.num_code_items + total_formulas += doc_stats.num_formulas + total_pictures_for_ocr += doc_stats.num_pictures_for_ocr + + # Track MIME types + if doc_stats.origin_mimetype: + mimetype_distribution[doc_stats.origin_mimetype] = ( + mimetype_distribution.get(doc_stats.origin_mimetype, 0) + 1 + ) + + num_documents = len(pages_list) + + # Handle edge case of empty collection + if num_documents == 0: + return CollectionStats() + + # Calculate statistics + return CollectionStats( + num_documents=num_documents, + # Page statistics + total_pages=total_pages, + min_pages=min(pages_list), + max_pages=max(pages_list), + deciles_pages=DocumentProfiler._calculate_deciles(pages_list), + histogram_pages=DocumentProfiler._calculate_histogram(pages_list, num_bins=num_bins), + mean_pages=statistics.mean(pages_list), + std_pages=statistics.stdev(pages_list) if num_documents > 1 else 0.0, + # Table statistics + total_tables=total_tables, + min_tables=min(tables_list), + max_tables=max(tables_list), + deciles_tables=DocumentProfiler._calculate_deciles(tables_list), + histogram_tables=DocumentProfiler._calculate_histogram(tables_list, num_bins=num_bins), + mean_tables=statistics.mean(tables_list), + std_tables=statistics.stdev(tables_list) if num_documents > 1 else 0.0, + # Picture statistics + total_pictures=total_pictures, + min_pictures=min(pictures_list), + max_pictures=max(pictures_list), + deciles_pictures=DocumentProfiler._calculate_deciles(pictures_list), + histogram_pictures=DocumentProfiler._calculate_histogram(pictures_list, num_bins=num_bins), + mean_pictures=statistics.mean(pictures_list), + std_pictures=statistics.stdev(pictures_list) if num_documents > 1 else 0.0, + # Text statistics + total_texts=total_texts, + min_texts=min(texts_list), + max_texts=max(texts_list), + deciles_texts=DocumentProfiler._calculate_deciles(texts_list), + histogram_texts=DocumentProfiler._calculate_histogram(texts_list, num_bins=num_bins), + mean_texts=statistics.mean(texts_list), + std_texts=statistics.stdev(texts_list) if num_documents > 1 else 0.0, + # Additional totals + total_key_value_items=total_key_value_items, + total_form_items=total_form_items, + total_section_headers=total_section_headers, + total_list_items=total_list_items, + total_code_items=total_code_items, + total_formulas=total_formulas, + # Document characteristics + # Pictures for OCR statistics + total_pictures_for_ocr=total_pictures_for_ocr, + min_pictures_for_ocr=min(pictures_for_ocr_list), + max_pictures_for_ocr=max(pictures_for_ocr_list), + deciles_pictures_for_ocr=DocumentProfiler._calculate_deciles(pictures_for_ocr_list), + histogram_pictures_for_ocr=DocumentProfiler._calculate_histogram(pictures_for_ocr_list, num_bins=num_bins), + mean_pictures_for_ocr=statistics.mean(pictures_for_ocr_list), + std_pictures_for_ocr=(statistics.stdev(pictures_for_ocr_list) if num_documents > 1 else 0.0), + mimetype_distribution=mimetype_distribution, + document_stats=doc_stats_list if include_individual_stats else [], + ) diff --git a/examples/document_profiling.py b/examples/document_profiling.py new file mode 100644 index 00000000..4dc18bfa --- /dev/null +++ b/examples/document_profiling.py @@ -0,0 +1,250 @@ +"""Example usage of the document profiler for extracting statistics.""" + +import time +from pathlib import Path + +from docling_core.transforms.profiler import DocumentProfiler +from docling_core.types.doc import DoclingDocument + + +def profile_single_document(): + """Example: Profile a single document.""" + print("=" * 80) + print("Example 1: Profiling a Single Document") + print("=" * 80) + + # Load a document + doc_path = Path("./examples/2408.09869v3.json") + if not doc_path.exists(): + print(f"Document not found: {doc_path}") + return + + doc = DoclingDocument.load_from_json(doc_path) + + # Profile the document + stats = DocumentProfiler.profile_document(doc) + + # Print statistics + print(f"\nDocument: {stats.name}") + print(f"Pages: {stats.num_pages}") + print(f"Tables: {stats.num_tables}") + print(f"Pictures: {stats.num_pictures}") + print(f"Text items: {stats.num_texts}") + print(f" - Section headers: {stats.num_section_headers}") + print(f" - List items: {stats.num_list_items}") + print(f" - Code blocks: {stats.num_code_items}") + print(f" - Formulas: {stats.num_formulas}") + print(f"\nTotal items: {stats.total_items}") + print(f"Average items per page: {stats.avg_items_per_page:.2f}") + print(f"\nOrigin MIME type: {stats.origin_mimetype}") + print(f"Pictures requiring OCR: {stats.num_pictures_for_ocr}") + + # Export to JSON + json_output = stats.model_dump_json(indent=2) + print(f"\nJSON export (first 500 chars):\n{json_output[:500]}...") + + +def profile_document_collection(): + """Example: Profile a collection of documents.""" + print("\n" + "=" * 80) + print("Example 2: Profiling a Document Collection") + print("=" * 80) + + # Load multiple documents + doc_dir = Path("./test/data/doc") + if not doc_dir.exists(): + print(f"Directory not found: {doc_dir}") + return + + # Load all JSON documents + docs = [] + for json_file in doc_dir.glob("*.json"): + try: + doc = DoclingDocument.load_from_json(json_file) + docs.append(doc) + except Exception as e: + print(f"Skipping {json_file.name}: {e}") + + if not docs: + print("No documents found") + return + + print(f"\nLoaded {len(docs)} documents") + + # Profile the collection + stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True) + + # Print collection statistics + print("\nCollection Statistics:") + print(f"Number of documents: {stats.num_documents}") + print("\nPages:") + print(f" Total: {stats.total_pages}") + print(f" Min: {stats.min_pages}, Max: {stats.max_pages}") + print(f" Median (d5): {stats.deciles_pages[4]:.1f}, Mean: {stats.mean_pages:.2f}") + print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}") + print(f" Std Dev: {stats.std_pages:.2f}") + print(f" Histogram bins: {len(stats.histogram_pages.bins)}, bin width: {stats.histogram_pages.bin_width:.2f}") + + print("\nTables:") + print(f" Total: {stats.total_tables}") + print(f" Min: {stats.min_tables}, Max: {stats.max_tables}") + print(f" Median (d5): {stats.deciles_tables[4]:.1f}, Mean: {stats.mean_tables:.2f}") + print(f" Deciles: d1={stats.deciles_tables[0]:.1f}, d5={stats.deciles_tables[4]:.1f}, d9={stats.deciles_tables[8]:.1f}") + print(f" Std Dev: {stats.std_tables:.2f}") + + print("\nPictures:") + print(f" Total: {stats.total_pictures}") + print(f" Min: {stats.min_pictures}, Max: {stats.max_pictures}") + print(f" Median (d5): {stats.deciles_pictures[4]:.1f}, Mean: {stats.mean_pictures:.2f}") + print(f" Deciles: d1={stats.deciles_pictures[0]:.1f}, d5={stats.deciles_pictures[4]:.1f}, d9={stats.deciles_pictures[8]:.1f}") + print(f" Std Dev: {stats.std_pictures:.2f}") + + print("\nText Items:") + print(f" Total: {stats.total_texts}") + print(f" Min: {stats.min_texts}, Max: {stats.max_texts}") + print(f" Median (d5): {stats.deciles_texts[4]:.1f}, Mean: {stats.mean_texts:.2f}") + print(f" Deciles: d1={stats.deciles_texts[0]:.1f}, d5={stats.deciles_texts[4]:.1f}, d9={stats.deciles_texts[8]:.1f}") + print(f" Std Dev: {stats.std_texts:.2f}") + + print("\nPictures Requiring OCR:") + print(f" Total: {stats.total_pictures_for_ocr}") + print(f" Min: {stats.min_pictures_for_ocr}, Max: {stats.max_pictures_for_ocr}") + print(f" Median (d5): {stats.deciles_pictures_for_ocr[4]:.1f}, Mean: {stats.mean_pictures_for_ocr:.2f}") + print(f" Deciles: d1={stats.deciles_pictures_for_ocr[0]:.1f}, d5={stats.deciles_pictures_for_ocr[4]:.1f}, d9={stats.deciles_pictures_for_ocr[8]:.1f}") + print(f" Std Dev: {stats.std_pictures_for_ocr:.2f}") + + if stats.mimetype_distribution: + print("\nMIME Type Distribution:") + for mimetype, count in sorted(stats.mimetype_distribution.items()): + print(f" {mimetype}: {count}") + + print("\nComputed Metrics:") + print(f" Total items: {stats.total_items}") + print(f" Avg items per document: {stats.avg_items_per_document:.2f}") + print(f" Avg items per page: {stats.avg_items_per_page:.2f}") + + # Show individual document stats + if stats.document_stats: + print("\nIndividual Document Statistics:") + for i, doc_stat in enumerate(stats.document_stats[:3], 1): # Show first 3 + print(f"\n Document {i}: {doc_stat.name}") + print(f" Pages: {doc_stat.num_pages}, Tables: {doc_stat.num_tables}, " + f"Pictures: {doc_stat.num_pictures}, Texts: {doc_stat.num_texts}") + + +def profile_with_generator(): + """Example: Profile documents using a generator (memory efficient).""" + print("\n" + "=" * 80) + print("Example 3: Profiling with Generator (Memory Efficient)") + print("=" * 80) + + doc_dir = Path("./test/data/doc") + if not doc_dir.exists(): + print(f"Directory not found: {doc_dir}") + return + + def document_generator(): + """Generator that yields documents one at a time.""" + for json_file in doc_dir.glob("*.json"): + try: + doc = DoclingDocument.load_from_json(json_file) + yield doc + except Exception: + pass # Skip invalid documents + + # Profile using generator - documents are not all loaded into memory + start_time = time.time() + stats = DocumentProfiler.profile_collection( + document_generator(), + include_individual_stats=False # Don't store individual stats to save memory + ) + elapsed_time = time.time() - start_time + + print(f"\nProcessed {stats.num_documents} documents in {elapsed_time:.2f} seconds") + print(f"Total pages: {stats.total_pages}") + print(f"Total tables: {stats.total_tables}") + print(f"Total pictures: {stats.total_pictures}") + print(f"Mean pages per document: {stats.mean_pages:.2f}") + + +def export_statistics_report(): + """Example: Export statistics to a JSON report.""" + print("\n" + "=" * 80) + print("Example 4: Exporting Statistics Report") + print("=" * 80) + + doc_dir = Path("./test/data/doc") + if not doc_dir.exists(): + print(f"Directory not found: {doc_dir}") + return + + # Load documents + docs = [] + for json_file in doc_dir.glob("*.json"): + try: + docs.append(DoclingDocument.load_from_json(json_file)) + except Exception: + pass + + if not docs: + print("No documents found") + return + + # Profile collection + stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True) + + # Export to JSON file + output_file = Path("./document_statistics_report.json") + with open(output_file, "w") as f: + f.write(stats.model_dump_json(indent=2)) + + print(f"\nStatistics report exported to: {output_file}") + print(f"File size: {output_file.stat().st_size} bytes") + + # Also export as Python dict for further processing + stats_dict = stats.model_dump() + print(f"\nStatistics as dict (keys): {list(stats_dict.keys())[:10]}...") + + +def analyze_document_characteristics(): + """Example: Analyze specific document characteristics.""" + print("\n" + "=" * 80) + print("Example 5: Analyzing Document Characteristics") + print("=" * 80) + + doc_dir = Path("./test/data/doc") + if not doc_dir.exists(): + print(f"Directory not found: {doc_dir}") + return + + # Profile each document individually + ocr_candidate_docs = [] + + for json_file in doc_dir.glob("*.json"): + try: + doc = DoclingDocument.load_from_json(json_file) + stats = DocumentProfiler.profile_document(doc) + + if stats.num_pictures_for_ocr > 0: + ocr_candidate_docs.append((stats.name, stats.num_pictures_for_ocr)) + except Exception: + pass + + print(f"\nDocuments with OCR requirements: {len(ocr_candidate_docs)}") + if ocr_candidate_docs: + for name, count in sorted(ocr_candidate_docs, key=lambda x: x[1], reverse=True)[:5]: + print(f" - {name}: {count} pictures require OCR") + + +if __name__ == "__main__": + # Run all examples + profile_single_document() + profile_document_collection() + profile_with_generator() + export_statistics_report() + analyze_document_characteristics() + + print("\n" + "=" * 80) + print("Examples completed!") + print("=" * 80) + diff --git a/examples/stats_visualizer.py b/examples/stats_visualizer.py new file mode 100644 index 00000000..7fb7f002 --- /dev/null +++ b/examples/stats_visualizer.py @@ -0,0 +1,319 @@ +"""Visualization utilities for collection statistics. + +This module provides utilities for creating charts from CollectionStats data. +Requires matplotlib to be installed (available with 'examples' extra). + +Install with: pip install docling-core[examples] +""" + +from pathlib import Path +from typing import Literal + +try: + import matplotlib.figure + import matplotlib.pyplot as plt + MATPLOTLIB_AVAILABLE = True +except ImportError: + MATPLOTLIB_AVAILABLE = False + +from docling_core.transforms.profiler.doc_profiler import CollectionStats, Histogram + + +class StatsVisualizer: + """Visualizer for creating charts from CollectionStats data.""" + + @staticmethod + def _check_matplotlib() -> None: + """Check if matplotlib is available.""" + if not MATPLOTLIB_AVAILABLE: + raise ImportError( + "matplotlib is required for visualization. " + "Install it with: pip install docling-core[examples]" + ) + + @staticmethod + def plot_histogram( + histogram: Histogram, + title: str = "Distribution", + xlabel: str = "Value", + ylabel: str = "Frequency", + color: str = "steelblue", + figsize: tuple[int, int] = (10, 6), + log_scale: bool = False, + ) -> "matplotlib.figure.Figure": + """Plot a histogram from Histogram data. + + Args: + histogram: Histogram object containing bins and frequencies + title: Plot title + xlabel: X-axis label + ylabel: Y-axis label + color: Bar color + figsize: Figure size as (width, height) + log_scale: If True, use logarithmic scale for y-axis (frequency counts) + + Returns: + matplotlib Figure object + + Raises: + ImportError: If matplotlib is not installed + """ + StatsVisualizer._check_matplotlib() + + fig, ax = plt.subplots(figsize=figsize) + + # Calculate bin centers for plotting + bins = histogram.bins + frequencies = histogram.frequencies + + if len(bins) > 0 and len(frequencies) > 0: + # bins has n+1 edges, frequencies has n values + bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))] + bin_width = histogram.bin_width + + ax.bar(bin_centers, frequencies, width=bin_width * 0.9, color=color, edgecolor="black", alpha=0.7) + + ax.set_xlabel(xlabel, fontsize=12) + ax.set_ylabel(ylabel, fontsize=12) + ax.set_title(title, fontsize=14, fontweight="bold") + ax.grid(axis="y", alpha=0.3, linestyle="--") + + if log_scale: + ax.set_yscale('log') + ax.set_ylabel(f"{ylabel} (log scale)", fontsize=12) + + plt.tight_layout() + return fig + + @staticmethod + def plot_deciles( + deciles: list[float], + title: str = "Decile Distribution", + ylabel: str = "Value", + color: str = "coral", + figsize: tuple[int, int] = (10, 6), + ) -> "matplotlib.figure.Figure": + """Plot deciles as a line chart. + + Args: + deciles: List of 9 decile values [d1, d2, ..., d9] (10th, 20th, ..., 90th percentiles) + title: Plot title + ylabel: Y-axis label + color: Line color + figsize: Figure size as (width, height) + + Returns: + matplotlib Figure object + + Raises: + ImportError: If matplotlib is not installed + """ + StatsVisualizer._check_matplotlib() + + fig, ax = plt.subplots(figsize=figsize) + + decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] + percentile_labels = [10, 20, 30, 40, 50, 60, 70, 80, 90] + + ax.plot(decile_labels, deciles, marker="o", linewidth=2, markersize=8, color=color) + ax.fill_between(decile_labels, deciles, alpha=0.3, color=color) + + # Highlight median (d5 = 50th percentile) + ax.axvline(x=5, color="red", linestyle="--", alpha=0.5, label="Median (d5)") + + ax.set_xlabel("Decile", fontsize=12) + ax.set_ylabel(ylabel, fontsize=12) + ax.set_title(title, fontsize=14, fontweight="bold") + ax.set_xticks(decile_labels) + ax.set_xticklabels([f"d{d} (p{p})" for d, p in zip(decile_labels, percentile_labels)]) + ax.grid(True, alpha=0.3, linestyle="--") + ax.legend() + + plt.tight_layout() + return fig + + @staticmethod + def plot_collection_overview( + stats: CollectionStats, + metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None, + figsize: tuple[int, int] = (16, 10), + log_scale: bool = False, + ) -> "matplotlib.figure.Figure": + """Create a comprehensive overview plot with multiple histograms. + + Args: + stats: CollectionStats object + metrics: List of metrics to plot. If None, plots all available metrics. + figsize: Figure size as (width, height) + log_scale: If True, use logarithmic scale for y-axis (frequency counts) + + Returns: + matplotlib Figure object with subplots + + Raises: + ImportError: If matplotlib is not installed + """ + StatsVisualizer._check_matplotlib() + + if metrics is None: + metrics = ["pages", "tables", "pictures", "texts"] + + n_metrics = len(metrics) + n_cols = 2 + n_rows = (n_metrics + 1) // 2 + + fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize) + if n_rows == 1: + axes = axes.reshape(1, -1) + + metric_config = { + "pages": { + "histogram": stats.histogram_pages, + "title": "Pages per Document", + "color": "steelblue", + }, + "tables": { + "histogram": stats.histogram_tables, + "title": "Tables per Document", + "color": "forestgreen", + }, + "pictures": { + "histogram": stats.histogram_pictures, + "title": "Pictures per Document", + "color": "coral", + }, + "texts": { + "histogram": stats.histogram_texts, + "title": "Text Items per Document", + "color": "mediumpurple", + }, + } + + for idx, metric in enumerate(metrics): + row = idx // n_cols + col = idx % n_cols + ax = axes[row, col] + + config = metric_config[metric] + histogram = config["histogram"] + bins = histogram.bins + frequencies = histogram.frequencies + + if len(bins) > 0 and len(frequencies) > 0: + bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(frequencies))] + bin_width = histogram.bin_width + + ax.bar( + bin_centers, + frequencies, + width=bin_width * 0.9, + color=config["color"], + edgecolor="black", + alpha=0.7, + ) + + ax.set_xlabel("Count", fontsize=10) + ylabel = "Frequency (log scale)" if log_scale else "Frequency" + ax.set_ylabel(ylabel, fontsize=10) + ax.set_title(config["title"], fontsize=12, fontweight="bold") + ax.grid(axis="y", alpha=0.3, linestyle="--") + + if log_scale: + ax.set_yscale("log") + + # Hide unused subplots + for idx in range(n_metrics, n_rows * n_cols): + row = idx // n_cols + col = idx % n_cols + axes[row, col].axis("off") + + fig.suptitle( + f"Collection Statistics Overview ({stats.num_documents} documents)", + fontsize=16, + fontweight="bold", + ) + plt.tight_layout() + return fig + + @staticmethod + def plot_deciles_comparison( + stats: CollectionStats, + metrics: list[Literal["pages", "tables", "pictures", "texts"]] | None = None, + figsize: tuple[int, int] = (12, 6), + ) -> "matplotlib.figure.Figure": + """Create a comparison plot of deciles for multiple metrics. + + Args: + stats: CollectionStats object + metrics: List of metrics to plot. If None, plots all available metrics. + figsize: Figure size as (width, height) + + Returns: + matplotlib Figure object + + Raises: + ImportError: If matplotlib is not installed + """ + StatsVisualizer._check_matplotlib() + + if metrics is None: + metrics = ["pages", "tables", "pictures", "texts"] + + fig, ax = plt.subplots(figsize=figsize) + + decile_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] + + metric_config = { + "pages": {"deciles": stats.deciles_pages, "label": "Pages", "color": "steelblue"}, + "tables": {"deciles": stats.deciles_tables, "label": "Tables", "color": "forestgreen"}, + "pictures": {"deciles": stats.deciles_pictures, "label": "Pictures", "color": "coral"}, + "texts": {"deciles": stats.deciles_texts, "label": "Text Items", "color": "mediumpurple"}, + } + + for metric in metrics: + config = metric_config[metric] + ax.plot( + decile_labels, + config["deciles"], + marker="o", + linewidth=2, + markersize=6, + label=config["label"], + color=config["color"], + ) + + ax.axvline(x=5, color="red", linestyle="--", alpha=0.3, label="Median (d5)") + + ax.set_xlabel("Decile", fontsize=12) + ax.set_ylabel("Count", fontsize=12) + ax.set_title("Decile Comparison Across Metrics", fontsize=14, fontweight="bold") + ax.set_xticks(decile_labels) + ax.set_xticklabels([f"d{d}" for d in decile_labels]) + ax.grid(True, alpha=0.3, linestyle="--") + ax.legend(loc="best") + + plt.tight_layout() + return fig + + @staticmethod + def save_figure(fig: "matplotlib.figure.Figure", filepath: str | Path, dpi: int = 300) -> None: + """Save a matplotlib figure to file. + + Args: + fig: matplotlib Figure object + filepath: Output file path (supports .png, .pdf, .svg, etc.) + dpi: Resolution in dots per inch + """ + StatsVisualizer._check_matplotlib() + fig.savefig(filepath, dpi=dpi, bbox_inches="tight") + + @staticmethod + def show_figure(fig: "matplotlib.figure.Figure") -> None: + """Display a matplotlib figure. + + Args: + fig: matplotlib Figure object + """ + StatsVisualizer._check_matplotlib() + plt.show() + diff --git a/examples/visualize_collection_stats.py b/examples/visualize_collection_stats.py new file mode 100644 index 00000000..79683d50 --- /dev/null +++ b/examples/visualize_collection_stats.py @@ -0,0 +1,257 @@ +"""Example: Visualizing Collection Statistics with Charts. + +This example demonstrates how to use the StatsVisualizer to create +various charts from CollectionStats data. + +Requirements: + pip install docling-core[examples] # Includes matplotlib +""" + +from pathlib import Path + +from stats_visualizer import StatsVisualizer + +from docling_core.transforms.profiler import CollectionStats, DocumentProfiler +from docling_core.types.doc import DoclingDocument + + +def load_documents_and_profile(doc_dir: Path) -> CollectionStats | None: + """Load documents from directory and profile them. + + Args: + doc_dir: Directory containing JSON documents + + Returns: + CollectionStats object or None if no documents found + """ + if not doc_dir.exists(): + print(f"Directory not found: {doc_dir}") + return None + + docs = [] + for json_file in doc_dir.glob("*.json"): + try: + docs.append(DoclingDocument.load_from_json(json_file)) + except Exception: + pass + + if not docs: + print("No documents found") + return None + + # Profile collection + stats = DocumentProfiler.profile_collection(docs) + print(f"Loaded and profiled {stats.num_documents} documents") + return stats + + +def visualize_single_histogram(stats: CollectionStats): + """Example 1: Plot a single histogram.""" + print("\n" + "=" * 80) + print("Example 1: Single Histogram Plot") + print("=" * 80) + + # Create histogram plot for pages (linear scale) + fig = StatsVisualizer.plot_histogram( + histogram=stats.histogram_pages, + title="Distribution of Pages per Document", + xlabel="Number of Pages", + ylabel="Number of Documents", + color="steelblue", + ) + + # Save the figure + output_file = Path("./pages_histogram.png") + StatsVisualizer.save_figure(fig, output_file) + print(f"Saved histogram to: {output_file}") + + # Create histogram plot for pages (logarithmic scale) + fig_log = StatsVisualizer.plot_histogram( + histogram=stats.histogram_pages, + title="Distribution of Pages per Document (Log Scale)", + xlabel="Number of Pages", + ylabel="Number of Documents", + color="steelblue", + log_scale=True, + ) + + # Save the figure + output_file_log = Path("./pages_histogram_log.png") + StatsVisualizer.save_figure(fig_log, output_file_log) + print(f"Saved histogram (log scale) to: {output_file_log}") + + +def visualize_deciles(stats: CollectionStats): + """Example 2: Plot deciles.""" + print("\n" + "=" * 80) + print("Example 2: Decile Distribution Plot") + print("=" * 80) + + # Create decile plot for tables + fig = StatsVisualizer.plot_deciles( + deciles=stats.deciles_tables, + title="Decile Distribution of Tables per Document", + ylabel="Number of Tables", + color="forestgreen", + ) + + # Save the figure + output_file = Path("./tables_deciles.png") + StatsVisualizer.save_figure(fig, output_file) + print(f"Saved decile plot to: {output_file}") + + +def visualize_collection_overview(stats: CollectionStats): + """Example 3: Create comprehensive overview with multiple metrics.""" + print("\n" + "=" * 80) + print("Example 3: Collection Overview (Multiple Histograms)") + print("=" * 80) + + # Create overview plot with all metrics (linear scale) + fig = StatsVisualizer.plot_collection_overview( + stats=stats, + metrics=["pages", "tables", "pictures", "texts"], + figsize=(16, 10), + ) + + # Save the figure + output_file = Path("./collection_overview.png") + StatsVisualizer.save_figure(fig, output_file) + print(f"Saved collection overview to: {output_file}") + + # Create overview plot with all metrics (logarithmic scale) + fig_log = StatsVisualizer.plot_collection_overview( + stats=stats, + metrics=["pages", "tables", "pictures", "texts"], + figsize=(16, 10), + log_scale=True, + ) + + # Save the figure + output_file_log = Path("./collection_overview_log.png") + StatsVisualizer.save_figure(fig_log, output_file_log) + print(f"Saved collection overview (log scale) to: {output_file_log}") + + +def visualize_deciles_comparison(stats: CollectionStats): + """Example 4: Compare deciles across multiple metrics.""" + print("\n" + "=" * 80) + print("Example 4: Decile Comparison Across Metrics") + print("=" * 80) + + # Create comparison plot + fig = StatsVisualizer.plot_deciles_comparison( + stats=stats, + metrics=["pages", "tables", "pictures", "texts"], + figsize=(12, 6), + ) + + # Save the figure + output_file = Path("./deciles_comparison.png") + StatsVisualizer.save_figure(fig, output_file) + print(f"Saved decile comparison to: {output_file}") + + +def create_custom_visualization(stats: CollectionStats): + """Example 5: Create custom visualization for specific metrics.""" + print("\n" + "=" * 80) + print("Example 5: Custom Visualization") + print("=" * 80) + + # Create histogram for pictures only (with log scale for high frequency on low values) + fig1 = StatsVisualizer.plot_histogram( + histogram=stats.histogram_pictures, + title="Picture Distribution (Log Scale)", + xlabel="Pictures per Document", + ylabel="Frequency", + color="coral", + figsize=(10, 6), + log_scale=True, + ) + StatsVisualizer.save_figure(fig1, "./pictures_histogram_log.png") + print("Saved pictures histogram (log scale)") + + # Create decile plot for texts only + fig2 = StatsVisualizer.plot_deciles( + deciles=stats.deciles_texts, + title="Text Items Decile Distribution", + ylabel="Number of Text Items", + color="mediumpurple", + figsize=(10, 6), + ) + StatsVisualizer.save_figure(fig2, "./texts_deciles.png") + print("Saved texts decile plot") + + # Create overview with selected metrics (log scale) + fig3 = StatsVisualizer.plot_collection_overview( + stats=stats, + metrics=["pages", "tables"], # Only pages and tables + figsize=(12, 6), + log_scale=True, + ) + StatsVisualizer.save_figure(fig3, "./pages_tables_overview_log.png") + print("Saved pages and tables overview (log scale)") + + +def display_statistics_summary(stats: CollectionStats): + """Example 6: Display statistics summary with key insights.""" + print("\n" + "=" * 80) + print("Example 6: Statistics Summary") + print("=" * 80) + + print(f"\nCollection Summary ({stats.num_documents} documents):") + print("\nPages:") + print(f" Range: {stats.min_pages} - {stats.max_pages}") + print(f" Median (d5): {stats.deciles_pages[4]:.1f}") + print(f" Mean: {stats.mean_pages:.2f}") + print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, " + f"d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}") + + print("\nTables:") + print(f" Range: {stats.min_tables} - {stats.max_tables}") + print(f" Median (d5): {stats.deciles_tables[4]:.1f}") + print(f" Mean: {stats.mean_tables:.2f}") + + print("\nPictures:") + print(f" Range: {stats.min_pictures} - {stats.max_pictures}") + print(f" Median (d5): {stats.deciles_pictures[4]:.1f}") + print(f" Mean: {stats.mean_pictures:.2f}") + + print("\nText Items:") + print(f" Range: {stats.min_texts} - {stats.max_texts}") + print(f" Median (d5): {stats.deciles_texts[4]:.1f}") + print(f" Mean: {stats.mean_texts:.2f}") + + +if __name__ == "__main__": + try: + # Load documents once and profile them + doc_dir = Path("./test/data/doc") + stats = load_documents_and_profile(doc_dir) + + if stats is None: + print("Failed to load documents. Exiting.") + exit(1) + + # Run all examples with the same stats object + visualize_single_histogram(stats) + visualize_deciles(stats) + visualize_collection_overview(stats) + # visualize_deciles_comparison(stats) + create_custom_visualization(stats) + display_statistics_summary(stats) + + print("\n" + "=" * 80) + print("All visualizations created successfully!") + print("Check the current directory for generated PNG files.") + print("=" * 80) + + except ImportError as e: + print(f"\nError: {e}") + print("\nTo run this example, install matplotlib:") + print(" pip install docling-core[examples]") + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc() + diff --git a/test/test_profiler.py b/test/test_profiler.py new file mode 100644 index 00000000..14e88bcc --- /dev/null +++ b/test/test_profiler.py @@ -0,0 +1,401 @@ +"""Tests for document profiler.""" + +import json +from pathlib import Path + +import pytest + +from docling_core.transforms.profiler import DocumentProfiler +from docling_core.types.doc import BoundingBox, DoclingDocument, ProvenanceItem +from docling_core.types.doc.document import DocumentOrigin, PageItem, Size, TableData +from docling_core.types.doc.labels import DocItemLabel + + +def test_profile_empty_document(): + """Test profiling an empty document.""" + doc = DoclingDocument(name="Empty Document") + + stats = DocumentProfiler.profile_document(doc) + + assert stats.name == "Empty Document" + assert stats.num_pages == 0 + assert stats.num_tables == 0 + assert stats.num_pictures == 0 + assert stats.num_texts == 0 + assert stats.num_key_value_items == 0 + assert stats.num_form_items == 0 + assert stats.total_items == 0 + assert stats.avg_items_per_page == 0.0 + assert stats.origin_mimetype is None + + +def test_profile_simple_document(): + """Test profiling a simple document with basic content.""" + doc = DoclingDocument( + name="Simple Document", + origin=DocumentOrigin( + mimetype="application/pdf", + binary_hash=12345, + filename="test.pdf", + ), + ) + + # Add some pages + doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792)) + doc.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792)) + + # Add some text items + doc.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1") + doc.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2") + doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section") + + # Add a table + doc.add_table(data=TableData(num_rows=2, num_cols=2)) + + # Add a picture + doc.add_picture() + + stats = DocumentProfiler.profile_document(doc) + + assert stats.name == "Simple Document" + assert stats.num_pages == 2 + assert stats.num_tables == 1 + assert stats.num_pictures == 1 + assert stats.num_texts == 3 + assert stats.num_section_headers == 1 + assert stats.total_items == 5 + assert stats.avg_items_per_page == 2.5 + assert stats.origin_mimetype == "application/pdf" + + +def test_profile_document_with_pictures_for_ocr(): + """Test profiling pictures that would trigger OCR based on area coverage.""" + doc = DoclingDocument(name="Document with Pictures for OCR") + + # Add a page + doc.pages[1] = PageItem(page_no=1, size=Size(width=1000, height=1000)) + + # Add a large picture (10% of page area, above default 5% threshold) + doc.add_picture( + prov=ProvenanceItem( + page_no=1, + bbox=BoundingBox(l=0, t=0, r=316.2, b=316.2), # ~10% of page area + charspan=(0, 0), + ) + ) + + # Add a small picture (2% of page area, below default 5% threshold) + doc.add_picture( + prov=ProvenanceItem( + page_no=1, + bbox=BoundingBox(l=0, t=0, r=141.4, b=141.4), # ~2% of page area + charspan=(0, 0), + ) + ) + + # Add a medium picture (exactly 5% of page area, at threshold) + doc.add_picture( + prov=ProvenanceItem( + page_no=1, + bbox=BoundingBox(l=0, t=0, r=223.607, b=223.607), # exactly 5% of page area + charspan=(0, 0), + ) + ) + + stats = DocumentProfiler.profile_document(doc) + + assert stats.num_pictures == 3 + # 2 out of 3 pictures meet the threshold (large and medium) + assert stats.num_pictures_for_ocr == 2 + + # Test with custom threshold of 10% + stats_custom = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.10) + # Only large picture (9.99%) is below 10%, so 0 pictures + assert stats_custom.num_pictures_for_ocr == 0 + + # Test with custom threshold of 2% + stats_custom2 = DocumentProfiler.profile_document(doc, bitmap_coverage_threshold=0.02) + # 2 pictures are above 2% threshold (large and medium, small is 1.99%) + assert stats_custom2.num_pictures_for_ocr == 2 + + +def test_profile_collection_empty(): + """Test profiling an empty collection.""" + stats = DocumentProfiler.profile_collection([]) + + assert stats.num_documents == 0 + assert stats.total_pages == 0 + assert stats.total_tables == 0 + assert stats.total_pictures == 0 + assert stats.avg_items_per_document == 0.0 + assert stats.avg_items_per_page == 0.0 + assert stats.deciles_pages == [0.0] * 9 + assert stats.deciles_tables == [0.0] * 9 + assert stats.histogram_pages.bins == [] + assert stats.histogram_pages.frequencies == [] + assert stats.histogram_pages.bin_width == 0.0 + + +def test_profile_collection_single_document(): + """Test profiling a collection with a single document.""" + doc = DoclingDocument(name="Single Doc") + doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792)) + doc.add_text(label=DocItemLabel.PARAGRAPH, text="Text", orig="Text") + doc.add_table(data=TableData(num_rows=1, num_cols=1)) + doc.add_picture() + + stats = DocumentProfiler.profile_collection(doc) + + assert stats.num_documents == 1 + assert stats.total_pages == 1 + assert stats.total_tables == 1 + assert stats.total_pictures == 1 + assert stats.total_texts == 1 + assert stats.min_pages == 1 + assert stats.max_pages == 1 + assert stats.deciles_pages[4] == 1.0 # median is d5 (5th decile, index 4) + assert stats.mean_pages == 1.0 + assert stats.std_pages == 0.0 + # Check histogram exists + assert len(stats.histogram_pages.bins) > 0 + assert len(stats.histogram_pages.frequencies) > 0 + + +def test_profile_collection_multiple_documents(): + """Test profiling a collection with multiple documents.""" + docs = [] + + # Document 1: 2 pages, 1 table, 2 pictures, 2 texts + doc1 = DoclingDocument( + name="Doc1", + origin=DocumentOrigin(mimetype="application/pdf", binary_hash=1, filename="doc1.pdf"), + ) + doc1.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792)) + doc1.pages[2] = PageItem(page_no=2, size=Size(width=612, height=792)) + doc1.add_table(data=TableData(num_rows=1, num_cols=1)) + doc1.add_picture() + doc1.add_picture() + doc1.add_text(label=DocItemLabel.TEXT, text="Text 1", orig="Text 1") + doc1.add_text(label=DocItemLabel.TEXT, text="Text 2", orig="Text 2") + docs.append(doc1) + + # Document 2: 5 pages, 3 tables, 1 picture, 10 texts + doc2 = DoclingDocument( + name="Doc2", + origin=DocumentOrigin(mimetype="application/pdf", binary_hash=2, filename="doc2.pdf"), + ) + for i in range(1, 6): + doc2.pages[i] = PageItem(page_no=i, size=Size(width=612, height=792)) + for _ in range(3): + doc2.add_table(data=TableData(num_rows=1, num_cols=1)) + doc2.add_picture() + for i in range(10): + doc2.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}") + docs.append(doc2) + + # Document 3: 1 page, 0 tables, 5 pictures, 2 texts + doc3 = DoclingDocument( + name="Doc3", + origin=DocumentOrigin(mimetype="text/html", binary_hash=3, filename="doc3.html"), + ) + doc3.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792)) + for _ in range(5): + doc3.add_picture() + doc3.add_text(label=DocItemLabel.TEXT, text="T1", orig="T1") + doc3.add_text(label=DocItemLabel.TEXT, text="T2", orig="T2") + docs.append(doc3) + + stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True) + + # Basic counts + assert stats.num_documents == 3 + assert stats.total_pages == 8 # 2 + 5 + 1 + assert stats.total_tables == 4 # 1 + 3 + 0 + assert stats.total_pictures == 8 # 2 + 1 + 5 + assert stats.total_texts == 14 # 2 + 10 + 2 + + # Page statistics + assert stats.min_pages == 1 + assert stats.max_pages == 5 + assert stats.deciles_pages[4] == 2.0 # median is d5 (5th decile, index 4) + assert stats.mean_pages == pytest.approx(8 / 3) + assert stats.std_pages > 0 + # Check deciles are in order: [d1, d2, d3, d4, d5, d6, d7, d8, d9] + assert stats.deciles_pages[0] <= stats.deciles_pages[4] <= stats.deciles_pages[8] + # Check histogram exists + assert len(stats.histogram_pages.bins) > 0 + assert len(stats.histogram_pages.frequencies) > 0 + + # Table statistics + assert stats.min_tables == 0 + assert stats.max_tables == 3 + assert stats.deciles_tables[4] == 1.0 # median is d5 (5th decile, index 4) + assert stats.mean_tables == pytest.approx(4 / 3) + # Check histogram exists + assert len(stats.histogram_tables.bins) > 0 + + # Picture statistics + assert stats.min_pictures == 1 + assert stats.max_pictures == 5 + assert stats.deciles_pictures[4] == 2.0 # median is d5 (5th decile, index 4) + assert stats.mean_pictures == pytest.approx(8 / 3) + # Check histogram exists + assert len(stats.histogram_pictures.bins) > 0 + + # Text statistics + assert stats.min_texts == 2 + assert stats.max_texts == 10 + assert stats.deciles_texts[4] == 2.0 # median is d5 (5th decile, index 4) + assert stats.mean_texts == pytest.approx(14 / 3) + # Check histogram exists + assert len(stats.histogram_texts.bins) > 0 + + # Document characteristics + assert len(stats.document_stats) == 3 + + # MIME type distribution + assert stats.mimetype_distribution["application/pdf"] == 2 + assert stats.mimetype_distribution["text/html"] == 1 + + # Computed fields + assert stats.total_items == 26 # 14 texts + 4 tables + 8 pictures + assert stats.avg_items_per_document == pytest.approx(26 / 3) + assert stats.avg_items_per_page == pytest.approx(26 / 8) + + +def test_profile_collection_with_iterator(): + """Test profiling a collection using an iterator (generator).""" + + def doc_generator(): + for i in range(3): + doc = DoclingDocument(name=f"Doc{i}") + doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792)) + doc.add_text(label=DocItemLabel.TEXT, text=f"Text {i}", orig=f"Text {i}") + yield doc + + stats = DocumentProfiler.profile_collection(doc_generator()) + + assert stats.num_documents == 3 + assert stats.total_pages == 3 + assert stats.total_texts == 3 + + +def test_profile_collection_without_individual_stats(): + """Test that individual stats are not included by default.""" + docs = [DoclingDocument(name=f"Doc{i}") for i in range(3)] + + stats = DocumentProfiler.profile_collection(docs, include_individual_stats=False) + + assert len(stats.document_stats) == 0 + + +def test_statistics_serialization(): + """Test that statistics can be serialized to JSON.""" + doc = DoclingDocument(name="Test Doc") + doc.pages[1] = PageItem(page_no=1, size=Size(width=612, height=792)) + doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text") + + doc_stats = DocumentProfiler.profile_document(doc) + + # Test DocumentStatistics serialization + json_str = doc_stats.model_dump_json() + data = json.loads(json_str) + assert data["name"] == "Test Doc" + assert data["num_pages"] == 1 + assert data["total_items"] == 1 + + # Test CollectionStatistics serialization + coll_stats = DocumentProfiler.profile_collection([doc]) + json_str = coll_stats.model_dump_json() + data = json.loads(json_str) + assert data["num_documents"] == 1 + assert data["total_pages"] == 1 + + +def test_profile_real_document(): + """Test profiling a real document from test data.""" + test_file = Path("./test/data/doc/2408.09869v3_enriched.json") + if not test_file.exists(): + pytest.skip("Test file not found") + + doc = DoclingDocument.load_from_json(test_file) + stats = DocumentProfiler.profile_document(doc) + + # Basic sanity checks + assert stats.name == doc.name + assert stats.num_pages == len(doc.pages) + assert stats.num_tables == len(doc.tables) + assert stats.num_pictures == len(doc.pictures) + assert stats.num_texts == len(doc.texts) + assert stats.total_items > 0 + + +def test_label_specific_counts(): + """Test that label-specific counts are accurate.""" + doc = DoclingDocument(name="Label Test") + + # Add various types of text items + doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Section", orig="Section") + doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 1", orig="Item 1") + doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 2", orig="Item 2") + doc.add_text(label=DocItemLabel.LIST_ITEM, text="Item 3", orig="Item 3") + doc.add_text(label=DocItemLabel.CODE, text="code", orig="code") + doc.add_text(label=DocItemLabel.FORMULA, text="x=y", orig="x=y") + doc.add_text(label=DocItemLabel.TEXT, text="Text", orig="Text") + + stats = DocumentProfiler.profile_document(doc) + + assert stats.num_section_headers == 1 + assert stats.num_list_items == 3 + assert stats.num_code_items == 1 + assert stats.num_formulas == 1 + assert stats.num_texts == 7 + + +def test_profile_sample_document(sample_doc): + """Test profiling the sample document from conftest.py fixture.""" + stats = DocumentProfiler.profile_document(sample_doc) + + # Verify basic document properties + assert stats.name == "Untitled 1" + assert stats.num_pages == 0 # sample_doc doesn't add pages explicitly + + # Verify item counts based on the sample_doc construction + assert stats.num_tables == len(sample_doc.tables) + assert stats.num_pictures == len(sample_doc.pictures) + assert stats.num_texts == len(sample_doc.texts) + assert stats.num_key_value_items == len(sample_doc.key_value_items) + assert stats.num_form_items == len(sample_doc.form_items) + + # Verify label-specific counts + assert stats.num_section_headers > 0 # sample_doc has section headers + assert stats.num_list_items > 0 # sample_doc has many list items + assert stats.num_code_items > 0 # sample_doc has code items + assert stats.num_formulas > 0 # sample_doc has formulas + + # Verify computed fields + assert stats.total_items > 0 + assert stats.total_items == ( + stats.num_texts + + stats.num_tables + + stats.num_pictures + + stats.num_key_value_items + + stats.num_form_items + ) + + # sample_doc has no pages, so avg_items_per_page should be 0 + assert stats.avg_items_per_page == 0.0 + + +def test_calculate_deciles_empty(): + """Test _calculate_deciles with empty data (line 191).""" + result = DocumentProfiler._calculate_deciles([]) + assert result == [0.0] * 9 + + +def test_calculate_histogram_empty(): + """Test _calculate_histogram with empty data (line 208).""" + result = DocumentProfiler._calculate_histogram([]) + assert result.bins == [] + assert result.frequencies == [] + assert result.bin_width == 0.0