"""Example usage of the document profiler for extracting statistics.""" import time from pathlib import Path from docling_core.transforms.profiler import DocumentProfiler from docling_core.types.doc import DoclingDocument def profile_single_document(): """Example: Profile a single document.""" print("=" * 80) print("Example 1: Profiling a Single Document") print("=" * 80) # Load a document doc_path = Path("./examples/2408.09869v3.json") if not doc_path.exists(): print(f"Document not found: {doc_path}") return doc = DoclingDocument.load_from_json(doc_path) # Profile the document stats = DocumentProfiler.profile_document(doc) # Print statistics print(f"\nDocument: {stats.name}") print(f"Pages: {stats.num_pages}") print(f"Tables: {stats.num_tables}") print(f"Pictures: {stats.num_pictures}") print(f"Text items: {stats.num_texts}") print(f" - Section headers: {stats.num_section_headers}") print(f" - List items: {stats.num_list_items}") print(f" - Code blocks: {stats.num_code_items}") print(f" - Formulas: {stats.num_formulas}") print(f"\nTotal items: {stats.total_items}") print(f"Average items per page: {stats.avg_items_per_page:.2f}") print(f"\nOrigin MIME type: {stats.origin_mimetype}") print(f"Pictures requiring OCR: {stats.num_pictures_for_ocr}") # Export to JSON json_output = stats.model_dump_json(indent=2) print(f"\nJSON export (first 500 chars):\n{json_output[:500]}...") def profile_document_collection(): """Example: Profile a collection of documents.""" print("\n" + "=" * 80) print("Example 2: Profiling a Document Collection") print("=" * 80) # Load multiple documents doc_dir = Path("./test/data/doc") if not doc_dir.exists(): print(f"Directory not found: {doc_dir}") return # Load all JSON documents docs = [] for json_file in doc_dir.glob("*.json"): try: doc = DoclingDocument.load_from_json(json_file) docs.append(doc) except Exception as e: print(f"Skipping {json_file.name}: {e}") if not docs: print("No documents found") return print(f"\nLoaded {len(docs)} documents") # Profile the collection stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True) # Print collection statistics print("\nCollection Statistics:") print(f"Number of documents: {stats.num_documents}") print("\nPages:") print(f" Total: {stats.total_pages}") print(f" Min: {stats.min_pages}, Max: {stats.max_pages}") print(f" Median (d5): {stats.deciles_pages[4]:.1f}, Mean: {stats.mean_pages:.2f}") print(f" Deciles: d1={stats.deciles_pages[0]:.1f}, d5={stats.deciles_pages[4]:.1f}, d9={stats.deciles_pages[8]:.1f}") print(f" Std Dev: {stats.std_pages:.2f}") print(f" Histogram bins: {len(stats.histogram_pages.bins)}, bin width: {stats.histogram_pages.bin_width:.2f}") print("\nTables:") print(f" Total: {stats.total_tables}") print(f" Min: {stats.min_tables}, Max: {stats.max_tables}") print(f" Median (d5): {stats.deciles_tables[4]:.1f}, Mean: {stats.mean_tables:.2f}") print(f" Deciles: d1={stats.deciles_tables[0]:.1f}, d5={stats.deciles_tables[4]:.1f}, d9={stats.deciles_tables[8]:.1f}") print(f" Std Dev: {stats.std_tables:.2f}") print("\nPictures:") print(f" Total: {stats.total_pictures}") print(f" Min: {stats.min_pictures}, Max: {stats.max_pictures}") print(f" Median (d5): {stats.deciles_pictures[4]:.1f}, Mean: {stats.mean_pictures:.2f}") print(f" Deciles: d1={stats.deciles_pictures[0]:.1f}, d5={stats.deciles_pictures[4]:.1f}, d9={stats.deciles_pictures[8]:.1f}") print(f" Std Dev: {stats.std_pictures:.2f}") print("\nText Items:") print(f" Total: {stats.total_texts}") print(f" Min: {stats.min_texts}, Max: {stats.max_texts}") print(f" Median (d5): {stats.deciles_texts[4]:.1f}, Mean: {stats.mean_texts:.2f}") print(f" Deciles: d1={stats.deciles_texts[0]:.1f}, d5={stats.deciles_texts[4]:.1f}, d9={stats.deciles_texts[8]:.1f}") print(f" Std Dev: {stats.std_texts:.2f}") print("\nPictures Requiring OCR:") print(f" Total: {stats.total_pictures_for_ocr}") print(f" Min: {stats.min_pictures_for_ocr}, Max: {stats.max_pictures_for_ocr}") print(f" Median (d5): {stats.deciles_pictures_for_ocr[4]:.1f}, Mean: {stats.mean_pictures_for_ocr:.2f}") print(f" Deciles: d1={stats.deciles_pictures_for_ocr[0]:.1f}, d5={stats.deciles_pictures_for_ocr[4]:.1f}, d9={stats.deciles_pictures_for_ocr[8]:.1f}") print(f" Std Dev: {stats.std_pictures_for_ocr:.2f}") if stats.mimetype_distribution: print("\nMIME Type Distribution:") for mimetype, count in sorted(stats.mimetype_distribution.items()): print(f" {mimetype}: {count}") print("\nComputed Metrics:") print(f" Total items: {stats.total_items}") print(f" Avg items per document: {stats.avg_items_per_document:.2f}") print(f" Avg items per page: {stats.avg_items_per_page:.2f}") # Show individual document stats if stats.document_stats: print("\nIndividual Document Statistics:") for i, doc_stat in enumerate(stats.document_stats[:3], 1): # Show first 3 print(f"\n Document {i}: {doc_stat.name}") print(f" Pages: {doc_stat.num_pages}, Tables: {doc_stat.num_tables}, " f"Pictures: {doc_stat.num_pictures}, Texts: {doc_stat.num_texts}") def profile_with_generator(): """Example: Profile documents using a generator (memory efficient).""" print("\n" + "=" * 80) print("Example 3: Profiling with Generator (Memory Efficient)") print("=" * 80) doc_dir = Path("./test/data/doc") if not doc_dir.exists(): print(f"Directory not found: {doc_dir}") return def document_generator(): """Generator that yields documents one at a time.""" for json_file in doc_dir.glob("*.json"): try: doc = DoclingDocument.load_from_json(json_file) yield doc except Exception: pass # Skip invalid documents # Profile using generator - documents are not all loaded into memory start_time = time.time() stats = DocumentProfiler.profile_collection( document_generator(), include_individual_stats=False # Don't store individual stats to save memory ) elapsed_time = time.time() - start_time print(f"\nProcessed {stats.num_documents} documents in {elapsed_time:.2f} seconds") print(f"Total pages: {stats.total_pages}") print(f"Total tables: {stats.total_tables}") print(f"Total pictures: {stats.total_pictures}") print(f"Mean pages per document: {stats.mean_pages:.2f}") def export_statistics_report(): """Example: Export statistics to a JSON report.""" print("\n" + "=" * 80) print("Example 4: Exporting Statistics Report") print("=" * 80) doc_dir = Path("./test/data/doc") if not doc_dir.exists(): print(f"Directory not found: {doc_dir}") return # Load documents docs = [] for json_file in doc_dir.glob("*.json"): try: docs.append(DoclingDocument.load_from_json(json_file)) except Exception: pass if not docs: print("No documents found") return # Profile collection stats = DocumentProfiler.profile_collection(docs, include_individual_stats=True) # Export to JSON file output_file = Path("./document_statistics_report.json") with open(output_file, "w") as f: f.write(stats.model_dump_json(indent=2)) print(f"\nStatistics report exported to: {output_file}") print(f"File size: {output_file.stat().st_size} bytes") # Also export as Python dict for further processing stats_dict = stats.model_dump() print(f"\nStatistics as dict (keys): {list(stats_dict.keys())[:10]}...") def analyze_document_characteristics(): """Example: Analyze specific document characteristics.""" print("\n" + "=" * 80) print("Example 5: Analyzing Document Characteristics") print("=" * 80) doc_dir = Path("./test/data/doc") if not doc_dir.exists(): print(f"Directory not found: {doc_dir}") return # Profile each document individually ocr_candidate_docs = [] for json_file in doc_dir.glob("*.json"): try: doc = DoclingDocument.load_from_json(json_file) stats = DocumentProfiler.profile_document(doc) if stats.num_pictures_for_ocr > 0: ocr_candidate_docs.append((stats.name, stats.num_pictures_for_ocr)) except Exception: pass print(f"\nDocuments with OCR requirements: {len(ocr_candidate_docs)}") if ocr_candidate_docs: for name, count in sorted(ocr_candidate_docs, key=lambda x: x[1], reverse=True)[:5]: print(f" - {name}: {count} pictures require OCR") if __name__ == "__main__": # Run all examples profile_single_document() profile_document_collection() profile_with_generator() export_statistics_report() analyze_document_characteristics() print("\n" + "=" * 80) print("Examples completed!") print("=" * 80)