mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
251c8b217a
* fix(ocr): support language selection for RapidOCR engine Allows specifying 'english' or 'chinese' via the --ocr-lang flag and automatically downloads the correct models. Signed-off-by: DevAbdullah90 <abdullahkashif12b3@gmail.com> * fix(ocr): fix linting and add unit tests for RapidOCR language selection Signed-off-by: DevAbdullah90 <abdullahkashif12b3@gmail.com> * fix(ocr): align RapidOCR english assets with 3.8 mobile models Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * fix(ocr): restore RapidOCR default model compatibility Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * fix(examples): disable OCR in code formula comparison Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> --------- Signed-off-by: DevAbdullah90 <abdullahkashif12b3@gmail.com> Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> Co-authored-by: DevAbdullah90 <abdullahkashif12b3@gmail.com>
118 lines
3.9 KiB
Python
Vendored
118 lines
3.9 KiB
Python
Vendored
"""Example: Comparing CodeFormula models for code and formula extraction.
|
|
|
|
This example demonstrates how to use both the CodeFormulaV2 model
|
|
and the Granite Docling model for extracting code blocks and mathematical
|
|
formulas from PDF documents, allowing you to compare their outputs.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
from docling_core.types.doc import CodeItem, FormulaItem
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import (
|
|
CodeFormulaVlmOptions,
|
|
PdfPipelineOptions,
|
|
)
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
|
|
def extract_with_preset(preset_name: str, input_doc: Path):
|
|
"""Extract code and formulas using a specific preset.
|
|
|
|
Args:
|
|
preset_name: Name of the preset to use ('codeformulav2' or 'granite_docling')
|
|
input_doc: Path to the input PDF document
|
|
|
|
Returns:
|
|
The converted document
|
|
"""
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Processing with preset: {preset_name}")
|
|
print(f"{'=' * 60}\n")
|
|
|
|
# Create options with the specified preset
|
|
code_formula_options = CodeFormulaVlmOptions.from_preset(preset_name)
|
|
|
|
# Display preset information
|
|
print(f"Model: {code_formula_options.model_spec.name}")
|
|
print(f"Repo ID: {code_formula_options.model_spec.default_repo_id}")
|
|
print(f"Scale: {code_formula_options.scale}")
|
|
print(f"Max tokens: {code_formula_options.model_spec.max_new_tokens}")
|
|
print()
|
|
|
|
# Configure the PDF pipeline to use code/formula enrichment
|
|
pipeline_options = PdfPipelineOptions(
|
|
# The sample PDF already contains embedded text, so OCR only adds an
|
|
# unrelated backend dependency for this code/formula comparison example.
|
|
do_ocr=False,
|
|
do_code_enrichment=True,
|
|
do_formula_enrichment=True,
|
|
code_formula_options=code_formula_options,
|
|
)
|
|
|
|
# Create converter with the configured options
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
}
|
|
)
|
|
|
|
# Convert the document
|
|
result = converter.convert(input_doc)
|
|
doc = result.document
|
|
|
|
# Print extracted code blocks
|
|
code_blocks = [
|
|
item for item, _ in doc.iterate_items() if isinstance(item, CodeItem)
|
|
]
|
|
print(f"Code blocks found: {len(code_blocks)}")
|
|
for i, item in enumerate(code_blocks, 1):
|
|
print(f"\n Code block {i}:")
|
|
print(f" Language: {item.code_language}")
|
|
print(f" Text: {item.text[:100]}{'...' if len(item.text) > 100 else ''}")
|
|
|
|
# Print extracted formulas
|
|
formulas = [
|
|
item for item, _ in doc.iterate_items() if isinstance(item, FormulaItem)
|
|
]
|
|
print(f"\nFormulas found: {len(formulas)}")
|
|
for i, item in enumerate(formulas, 1):
|
|
print(f"\n Formula {i}:")
|
|
print(f" Text: {item.text[:100]}{'...' if len(item.text) > 100 else ''}")
|
|
|
|
return doc
|
|
|
|
|
|
def main():
|
|
"""Main function to compare both presets."""
|
|
input_doc = Path("tests/data/pdf/code_and_formula.pdf")
|
|
|
|
if not input_doc.exists():
|
|
print(f"Error: Input file not found: {input_doc}")
|
|
print("Please provide a valid PDF file with code and formulas.")
|
|
return
|
|
|
|
print("Comparing CodeFormula presets for code and formula extraction")
|
|
print(f"Input document: {input_doc}")
|
|
|
|
# Extract with CodeFormulaV2 model
|
|
extract_with_preset("codeformulav2", input_doc)
|
|
|
|
# Extract with Granite Docling model
|
|
extract_with_preset("granite_docling", input_doc)
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print("Comparison complete!")
|
|
print(f"{'=' * 60}")
|
|
print("\nBoth presets have been tested. You can compare the outputs above.")
|
|
print("\nKey differences:")
|
|
print("- CodeFormulaV2: Uses specialized CodeFormulaV2 model")
|
|
print(
|
|
"- Granite Docling: Uses IBM Granite-Docling-258M with extended context (8192 tokens)"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|