mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
d4c87133f3
* model runtime refactoring Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix code formula preset Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * batch prediction Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use presets and new vlm options in CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use new model settings by default Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * running Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fixes for running examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * keep old stage Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update model Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use granite 3.3 and set options Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * revisit init logic and propagate the proper options to the runtimes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update all stages with original setup Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * per stage registry Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use chat template Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove duplicated predict() and factor out some utils Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * working picture description examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add granite docling as code formula model Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename code formula presets Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix running minimal_vlm example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add all models to presets and run compare_vlm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused repo_id Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update vlm api model example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix legacy examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add another legacy example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * avoid automatic fallback to mlx and fix end_of_utterance in codeformula Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move vlm_convert_model Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use new vlm runtime class Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * flasg for CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename runtimes to explicit vlm_runtimes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * renaming from runtime to inference engine and model families Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docs with stages Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update docs catalog page Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename runtime to inference engine Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
216 lines
7.7 KiB
Python
Vendored
216 lines
7.7 KiB
Python
Vendored
# %% [markdown]
|
|
# Describe pictures using VLM models via API runtimes
|
|
#
|
|
# What this example does
|
|
# - Demonstrates using presets with API runtimes (LM Studio, watsonx.ai)
|
|
# - Shows that API is just a runtime choice, not a different options class
|
|
# - Explains pre-configured API types and custom API configuration
|
|
#
|
|
# Prerequisites
|
|
# - Install Docling and `python-dotenv` if loading env vars from a `.env` file.
|
|
# - For LM Studio: ensure LM Studio is running with a VLM model loaded
|
|
# - For watsonx.ai: set `WX_API_KEY` and `WX_PROJECT_ID` in the environment.
|
|
#
|
|
# How to run
|
|
# - From the repo root: `python docs/examples/pictures_description_api.py`.
|
|
# - watsonx.ai example runs automatically if credentials are available
|
|
#
|
|
# Notes
|
|
# - The NEW runtime system unifies API and local inference
|
|
# - For legacy approach, see `pictures_description_api_legacy.py`
|
|
|
|
# %%
|
|
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from docling_core.types.doc import PictureItem
|
|
from dotenv import load_dotenv
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import (
|
|
PdfPipelineOptions,
|
|
PictureDescriptionVlmEngineOptions,
|
|
)
|
|
from docling.datamodel.vlm_engine_options import (
|
|
ApiVlmEngineOptions,
|
|
VlmEngineType,
|
|
)
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
|
|
def run_lm_studio_example(input_doc_path: Path):
|
|
"""Example 1: Using Granite Vision preset with LM Studio API runtime."""
|
|
print("=" * 70)
|
|
print("Example 1: Granite Vision with LM Studio (pre-configured API type)")
|
|
print("=" * 70)
|
|
|
|
# Start LM Studio with granite-vision model loaded
|
|
# The preset is pre-configured for LM Studio API type
|
|
picture_desc_options = PictureDescriptionVlmEngineOptions.from_preset(
|
|
"granite_vision",
|
|
engine_options=ApiVlmEngineOptions(
|
|
runtime_type=VlmEngineType.API_LMSTUDIO,
|
|
# url is pre-configured for LM Studio (http://localhost:1234/v1/chat/completions)
|
|
# model name is pre-configured from the preset
|
|
timeout=90,
|
|
),
|
|
)
|
|
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_picture_description = True
|
|
pipeline_options.picture_description_options = picture_desc_options
|
|
pipeline_options.enable_remote_services = True # Required for API runtimes
|
|
|
|
print("\nOther API types are also pre-configured:")
|
|
print("- VlmEngineType.API_OLLAMA: http://localhost:11434/v1/chat/completions")
|
|
print("- VlmEngineType.API_OPENAI: https://api.openai.com/v1/chat/completions")
|
|
print("- VlmEngineType.API: Generic API endpoint (you specify the URL)")
|
|
print("\nEach preset has pre-configured model names for these API types.")
|
|
print("For example, granite_vision preset knows:")
|
|
print('- Ollama model name: "ibm/granite3.3-vision:2b"')
|
|
print('- LM Studio model name: "granite-vision-3.3-2b"')
|
|
print("- OpenAI model name: would use the HuggingFace repo_id\n")
|
|
|
|
doc_converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
)
|
|
}
|
|
)
|
|
result = doc_converter.convert(input_doc_path)
|
|
|
|
for element, _level in result.document.iterate_items():
|
|
if isinstance(element, PictureItem):
|
|
print(
|
|
f"Picture {element.self_ref}\n"
|
|
f"Caption: {element.caption_text(doc=result.document)}\n"
|
|
f"Meta: {element.meta}\n"
|
|
)
|
|
|
|
|
|
def run_watsonx_example(input_doc_path: Path):
|
|
"""Example 2: Using Granite Vision preset with watsonx.ai."""
|
|
print("\n" + "=" * 70)
|
|
print("Example 2: Granite Vision with watsonx.ai (custom API configuration)")
|
|
print("=" * 70)
|
|
|
|
# Check if running in CI environment
|
|
if os.environ.get("CI"):
|
|
print("Skipping watsonx.ai example in CI environment")
|
|
return
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
api_key = os.environ.get("WX_API_KEY")
|
|
project_id = os.environ.get("WX_PROJECT_ID")
|
|
|
|
# Check if credentials are available
|
|
if not api_key or not project_id:
|
|
print("WARNING: watsonx.ai credentials not found.")
|
|
print(
|
|
"Set WX_API_KEY and WX_PROJECT_ID environment variables to run this example."
|
|
)
|
|
print("Skipping watsonx.ai example.\n")
|
|
return
|
|
|
|
def _get_iam_access_token(api_key: str) -> str:
|
|
res = requests.post(
|
|
url="https://iam.cloud.ibm.com/identity/token",
|
|
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
|
data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}",
|
|
)
|
|
res.raise_for_status()
|
|
return res.json()["access_token"]
|
|
|
|
# For watsonx.ai, we need to provide custom URL, headers, and params
|
|
picture_desc_options = PictureDescriptionVlmEngineOptions.from_preset(
|
|
"granite_vision",
|
|
engine_options=ApiVlmEngineOptions(
|
|
runtime_type=VlmEngineType.API, # Generic API type
|
|
url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
|
|
headers={
|
|
"Authorization": "Bearer " + _get_iam_access_token(api_key=api_key),
|
|
},
|
|
params={
|
|
# Note: Granite Vision models are no longer available on watsonx.ai (they are model on demand)
|
|
# "model_id": "ibm/granite-vision-3-3-2b",
|
|
"model_id": "meta-llama/llama-3-2-11b-vision-instruct",
|
|
"project_id": project_id,
|
|
"parameters": {"max_new_tokens": 400},
|
|
},
|
|
timeout=60,
|
|
),
|
|
)
|
|
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_picture_description = True
|
|
pipeline_options.picture_description_options = picture_desc_options
|
|
pipeline_options.enable_remote_services = True
|
|
|
|
doc_converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
)
|
|
}
|
|
)
|
|
result = doc_converter.convert(input_doc_path)
|
|
|
|
for element, _level in result.document.iterate_items():
|
|
if isinstance(element, PictureItem):
|
|
print(
|
|
f"Picture {element.self_ref}\n"
|
|
f"Caption: {element.caption_text(doc=result.document)}\n"
|
|
f"Meta: {element.meta}\n"
|
|
)
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
data_folder = Path(__file__).parent / "../../tests/data"
|
|
input_doc_path = data_folder / "pdf/2206.01062.pdf"
|
|
|
|
# Run LM Studio example
|
|
run_lm_studio_example(input_doc_path)
|
|
|
|
# Run watsonx.ai example (skips if in CI or credentials not found)
|
|
run_watsonx_example(input_doc_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
# %% [markdown]
|
|
# ## Key Concepts
|
|
#
|
|
# ### Pre-configured API Types
|
|
# The new runtime system has pre-configured API types:
|
|
# - **API_OLLAMA**: Ollama server (port 11434)
|
|
# - **API_LMSTUDIO**: LM Studio server (port 1234)
|
|
# - **API_OPENAI**: OpenAI API
|
|
# - **API**: Generic API endpoint (you provide URL)
|
|
#
|
|
# Each preset knows the appropriate model names for these API types.
|
|
#
|
|
# ### Custom API Configuration
|
|
# For services like watsonx.ai that need custom configuration:
|
|
# - Use `VlmEngineType.API` (generic)
|
|
# - Provide custom `url`, `headers`, and `params`
|
|
# - The preset still provides the base model configuration
|
|
#
|
|
# ### Same Preset, Different Runtime
|
|
# You can use the same preset (e.g., "granite_vision") with:
|
|
# - Local Transformers runtime (see `picture_description_inline.py`)
|
|
# - Local MLX runtime (macOS)
|
|
# - LM Studio API runtime (this example)
|
|
# - watsonx.ai API runtime (this example)
|
|
# - Any other API endpoint
|
|
#
|
|
# This makes it easy to develop locally and deploy to production!
|