mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
437 lines
11 KiB
TOML
437 lines
11 KiB
TOML
[build-system]
|
|
requires = ["hatchling"]
|
|
build-backend = "hatchling.build"
|
|
|
|
[project]
|
|
name = "docling-slim"
|
|
version = "2.93.0" # DO NOT EDIT, updated automatically
|
|
description = "Modular version of the Docling package: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
|
license = "MIT"
|
|
keywords = [
|
|
"docling",
|
|
"convert",
|
|
"document",
|
|
"pdf",
|
|
"docx",
|
|
"html",
|
|
"markdown",
|
|
"layout model",
|
|
"segmentation",
|
|
]
|
|
classifiers = [
|
|
"Operating System :: MacOS :: MacOS X",
|
|
"Operating System :: POSIX :: Linux",
|
|
"Operating System :: Microsoft :: Windows",
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Science/Research",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.10",
|
|
"Programming Language :: Python :: 3.11",
|
|
"Programming Language :: Python :: 3.12",
|
|
"Programming Language :: Python :: 3.13",
|
|
"Programming Language :: Python :: 3.14",
|
|
]
|
|
readme = "packages/docling-slim/README.md"
|
|
authors = [
|
|
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
|
|
{ name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
|
|
{ name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
|
|
{ name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
|
|
{ name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
|
|
{ name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
|
|
{ name = "Peter Staar", email = "taa@zurich.ibm.com" },
|
|
]
|
|
requires-python = '>=3.10,<4.0'
|
|
|
|
# MINIMAL BASE (8 packages) - ~50MB
|
|
dependencies = [
|
|
'pydantic>=2.0.0,<3.0.0',
|
|
'docling-core>=2.73.0,<3.0.0',
|
|
'pydantic-settings>=2.3.0,<3.0.0',
|
|
'filetype>=1.2.0,<2.0.0',
|
|
'requests>=2.32.2,<3.0.0',
|
|
'certifi>=2024.7.4',
|
|
'pluggy>=1.0.0,<2.0.0',
|
|
'tqdm>=4.65.0,<5.0.0',
|
|
]
|
|
|
|
[project.urls]
|
|
homepage = "https://github.com/docling-project/docling"
|
|
repository = "https://github.com/docling-project/docling"
|
|
issues = "https://github.com/docling-project/docling/issues"
|
|
changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
|
|
|
|
[project.entry-points.docling]
|
|
"docling_defaults" = "docling.models.plugins.defaults"
|
|
|
|
# CLI scripts (require cli extra: pip install docling-slim[cli])
|
|
[project.scripts]
|
|
docling = "docling.cli.main:app"
|
|
docling-tools = "docling.cli.tools:app"
|
|
|
|
[project.optional-dependencies]
|
|
# ============================================================================
|
|
# CORE COMPONENTS
|
|
# ============================================================================
|
|
convert-core = [
|
|
'numpy>=1.24.0,<3.0.0',
|
|
'pillow>=10.0.0,<13.0.0',
|
|
'rtree>=1.3.0,<2.0.0',
|
|
'scipy>=1.6.0,<2.0.0',
|
|
]
|
|
|
|
extract-core = [
|
|
'docling-slim[convert-core]',
|
|
'polyfactory>=2.22.2',
|
|
]
|
|
|
|
# ============================================================================
|
|
# FORMAT SUPPORT
|
|
# ============================================================================
|
|
|
|
# --- PDF Formats ---
|
|
format-pdf-pypdfium2 = [
|
|
'pypdfium2>=4.30.0,!=4.30.1,<6.0.0',
|
|
]
|
|
|
|
format-pdf-docling = [
|
|
'pypdfium2>=4.30.0,!=4.30.1,<6.0.0',
|
|
'docling-parse>=5.3.2,<6.0.0',
|
|
]
|
|
|
|
format-pdf = [
|
|
'docling-slim[format-pdf-pypdfium2,format-pdf-docling]',
|
|
]
|
|
|
|
# --- Office Formats (office = docx + pptx + xlsx) ---
|
|
format-docx = [
|
|
'python-docx>=1.1.2,<2.0.0',
|
|
]
|
|
|
|
format-pptx = [
|
|
'python-pptx>=1.0.2,<2.0.0',
|
|
]
|
|
|
|
format-xlsx = [
|
|
'openpyxl>=3.1.5,<4.0.0',
|
|
]
|
|
|
|
format-office = [
|
|
'docling-slim[format-docx,format-pptx,format-xlsx]',
|
|
]
|
|
|
|
# --- Web Formats (web = html + markdown) ---
|
|
format-html = [
|
|
'beautifulsoup4>=4.12.3,<5.0.0',
|
|
'lxml>=4.0.0,<7.0.0',
|
|
]
|
|
|
|
format-markdown = [
|
|
'marko>=2.1.2,<3.0.0',
|
|
]
|
|
|
|
format-web = [
|
|
'docling-slim[format-html,format-markdown]',
|
|
]
|
|
|
|
# --- Other Formats ---
|
|
format-latex = [
|
|
'pylatexenc>=2.10,<3.0',
|
|
]
|
|
|
|
format-xml-xbrl = [
|
|
'arelle-release>=2.38.17,<3.0.0',
|
|
]
|
|
|
|
format-html-render = [
|
|
'playwright>=1.58.0',
|
|
]
|
|
|
|
format-audio = [
|
|
'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
|
'openai-whisper>=20250625',
|
|
'numba>=0.63.0',
|
|
]
|
|
|
|
# ============================================================================
|
|
# OCR ENGINES (feat-ocr-*)
|
|
# ============================================================================
|
|
feat-ocr-rapidocr = [
|
|
'rapidocr>=3.8,<4.0.0',
|
|
]
|
|
|
|
feat-ocr-rapidocr-onnx = [
|
|
'rapidocr>=3.8,<4.0.0',
|
|
'onnxruntime>=1.7.0,<2.0.0 ; python_version < "3.14"',
|
|
]
|
|
|
|
feat-ocr-easyocr = [
|
|
'easyocr>=1.7,<2.0',
|
|
# easyocr declares scikit-image with no lower bound; without this pin,
|
|
# resolvers on Python 3.10 backtrack to 0.16.2 (2019), which has no
|
|
# Py3.10 wheels and fails to build from source.
|
|
'scikit-image>=0.19',
|
|
]
|
|
|
|
feat-ocr-tesserocr = [
|
|
'tesserocr>=2.7.1,<3.0.0',
|
|
'pandas>=2.1.4,<4.0.0',
|
|
]
|
|
|
|
feat-ocr-mac = [
|
|
'ocrmac>=1.0.0,<2.0.0 ; sys_platform == "darwin"',
|
|
]
|
|
|
|
# ============================================================================
|
|
# MODELS
|
|
# ============================================================================
|
|
models-local = [
|
|
'torch>=2.2.2,<3.0.0',
|
|
'torchvision>=0,<1',
|
|
'docling-ibm-models>=3.13.0,<4',
|
|
'accelerate>=1.0.0,<2',
|
|
'huggingface_hub>=0.23,<2',
|
|
'defusedxml>=0.7.1,<0.8.0',
|
|
]
|
|
|
|
models-remote = [
|
|
'tritonclient[grpc]>=2.65.0,<3.0.0',
|
|
]
|
|
|
|
models-onnxruntime = [
|
|
'onnxruntime<1.24 ; python_version < "3.14" and sys_platform == "darwin"',
|
|
'onnxruntime-gpu<1.24 ; python_version < "3.14" and (sys_platform == "linux" or sys_platform == "win32")',
|
|
]
|
|
|
|
# Vision Language Models for inline processing
|
|
models-vlm-inline = [
|
|
'transformers>=4.42.0,<6.0.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*',
|
|
'accelerate>=1.2.1,<2.0.0',
|
|
'mlx-vlm>=0.4.3,<1.0.0 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
|
'qwen-vl-utils>=0.0.11',
|
|
'peft>=0.18.1',
|
|
]
|
|
|
|
# ============================================================================
|
|
# FEATURES
|
|
# ============================================================================
|
|
# Document chunking for RAG applications
|
|
feat-chunking = [
|
|
'docling-core[chunking]>=2.73.0,<3.0.0',
|
|
]
|
|
|
|
service-client = [
|
|
'httpx>=0.28,<1.0.0',
|
|
'websockets>=14.0,<17.0',
|
|
]
|
|
|
|
cli = [
|
|
'typer>=0.12.5,<0.22.0',
|
|
'rich>=13.0.0',
|
|
]
|
|
|
|
# ============================================================================
|
|
# CONVENIENCE BUNDLES
|
|
# ============================================================================
|
|
standard = [
|
|
'docling-slim[format-pdf,models-local,feat-ocr-rapidocr,format-office,format-web,format-latex,feat-chunking,extract-core,service-client,cli]',
|
|
]
|
|
|
|
all = [
|
|
'docling-slim[standard,models-vlm-inline,format-audio,format-html-render,format-xml-xbrl,models-remote,models-onnxruntime,feat-ocr-easyocr,feat-ocr-tesserocr,feat-ocr-mac]',
|
|
]
|
|
|
|
[dependency-groups]
|
|
typecheck = [
|
|
"ty==0.0.33",
|
|
"types-setuptools~=70.3",
|
|
"pandas-stubs~=2.1",
|
|
"types-openpyxl~=3.1",
|
|
"types-requests~=2.31",
|
|
"boto3-stubs~=1.37",
|
|
"types-urllib3~=1.26",
|
|
"types-tqdm~=4.67",
|
|
"types-defusedxml>=0.7.0.20250822,<0.8.0",
|
|
]
|
|
pr-fast-checks = [
|
|
{ include-group = "typecheck" },
|
|
"docling-core[chunking] (>=2.73.0,<3.0.0)",
|
|
"pydantic (>=2.0.0,<3.0.0)",
|
|
"pydantic-settings (>=2.3.0,<3.0.0)",
|
|
"ruff==0.15.12",
|
|
]
|
|
dev = [
|
|
{ include-group = "typecheck" },
|
|
"dprint-py==0.53.1.0",
|
|
"prek>=0.3.10",
|
|
"tach>=0.34.0,<1",
|
|
"coverage~=7.6",
|
|
"pytest~=8.3",
|
|
"pytest-cov>=6.1.1",
|
|
"pytest-dependency~=0.6",
|
|
"pytest-durations~=1.6.1",
|
|
"pytest-xdist~=3.3",
|
|
"ipykernel~=6.29",
|
|
"ipywidgets~=8.1",
|
|
"nbqa~=1.9",
|
|
"python-semantic-release~=7.32",
|
|
]
|
|
|
|
docs = [
|
|
"mkdocs-material~=9.5",
|
|
"mkdocs-jupyter>=0.25,<0.26",
|
|
"mkdocs-click~=0.8",
|
|
"mkdocs-redirects~=1.2",
|
|
"mkdocstrings[python]~=0.27",
|
|
"griffe-pydantic~=1.1",
|
|
]
|
|
|
|
examples = [
|
|
"datasets~=2.21",
|
|
"python-dotenv~=1.0",
|
|
"langchain-huggingface>=0.0.3",
|
|
"langchain-milvus~=0.1",
|
|
"langchain-text-splitters>=0.2",
|
|
"modelscope>=1.29.0",
|
|
'gliner>=0.2.21 ; python_version < "3.14"',
|
|
]
|
|
|
|
constraints = [
|
|
'numba>=0.63.0',
|
|
'langchain-core>=0.3.81',
|
|
'pandas>=2.1.4,<3.0.0 ; python_version < "3.11"',
|
|
'pandas>=2.1.4,<4.0.0 ; python_version >= "3.11"',
|
|
]
|
|
|
|
[tool.uv.workspace]
|
|
members = ["packages/docling"]
|
|
|
|
[tool.uv.sources]
|
|
docling = { workspace = true }
|
|
docling-parse = { path = "../docling-parse", editable = true }
|
|
|
|
[tool.uv]
|
|
package = true
|
|
default-groups = "all"
|
|
|
|
[tool.hatch.build.targets.wheel]
|
|
packages = ["docling"]
|
|
|
|
[tool.hatch.build.targets.sdist]
|
|
only-include = ["docling", "pyproject.toml", "README.md", "LICENSE"]
|
|
|
|
[tool.pytest.ini_options]
|
|
markers = [
|
|
"ml_ocr: OCR tests that download or execute OCR model code.",
|
|
"ml_pdf_model: PDF conversion tests that download or execute document AI model code.",
|
|
"ml_vlm: VLM and picture-description tests that download or execute vision-language model code.",
|
|
"ml_asr: ASR tests that download or execute speech model code.",
|
|
"cross_platform: Lightweight smoke tests run on Windows and macOS workflow-dispatch lanes.",
|
|
]
|
|
|
|
[tool.ruff]
|
|
target-version = "py310"
|
|
line-length = 88
|
|
respect-gitignore = true
|
|
|
|
[tool.ruff.format]
|
|
skip-magic-trailing-comma = false
|
|
|
|
[tool.ruff.lint]
|
|
select = [
|
|
"C", # flake8-comprehensions
|
|
"C9", # mccabe
|
|
"E", # pycodestyle errors (default)
|
|
"F", # pyflakes (default)
|
|
"I", # isort
|
|
"PD", # pandas-vet
|
|
"PIE", # pie
|
|
"Q", # flake8-quotes
|
|
"RUF", # Enable all ruff-specific checks
|
|
"S307", # eval
|
|
"W", # pycodestyle warnings
|
|
"ASYNC", # async
|
|
"UP", # pyupgrade
|
|
]
|
|
|
|
ignore = [
|
|
"C408", # Unnecessary `dict()` call (rewrite as a literal)
|
|
"E501", # Line too long, handled by ruff formatter
|
|
"D107", # "Missing docstring in __init__",
|
|
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
|
|
"F811", # "redefinition of the same function"
|
|
"PL", # Pylint
|
|
"RUF012", # Mutable Class Attributes
|
|
"RUF059", # Unused unpacked variables
|
|
"UP006", # List vs list, etc
|
|
"UP007", # Option and Union
|
|
"UP035", # `typing.Set` is deprecated, use `set` instead"
|
|
"UP045", # `Optional[T]` vs `T | None`
|
|
]
|
|
|
|
[tool.ruff.lint.pep8-naming]
|
|
classmethod-decorators = [
|
|
"pydantic.validator",
|
|
]
|
|
|
|
[tool.ruff.lint.per-file-ignores]
|
|
"__init__.py" = ["E402", "F401"]
|
|
"tests/*.py" = ["ASYNC"]
|
|
|
|
[tool.ruff.lint.mccabe]
|
|
max-complexity = 30
|
|
|
|
[tool.ruff.lint.isort]
|
|
combine-as-imports = true
|
|
|
|
[tool.ty.rules]
|
|
all = "warn"
|
|
|
|
[tool.ty.environment]
|
|
python-version = "3.10"
|
|
|
|
[tool.ty.src]
|
|
include = ["docling", ".github/scripts"]
|
|
|
|
[tool.ty.analysis]
|
|
allowed-unresolved-imports = [
|
|
"arelle.**",
|
|
"docling_ibm_models.**",
|
|
"docling_parse.**",
|
|
"easyocr.**",
|
|
"filetype.**",
|
|
"huggingface_hub.**",
|
|
"lxml.**",
|
|
"mlx_vlm.**",
|
|
"networkx.**",
|
|
"ocrmac.**",
|
|
"onnxruntime.**",
|
|
"pylatexenc.**",
|
|
"pypdfium2.**",
|
|
"qwen_vl_utils.**",
|
|
"scipy.**",
|
|
"tesserocr.**",
|
|
"torchvision.**",
|
|
"transformers.**",
|
|
"vllm.**",
|
|
"websockets.**",
|
|
]
|
|
|
|
[tool.ty.terminal]
|
|
output-format = "concise"
|
|
|
|
[tool.semantic_release]
|
|
# for default values check:
|
|
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
|
|
|
version_source = "tag_only"
|
|
branch = "main"
|
|
|
|
# configure types which should trigger minor and patch version bumps respectively
|
|
# (note that they must be a subset of the configured allowed types):
|
|
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
|
|
parser_angular_minor_types = "feat"
|
|
parser_angular_patch_types = "fix,perf"
|