Files
docling/pyproject.toml
T
Michele Dolfi ed32c5e993 feat: Introduce modular docling-slim package (#3285)
* plans folder structure

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* initial plan

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated plan

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* restructure repo for docling and docling-slim

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* transpose package structures

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add all-packages

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated  lock and deps

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* align deps

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more lock like main

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more locked pinning

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add simple README for docling-slim

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix scikit-image issue

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add readme placeholder

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add all extras in package test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* cli in docling-slim

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply formatting

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix testing package

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* override grpcio in no-header test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update package description

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix publish scripts

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update package test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2026-04-24 15:14:57 +02:00

409 lines
10 KiB
TOML

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "docling-slim"
version = "2.91.0" # DO NOT EDIT, updated automatically
description = "Modular version of the Docling package: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
"docling",
"convert",
"document",
"pdf",
"docx",
"html",
"markdown",
"layout model",
"segmentation",
]
classifiers = [
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",
"Operating System :: Microsoft :: Windows",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]
readme = "packages/docling-slim/README.md"
authors = [
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
{ name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
{ name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
{ name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
{ name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
{ name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
{ name = "Peter Staar", email = "taa@zurich.ibm.com" },
]
requires-python = '>=3.10,<4.0'
# MINIMAL BASE (8 packages) - ~50MB
dependencies = [
'pydantic>=2.0.0,<3.0.0',
'docling-core>=2.73.0,<3.0.0',
'pydantic-settings>=2.3.0,<3.0.0',
'filetype>=1.2.0,<2.0.0',
'requests>=2.32.2,<3.0.0',
'certifi>=2024.7.4',
'pluggy>=1.0.0,<2.0.0',
'tqdm>=4.65.0,<5.0.0',
]
[project.urls]
homepage = "https://github.com/docling-project/docling"
repository = "https://github.com/docling-project/docling"
issues = "https://github.com/docling-project/docling/issues"
changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
[project.entry-points.docling]
"docling_defaults" = "docling.models.plugins.defaults"
# CLI scripts (require cli extra: pip install docling-slim[cli])
[project.scripts]
docling = "docling.cli.main:app"
docling-tools = "docling.cli.tools:app"
[project.optional-dependencies]
# ============================================================================
# CORE COMPONENTS
# ============================================================================
convert-core = [
'numpy>=1.24.0,<3.0.0',
'pillow>=10.0.0,<13.0.0',
'rtree>=1.3.0,<2.0.0',
'scipy>=1.6.0,<2.0.0',
]
extract-core = [
'docling-slim[convert-core]',
'polyfactory>=2.22.2',
]
# ============================================================================
# FORMAT SUPPORT
# ============================================================================
# --- PDF Formats ---
format-pdf-pypdfium2 = [
'pypdfium2>=4.30.0,!=4.30.1,<6.0.0',
]
format-pdf-docling = [
'pypdfium2>=4.30.0,!=4.30.1,<6.0.0',
'docling-parse>=5.3.2,<6.0.0',
]
format-pdf = [
'docling-slim[format-pdf-pypdfium2,format-pdf-docling]',
]
# --- Office Formats (office = docx + pptx + xlsx) ---
format-docx = [
'python-docx>=1.1.2,<2.0.0',
]
format-pptx = [
'python-pptx>=1.0.2,<2.0.0',
]
format-xlsx = [
'openpyxl>=3.1.5,<4.0.0',
]
format-office = [
'docling-slim[format-docx,format-pptx,format-xlsx]',
]
# --- Web Formats (web = html + markdown) ---
format-html = [
'beautifulsoup4>=4.12.3,<5.0.0',
'lxml>=4.0.0,<7.0.0',
]
format-markdown = [
'marko>=2.1.2,<3.0.0',
]
format-web = [
'docling-slim[format-html,format-markdown]',
]
# --- Other Formats ---
format-latex = [
'pylatexenc>=2.10,<3.0',
]
format-xml-xbrl = [
'arelle-release>=2.38.17,<3.0.0',
]
format-html-render = [
'playwright>=1.58.0',
]
format-audio = [
'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
'openai-whisper>=20250625',
'numba>=0.63.0',
]
# ============================================================================
# OCR ENGINES (feat-ocr-*)
# ============================================================================
feat-ocr-rapidocr = [
'rapidocr>=3.8,<4.0.0',
]
feat-ocr-rapidocr-onnx = [
'rapidocr>=3.8,<4.0.0',
'onnxruntime>=1.7.0,<2.0.0 ; python_version < "3.14"',
]
feat-ocr-easyocr = [
'easyocr>=1.7,<2.0',
# easyocr declares scikit-image with no lower bound; without this pin,
# resolvers on Python 3.10 backtrack to 0.16.2 (2019), which has no
# Py3.10 wheels and fails to build from source.
'scikit-image>=0.19',
]
feat-ocr-tesserocr = [
'tesserocr>=2.7.1,<3.0.0',
'pandas>=2.1.4,<4.0.0',
]
feat-ocr-mac = [
'ocrmac>=1.0.0,<2.0.0 ; sys_platform == "darwin"',
]
# ============================================================================
# MODELS
# ============================================================================
models-local = [
'torch>=2.2.2,<3.0.0',
'torchvision>=0,<1',
'docling-ibm-models>=3.13.0,<4',
'accelerate>=1.0.0,<2',
'huggingface_hub>=0.23,<2',
'defusedxml>=0.7.1,<0.8.0',
]
models-remote = [
'tritonclient[grpc]>=2.65.0,<3.0.0',
]
models-onnxruntime = [
'onnxruntime<1.24 ; python_version < "3.14" and sys_platform == "darwin"',
'onnxruntime-gpu<1.24 ; python_version < "3.14" and (sys_platform == "linux" or sys_platform == "win32")',
]
# Vision Language Models for inline processing
models-vlm-inline = [
'transformers>=4.42.0,<6.0.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*',
'accelerate>=1.2.1,<2.0.0',
'mlx-vlm>=0.4.3,<1.0.0 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
'qwen-vl-utils>=0.0.11',
'peft>=0.18.1',
]
# ============================================================================
# FEATURES
# ============================================================================
# Document chunking for RAG applications
feat-chunking = [
'docling-core[chunking]>=2.73.0,<3.0.0',
]
service-client = [
'httpx>=0.28,<1.0.0',
'websockets>=14.0,<17.0',
]
cli = [
'typer>=0.12.5,<0.22.0',
'rich>=13.0.0',
]
# ============================================================================
# CONVENIENCE BUNDLES
# ============================================================================
standard = [
'docling-slim[format-pdf,models-local,feat-ocr-rapidocr,format-office,format-web,format-latex,feat-chunking,extract-core,service-client,cli]',
]
all = [
'docling-slim[standard,models-vlm-inline,format-audio,format-html-render,format-xml-xbrl,models-remote,models-onnxruntime,feat-ocr-easyocr,feat-ocr-tesserocr,feat-ocr-mac]',
]
[dependency-groups]
dev = [
"pre-commit~=3.7",
"mypy~=1.10",
"types-setuptools~=70.3",
"pandas-stubs~=2.1",
"types-openpyxl~=3.1",
"types-requests~=2.31",
"boto3-stubs~=1.37",
"types-urllib3~=1.26",
"types-tqdm~=4.67",
"coverage~=7.6",
"pytest~=8.3",
"pytest-cov>=6.1.1",
"pytest-dependency~=0.6",
"pytest-durations~=1.6.1",
"pytest-xdist~=3.3",
"ipykernel~=6.29",
"ipywidgets~=8.1",
"nbqa~=1.9",
"python-semantic-release~=7.32",
"types-defusedxml>=0.7.0.20250822,<0.8.0",
]
docs = [
"mkdocs-material~=9.5",
"mkdocs-jupyter>=0.25,<0.26",
"mkdocs-click~=0.8",
"mkdocs-redirects~=1.2",
"mkdocstrings[python]~=0.27",
"griffe-pydantic~=1.1",
]
examples = [
"datasets~=2.21",
"python-dotenv~=1.0",
"langchain-huggingface>=0.0.3",
"langchain-milvus~=0.1",
"langchain-text-splitters>=0.2",
"modelscope>=1.29.0",
'gliner>=0.2.21 ; python_version < "3.14"',
]
constraints = [
'numba>=0.63.0',
'langchain-core>=0.3.81',
'pandas>=2.1.4,<3.0.0 ; python_version < "3.11"',
'pandas>=2.1.4,<4.0.0 ; python_version >= "3.11"',
]
[tool.uv.workspace]
members = ["packages/docling"]
[tool.uv.sources]
docling = { workspace = true }
[tool.uv]
package = true
default-groups = "all"
[tool.hatch.build.targets.wheel]
packages = ["docling"]
[tool.hatch.build.targets.sdist]
only-include = ["docling", "pyproject.toml", "README.md", "LICENSE"]
[tool.ruff]
target-version = "py310"
line-length = 88
respect-gitignore = true
[tool.ruff.format]
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = [
"C", # flake8-comprehensions
"C9", # mccabe
"E", # pycodestyle errors (default)
"F", # pyflakes (default)
"I", # isort
"PD", # pandas-vet
"PIE", # pie
"Q", # flake8-quotes
"RUF", # Enable all ruff-specific checks
"S307", # eval
"W", # pycodestyle warnings
"ASYNC", # async
"UP", # pyupgrade
]
ignore = [
"C408", # Unnecessary `dict()` call (rewrite as a literal)
"E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__",
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811", # "redefinition of the same function"
"PL", # Pylint
"RUF012", # Mutable Class Attributes
"UP006", # List vs list, etc
"UP007", # Option and Union
"UP035", # `typing.Set` is deprecated, use `set` instead"
]
[tool.ruff.lint.pep8-naming]
classmethod-decorators = [
"pydantic.validator",
]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401"]
"tests/*.py" = ["ASYNC"]
[tool.ruff.lint.mccabe]
max-complexity = 30
[tool.ruff.lint.isort]
combine-as-imports = true
[tool.mypy]
pretty = true
no_implicit_optional = true
plugins = "pydantic.mypy"
python_version = "3.10"
[[tool.mypy.overrides]]
module = [
"docling_parse.*",
"pypdfium2.*",
"networkx.*",
"scipy.*",
"filetype.*",
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"ocrmac.*",
"onnxruntime.*",
"mlx_vlm.*",
"lxml.*",
"huggingface_hub.*",
"transformers.*",
"pylatexenc.*",
"vllm.*",
"qwen_vl_utils.*",
"arelle.*",
"websockets.*",
"torchvision.*",
"torchvision.transforms.*",
]
ignore_missing_imports = true
[tool.semantic_release]
# for default values check:
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
version_source = "tag_only"
branch = "main"
# configure types which should trigger minor and patch version bumps respectively
# (note that they must be a subset of the configured allowed types):
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
parser_angular_minor_types = "feat"
parser_angular_patch_types = "fix,perf"