feat: Introduce modular docling-slim package (#3285)

* plans folder structure

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* initial plan

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated plan

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* restructure repo for docling and docling-slim

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* transpose package structures

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add all-packages

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated  lock and deps

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* align deps

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more lock like main

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more locked pinning

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add simple README for docling-slim

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix scikit-image issue

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add readme placeholder

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add all extras in package test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* cli in docling-slim

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply formatting

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix testing package

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* override grpcio in no-header test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update package description

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix publish scripts

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update package test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2026-04-24 15:14:57 +02:00
committed by GitHub
parent a6a37ca895
commit ed32c5e993
17 changed files with 3490 additions and 1543 deletions
+27
View File
@@ -0,0 +1,27 @@
#!/bin/bash
set -e # trigger failure on error - do not remove!
set -x # display command on output
# Build each package into its own dist subdirectory so the PyPI publish
# action can upload them independently (otherwise a single `dist/` causes
# the second publish step to re-upload files and fail on `skip-existing: false`).
# Build docling-slim package (from repo root — source co-located)
echo "Building docling-slim package..."
uv build --out-dir dist/docling-slim
# Build docling package (meta-package, dependency-only wheel)
echo "Building docling package..."
# Backup placeholder README and copy root README for build
mv packages/docling/README.md packages/docling/README.md.placeholder
cp README.md packages/docling/README.md
(cd packages/docling && uv build --out-dir ../../dist/docling)
# Restore placeholder README
mv packages/docling/README.md.placeholder packages/docling/README.md
echo "Build complete."
echo "docling-slim artifacts:"
ls -lh dist/docling-slim/
echo "docling artifacts:"
ls -lh dist/docling/
+15 -3
View File
@@ -9,9 +9,21 @@ if [ -z "${TARGET_VERSION}" ]; then
fi
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
# update package version
# update package versions:
# - root pyproject.toml = docling-slim
# - packages/docling/pyproject.toml = docling (meta-package)
uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
UV_FROZEN=0 uv lock --upgrade-package docling
uvx --from=toml-cli toml set --toml-path=packages/docling/pyproject.toml project.version "${TARGET_VERSION}"
# update docling-slim dependency version in docling package
uvx --from=toml-cli toml set --toml-path=packages/docling/pyproject.toml "project.dependencies[0]" "docling-slim[standard]==${TARGET_VERSION}"
# update all re-exported extras in docling package
for extra in easyocr tesserocr ocrmac vlm rapidocr asr htmlrender remote-serving onnxruntime xbrl; do
uvx --from=toml-cli toml set --toml-path=packages/docling/pyproject.toml "project.optional-dependencies.${extra}[0]" "docling-slim[*]==${TARGET_VERSION}"
done
UV_FROZEN=0 uv lock --upgrade-package docling --upgrade-package docling-slim
# collect release notes
REL_NOTES=$(mktemp)
@@ -31,7 +43,7 @@ mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
# push changes
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git add pyproject.toml uv.lock "${CHGLOG_FILE}"
git add pyproject.toml packages/docling/pyproject.toml uv.lock "${CHGLOG_FILE}"
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
git commit -m "${COMMIT_MSG}"
git push origin main
+34 -31
View File
@@ -50,7 +50,7 @@ jobs:
pre-commit|${{ env.PY }}|
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
run: uv sync --frozen --all-extras --all-packages
- name: Check style
run: |
@@ -92,7 +92,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
run: uv sync --frozen --all-extras --all-packages
- name: Cache Models
uses: actions/cache@v5
@@ -159,7 +159,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
run: uv sync --frozen --all-extras --all-packages
- name: Cache Models
uses: actions/cache@v5
@@ -231,7 +231,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
run: uv sync --frozen --all-extras --all-packages
- name: Cache Models
uses: actions/cache@v5
@@ -301,21 +301,21 @@ jobs:
echo "=========================================="
echo "Testing Python $py_version"
echo "=========================================="
# Create virtual environment with uv
uv venv /tmp/venv-${py_version} --python=${py_version}
source /tmp/venv-${py_version}/bin/activate
# Install package with pip (no lock file)
uv pip install --torch-backend=cpu -e .[easyocr,tesserocr,vlm,rapidocr,asr]
# Install docling-slim with pip (no lock file)
uv pip install --torch-backend=cpu -e .[all]
# Run basic import test
python -c "import docling; from docling.document_converter import DocumentConverter; print('Import successful for Python ${py_version}')"
python -c "import docling; print('Import successful for Python ${py_version}')"
# Cleanup
deactivate
rm -rf /tmp/venv-${py_version}
echo "Python $py_version: PASSED"
echo ""
done
@@ -334,16 +334,16 @@ jobs:
echo "=========================================="
echo "Testing Python $py_version (no dev headers)"
echo "=========================================="
# Create virtual environment with uv
uv venv /tmp/venv-nodev-${py_version} --python=${py_version}
source /tmp/venv-nodev-${py_version}/bin/activate
# Find and remove Python.h from the Python installation
echo "Removing Python development headers from Python installation..."
python_include_dir=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")
echo "Python include directory: $python_include_dir"
if [ -f "$python_include_dir/Python.h" ]; then
echo "Found Python.h, removing it and other headers..."
# Use sudo if the directory is system-owned
@@ -356,7 +356,7 @@ jobs:
else
echo "Warning: Python.h not found at expected location"
fi
# Verify that compilation fails without dev headers
# Try to install numpy from source (sdist) - this should fail
echo "Verifying compilation fails without dev headers..."
@@ -364,7 +364,7 @@ jobs:
uv pip install --no-binary=:all: numpy==1.26.4 > /tmp/numpy-install-${py_version}.log 2>&1
numpy_exit_code=$?
set -e # Re-enable exit on error
if [ $numpy_exit_code -eq 0 ]; then
echo "ERROR: numpy installation from source succeeded, but it should have failed without dev headers!"
cat /tmp/numpy-install-${py_version}.log
@@ -378,19 +378,18 @@ jobs:
echo "Warning: Error message doesn't explicitly mention missing headers, but compilation failed as expected"
fi
fi
# Install package with pip (no lock file, no compilation)
# Install without extras that require compilation (tesserocr requires dev headers)
# Note: Not using --only-binary since some packages are sdist-only but don't require compilation
uv pip install --torch-backend=cpu -e .[easyocr,vlm,rapidocr,asr]
# Install docling-slim with pip (no lock file, no compilation)
echo "grpcio>=1.71.0" > override-grpcio.txt
uv pip install --torch-backend=cpu -e ".[all]" --overrides override-grpcio.txt
# Run basic import test
python -c "import docling; from docling.document_converter import DocumentConverter; print('Import successful for Python ${py_version} without dev headers')"
python -c "import docling; print('Import successful for Python ${py_version} without dev headers')"
# Cleanup
deactivate
rm -rf /tmp/venv-nodev-${py_version}
echo "Python $py_version (no dev headers): PASSED"
echo ""
done
@@ -412,11 +411,15 @@ jobs:
- name: Install dependencies
run: uv sync --all-extras
- name: Build package
run: uv build
- name: Build packages
run: bash .github/scripts/build-packages.sh
- name: Check content of wheel
run: unzip -l dist/*.whl
- name: Check content of wheels
run: |
for whl in dist/*/*.whl; do
echo "=== $whl ==="
unzip -l "$whl"
done
- name: Store the distribution packages
uses: actions/upload-artifact@v6
@@ -447,7 +450,7 @@ jobs:
- name: Install package
run: |
uv pip install dist/*.whl
uv pip install --find-links dist/docling-slim/ dist/docling/docling-*.whl
- name: Run docling
run: uv run docling --help
+3 -1
View File
@@ -19,8 +19,10 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
enable-cache: true
- name: Install Python Dependencies
run: uv sync --frozen --all-extras --all-packages
- name: Build docs
run: uv run mkdocs build --verbose --clean
run: uv run --no-sync mkdocs build --verbose --clean
- name: Build and push docs
if: inputs.deploy
run: uv run --no-sync mkdocs gh-deploy --force
+54 -10
View File
@@ -1,4 +1,4 @@
name: "Build and publish package"
name: "Build and publish packages"
on:
release:
@@ -11,16 +11,11 @@ permissions:
contents: read
jobs:
build-and-publish:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.12']
environment:
name: pypi
url: https://pypi.org/p/docling
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- uses: actions/checkout@v6
- name: Install uv and set the python version
@@ -30,9 +25,58 @@ jobs:
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras
- name: Build package
run: uv build
- name: Publish distribution 📦 to PyPI
- name: Build packages
run: bash .github/scripts/build-packages.sh
- name: Upload build artifacts
uses: actions/upload-artifact@v4
with:
name: dist
path: dist/
publish-docling-slim:
needs: build
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/docling-slim
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: dist
path: dist/
- name: Publish docling-slim to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
attestations: true
packages-dir: dist/docling-slim/
skip-existing: false
publish-docling:
# docling is a meta-package that depends on docling-slim, so publish it
# after docling-slim is available on PyPI.
needs: publish-docling-slim
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/docling
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: dist
path: dist/
- name: Wait for docling-slim to be available on PyPI
run: |
echo "Waiting 60 seconds for docling-slim to propagate on PyPI..."
sleep 60
- name: Publish docling to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
attestations: true
packages-dir: dist/docling/
skip-existing: false
View File
File diff suppressed because it is too large Load Diff
View File
View File
+20 -2
View File
@@ -1,6 +1,7 @@
import datetime
import logging
import re
import sys
import tempfile
import time
import warnings
@@ -8,8 +9,25 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Annotated, Type
import rich.table
import typer
# Check for CLI dependencies
try:
import rich.table
import typer
except ImportError as e:
missing_package = str(e).split("'")[1] if "'" in str(e) else "typer or rich"
print(
f"Error: Missing required CLI dependency '{missing_package}'", file=sys.stderr
)
print("\nThe docling CLI requires additional dependencies.", file=sys.stderr)
print("Please install them using one of the following options:\n", file=sys.stderr)
print(" 1. Install the full docling package (recommended):", file=sys.stderr)
print(" pip install docling\n", file=sys.stderr)
print(" 2. Install docling-slim with CLI support:", file=sys.stderr)
print(" pip install docling-slim[cli]\n", file=sys.stderr)
print(" 3. Install just the missing dependencies:", file=sys.stderr)
print(" pip install typer rich\n", file=sys.stderr)
sys.exit(1)
from docling_core.transforms.serializer.html import (
HTMLDocSerializer,
HTMLOutputStyle,
+20 -3
View File
@@ -1,12 +1,29 @@
import logging
import sys
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Optional
import typer
from rich.console import Console
from rich.logging import RichHandler
# Check for CLI dependencies
try:
import typer
from rich.console import Console
from rich.logging import RichHandler
except ImportError as e:
missing_package = str(e).split("'")[1] if "'" in str(e) else "typer or rich"
print(
f"Error: Missing required CLI dependency '{missing_package}'", file=sys.stderr
)
print("\nThe docling-tools CLI requires additional dependencies.", file=sys.stderr)
print("Please install them using one of the following options:\n", file=sys.stderr)
print(" 1. Install the full docling package (recommended):", file=sys.stderr)
print(" pip install docling\n", file=sys.stderr)
print(" 2. Install docling-slim with CLI support:", file=sys.stderr)
print(" pip install docling-slim[cli]\n", file=sys.stderr)
print(" 3. Install just the missing dependencies:", file=sys.stderr)
print(" pip install typer rich\n", file=sys.stderr)
sys.exit(1)
from docling.datamodel.settings import settings
from docling.models.utils.hf_model_download import download_hf_model
+19 -1
View File
@@ -1,4 +1,22 @@
import typer
import sys
# Check for CLI dependencies
try:
import typer
except ImportError as e:
missing_package = str(e).split("'")[1] if "'" in str(e) else "typer"
print(
f"Error: Missing required CLI dependency '{missing_package}'", file=sys.stderr
)
print("\nThe docling-tools CLI requires additional dependencies.", file=sys.stderr)
print("Please install them using one of the following options:\n", file=sys.stderr)
print(" 1. Install the full docling package (recommended):", file=sys.stderr)
print(" pip install docling\n", file=sys.stderr)
print(" 2. Install docling-slim with CLI support:", file=sys.stderr)
print(" pip install docling-slim[cli]\n", file=sys.stderr)
print(" 3. Install just the missing dependencies:", file=sys.stderr)
print(" pip install typer rich\n", file=sys.stderr)
sys.exit(1)
from docling.cli.models import app as models_app
+126
View File
@@ -0,0 +1,126 @@
# Docling Slim
**Lightweight SDK for parsing documents with minimal dependencies and opt-in extras**
Docling Slim is a minimal-dependency version of Docling that allows you to install only the components you need. It provides the core document processing functionality with ~50MB of base dependencies, and you can add specific features through optional extras.
## When to Use Docling Slim
- **Use `docling`** (recommended): If you want the full-featured experience with all standard capabilities
- **Use `docling-slim`**: If you need fine-grained control over dependencies or want to minimize installation size
## For Most Users: Use the Main Docling Package
We recommend most users install the full-featured `docling` package instead:
```bash
pip install docling
```
The `docling` package includes all standard features, the CLI tools, and is the easiest way to get started. Visit the [main Docling documentation](https://docling-project.github.io/docling/) for complete guides and examples.
## Installation
### With Specific Features
```bash
# PDF support with local models
pip install docling-slim[format-pdf,models-local]
# Office formats only
pip install docling-slim[format-office]
# PDF + CLI
pip install docling-slim[format-pdf,cli]
# Docling service client for using the Docling Serve API
pip install docling-slim[service-client]
```
## Available Extras
### Convenience Bundles
| Extra | Description | Use Case |
|-------|-------------|----------|
| `standard` | All standard features (same as `docling` package) | Full-featured usage |
| `all` | All available extras | Complete installation |
### CLI
| Extra | Description | Use Case |
|-------|-------------|----------|
| `cli` | Command-line interface (typer, rich) | CLI tools (docling, docling-tools) |
### Core Components
| Extra | Description | Use Case |
|-------|-------------|----------|
| `convert-core` | Core conversion components (numpy, pillow, scipy) | Basic document conversion |
| `extract-core` | Structured information extraction | Data extraction from documents |
### Format Support
#### PDF Formats
| Extra | Description | Use Case |
|-------|-------------|----------|
| `format-pdf` | PDF parsing (pypdfium2 + docling-parse) | PDF documents |
| `format-pdf-pypdfium2` | PDF rendering only | Lightweight PDF support |
| `format-pdf-docling` | Advanced PDF parsing | Complex PDF layouts |
#### Office Formats (office = docx + pptx + xlsx)
| Extra | Description | Use Case |
|-------|-------------|----------|
| `format-office` | All Office formats | Microsoft Office documents |
| `format-docx` | Microsoft Word documents | .docx files |
| `format-pptx` | Microsoft PowerPoint | .pptx files |
| `format-xlsx` | Microsoft Excel | .xlsx files |
#### Web Formats (web = html + markdown)
| Extra | Description | Use Case |
|-------|-------------|----------|
| `format-web` | HTML and Markdown | Web content |
| `format-html` | HTML parsing | Web pages and HTML files |
| `format-markdown` | Markdown parsing | .md files |
#### Other Formats
| Extra | Description | Use Case |
|-------|-------------|----------|
| `format-latex` | LaTeX documents | .tex files |
| `format-xml-xbrl` | XBRL financial reports | Financial documents |
| `format-html-render` | HTML rendering with Playwright | Dynamic web content |
| `format-audio` | Audio transcription (Whisper) | .wav, .mp3 files |
### OCR Engines
| Extra | Description | Use Case |
|-------|-------------|----------|
| `feat-ocr-rapidocr` | RapidOCR (lightweight) | Fast OCR |
| `feat-ocr-rapidocr-onnx` | RapidOCR with ONNX runtime | Optimized OCR |
| `feat-ocr-easyocr` | EasyOCR | Multi-language OCR |
| `feat-ocr-tesserocr` | Tesseract OCR | High-accuracy OCR |
| `feat-ocr-mac` | macOS native OCR | macOS only |
### Models
| Extra | Description | Use Case |
|-------|-------------|----------|
| `models-local` | Local PyTorch models | GPU/CPU inference |
| `models-remote` | Remote model serving (Triton) | Production deployments |
| `models-onnxruntime` | ONNX Runtime acceleration | Optimized inference |
| `models-vlm-inline` | Vision Language Models | Image understanding, inline processing |
### Other features
| Extra | Description | Use Case |
|-------|-------------|----------|
| `feat-chunking` | Document chunking | RAG applications |
| `service-client` | Docling service client | Remote processing |
## License
MIT License - See [LICENSE](https://github.com/docling-project/docling/blob/main/LICENSE)
+7
View File
@@ -0,0 +1,7 @@
# Docling
This is a placeholder README for the `docling` meta-package.
For the full README, see the [root README.md](../../README.md) in the repository.
The actual README content is copied from the root during the build process.
+92
View File
@@ -0,0 +1,92 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "docling"
version = "2.91.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
"docling",
"convert",
"document",
"pdf",
"docx",
"html",
"markdown",
"layout model",
"segmentation",
"table structure",
"table former",
]
classifiers = [
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",
"Operating System :: Microsoft :: Windows",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]
readme = "README.md"
authors = [
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
{ name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
{ name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
{ name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
{ name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
{ name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
{ name = "Peter Staar", email = "taa@zurich.ibm.com" },
]
requires-python = '>=3.10,<4.0'
# Meta-package: pulls in docling-slim with standard extras (includes CLI).
# The `docling` Python module itself is provided by docling-slim.
# CLI entry points are now defined in docling-slim's pyproject.toml.
dependencies = [
'docling-slim[standard]==2.91.0',
]
[project.urls]
homepage = "https://github.com/docling-project/docling"
repository = "https://github.com/docling-project/docling"
issues = "https://github.com/docling-project/docling/issues"
changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
[tool.uv.sources]
# For local development: use workspace member
docling-slim = { workspace = true }
# Re-export slim extras for convenience
[project.optional-dependencies]
easyocr = ['docling-slim[feat-ocr-easyocr]==2.91.0']
tesserocr = ['docling-slim[feat-ocr-tesserocr]==2.91.0']
ocrmac = ['docling-slim[feat-ocr-mac]==2.91.0']
vlm = ['docling-slim[models-vlm-inline]==2.91.0']
rapidocr = ['docling-slim[feat-ocr-rapidocr-onnx]==2.91.0']
chunking = ['docling-slim[feat-chunking]==2.91.0']
format-audio = ['docling-slim[format-audio]==2.91.0']
format-html-render = ['docling-slim[format-html-render]==2.91.0']
models-remote = ['docling-slim[models-remote]==2.91.0']
models-onnxruntime = ['docling-slim[models-onnxruntime]==2.91.0']
format-xml-xbrl = ['docling-slim[format-xml-xbrl]==2.91.0']
# Dependency-only wheel: no Python modules shipped here. All source lives in
# the docling-slim wheel (built from the repo root). This avoids the prior
# bug where both wheels shipped the same `docling/` module and collided on
# install.
[tool.hatch.build.targets.wheel]
bypass-selection = true
[tool.hatch.build.targets.sdist]
only-include = ["pyproject.toml", "README.md"]
[tool.uv]
package = true
+223 -118
View File
@@ -1,7 +1,11 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "docling"
name = "docling-slim"
version = "2.91.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
description = "Modular version of the Docling package: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
"docling",
@@ -13,8 +17,6 @@ keywords = [
"markdown",
"layout model",
"segmentation",
"table structure",
"table former",
]
classifiers = [
"Operating System :: MacOS :: MacOS X",
@@ -31,7 +33,7 @@ classifiers = [
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]
readme = "README.md"
readme = "packages/docling-slim/README.md"
authors = [
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
{ name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
@@ -42,40 +44,17 @@ authors = [
{ name = "Peter Staar", email = "taa@zurich.ibm.com" },
]
requires-python = '>=3.10,<4.0'
# MINIMAL BASE (8 packages) - ~50MB
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.73.0,<3.0.0)',
'docling-parse (>=5.3.2,<6.0.0)',
'docling-ibm-models>=3.13.0,<4',
'torch (>=2.2.2,<3.0.0)',
'torchvision (>=0,<1)',
'filetype (>=1.2.0,<2.0.0)',
'pypdfium2 (>=4.30.0,!=4.30.1,<6.0.0)',
'pydantic-settings (>=2.3.0,<3.0.0)',
'huggingface_hub (>=0.23,<2)',
'httpx (>=0.28,<1.0.0)',
'requests (>=2.32.2,<3.0.0)',
'ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"',
'rapidocr (>=3.8,<4.0.0)',
'certifi (>=2024.7.4)',
'rtree (>=1.3.0,<2.0.0)',
'typer (>=0.12.5,<0.22.0)',
'python-docx (>=1.1.2,<2.0.0)',
'python-pptx (>=1.0.2,<2.0.0)',
'beautifulsoup4 (>=4.12.3,<5.0.0)',
'pandas (>=2.1.4,<4.0.0)',
'marko (>=2.1.2,<3.0.0)',
'openpyxl (>=3.1.5,<4.0.0)',
'lxml (>=4.0.0,<7.0.0)',
'pillow (>=10.0.0,<13.0.0)',
'tqdm (>=4.65.0,<5.0.0)',
'pluggy (>=1.0.0,<2.0.0)',
'pylatexenc (>=2.10,<3.0)',
'scipy (>=1.6.0,<2.0.0)',
"accelerate>=1.0.0,<2",
"polyfactory>=2.22.2",
"defusedxml (>=0.7.1, <0.8.0)",
"websockets (>=14.0,<17.0)",
'pydantic>=2.0.0,<3.0.0',
'docling-core>=2.73.0,<3.0.0',
'pydantic-settings>=2.3.0,<3.0.0',
'filetype>=1.2.0,<2.0.0',
'requests>=2.32.2,<3.0.0',
'certifi>=2024.7.4',
'pluggy>=1.0.0,<2.0.0',
'tqdm>=4.65.0,<5.0.0',
]
[project.urls]
@@ -87,66 +66,207 @@ changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
[project.entry-points.docling]
"docling_defaults" = "docling.models.plugins.defaults"
# CLI scripts (require cli extra: pip install docling-slim[cli])
[project.scripts]
docling = "docling.cli.main:app"
docling-tools = "docling.cli.tools:app"
[project.optional-dependencies]
easyocr = ['easyocr (>=1.7,<2.0)']
tesserocr = ['tesserocr (>=2.7.1,<3.0.0)']
ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
htmlrender = ["playwright>=1.58.0"]
vlm = [
'transformers (>=4.42.0,<6.0.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*)',
'accelerate (>=1.2.1,<2.0.0)',
'mlx-vlm (>=0.4.3,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
# 'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and python_version < "3.14" and sys_platform == "linux" and platform_machine == "x86_64"',
"qwen-vl-utils>=0.0.11",
"peft>=0.18.1",
# ============================================================================
# CORE COMPONENTS
# ============================================================================
convert-core = [
'numpy>=1.24.0,<3.0.0',
'pillow>=10.0.0,<13.0.0',
'rtree>=1.3.0,<2.0.0',
'scipy>=1.6.0,<2.0.0',
]
rapidocr = [
'rapidocr (>=3.8,<4.0.0)',
'onnxruntime (>=1.7.0,<2.0.0) ; python_version < "3.14"',
extract-core = [
'docling-slim[convert-core]',
'polyfactory>=2.22.2',
]
onnxruntime = [
'onnxruntime (<1.24) ; python_version < "3.14" and sys_platform == "darwin"',
'onnxruntime-gpu (<1.24) ; python_version < "3.14" and (sys_platform == "linux" or sys_platform == "win32")',
# ============================================================================
# FORMAT SUPPORT
# ============================================================================
# --- PDF Formats ---
format-pdf-pypdfium2 = [
'pypdfium2>=4.30.0,!=4.30.1,<6.0.0',
]
asr = [
'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
'openai-whisper>=20250625',
'numba>=0.63.0', # Ensure Python 3.11+ compatibility (llvmlite constraint)
format-pdf-docling = [
'pypdfium2>=4.30.0,!=4.30.1,<6.0.0',
'docling-parse>=5.3.2,<6.0.0',
]
xbrl = [
"arelle-release (>=2.38.17,<3.0.0)",
format-pdf = [
'docling-slim[format-pdf-pypdfium2,format-pdf-docling]',
]
remote-serving = [
'tritonclient[grpc] (>=2.65.0,<3.0.0)',
# --- Office Formats (office = docx + pptx + xlsx) ---
format-docx = [
'python-docx>=1.1.2,<2.0.0',
]
format-pptx = [
'python-pptx>=1.0.2,<2.0.0',
]
format-xlsx = [
'openpyxl>=3.1.5,<4.0.0',
]
format-office = [
'docling-slim[format-docx,format-pptx,format-xlsx]',
]
# --- Web Formats (web = html + markdown) ---
format-html = [
'beautifulsoup4>=4.12.3,<5.0.0',
'lxml>=4.0.0,<7.0.0',
]
format-markdown = [
'marko>=2.1.2,<3.0.0',
]
format-web = [
'docling-slim[format-html,format-markdown]',
]
# --- Other Formats ---
format-latex = [
'pylatexenc>=2.10,<3.0',
]
format-xml-xbrl = [
'arelle-release>=2.38.17,<3.0.0',
]
format-html-render = [
'playwright>=1.58.0',
]
format-audio = [
'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
'openai-whisper>=20250625',
'numba>=0.63.0',
]
# ============================================================================
# OCR ENGINES (feat-ocr-*)
# ============================================================================
feat-ocr-rapidocr = [
'rapidocr>=3.8,<4.0.0',
]
feat-ocr-rapidocr-onnx = [
'rapidocr>=3.8,<4.0.0',
'onnxruntime>=1.7.0,<2.0.0 ; python_version < "3.14"',
]
feat-ocr-easyocr = [
'easyocr>=1.7,<2.0',
# easyocr declares scikit-image with no lower bound; without this pin,
# resolvers on Python 3.10 backtrack to 0.16.2 (2019), which has no
# Py3.10 wheels and fails to build from source.
'scikit-image>=0.19',
]
feat-ocr-tesserocr = [
'tesserocr>=2.7.1,<3.0.0',
'pandas>=2.1.4,<4.0.0',
]
feat-ocr-mac = [
'ocrmac>=1.0.0,<2.0.0 ; sys_platform == "darwin"',
]
# ============================================================================
# MODELS
# ============================================================================
models-local = [
'torch>=2.2.2,<3.0.0',
'torchvision>=0,<1',
'docling-ibm-models>=3.13.0,<4',
'accelerate>=1.0.0,<2',
'huggingface_hub>=0.23,<2',
'defusedxml>=0.7.1,<0.8.0',
]
models-remote = [
'tritonclient[grpc]>=2.65.0,<3.0.0',
]
models-onnxruntime = [
'onnxruntime<1.24 ; python_version < "3.14" and sys_platform == "darwin"',
'onnxruntime-gpu<1.24 ; python_version < "3.14" and (sys_platform == "linux" or sys_platform == "win32")',
]
# Vision Language Models for inline processing
models-vlm-inline = [
'transformers>=4.42.0,<6.0.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*',
'accelerate>=1.2.1,<2.0.0',
'mlx-vlm>=0.4.3,<1.0.0 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
'qwen-vl-utils>=0.0.11',
'peft>=0.18.1',
]
# ============================================================================
# FEATURES
# ============================================================================
# Document chunking for RAG applications
feat-chunking = [
'docling-core[chunking]>=2.73.0,<3.0.0',
]
service-client = [
'httpx>=0.28,<1.0.0',
'websockets>=14.0,<17.0',
]
cli = [
'typer>=0.12.5,<0.22.0',
'rich>=13.0.0',
]
# ============================================================================
# CONVENIENCE BUNDLES
# ============================================================================
standard = [
'docling-slim[format-pdf,models-local,feat-ocr-rapidocr,format-office,format-web,format-latex,feat-chunking,extract-core,service-client,cli]',
]
all = [
'docling-slim[standard,models-vlm-inline,format-audio,format-html-render,format-xml-xbrl,models-remote,models-onnxruntime,feat-ocr-easyocr,feat-ocr-tesserocr,feat-ocr-mac]',
]
[dependency-groups]
dev = [
"pre-commit~=3.7",
"mypy~=1.10",
"types-setuptools~=70.3",
"pandas-stubs~=2.1",
"types-openpyxl~=3.1",
"types-requests~=2.31",
"boto3-stubs~=1.37",
"types-urllib3~=1.26",
"types-tqdm~=4.67",
"coverage~=7.6",
"pytest~=8.3",
"pytest-cov>=6.1.1",
"pytest-dependency~=0.6",
"pytest-durations~=1.6.1",
"pytest-xdist~=3.3",
"ipykernel~=6.29",
"ipywidgets~=8.1",
"nbqa~=1.9",
"python-semantic-release~=7.32",
"types-defusedxml (>=0.7.0.20250822, <0.8.0)",
"pre-commit~=3.7",
"mypy~=1.10",
"types-setuptools~=70.3",
"pandas-stubs~=2.1",
"types-openpyxl~=3.1",
"types-requests~=2.31",
"boto3-stubs~=1.37",
"types-urllib3~=1.26",
"types-tqdm~=4.67",
"coverage~=7.6",
"pytest~=8.3",
"pytest-cov>=6.1.1",
"pytest-dependency~=0.6",
"pytest-durations~=1.6.1",
"pytest-xdist~=3.3",
"ipykernel~=6.29",
"ipywidgets~=8.1",
"nbqa~=1.9",
"python-semantic-release~=7.32",
"types-defusedxml>=0.7.0.20250822,<0.8.0",
]
docs = [
"mkdocs-material~=9.5",
"mkdocs-jupyter>=0.25,<0.26",
@@ -155,6 +275,7 @@ docs = [
"mkdocstrings[python]~=0.27",
"griffe-pydantic~=1.1",
]
examples = [
"datasets~=2.21",
"python-dotenv~=1.0",
@@ -162,53 +283,52 @@ examples = [
"langchain-milvus~=0.1",
"langchain-text-splitters>=0.2",
"modelscope>=1.29.0",
'gliner>=0.2.21 ; python_version < "3.14"', # gliner depends on onnxruntime which is not available on py3.14
]
constraints = [
'numba >=0.63.0',
'langchain-core >=0.3.81',
'pandas (>=2.1.4,<3.0.0); python_version < "3.11"',
'pandas (>=2.1.4,<4.0.0); python_version >= "3.11"',
'gliner>=0.2.21 ; python_version < "3.14"',
]
constraints = [
'numba>=0.63.0',
'langchain-core>=0.3.81',
'pandas>=2.1.4,<3.0.0 ; python_version < "3.11"',
'pandas>=2.1.4,<4.0.0 ; python_version >= "3.11"',
]
[tool.uv.workspace]
members = ["packages/docling"]
[tool.uv.sources]
docling = { workspace = true }
[tool.uv]
package = true
default-groups = "all"
[tool.setuptools.packages.find]
include = ["docling*"]
[tool.hatch.build.targets.wheel]
packages = ["docling"]
[tool.hatch.build.targets.sdist]
only-include = ["docling", "pyproject.toml", "README.md", "LICENSE"]
[tool.ruff]
target-version = "py310"
line-length = 88
respect-gitignore = true
# extend-exclude = [
# "tests",
# ]
[tool.ruff.format]
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = [
# "B", # flake8-bugbear
"C", # flake8-comprehensions
"C9", # mccabe
# "D", # flake8-docstrings
"E", # pycodestyle errors (default)
"F", # pyflakes (default)
"I", # isort
"PD", # pandas-vet
"PIE", # pie
# "PTH", # pathlib
"Q", # flake8-quotes
# "RET", # return
"RUF", # Enable all ruff-specific checks
# "SIM", # simplify
"S307", # eval
# "T20", # (disallow print statements) keep debugging statements out of the codebase
"W", # pycodestyle warnings
"ASYNC", # async
"UP", # pyupgrade
@@ -227,38 +347,23 @@ ignore = [
"UP035", # `typing.Set` is deprecated, use `set` instead"
]
#extend-select = []
[tool.ruff.lint.pep8-naming]
classmethod-decorators = [
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
"pydantic.validator",
]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401"]
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
"tests/*.py" = ["ASYNC"]
[tool.ruff.lint.mccabe]
max-complexity = 30
# [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
[tool.ruff.lint.isort]
combine-as-imports = true
# section-order = [
# "future",
# "standard-library",
# "third-party",
# "docling",
# "first-party",
# "local-folder",
# ]
[tool.mypy]
pretty = true
# strict = true
no_implicit_optional = true
plugins = "pydantic.mypy"
python_version = "3.10"
Generated
+1630 -1374
View File
File diff suppressed because it is too large Load Diff