[build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "docling-slim" version = "2.93.0" # DO NOT EDIT, updated automatically description = "Modular version of the Docling package: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." license = "MIT" keywords = [ "docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", ] classifiers = [ "Operating System :: MacOS :: MacOS X", "Operating System :: POSIX :: Linux", "Operating System :: Microsoft :: Windows", "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", ] readme = "packages/docling-slim/README.md" authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, { name = "Michele Dolfi", email = "dol@zurich.ibm.com" }, { name = "Maxim Lysak", email = "mly@zurich.ibm.com" }, { name = "Nikos Livathinos", email = "nli@zurich.ibm.com" }, { name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" }, { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] requires-python = '>=3.10,<4.0' # MINIMAL BASE (8 packages) - ~50MB dependencies = [ 'pydantic>=2.0.0,<3.0.0', 'docling-core>=2.73.0,<3.0.0', 'pydantic-settings>=2.3.0,<3.0.0', 'filetype>=1.2.0,<2.0.0', 'requests>=2.32.2,<3.0.0', 'certifi>=2024.7.4', 'pluggy>=1.0.0,<2.0.0', 'tqdm>=4.65.0,<5.0.0', ] [project.urls] homepage = "https://github.com/docling-project/docling" repository = "https://github.com/docling-project/docling" issues = "https://github.com/docling-project/docling/issues" changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md" [project.entry-points.docling] "docling_defaults" = "docling.models.plugins.defaults" # CLI scripts (require cli extra: pip install docling-slim[cli]) [project.scripts] docling = "docling.cli.main:app" docling-tools = "docling.cli.tools:app" [project.optional-dependencies] # ============================================================================ # CORE COMPONENTS # ============================================================================ convert-core = [ 'numpy>=1.24.0,<3.0.0', 'pillow>=10.0.0,<13.0.0', 'rtree>=1.3.0,<2.0.0', 'scipy>=1.6.0,<2.0.0', ] extract-core = [ 'docling-slim[convert-core]', 'polyfactory>=2.22.2', ] # ============================================================================ # FORMAT SUPPORT # ============================================================================ # --- PDF Formats --- format-pdf-pypdfium2 = [ 'pypdfium2>=4.30.0,!=4.30.1,<6.0.0', ] format-pdf-docling = [ 'pypdfium2>=4.30.0,!=4.30.1,<6.0.0', 'docling-parse>=5.3.2,<6.0.0', ] format-pdf = [ 'docling-slim[format-pdf-pypdfium2,format-pdf-docling]', ] # --- Office Formats (office = docx + pptx + xlsx) --- format-docx = [ 'python-docx>=1.1.2,<2.0.0', ] format-pptx = [ 'python-pptx>=1.0.2,<2.0.0', ] format-xlsx = [ 'openpyxl>=3.1.5,<4.0.0', ] format-office = [ 'docling-slim[format-docx,format-pptx,format-xlsx]', ] # --- Web Formats (web = html + markdown) --- format-html = [ 'beautifulsoup4>=4.12.3,<5.0.0', 'lxml>=4.0.0,<7.0.0', ] format-markdown = [ 'marko>=2.1.2,<3.0.0', ] format-web = [ 'docling-slim[format-html,format-markdown]', ] # --- Other Formats --- format-latex = [ 'pylatexenc>=2.10,<3.0', ] format-xml-xbrl = [ 'arelle-release>=2.38.17,<3.0.0', ] format-html-render = [ 'playwright>=1.58.0', ] format-audio = [ 'mlx-whisper>=0.4.3 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"', 'openai-whisper>=20250625', 'numba>=0.63.0', ] # ============================================================================ # OCR ENGINES (feat-ocr-*) # ============================================================================ feat-ocr-rapidocr = [ 'rapidocr>=3.8,<4.0.0', ] feat-ocr-rapidocr-onnx = [ 'rapidocr>=3.8,<4.0.0', 'onnxruntime>=1.7.0,<2.0.0 ; python_version < "3.14"', ] feat-ocr-easyocr = [ 'easyocr>=1.7,<2.0', # easyocr declares scikit-image with no lower bound; without this pin, # resolvers on Python 3.10 backtrack to 0.16.2 (2019), which has no # Py3.10 wheels and fails to build from source. 'scikit-image>=0.19', ] feat-ocr-tesserocr = [ 'tesserocr>=2.7.1,<3.0.0', 'pandas>=2.1.4,<4.0.0', ] feat-ocr-mac = [ 'ocrmac>=1.0.0,<2.0.0 ; sys_platform == "darwin"', ] # ============================================================================ # MODELS # ============================================================================ models-local = [ 'torch>=2.2.2,<3.0.0', 'torchvision>=0,<1', 'docling-ibm-models>=3.13.0,<4', 'accelerate>=1.0.0,<2', 'huggingface_hub>=0.23,<2', 'defusedxml>=0.7.1,<0.8.0', ] models-remote = [ 'tritonclient[grpc]>=2.65.0,<3.0.0', ] models-onnxruntime = [ 'onnxruntime<1.24 ; python_version < "3.14" and sys_platform == "darwin"', 'onnxruntime-gpu<1.24 ; python_version < "3.14" and (sys_platform == "linux" or sys_platform == "win32")', ] # Vision Language Models for inline processing models-vlm-inline = [ 'transformers>=4.42.0,<6.0.0,!=5.0.*,!=5.1.*,!=5.2.*,!=5.3.*', 'accelerate>=1.2.1,<2.0.0', 'mlx-vlm>=0.4.3,<1.0.0 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"', 'qwen-vl-utils>=0.0.11', 'peft>=0.18.1', ] # ============================================================================ # FEATURES # ============================================================================ # Document chunking for RAG applications feat-chunking = [ 'docling-core[chunking]>=2.73.0,<3.0.0', ] service-client = [ 'httpx>=0.28,<1.0.0', 'websockets>=14.0,<17.0', ] cli = [ 'typer>=0.12.5,<0.22.0', 'rich>=13.0.0', ] # ============================================================================ # CONVENIENCE BUNDLES # ============================================================================ standard = [ 'docling-slim[format-pdf,models-local,feat-ocr-rapidocr,format-office,format-web,format-latex,feat-chunking,extract-core,service-client,cli]', ] all = [ 'docling-slim[standard,models-vlm-inline,format-audio,format-html-render,format-xml-xbrl,models-remote,models-onnxruntime,feat-ocr-easyocr,feat-ocr-tesserocr,feat-ocr-mac]', ] [dependency-groups] typecheck = [ "ty==0.0.33", "types-setuptools~=70.3", "pandas-stubs~=2.1", "types-openpyxl~=3.1", "types-requests~=2.31", "boto3-stubs~=1.37", "types-urllib3~=1.26", "types-tqdm~=4.67", "types-defusedxml>=0.7.0.20250822,<0.8.0", ] pr-fast-checks = [ { include-group = "typecheck" }, "docling-core[chunking] (>=2.73.0,<3.0.0)", "pydantic (>=2.0.0,<3.0.0)", "pydantic-settings (>=2.3.0,<3.0.0)", "ruff==0.15.12", ] dev = [ { include-group = "typecheck" }, "dprint-py==0.53.1.0", "prek>=0.3.10", "tach>=0.34.0,<1", "coverage~=7.6", "pytest~=8.3", "pytest-cov>=6.1.1", "pytest-dependency~=0.6", "pytest-durations~=1.6.1", "pytest-xdist~=3.3", "ipykernel~=6.29", "ipywidgets~=8.1", "nbqa~=1.9", "python-semantic-release~=7.32", ] docs = [ "mkdocs-material~=9.5", "mkdocs-jupyter>=0.25,<0.26", "mkdocs-click~=0.8", "mkdocs-redirects~=1.2", "mkdocstrings[python]~=0.27", "griffe-pydantic~=1.1", ] examples = [ "datasets~=2.21", "python-dotenv~=1.0", "langchain-huggingface>=0.0.3", "langchain-milvus~=0.1", "langchain-text-splitters>=0.2", "modelscope>=1.29.0", 'gliner>=0.2.21 ; python_version < "3.14"', ] constraints = [ 'numba>=0.63.0', 'langchain-core>=0.3.81', 'pandas>=2.1.4,<3.0.0 ; python_version < "3.11"', 'pandas>=2.1.4,<4.0.0 ; python_version >= "3.11"', ] [tool.uv.workspace] members = ["packages/docling"] [tool.uv.sources] docling = { workspace = true } [tool.uv] package = true default-groups = "all" [tool.hatch.build.targets.wheel] packages = ["docling"] [tool.hatch.build.targets.sdist] only-include = ["docling", "pyproject.toml", "README.md", "LICENSE"] [tool.pytest.ini_options] markers = [ "ml_ocr: OCR tests that download or execute OCR model code.", "ml_pdf_model: PDF conversion tests that download or execute document AI model code.", "ml_vlm: VLM and picture-description tests that download or execute vision-language model code.", "ml_asr: ASR tests that download or execute speech model code.", "cross_platform: Lightweight smoke tests run on Windows and macOS workflow-dispatch lanes.", ] [tool.ruff] target-version = "py310" line-length = 88 respect-gitignore = true [tool.ruff.format] skip-magic-trailing-comma = false [tool.ruff.lint] select = [ "C", # flake8-comprehensions "C9", # mccabe "E", # pycodestyle errors (default) "F", # pyflakes (default) "I", # isort "PD", # pandas-vet "PIE", # pie "Q", # flake8-quotes "RUF", # Enable all ruff-specific checks "S307", # eval "W", # pycodestyle warnings "ASYNC", # async "UP", # pyupgrade ] ignore = [ "C408", # Unnecessary `dict()` call (rewrite as a literal) "E501", # Line too long, handled by ruff formatter "D107", # "Missing docstring in __init__", "F401", # imported but unused; consider using `importlib.util.find_spec` to test for " "F811", # "redefinition of the same function" "PL", # Pylint "RUF012", # Mutable Class Attributes "RUF059", # Unused unpacked variables "UP006", # List vs list, etc "UP007", # Option and Union "UP035", # `typing.Set` is deprecated, use `set` instead" "UP045", # `Optional[T]` vs `T | None` ] [tool.ruff.lint.pep8-naming] classmethod-decorators = [ "pydantic.validator", ] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] "tests/*.py" = ["ASYNC"] [tool.ruff.lint.mccabe] max-complexity = 30 [tool.ruff.lint.isort] combine-as-imports = true [tool.ty.rules] all = "warn" [tool.ty.environment] python-version = "3.10" [tool.ty.src] include = ["docling", ".github/scripts"] [tool.ty.analysis] allowed-unresolved-imports = [ "arelle.**", "docling_ibm_models.**", "docling_parse.**", "easyocr.**", "filetype.**", "huggingface_hub.**", "lxml.**", "mlx_vlm.**", "networkx.**", "ocrmac.**", "onnxruntime.**", "pylatexenc.**", "pypdfium2.**", "qwen_vl_utils.**", "scipy.**", "tesserocr.**", "torchvision.**", "transformers.**", "vllm.**", "websockets.**", ] [tool.ty.terminal] output-format = "concise" [tool.semantic_release] # for default values check: # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg version_source = "tag_only" branch = "main" # configure types which should trigger minor and patch version bumps respectively # (note that they must be a subset of the configured allowed types): parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test" parser_angular_minor_types = "feat" parser_angular_patch_types = "fix,perf"