add pyproject and ci

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-02-26 11:31:31 +01:00
parent c861429823
commit 9b905728e1
16 changed files with 3924 additions and 34 deletions
+12
View File
@@ -0,0 +1,12 @@
<!-- Thank you for contributing to Docling! -->
<!-- STEPS TO FOLLOW:
1. Add a description of the changes (frequently the same as the commit description)
2. Enter the issue number next to "Resolves #" below (if there is no tracking issue resolved, **remove that section**)
3. Make sure the PR title follows the **Commit Message Formatting**: https://www.conventionalcommits.org/en/v1.0.0/#summary.
-->
<!-- Uncomment this section with the issue number if an issue is being resolved
**Issue resolved by this Pull Request:**
Resolves #
--->
+23
View File
@@ -0,0 +1,23 @@
# Security and Disclosure Information Policy for the Docling Project
The Docling team and community take security bugs seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.
## Reporting a Vulnerability
If you think you've identified a security issue in an Docling project repository, please DO NOT report the issue publicly via the GitHub issue tracker, etc.
Instead, send an email with as many details as possible to [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). This is a private mailing list for the maintainers team.
Please do not create a public issue.
## Security Vulnerability Response
Each report is acknowledged and analyzed by the core maintainers within 3 working days.
Any vulnerability information shared with core maintainers stays within the Docling project and will not be disseminated to other projects unless it is necessary to get the issue fixed.
After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
## Security Alerts
We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/DS4SD/docling-jobkit/discussions/categories/announcements).
+9
View File
@@ -0,0 +1,9 @@
merge_protections:
- name: Enforce conventional commit
description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/
if:
- base = main
success_conditions:
- "title ~=
^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\
\\))?(!)?:"
+39
View File
@@ -0,0 +1,39 @@
#!/bin/bash
set -e # trigger failure on error - do not remove!
set -x # display command on output
if [ -z "${TARGET_VERSION}" ]; then
>&2 echo "No TARGET_VERSION specified"
exit 1
fi
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
# update package version
uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
# collect release notes
REL_NOTES=$(mktemp)
uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"
# update changelog
TMP_CHGLOG=$(mktemp)
TARGET_TAG_NAME="v${TARGET_VERSION}"
RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
if [ -f "${CHGLOG_FILE}" ]; then
printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
fi
mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
# push changes
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git add pyproject.toml "${CHGLOG_FILE}"
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
git commit -m "${COMMIT_MSG}"
git push origin main
# create GitHub release (incl. Git tag)
gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
+23
View File
@@ -0,0 +1,23 @@
name: Lint GitHub Actions workflows
on:
push:
branches: ["main"]
paths:
- '.github/**'
pull_request:
branches: ["main"]
paths:
- '.github/**'
jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Download actionlint
id: get_actionlint
run: bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash)
shell: bash
- name: Check workflow files
run: PATH=".:$PATH" make action-lint
shell: bash
+59
View File
@@ -0,0 +1,59 @@
name: "Run CD"
on:
workflow_dispatch:
jobs:
code-checks:
uses: ./.github/workflows/job-checks.yml
pre-release-check:
runs-on: ubuntu-latest
outputs:
TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # for fetching tags, required for semantic-release
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
- name: Install dependencies
run: uv sync --only-dev
- name: Check version of potential release
id: version_check
run: |
TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
echo "${TRGT_VERSION}"
- name: Check notes of potential release
run: uv run --no-sync semantic-release changelog --unreleased
release:
needs: [code-checks, pre-release-check]
if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
environment: auto-release
runs-on: ubuntu-latest
concurrency: release
steps:
- uses: actions/create-github-app-token@v1
id: app-token
with:
app-id: ${{ vars.CI_APP_ID }}
private-key: ${{ secrets.CI_PRIVATE_KEY }}
- uses: actions/checkout@v4
with:
token: ${{ steps.app-token.outputs.token }}
fetch-depth: 0 # for fetching tags, required for semantic-release
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
- name: Install dependencies
run: uv sync --only-dev
- name: Run release script
env:
GH_TOKEN: ${{ steps.app-token.outputs.token }}
TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }}
CHGLOG_FILE: CHANGELOG.md
run: ./.github/scripts/release.sh
shell: bash
+16
View File
@@ -0,0 +1,16 @@
name: "Run CI"
on:
push:
branches: ["main"]
pull_request:
branches: ["main"]
jobs:
code-checks:
uses: ./.github/workflows/job-checks.yml
permissions:
packages: write
contents: read
attestations: write
id-token: write
+22
View File
@@ -0,0 +1,22 @@
name: Run checks
on:
workflow_call:
jobs:
py-lint:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.12']
steps:
- uses: actions/checkout@v4
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v5
with:
python-version: ${{ matrix.python-version }}
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras --no-extra cu124
- name: Run styling check
run: uv run --no-sync pre-commit run --all-files
+32
View File
@@ -0,0 +1,32 @@
name: "Build and publish package"
on:
release:
types: [published]
permissions:
contents: read
jobs:
build-and-publish:
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/docling-jobkit # Replace <package-name> with your PyPI project name
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- uses: actions/checkout@v4
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras --no-extra cu124
- name: Build
run: uv build
- name: Publish distribution 📦 to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
# currently not working with reusable workflows
attestations: false
-1
View File
@@ -284,7 +284,6 @@ tags
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json
+24
View File
@@ -0,0 +1,24 @@
fail_fast: true
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.6
hooks:
# Run the Ruff linter.
- id: ruff
args: [--exit-non-zero-on-fix, --config=pyproject.toml]
# Run the Ruff formatter.
# - id: ruff-format
# args: [--config=pyproject.toml]
- repo: local
hooks:
- id: system
name: MyPy
entry: uv run --no-sync mypy docling_serve
pass_filenames: false
language: system
files: '\.py$'
- repo: https://github.com/astral-sh/uv-pre-commit
# uv version.
rev: 0.6.1
hooks:
- id: uv-lock
+1
View File
@@ -0,0 +1 @@
3.12
+28 -33
View File
@@ -2,10 +2,10 @@
Our project welcomes external contributions. If you have an itch, please feel
free to scratch it.
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling-pipelines/pulls).
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling-jobkit/pulls).
A good way to familiarize yourself with the codebase and contribution process is
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling-pipelines/issues).
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling-jobkit/issues).
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions)
@@ -17,14 +17,14 @@ cannot be accepted at all!**
### Proposing New Features
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling-pipelines/issues)
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling-jobkit/issues)
before sending a pull request so the feature can be discussed. This is to avoid
you spending valuable time working on a feature that the project developers
are not interested in accepting into the codebase.
### Fixing Bugs
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/docling-pipelines) before sending a
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/docling-jobkit) before sending a
pull request so it can be tracked.
### Merge Approval
@@ -85,44 +85,39 @@ Please feel free to connect with us using the [discussion section](https://githu
## Developing
### Usage of Poetry
### Usage of `uv`
We use Poetry to manage dependencies.
We use `uv` to manage dependencies.
#### Installation
To install Poetry, follow the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
To install `uv`, follow the documentation here: https://docs.astral.sh/uv/getting-started/installation/
1. Install Poetry globally on your machine:
1. Install `uv` globally on your machine:
```bash
curl -sSL https://install.python-poetry.org | python3 -
curl -LsSf https://astral.sh/uv/install.sh | sh
```
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
2. Make sure Poetry is in your `$PATH`:
- for `zsh`:
```sh
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
```
- for `bash`:
```sh
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
```
3. The official guidelines linked above include useful details on configuring autocomplete for most shell environments, e.g., Bash and Zsh.
#### Create a Virtual Environment and Install Dependencies
To activate the Virtual Environment, run:
To create the Virtual Environment, run:
```bash
poetry shell
uv venv
```
This will spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
The virtual environment can be "activated" to make its packages available:
```bash
poetry install
source .venv/bin/activate
```
Then, to install dependencies, run:
```bash
uv sync
```
**(Advanced) Use a Specific Python Version**
@@ -130,25 +125,25 @@ poetry install
If you need to work with a specific (older) version of Python, run:
```bash
poetry env use $(which python3.8)
uv venv --python 3.11
```
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` with the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
More detailed options are described in the [uv documentation](https://docs.astral.sh/uv/pip/environments).
#### Add a New Dependency
```bash
poetry add NAME
uv add NAME
```
## Coding Style Guidelines
## Coding style guidelines
We use the following tools to enforce code style:
- iSort, to sort imports
- Black, to format code
- ruff, to sort imports and format code
We run a series of checks on the codebase on every commit using `pre-commit`. To install the hooks, run:
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
```bash
pre-commit install
@@ -156,8 +151,8 @@ pre-commit install
To run the checks on-demand, run:
```bash
```shell
pre-commit run --all-files
```
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.
Note: Formatting checks like `ruff` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
View File
+212
View File
@@ -0,0 +1,212 @@
[project]
name = "docling-jobkit"
version = "0.1.0" # DO NOT EDIT, updated automatically
description = "Running a distributed job processing documents with Docling."
readme = "README.md"
license = {text = "MIT"}
authors = [
{name="Michele Dolfi", email="dol@zurich.ibm.com"},
{name="Viktor Kuropiatnyk", email="vku@zurich.ibm.com"},
{name="Tiago Santana", email="Tiago.Santana@ibm.com"},
{name="Cesar Berrospi Ramis", email="ceb@zurich.ibm.com"},
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
{name="Christoph Auer", email="cau@zurich.ibm.com"},
{name="Peter Staar", email="taa@zurich.ibm.com"},
]
maintainers = [
{name="Michele Dolfi", email="dol@zurich.ibm.com"},
{name="Cesar Berrospi Ramis", email="ceb@zurich.ibm.com"},
{name="Panos Vagenas", email="pva@zurich.ibm.com"},
{name="Christoph Auer", email="cau@zurich.ibm.com"},
{name="Peter Staar", email="taa@zurich.ibm.com"},
]
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
# "Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Typing :: Typed",
"Programming Language :: Python :: 3"
]
requires-python = ">=3.10"
dependencies = [
"ray~=2.30",
"docling~=2.23",
"typer~=0.12",
]
[project.optional-dependencies]
tesserocr = [
"tesserocr~=2.7"
]
rapidocr = [
"rapidocr-onnxruntime~=1.4; python_version<'3.13'",
"onnxruntime~=1.7",
]
cpu = [
"torch>=2.6.0",
"torchvision>=0.21.0",
]
cu124 = [
"torch>=2.6.0",
"torchvision>=0.21.0",
]
[dependency-groups]
dev = [
"mypy~=1.11",
"pre-commit~=3.8",
"pytest~=8.3",
"pytest-asyncio~=0.24",
"pytest-check~=2.4",
"python-semantic-release~=7.32",
"ruff>=0.9.6",
]
[tool.uv]
package = true
conflicts = [
[
{ extra = "cpu" },
{ extra = "cu124" },
],
]
[tool.uv.sources]
torch = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu124", extra = "cu124" },
]
torchvision = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu124", extra = "cu124" },
]
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[[tool.uv.index]]
name = "pytorch-cu124"
url = "https://download.pytorch.org/whl/cu124"
explicit = true
[tool.setuptools.packages.find]
include = ["docling_jobkit"]
[project.scripts]
docling-jobkit = "docling_pipeline.app:main"
[project.urls]
Homepage = "https://github.com/DS4SD/docling-jobkit"
# Documentation = "https://ds4sd.github.io/docling"
Repository = "https://github.com/DS4SD/docling-jobkit"
Issues = "https://github.com/DS4SD/docling-jobkit/issues"
Changelog = "https://github.com/DS4SD/docling-jobkit/blob/main/CHANGELOG.md"
[tool.ruff]
target-version = "py310"
line-length = 88
respect-gitignore = true
# extend-exclude = [
# "tests",
# ]
[tool.ruff.format]
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = [
# "B", # flake8-bugbear
"C", # flake8-comprehensions
"C9", # mccabe
# "D", # flake8-docstrings
"E", # pycodestyle errors (default)
"F", # pyflakes (default)
"I", # isort
"PD", # pandas-vet
"PIE", # pie
# "PTH", # pathlib
"Q", # flake8-quotes
# "RET", # return
"RUF", # Enable all ruff-specific checks
# "SIM", # simplify
"S307", # eval
# "T20", # (disallow print statements) keep debugging statements out of the codebase
"W", # pycodestyle warnings
"ASYNC" # async
]
ignore = [
"E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__",
"F811", # "redefinition of the same function"
"PL", # Pylint
"RUF012", # Mutable Class Attributes
]
#extend-select = []
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401"]
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
[tool.ruff.lint.mccabe]
max-complexity = 15
[tool.ruff.lint.isort.sections]
"docling" = ["docling", "docling_core"]
[tool.ruff.lint.isort]
combine-as-imports = true
section-order = [
"future",
"standard-library",
"third-party",
"docling",
"first-party",
"local-folder",
]
[tool.mypy]
pretty = true
# strict = true
no_implicit_optional = true
plugins = "pydantic.mypy"
python_version = "3.10"
[[tool.mypy.overrides]]
module = [
"easyocr.*",
"tesserocr.*",
"rapidocr_onnxruntime.*",
]
ignore_missing_imports = true
[tool.pytest.ini_options]
asyncio_mode = "auto"
asyncio_default_fixture_loop_scope = "function"
minversion = "8.2"
testpaths = [
"tests",
]
addopts = "-rA --color=yes --tb=short --maxfail=5"
markers = [
"asyncio",
]
[tool.semantic_release]
# for default values check:
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
version_source = "tag_only"
branch = "main"
# configure types which should trigger minor and patch version bumps respectively
# (note that they must be a subset of the configured allowed types):
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
parser_angular_minor_types = "feat"
parser_angular_patch_types = "fix,perf"
Generated
+3424
View File
File diff suppressed because it is too large Load Diff