From 4856cdf677771669cfb0419f5a1ebf31663a55d5 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 5 Aug 2024 15:49:13 +0200 Subject: [PATCH] add OSS release files Signed-off-by: Michele Dolfi --- CODE_OF_CONDUCT.md | 129 +++++++++++++++++++++++++++++++ CONTRIBUTING.md | 184 +++++++++++++++++++++++++++++++++++++++++++++ LICENSE | 21 ++++++ MAINTAINERS.md | 8 ++ README.md | 89 +++++++++++++++++----- pyproject.toml | 22 +++++- 6 files changed, 433 insertions(+), 20 deletions(-) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 MAINTAINERS.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..c7d5ea1 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,129 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement using +[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). + +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html). + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org) + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at +[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations). \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..7b40242 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,184 @@ +## Contributing In General +Our project welcomes external contributions. If you have an itch, please feel +free to scratch it. + +To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls). + +A good way to familiarize yourself with the codebase and contribution process is +to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues). +Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us. + +For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions). + +**Note: We appreciate your effort, and want to avoid a situation where a contribution +requires extensive rework (by you or by us), sits in backlog for a long time, or +cannot be accepted at all!** + +### Proposing new features + +If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues) +before sending a pull request so the feature can be discussed. This is to avoid +you wasting your valuable time working on a feature that the project developers +are not interested in accepting into the code base. + +### Fixing bugs + +If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a +pull request so it can be tracked. + +### Merge approval + +The project maintainers use LGTM (Looks Good To Me) in comments on the code +review to indicate acceptance. A change requires LGTMs from two of the +maintainers of each component affected. + +For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page. + + +## Legal + +Each source file must include a license header for the MIT +Software. Using the SPDX format is the simplest approach. +e.g. + +``` +/* +Copyright IBM Inc. All rights reserved. + +SPDX-License-Identifier: MIT +*/ +``` + +We have tried to make it as easy as possible to make contributions. This +applies to how we handle the legal aspects of contribution. We use the +same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin) +uses to manage code contributions. + +We simply ask that when submitting a patch for review, the developer +must include a sign-off statement in the commit message. + +Here is an example Signed-off-by line, which indicates that the +submitter accepts the DCO: + +``` +Signed-off-by: John Doe +``` + +You can include this automatically when you commit a change to your +local git repository using the following command: + +``` +git commit -s +``` + + +## Communication + +Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions). + + + +## Developing + +### Usage of Poetry + +We use Poetry to manage dependencies. + + +#### Install + +To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer + +1. Install the Poetry globally in your machine + ```bash + curl -sSL https://install.python-poetry.org | python3 - + ``` + The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps. + +2. Make sure Poetry is in your `$PATH` + - for `zsh` + ```sh + echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc + ``` + - for `bash` + ```sh + echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc + ``` + +3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh. + + +#### Create a Virtual Environment and Install Dependencies + +To activate the Virtual Environment, run: + +```bash +poetry shell +``` + +To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run: + +```bash +poetry install +``` + +**(Advanced) Use a Specific Python Version** + +If for whatever reason you need to work in a specific (older) version of Python, run: + +```bash +poetry env use $(which python3.8) +``` + +This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`. + + +#### Add a new dependency + +```bash +poetry add NAME +``` + +## Coding style guidelines + +We use the following tools to enforce code style: + +- iSort, to sort imports +- Black, to format code + + +We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run: + +```bash +pre-commit install +``` + +To run the checks on-demand, run: + +``` +pre-commit run --all-files +``` + +Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again. + + + +## Documentation + +We use [MkDocs](https://www.mkdocs.org/) to write documentation. + +To run the documentation server, do: + +```bash +mkdocs serve +``` + +The server will be available on [http://localhost:8000](http://localhost:8000). + +### Pushing Documentation to GitHub pages + +Run the following: + +```bash +mkdocs gh-deploy +``` \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8aa2645 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [year] [fullname] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 0000000..acb4fcb --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,8 @@ +# MAINTAINERS + +- Christoph Auer - [@cau-git](https://github.com/cau-git) +- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm) +- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic) +- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM) + +Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). \ No newline at end of file diff --git a/README.md b/README.md index 226ad37..89abdfa 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,38 @@ -# docling-parse +# Docling Parse -## Install +[![PyPI version](https://img.shields.io/pypi/v/docling-parse)](https://pypi.org/project/docling-parse/) +![Python](https://img.shields.io/badge/python-3.09%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue) +[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) +[![Pybind11](https://img.shields.io/badge/build-pybind11-blue)](https://github.com/pybind/pybind11/) +[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) + +Simple package to extract text with coordinates from programmatic PDFs. +This package is part of the [Docling](https://github.com/DS4SD/docling) conversion. + + +## Quick start + +Install the package from Pypi + +```sh +pip install docling-parse +``` + +Convert a PDF + +```sh +from docling_parse import pdf_parser + +parser = pdf_parser() +doc = parser.find_cells("mydoc.pdf") + +for i, page in enumerate(doc["pages"]): + for j, cell in enumerate(page["cells"]): + print(i, "\t", j, "\t", cell["content"]["rnormalized"]) +``` + + +## Development ### CXX @@ -10,6 +42,15 @@ To build the parse, simply run the following command in the root folder, rm -rf build; cmake -B ./build; cd build; make ``` +You can run the parser from your build folder with + +```sh +./parse.exe +``` + +If you dont have an input file, then a template input file will be printed on the terminal. + + ### Python To build the package, simply run (make sure [poetry](https://python-poetry.org/) is [installed](https://python-poetry.org/docs/#installing-with-the-official-installer)), @@ -18,24 +59,34 @@ To build the package, simply run (make sure [poetry](https://python-poetry.org/) poetry build ``` -## Run - -### CXX - -In the build-folder, run - -``` -./parse.exe -``` - -If you dont have an input file, then a template input file will be printed on the terminal. - -### Python - -## Test - To test the package, run, ``` poetry run pytest ./tests/test_parse.py -``` \ No newline at end of file +``` + + +## Contributing + +Please read [Contributing to Docling Parse](https://github.com/DS4SD/docling-parse/blob/main/CONTRIBUTING.md) for details. + + +## References + +If you use Docling in your projects, please consider citing the following: + +```bib +@software{Docling, +author = {Deep Search Team}, +month = {7}, +title = {{Docling}}, +url = {https://github.com/DS4SD/docling}, +version = {main}, +year = {2024} +} +``` + +## License + +The Docling Parse codebase is under MIT license. +For individual model usage, please refer to the model licenses found in the original packages. diff --git a/pyproject.toml b/pyproject.toml index 29daa8a..e858468 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,28 @@ [tool.poetry] name = "docling-parse" version = "0.1.0" -description = "Simple package to extract text with coordinates from programmatic PDF's" +description = "Simple package to extract text with coordinates from programmatic PDFs" authors = ["Peter Staar "] +maintainers = [ + "Peter Staar ", + "Christoph Auer ", + "Michele Dolfi ", + "Panos Vagenas ", + "Maxim Lysak ", +] +repository = "https://github.com/DS4SD/docling-parse" +homepage = "https://github.com/DS4SD/docling-parse" +keywords= ["docling", "pdf", "parser"] +classifiers = [ + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Programming Language :: C++", + "Programming Language :: Python :: 3" +] license = "MIT" readme = "README.md" packages = [{include = "docling_parse"}]