mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
add OSS release files
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -0,0 +1,129 @@
|
|||||||
|
# Contributor Covenant Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socio-economic status,
|
||||||
|
nationality, personal appearance, race, religion, or sexual identity
|
||||||
|
and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the
|
||||||
|
overall community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or
|
||||||
|
advances of any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email
|
||||||
|
address, without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official e-mail address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement using
|
||||||
|
[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
||||||
|
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series
|
||||||
|
of actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or
|
||||||
|
permanent ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within
|
||||||
|
the community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||||
|
version 2.0, available at
|
||||||
|
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||||
|
enforcement ladder](https://github.com/mozilla/diversity).
|
||||||
|
|
||||||
|
Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the FAQ at
|
||||||
|
[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
|
[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
|
||||||
+184
@@ -0,0 +1,184 @@
|
|||||||
|
## Contributing In General
|
||||||
|
Our project welcomes external contributions. If you have an itch, please feel
|
||||||
|
free to scratch it.
|
||||||
|
|
||||||
|
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
|
||||||
|
|
||||||
|
A good way to familiarize yourself with the codebase and contribution process is
|
||||||
|
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
|
||||||
|
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
|
||||||
|
|
||||||
|
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||||
|
|
||||||
|
**Note: We appreciate your effort, and want to avoid a situation where a contribution
|
||||||
|
requires extensive rework (by you or by us), sits in backlog for a long time, or
|
||||||
|
cannot be accepted at all!**
|
||||||
|
|
||||||
|
### Proposing new features
|
||||||
|
|
||||||
|
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
|
||||||
|
before sending a pull request so the feature can be discussed. This is to avoid
|
||||||
|
you wasting your valuable time working on a feature that the project developers
|
||||||
|
are not interested in accepting into the code base.
|
||||||
|
|
||||||
|
### Fixing bugs
|
||||||
|
|
||||||
|
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
|
||||||
|
pull request so it can be tracked.
|
||||||
|
|
||||||
|
### Merge approval
|
||||||
|
|
||||||
|
The project maintainers use LGTM (Looks Good To Me) in comments on the code
|
||||||
|
review to indicate acceptance. A change requires LGTMs from two of the
|
||||||
|
maintainers of each component affected.
|
||||||
|
|
||||||
|
For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
|
||||||
|
|
||||||
|
|
||||||
|
## Legal
|
||||||
|
|
||||||
|
Each source file must include a license header for the MIT
|
||||||
|
Software. Using the SPDX format is the simplest approach.
|
||||||
|
e.g.
|
||||||
|
|
||||||
|
```
|
||||||
|
/*
|
||||||
|
Copyright IBM Inc. All rights reserved.
|
||||||
|
|
||||||
|
SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
```
|
||||||
|
|
||||||
|
We have tried to make it as easy as possible to make contributions. This
|
||||||
|
applies to how we handle the legal aspects of contribution. We use the
|
||||||
|
same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
|
||||||
|
uses to manage code contributions.
|
||||||
|
|
||||||
|
We simply ask that when submitting a patch for review, the developer
|
||||||
|
must include a sign-off statement in the commit message.
|
||||||
|
|
||||||
|
Here is an example Signed-off-by line, which indicates that the
|
||||||
|
submitter accepts the DCO:
|
||||||
|
|
||||||
|
```
|
||||||
|
Signed-off-by: John Doe <john.doe@example.com>
|
||||||
|
```
|
||||||
|
|
||||||
|
You can include this automatically when you commit a change to your
|
||||||
|
local git repository using the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
git commit -s
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Communication
|
||||||
|
|
||||||
|
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Developing
|
||||||
|
|
||||||
|
### Usage of Poetry
|
||||||
|
|
||||||
|
We use Poetry to manage dependencies.
|
||||||
|
|
||||||
|
|
||||||
|
#### Install
|
||||||
|
|
||||||
|
To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
|
||||||
|
|
||||||
|
1. Install the Poetry globally in your machine
|
||||||
|
```bash
|
||||||
|
curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
```
|
||||||
|
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
|
||||||
|
|
||||||
|
2. Make sure Poetry is in your `$PATH`
|
||||||
|
- for `zsh`
|
||||||
|
```sh
|
||||||
|
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
|
||||||
|
```
|
||||||
|
- for `bash`
|
||||||
|
```sh
|
||||||
|
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
|
||||||
|
```
|
||||||
|
|
||||||
|
3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
|
||||||
|
|
||||||
|
|
||||||
|
#### Create a Virtual Environment and Install Dependencies
|
||||||
|
|
||||||
|
To activate the Virtual Environment, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry shell
|
||||||
|
```
|
||||||
|
|
||||||
|
To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry install
|
||||||
|
```
|
||||||
|
|
||||||
|
**(Advanced) Use a Specific Python Version**
|
||||||
|
|
||||||
|
If for whatever reason you need to work in a specific (older) version of Python, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry env use $(which python3.8)
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
|
||||||
|
|
||||||
|
|
||||||
|
#### Add a new dependency
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry add NAME
|
||||||
|
```
|
||||||
|
|
||||||
|
## Coding style guidelines
|
||||||
|
|
||||||
|
We use the following tools to enforce code style:
|
||||||
|
|
||||||
|
- iSort, to sort imports
|
||||||
|
- Black, to format code
|
||||||
|
|
||||||
|
|
||||||
|
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
To run the checks on-demand, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
We use [MkDocs](https://www.mkdocs.org/) to write documentation.
|
||||||
|
|
||||||
|
To run the documentation server, do:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdocs serve
|
||||||
|
```
|
||||||
|
|
||||||
|
The server will be available on [http://localhost:8000](http://localhost:8000).
|
||||||
|
|
||||||
|
### Pushing Documentation to GitHub pages
|
||||||
|
|
||||||
|
Run the following:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdocs gh-deploy
|
||||||
|
```
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) [year] [fullname]
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
# MAINTAINERS
|
||||||
|
|
||||||
|
- Christoph Auer - [@cau-git](https://github.com/cau-git)
|
||||||
|
- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
|
||||||
|
- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
|
||||||
|
- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
|
||||||
|
|
||||||
|
Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
||||||
@@ -1,6 +1,38 @@
|
|||||||
# docling-parse
|
# Docling Parse
|
||||||
|
|
||||||
## Install
|
[](https://pypi.org/project/docling-parse/)
|
||||||
|

|
||||||
|
[](https://python-poetry.org/)
|
||||||
|
[](https://github.com/pybind/pybind11/)
|
||||||
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
Simple package to extract text with coordinates from programmatic PDFs.
|
||||||
|
This package is part of the [Docling](https://github.com/DS4SD/docling) conversion.
|
||||||
|
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
Install the package from Pypi
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pip install docling-parse
|
||||||
|
```
|
||||||
|
|
||||||
|
Convert a PDF
|
||||||
|
|
||||||
|
```sh
|
||||||
|
from docling_parse import pdf_parser
|
||||||
|
|
||||||
|
parser = pdf_parser()
|
||||||
|
doc = parser.find_cells("mydoc.pdf")
|
||||||
|
|
||||||
|
for i, page in enumerate(doc["pages"]):
|
||||||
|
for j, cell in enumerate(page["cells"]):
|
||||||
|
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
### CXX
|
### CXX
|
||||||
|
|
||||||
@@ -10,6 +42,15 @@ To build the parse, simply run the following command in the root folder,
|
|||||||
rm -rf build; cmake -B ./build; cd build; make
|
rm -rf build; cmake -B ./build; cd build; make
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can run the parser from your build folder with
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./parse.exe <input-file> <optional-logging:true>
|
||||||
|
```
|
||||||
|
|
||||||
|
If you dont have an input file, then a template input file will be printed on the terminal.
|
||||||
|
|
||||||
|
|
||||||
### Python
|
### Python
|
||||||
|
|
||||||
To build the package, simply run (make sure [poetry](https://python-poetry.org/) is [installed](https://python-poetry.org/docs/#installing-with-the-official-installer)),
|
To build the package, simply run (make sure [poetry](https://python-poetry.org/) is [installed](https://python-poetry.org/docs/#installing-with-the-official-installer)),
|
||||||
@@ -18,24 +59,34 @@ To build the package, simply run (make sure [poetry](https://python-poetry.org/)
|
|||||||
poetry build
|
poetry build
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run
|
|
||||||
|
|
||||||
### CXX
|
|
||||||
|
|
||||||
In the build-folder, run
|
|
||||||
|
|
||||||
```
|
|
||||||
./parse.exe <input-file> <optional-logging:true>
|
|
||||||
```
|
|
||||||
|
|
||||||
If you dont have an input file, then a template input file will be printed on the terminal.
|
|
||||||
|
|
||||||
### Python
|
|
||||||
|
|
||||||
## Test
|
|
||||||
|
|
||||||
To test the package, run,
|
To test the package, run,
|
||||||
|
|
||||||
```
|
```
|
||||||
poetry run pytest ./tests/test_parse.py
|
poetry run pytest ./tests/test_parse.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Please read [Contributing to Docling Parse](https://github.com/DS4SD/docling-parse/blob/main/CONTRIBUTING.md) for details.
|
||||||
|
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
If you use Docling in your projects, please consider citing the following:
|
||||||
|
|
||||||
|
```bib
|
||||||
|
@software{Docling,
|
||||||
|
author = {Deep Search Team},
|
||||||
|
month = {7},
|
||||||
|
title = {{Docling}},
|
||||||
|
url = {https://github.com/DS4SD/docling},
|
||||||
|
version = {main},
|
||||||
|
year = {2024}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
The Docling Parse codebase is under MIT license.
|
||||||
|
For individual model usage, please refer to the model licenses found in the original packages.
|
||||||
|
|||||||
+21
-1
@@ -1,8 +1,28 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling-parse"
|
name = "docling-parse"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Simple package to extract text with coordinates from programmatic PDF's"
|
description = "Simple package to extract text with coordinates from programmatic PDFs"
|
||||||
authors = ["Peter Staar <taa@zurich.ibm.com>"]
|
authors = ["Peter Staar <taa@zurich.ibm.com>"]
|
||||||
|
maintainers = [
|
||||||
|
"Peter Staar <taa@zurich.ibm.com>",
|
||||||
|
"Christoph Auer <cau@zurich.ibm.com>",
|
||||||
|
"Michele Dolfi <dol@zurich.ibm.com>",
|
||||||
|
"Panos Vagenas <pva@zurich.ibm.com>",
|
||||||
|
"Maxim Lysak <mly@zurich.ibm.com>",
|
||||||
|
]
|
||||||
|
repository = "https://github.com/DS4SD/docling-parse"
|
||||||
|
homepage = "https://github.com/DS4SD/docling-parse"
|
||||||
|
keywords= ["docling", "pdf", "parser"]
|
||||||
|
classifiers = [
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: MacOS :: MacOS X",
|
||||||
|
"Operating System :: POSIX :: Linux",
|
||||||
|
"Development Status :: 5 - Production/Stable",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"Programming Language :: C++",
|
||||||
|
"Programming Language :: Python :: 3"
|
||||||
|
]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
packages = [{include = "docling_parse"}]
|
packages = [{include = "docling_parse"}]
|
||||||
|
|||||||
Reference in New Issue
Block a user