mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
add OSS release files
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -0,0 +1,129 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, religion, or sexual identity
|
||||
and orientation.
|
||||
|
||||
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||
diverse, inclusive, and healthy community.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment for our
|
||||
community include:
|
||||
|
||||
* Demonstrating empathy and kindness toward other people
|
||||
* Being respectful of differing opinions, viewpoints, and experiences
|
||||
* Giving and gracefully accepting constructive feedback
|
||||
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||
and learning from the experience
|
||||
* Focusing on what is best not just for us as individuals, but for the
|
||||
overall community
|
||||
|
||||
Examples of unacceptable behavior include:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or
|
||||
advances of any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or email
|
||||
address, without their explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Enforcement Responsibilities
|
||||
|
||||
Community leaders are responsible for clarifying and enforcing our standards of
|
||||
acceptable behavior and will take appropriate and fair corrective action in
|
||||
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||
or harmful.
|
||||
|
||||
Community leaders have the right and responsibility to remove, edit, or reject
|
||||
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||
decisions when appropriate.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies within all community spaces, and also applies when
|
||||
an individual is officially representing the community in public spaces.
|
||||
Examples of representing our community include using an official e-mail address,
|
||||
posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the community leaders responsible for enforcement using
|
||||
[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
||||
|
||||
All complaints will be reviewed and investigated promptly and fairly.
|
||||
|
||||
All community leaders are obligated to respect the privacy and security of the
|
||||
reporter of any incident.
|
||||
|
||||
## Enforcement Guidelines
|
||||
|
||||
Community leaders will follow these Community Impact Guidelines in determining
|
||||
the consequences for any action they deem in violation of this Code of Conduct:
|
||||
|
||||
### 1. Correction
|
||||
|
||||
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||
unprofessional or unwelcome in the community.
|
||||
|
||||
**Consequence**: A private, written warning from community leaders, providing
|
||||
clarity around the nature of the violation and an explanation of why the
|
||||
behavior was inappropriate. A public apology may be requested.
|
||||
|
||||
### 2. Warning
|
||||
|
||||
**Community Impact**: A violation through a single incident or series
|
||||
of actions.
|
||||
|
||||
**Consequence**: A warning with consequences for continued behavior. No
|
||||
interaction with the people involved, including unsolicited interaction with
|
||||
those enforcing the Code of Conduct, for a specified period of time. This
|
||||
includes avoiding interactions in community spaces as well as external channels
|
||||
like social media. Violating these terms may lead to a temporary or
|
||||
permanent ban.
|
||||
|
||||
### 3. Temporary Ban
|
||||
|
||||
**Community Impact**: A serious violation of community standards, including
|
||||
sustained inappropriate behavior.
|
||||
|
||||
**Consequence**: A temporary ban from any sort of interaction or public
|
||||
communication with the community for a specified period of time. No public or
|
||||
private interaction with the people involved, including unsolicited interaction
|
||||
with those enforcing the Code of Conduct, is allowed during this period.
|
||||
Violating these terms may lead to a permanent ban.
|
||||
|
||||
### 4. Permanent Ban
|
||||
|
||||
**Community Impact**: Demonstrating a pattern of violation of community
|
||||
standards, including sustained inappropriate behavior, harassment of an
|
||||
individual, or aggression toward or disparagement of classes of individuals.
|
||||
|
||||
**Consequence**: A permanent ban from any sort of public interaction within
|
||||
the community.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||
version 2.0, available at
|
||||
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
|
||||
|
||||
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||
enforcement ladder](https://github.com/mozilla/diversity).
|
||||
|
||||
Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
|
||||
[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
|
||||
+184
@@ -0,0 +1,184 @@
|
||||
## Contributing In General
|
||||
Our project welcomes external contributions. If you have an itch, please feel
|
||||
free to scratch it.
|
||||
|
||||
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
|
||||
|
||||
A good way to familiarize yourself with the codebase and contribution process is
|
||||
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
|
||||
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
|
||||
|
||||
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
|
||||
**Note: We appreciate your effort, and want to avoid a situation where a contribution
|
||||
requires extensive rework (by you or by us), sits in backlog for a long time, or
|
||||
cannot be accepted at all!**
|
||||
|
||||
### Proposing new features
|
||||
|
||||
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
|
||||
before sending a pull request so the feature can be discussed. This is to avoid
|
||||
you wasting your valuable time working on a feature that the project developers
|
||||
are not interested in accepting into the code base.
|
||||
|
||||
### Fixing bugs
|
||||
|
||||
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
|
||||
pull request so it can be tracked.
|
||||
|
||||
### Merge approval
|
||||
|
||||
The project maintainers use LGTM (Looks Good To Me) in comments on the code
|
||||
review to indicate acceptance. A change requires LGTMs from two of the
|
||||
maintainers of each component affected.
|
||||
|
||||
For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
|
||||
|
||||
|
||||
## Legal
|
||||
|
||||
Each source file must include a license header for the MIT
|
||||
Software. Using the SPDX format is the simplest approach.
|
||||
e.g.
|
||||
|
||||
```
|
||||
/*
|
||||
Copyright IBM Inc. All rights reserved.
|
||||
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
```
|
||||
|
||||
We have tried to make it as easy as possible to make contributions. This
|
||||
applies to how we handle the legal aspects of contribution. We use the
|
||||
same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
|
||||
uses to manage code contributions.
|
||||
|
||||
We simply ask that when submitting a patch for review, the developer
|
||||
must include a sign-off statement in the commit message.
|
||||
|
||||
Here is an example Signed-off-by line, which indicates that the
|
||||
submitter accepts the DCO:
|
||||
|
||||
```
|
||||
Signed-off-by: John Doe <john.doe@example.com>
|
||||
```
|
||||
|
||||
You can include this automatically when you commit a change to your
|
||||
local git repository using the following command:
|
||||
|
||||
```
|
||||
git commit -s
|
||||
```
|
||||
|
||||
|
||||
## Communication
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
|
||||
|
||||
|
||||
## Developing
|
||||
|
||||
### Usage of Poetry
|
||||
|
||||
We use Poetry to manage dependencies.
|
||||
|
||||
|
||||
#### Install
|
||||
|
||||
To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
|
||||
|
||||
1. Install the Poetry globally in your machine
|
||||
```bash
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
```
|
||||
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
|
||||
|
||||
2. Make sure Poetry is in your `$PATH`
|
||||
- for `zsh`
|
||||
```sh
|
||||
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
|
||||
```
|
||||
- for `bash`
|
||||
```sh
|
||||
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
|
||||
```
|
||||
|
||||
3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
|
||||
|
||||
|
||||
#### Create a Virtual Environment and Install Dependencies
|
||||
|
||||
To activate the Virtual Environment, run:
|
||||
|
||||
```bash
|
||||
poetry shell
|
||||
```
|
||||
|
||||
To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
|
||||
|
||||
```bash
|
||||
poetry install
|
||||
```
|
||||
|
||||
**(Advanced) Use a Specific Python Version**
|
||||
|
||||
If for whatever reason you need to work in a specific (older) version of Python, run:
|
||||
|
||||
```bash
|
||||
poetry env use $(which python3.8)
|
||||
```
|
||||
|
||||
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
|
||||
|
||||
|
||||
#### Add a new dependency
|
||||
|
||||
```bash
|
||||
poetry add NAME
|
||||
```
|
||||
|
||||
## Coding style guidelines
|
||||
|
||||
We use the following tools to enforce code style:
|
||||
|
||||
- iSort, to sort imports
|
||||
- Black, to format code
|
||||
|
||||
|
||||
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
|
||||
|
||||
```bash
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
To run the checks on-demand, run:
|
||||
|
||||
```
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||
|
||||
|
||||
|
||||
## Documentation
|
||||
|
||||
We use [MkDocs](https://www.mkdocs.org/) to write documentation.
|
||||
|
||||
To run the documentation server, do:
|
||||
|
||||
```bash
|
||||
mkdocs serve
|
||||
```
|
||||
|
||||
The server will be available on [http://localhost:8000](http://localhost:8000).
|
||||
|
||||
### Pushing Documentation to GitHub pages
|
||||
|
||||
Run the following:
|
||||
|
||||
```bash
|
||||
mkdocs gh-deploy
|
||||
```
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) [year] [fullname]
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,8 @@
|
||||
# MAINTAINERS
|
||||
|
||||
- Christoph Auer - [@cau-git](https://github.com/cau-git)
|
||||
- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
|
||||
- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
|
||||
- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
|
||||
|
||||
Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
||||
@@ -1,6 +1,38 @@
|
||||
# docling-parse
|
||||
# Docling Parse
|
||||
|
||||
## Install
|
||||
[](https://pypi.org/project/docling-parse/)
|
||||

|
||||
[](https://python-poetry.org/)
|
||||
[](https://github.com/pybind/pybind11/)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
Simple package to extract text with coordinates from programmatic PDFs.
|
||||
This package is part of the [Docling](https://github.com/DS4SD/docling) conversion.
|
||||
|
||||
|
||||
## Quick start
|
||||
|
||||
Install the package from Pypi
|
||||
|
||||
```sh
|
||||
pip install docling-parse
|
||||
```
|
||||
|
||||
Convert a PDF
|
||||
|
||||
```sh
|
||||
from docling_parse import pdf_parser
|
||||
|
||||
parser = pdf_parser()
|
||||
doc = parser.find_cells("mydoc.pdf")
|
||||
|
||||
for i, page in enumerate(doc["pages"]):
|
||||
for j, cell in enumerate(page["cells"]):
|
||||
print(i, "\t", j, "\t", cell["content"]["rnormalized"])
|
||||
```
|
||||
|
||||
|
||||
## Development
|
||||
|
||||
### CXX
|
||||
|
||||
@@ -10,6 +42,15 @@ To build the parse, simply run the following command in the root folder,
|
||||
rm -rf build; cmake -B ./build; cd build; make
|
||||
```
|
||||
|
||||
You can run the parser from your build folder with
|
||||
|
||||
```sh
|
||||
./parse.exe <input-file> <optional-logging:true>
|
||||
```
|
||||
|
||||
If you dont have an input file, then a template input file will be printed on the terminal.
|
||||
|
||||
|
||||
### Python
|
||||
|
||||
To build the package, simply run (make sure [poetry](https://python-poetry.org/) is [installed](https://python-poetry.org/docs/#installing-with-the-official-installer)),
|
||||
@@ -18,24 +59,34 @@ To build the package, simply run (make sure [poetry](https://python-poetry.org/)
|
||||
poetry build
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
### CXX
|
||||
|
||||
In the build-folder, run
|
||||
|
||||
```
|
||||
./parse.exe <input-file> <optional-logging:true>
|
||||
```
|
||||
|
||||
If you dont have an input file, then a template input file will be printed on the terminal.
|
||||
|
||||
### Python
|
||||
|
||||
## Test
|
||||
|
||||
To test the package, run,
|
||||
|
||||
```
|
||||
poetry run pytest ./tests/test_parse.py
|
||||
```
|
||||
```
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling Parse](https://github.com/DS4SD/docling-parse/blob/main/CONTRIBUTING.md) for details.
|
||||
|
||||
|
||||
## References
|
||||
|
||||
If you use Docling in your projects, please consider citing the following:
|
||||
|
||||
```bib
|
||||
@software{Docling,
|
||||
author = {Deep Search Team},
|
||||
month = {7},
|
||||
title = {{Docling}},
|
||||
url = {https://github.com/DS4SD/docling},
|
||||
version = {main},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The Docling Parse codebase is under MIT license.
|
||||
For individual model usage, please refer to the model licenses found in the original packages.
|
||||
|
||||
+21
-1
@@ -1,8 +1,28 @@
|
||||
[tool.poetry]
|
||||
name = "docling-parse"
|
||||
version = "0.1.0"
|
||||
description = "Simple package to extract text with coordinates from programmatic PDF's"
|
||||
description = "Simple package to extract text with coordinates from programmatic PDFs"
|
||||
authors = ["Peter Staar <taa@zurich.ibm.com>"]
|
||||
maintainers = [
|
||||
"Peter Staar <taa@zurich.ibm.com>",
|
||||
"Christoph Auer <cau@zurich.ibm.com>",
|
||||
"Michele Dolfi <dol@zurich.ibm.com>",
|
||||
"Panos Vagenas <pva@zurich.ibm.com>",
|
||||
"Maxim Lysak <mly@zurich.ibm.com>",
|
||||
]
|
||||
repository = "https://github.com/DS4SD/docling-parse"
|
||||
homepage = "https://github.com/DS4SD/docling-parse"
|
||||
keywords= ["docling", "pdf", "parser"]
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: MacOS :: MacOS X",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Programming Language :: C++",
|
||||
"Programming Language :: Python :: 3"
|
||||
]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
packages = [{include = "docling_parse"}]
|
||||
|
||||
Reference in New Issue
Block a user