mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
25672da1e8
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
717 lines
29 KiB
C++
717 lines
29 KiB
C++
//-*-C++-*-
|
|
|
|
#include <optional>
|
|
#include <pybind11/pybind11.h>
|
|
#include <pybind11/stl.h>
|
|
#include <pybind11/buffer_info.h>
|
|
|
|
#include <pybind/utils/pybind11_json.h>
|
|
|
|
#include <pybind/docling_parser.h>
|
|
#include <pybind/docling_sanitizer.h>
|
|
|
|
// Include parse headers for typed bindings
|
|
#include <parse.h>
|
|
|
|
PYBIND11_MODULE(pdf_parsers, m) {
|
|
|
|
// ============= Typed Resource Bindings (for zero-copy access) =============
|
|
|
|
// PdfCell - individual text cell with bounding box and text content
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_CELL>>(m, "PdfCell")
|
|
.def_readonly("x0", &pdflib::pdf_resource<pdflib::PAGE_CELL>::x0)
|
|
.def_readonly("y0", &pdflib::pdf_resource<pdflib::PAGE_CELL>::y0)
|
|
.def_readonly("x1", &pdflib::pdf_resource<pdflib::PAGE_CELL>::x1)
|
|
.def_readonly("y1", &pdflib::pdf_resource<pdflib::PAGE_CELL>::y1)
|
|
.def_readonly("r_x0", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_x0)
|
|
.def_readonly("r_y0", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_y0)
|
|
.def_readonly("r_x1", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_x1)
|
|
.def_readonly("r_y1", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_y1)
|
|
.def_readonly("r_x2", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_x2)
|
|
.def_readonly("r_y2", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_y2)
|
|
.def_readonly("r_x3", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_x3)
|
|
.def_readonly("r_y3", &pdflib::pdf_resource<pdflib::PAGE_CELL>::r_y3)
|
|
.def_readonly("text", &pdflib::pdf_resource<pdflib::PAGE_CELL>::text)
|
|
.def_readonly("rendering_mode", &pdflib::pdf_resource<pdflib::PAGE_CELL>::rendering_mode)
|
|
.def_readonly("space_width", &pdflib::pdf_resource<pdflib::PAGE_CELL>::space_width)
|
|
.def_readonly("enc_name", &pdflib::pdf_resource<pdflib::PAGE_CELL>::enc_name)
|
|
.def_readonly("font_enc", &pdflib::pdf_resource<pdflib::PAGE_CELL>::font_enc)
|
|
.def_readonly("font_key", &pdflib::pdf_resource<pdflib::PAGE_CELL>::font_key)
|
|
.def_readonly("font_name", &pdflib::pdf_resource<pdflib::PAGE_CELL>::font_name)
|
|
.def_readonly("widget", &pdflib::pdf_resource<pdflib::PAGE_CELL>::widget)
|
|
.def_readonly("left_to_right", &pdflib::pdf_resource<pdflib::PAGE_CELL>::left_to_right);
|
|
|
|
// PdfLine - graphic line with coordinates
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_LINE>>(m, "PdfLine")
|
|
.def("get_x", &pdflib::pdf_resource<pdflib::PAGE_LINE>::get_x,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get x coordinates of line points")
|
|
.def("get_y", &pdflib::pdf_resource<pdflib::PAGE_LINE>::get_y,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get y coordinates of line points")
|
|
.def("get_i", &pdflib::pdf_resource<pdflib::PAGE_LINE>::get_i,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get segment indices")
|
|
.def("__len__", &pdflib::pdf_resource<pdflib::PAGE_LINE>::size);
|
|
|
|
// PdfImage - bitmap resource with bounding box and image data
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_IMAGE>>(m, "PdfImage")
|
|
.def_readonly("x0", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::x0)
|
|
.def_readonly("y0", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::y0)
|
|
.def_readonly("x1", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::x1)
|
|
.def_readonly("y1", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::y1)
|
|
.def_readonly("image_width", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::image_width)
|
|
.def_readonly("image_height", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::image_height)
|
|
.def("get_image_format", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::get_image_format,
|
|
"Get image format hint: 'jpeg', 'jp2', 'jbig2', or 'raw'")
|
|
.def("get_pil_mode", &pdflib::pdf_resource<pdflib::PAGE_IMAGE>::get_pil_mode,
|
|
"Get PIL-compatible mode string: 'L', 'RGB', 'CMYK', or '1'")
|
|
.def("get_image_as_bytes",
|
|
[](pdflib::pdf_resource<pdflib::PAGE_IMAGE> const& self) {
|
|
auto data = self.get_image_as_bytes();
|
|
return pybind11::bytes(reinterpret_cast<char const*>(data.data()),
|
|
data.size());
|
|
},
|
|
"Get image data as bytes (corrected JPEG, raw JP2, or decoded pixels)");
|
|
|
|
// PdfPageDimension - page geometry and bounding boxes
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_DIMENSION>>(m, "PdfPageDimension")
|
|
.def("get_angle", &pdflib::pdf_resource<pdflib::PAGE_DIMENSION>::get_angle,
|
|
"Get page rotation angle in degrees")
|
|
.def("get_crop_bbox", &pdflib::pdf_resource<pdflib::PAGE_DIMENSION>::get_crop_bbox,
|
|
"Get crop box as [x0, y0, x1, y1]")
|
|
.def("get_media_bbox", &pdflib::pdf_resource<pdflib::PAGE_DIMENSION>::get_media_bbox,
|
|
"Get media box as [x0, y0, x1, y1]");
|
|
|
|
// ============= Container Type Bindings =============
|
|
|
|
// PdfCells - iterable container of PdfCell objects
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_CELLS>>(m, "PdfCells")
|
|
.def("__len__", &pdflib::pdf_resource<pdflib::PAGE_CELLS>::size)
|
|
.def("__getitem__", [](pdflib::pdf_resource<pdflib::PAGE_CELLS>& self, size_t i)
|
|
-> pdflib::pdf_resource<pdflib::PAGE_CELL>& {
|
|
if (i >= self.size()) {
|
|
throw pybind11::index_error("index out of range");
|
|
}
|
|
return self[i];
|
|
}, pybind11::return_value_policy::reference_internal)
|
|
.def("__iter__", [](pdflib::pdf_resource<pdflib::PAGE_CELLS>& self) {
|
|
return pybind11::make_iterator(self.begin(), self.end());
|
|
}, pybind11::keep_alive<0, 1>());
|
|
|
|
// PdfLines - iterable container of PdfLine objects
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_LINES>>(m, "PdfLines")
|
|
.def("__len__", &pdflib::pdf_resource<pdflib::PAGE_LINES>::size)
|
|
.def("__getitem__", [](pdflib::pdf_resource<pdflib::PAGE_LINES>& self, size_t i)
|
|
-> pdflib::pdf_resource<pdflib::PAGE_LINE>& {
|
|
if (i >= self.size()) {
|
|
throw pybind11::index_error("index out of range");
|
|
}
|
|
return self[i];
|
|
}, pybind11::return_value_policy::reference_internal)
|
|
.def("__iter__", [](pdflib::pdf_resource<pdflib::PAGE_LINES>& self) {
|
|
return pybind11::make_iterator(self.begin(), self.end());
|
|
}, pybind11::keep_alive<0, 1>());
|
|
|
|
// PdfImages - iterable container of PdfImage objects
|
|
pybind11::class_<pdflib::pdf_resource<pdflib::PAGE_IMAGES>>(m, "PdfImages")
|
|
.def("__len__", &pdflib::pdf_resource<pdflib::PAGE_IMAGES>::size)
|
|
.def("__getitem__", [](pdflib::pdf_resource<pdflib::PAGE_IMAGES>& self, size_t i)
|
|
-> pdflib::pdf_resource<pdflib::PAGE_IMAGE>& {
|
|
if (i >= self.size()) {
|
|
throw pybind11::index_error("index out of range");
|
|
}
|
|
return self[i];
|
|
}, pybind11::return_value_policy::reference_internal)
|
|
.def("__iter__", [](pdflib::pdf_resource<pdflib::PAGE_IMAGES>& self) {
|
|
return pybind11::make_iterator(self.begin(), self.end());
|
|
}, pybind11::keep_alive<0, 1>());
|
|
|
|
// ============= Page Decoder Binding =============
|
|
|
|
// PdfPageDecoder - provides typed access to decoded page data
|
|
pybind11::class_<pdflib::pdf_decoder<pdflib::PAGE>,
|
|
std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>>>(m, "PdfPageDecoder")
|
|
.def("get_page_number", &pdflib::pdf_decoder<pdflib::PAGE>::get_page_number,
|
|
"Get the page number (0-indexed)")
|
|
.def("get_page_dimension", &pdflib::pdf_decoder<pdflib::PAGE>::get_page_dimension,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get page dimension/geometry")
|
|
.def("get_char_cells", &pdflib::pdf_decoder<pdflib::PAGE>::get_char_cells,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get individual character cells")
|
|
.def("get_word_cells", &pdflib::pdf_decoder<pdflib::PAGE>::get_word_cells,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get word cells (aggregated from char cells)")
|
|
.def("get_line_cells", &pdflib::pdf_decoder<pdflib::PAGE>::get_line_cells,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get line cells (aggregated from char cells)")
|
|
.def("get_page_lines", &pdflib::pdf_decoder<pdflib::PAGE>::get_page_lines,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get graphic lines on the page")
|
|
.def("get_page_images", &pdflib::pdf_decoder<pdflib::PAGE>::get_page_images,
|
|
pybind11::return_value_policy::reference_internal,
|
|
"Get bitmap/image resources on the page")
|
|
.def("has_word_cells", &pdflib::pdf_decoder<pdflib::PAGE>::has_word_cells,
|
|
"Check if word cells have been created")
|
|
.def("has_line_cells", &pdflib::pdf_decoder<pdflib::PAGE>::has_line_cells,
|
|
"Check if line cells have been created")
|
|
.def("get_timings", [](pdflib::pdf_decoder<pdflib::PAGE>& self) {
|
|
// Return as Dict[str, float] (sums) for backward compatibility
|
|
return self.get_timings().to_sum_map();
|
|
},
|
|
"Get timing information for page decoding as Dict[str, float]")
|
|
.def("get_timings_raw", [](pdflib::pdf_decoder<pdflib::PAGE>& self) {
|
|
// Return as Dict[str, List[float]] for detailed timing data
|
|
return self.get_timings().get_raw_data();
|
|
},
|
|
"Get detailed timing information as Dict[str, List[float]]")
|
|
.def("get_static_timings", [](pdflib::pdf_decoder<pdflib::PAGE>& self) {
|
|
return self.get_timings().get_static_timings();
|
|
},
|
|
"Get only static (constant) timing keys as Dict[str, float]")
|
|
.def("get_dynamic_timings", [](pdflib::pdf_decoder<pdflib::PAGE>& self) {
|
|
return self.get_timings().get_dynamic_timings();
|
|
},
|
|
"Get only dynamic timing keys as Dict[str, float]");
|
|
|
|
// ============= Timing Keys Constants =============
|
|
|
|
m.attr("TIMING_KEY_DECODE_PAGE") = pdflib::pdf_timings::KEY_DECODE_PAGE;
|
|
m.attr("TIMING_KEY_DECODE_DIMENSIONS") = pdflib::pdf_timings::KEY_DECODE_DIMENSIONS;
|
|
m.attr("TIMING_KEY_DECODE_RESOURCES") = pdflib::pdf_timings::KEY_DECODE_RESOURCES;
|
|
m.attr("TIMING_KEY_DECODE_GRPHS") = pdflib::pdf_timings::KEY_DECODE_GRPHS;
|
|
m.attr("TIMING_KEY_DECODE_FONTS") = pdflib::pdf_timings::KEY_DECODE_FONTS;
|
|
m.attr("TIMING_KEY_DECODE_XOBJECTS") = pdflib::pdf_timings::KEY_DECODE_XOBJECTS;
|
|
m.attr("TIMING_KEY_DECODE_CONTENTS") = pdflib::pdf_timings::KEY_DECODE_CONTENTS;
|
|
m.attr("TIMING_KEY_DECODE_ANNOTS") = pdflib::pdf_timings::KEY_DECODE_ANNOTS;
|
|
m.attr("TIMING_KEY_SANITISE_CONTENTS") = pdflib::pdf_timings::KEY_SANITISE_CONTENTS;
|
|
m.attr("TIMING_KEY_CREATE_WORD_CELLS") = pdflib::pdf_timings::KEY_CREATE_WORD_CELLS;
|
|
m.attr("TIMING_KEY_CREATE_LINE_CELLS") = pdflib::pdf_timings::KEY_CREATE_LINE_CELLS;
|
|
m.attr("TIMING_KEY_DECODE_FONTS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_FONTS_TOTAL;
|
|
|
|
// Additional decode_page step keys
|
|
m.attr("TIMING_KEY_TO_JSON_PAGE") = pdflib::pdf_timings::KEY_TO_JSON_PAGE;
|
|
m.attr("TIMING_KEY_EXTRACT_ANNOTS_JSON") = pdflib::pdf_timings::KEY_EXTRACT_ANNOTS_JSON;
|
|
m.attr("TIMING_KEY_ROTATE_CONTENTS") = pdflib::pdf_timings::KEY_ROTATE_CONTENTS;
|
|
m.attr("TIMING_KEY_SANITIZE_ORIENTATION") = pdflib::pdf_timings::KEY_SANITIZE_ORIENTATION;
|
|
m.attr("TIMING_KEY_SANITIZE_CELLS") = pdflib::pdf_timings::KEY_SANITIZE_CELLS;
|
|
|
|
m.attr("TIMING_KEY_PROCESS_DOCUMENT_FROM_FILE") = pdflib::pdf_timings::KEY_PROCESS_DOCUMENT_FROM_FILE;
|
|
m.attr("TIMING_KEY_PROCESS_DOCUMENT_FROM_BYTESIO") = pdflib::pdf_timings::KEY_PROCESS_DOCUMENT_FROM_BYTESIO;
|
|
m.attr("TIMING_KEY_DECODE_DOCUMENT") = pdflib::pdf_timings::KEY_DECODE_DOCUMENT;
|
|
|
|
m.attr("TIMING_PREFIX_DECODE_FONT") = pdflib::pdf_timings::PREFIX_DECODE_FONT;
|
|
m.attr("TIMING_PREFIX_DECODING_PAGE") = pdflib::pdf_timings::PREFIX_DECODING_PAGE;
|
|
m.attr("TIMING_PREFIX_DECODE_PAGE") = pdflib::pdf_timings::PREFIX_DECODE_PAGE;
|
|
|
|
m.def("get_static_timing_keys", &pdflib::pdf_timings::get_static_keys,
|
|
"Get all static timing keys as Set[str]");
|
|
m.def("is_static_timing_key", &pdflib::pdf_timings::is_static_key,
|
|
pybind11::arg("key"),
|
|
"Check if a timing key is static (constant)");
|
|
m.def("get_decode_page_timing_keys", &pdflib::pdf_timings::get_decode_page_keys,
|
|
"Get timing keys used in decode_page method (in order, excluding global timer) as List[str]");
|
|
|
|
// ============= PDF Parser =============
|
|
|
|
// next generation parser, 10x faster with more finegrained output
|
|
pybind11::class_<docling::docling_parser>(m, "pdf_parser")
|
|
.def(pybind11::init())
|
|
|
|
.def(pybind11::init<const std::string&>(),
|
|
pybind11::arg("level"),
|
|
R"(
|
|
Construct pdf_parser with logging level.
|
|
|
|
Parameters:
|
|
level (str): Logging level as a string.
|
|
One of ['fatal', 'error', 'warning', 'info'])")
|
|
|
|
.def("set_loglevel",
|
|
[](docling::docling_parser &self, int level) -> void {
|
|
self.set_loglevel(level);
|
|
},
|
|
pybind11::arg("level"),
|
|
R"(
|
|
Set the log level using an integer.
|
|
|
|
Parameters:
|
|
level (int): Logging level as an integer.
|
|
One of [`fatal`=0, `error`=1, `warning`=2, `info`=3])")
|
|
|
|
.def("set_loglevel_with_label",
|
|
[](docling::docling_parser &self, const std::string &level) -> void {
|
|
self.set_loglevel_with_label(level);
|
|
},
|
|
pybind11::arg("level"),
|
|
R"(
|
|
Set the log level using a string label.
|
|
|
|
Parameters:
|
|
level (str): Logging level as a string.
|
|
One of ['fatal', 'error', 'warning', 'info']
|
|
)")
|
|
|
|
.def("is_loaded",
|
|
[](docling::docling_parser &self, const std::string &key) -> bool {
|
|
return self.is_loaded(key);
|
|
},
|
|
pybind11::arg("key"),
|
|
R"(
|
|
Check if a document with the given key is loaded.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document to check.
|
|
|
|
Returns:
|
|
bool: True if the document is loaded, False otherwise.)")
|
|
|
|
.def("list_loaded_keys",
|
|
[](docling::docling_parser &self) -> std::vector<std::string> {
|
|
return self.list_loaded_keys();
|
|
},
|
|
R"(
|
|
List the keys of the loaded documents.
|
|
|
|
Returns:
|
|
List[str]: A list of keys for the currently loaded documents.)")
|
|
|
|
.def("load_document",
|
|
[](
|
|
docling::docling_parser &self,
|
|
const std::string &key,
|
|
const std::string &filename,
|
|
std::optional<std::string>& password
|
|
) -> bool {
|
|
return self.load_document(key, filename, password);
|
|
},
|
|
pybind11::arg("key"),
|
|
pybind11::arg("filename"),
|
|
pybind11::arg("password") = pybind11::none(),
|
|
R"(
|
|
Load a document by key and filename.
|
|
|
|
Parameters:
|
|
key (str): The unique key to identify the document.
|
|
filename (str): The path to the document file to load.
|
|
password (str, optional): Optional password for password-protected files
|
|
|
|
Returns:
|
|
bool: True if the document was successfully loaded, False otherwise.)")
|
|
|
|
.def("load_document_from_bytesio",
|
|
[](docling::docling_parser &self, const std::string &key, pybind11::object bytes_io) -> bool {
|
|
return self.load_document_from_bytesio(key, bytes_io);
|
|
},
|
|
pybind11::arg("key"),
|
|
pybind11::arg("bytes_io"),
|
|
R"(
|
|
Load a document by key from a BytesIO-like object.
|
|
|
|
Parameters:
|
|
key (str): The unique key to identify the document.
|
|
bytes_io (Any): A BytesIO-like object containing the document data.
|
|
|
|
Returns:
|
|
bool: True if the document was successfully loaded, False otherwise.)")
|
|
|
|
.def("unload_document",
|
|
[](docling::docling_parser &self, const std::string &key) -> bool {
|
|
return self.unload_document(key);
|
|
},
|
|
R"(
|
|
Unload a document by its unique key.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document to unload.
|
|
|
|
Returns:
|
|
bool: True if the document was successfully unloaded, False otherwise.)")
|
|
|
|
.def("unload_document_pages",
|
|
[](docling::docling_parser &self, const std::string &key) -> bool {
|
|
return self.unload_document_pages(key);
|
|
},
|
|
pybind11::arg("key"),
|
|
R"(
|
|
Unload the only the cached pages of the document by its unique key.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document to unload.
|
|
|
|
Returns:
|
|
bool: True if the document was successfully unloaded, False otherwise.)")
|
|
|
|
.def("unload_document_page",
|
|
[](docling::docling_parser &self, const std::string &key, int page) -> bool {
|
|
return self.unload_document_page(key, page);
|
|
},
|
|
pybind11::arg("key"),
|
|
pybind11::arg("page"),
|
|
R"(
|
|
Unload a single page of the document by its unique key and page_number.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document to unload.
|
|
page (int): The page number of the document to unload.
|
|
|
|
Returns:
|
|
bool: True if the document was successfully unloaded, False otherwise.)")
|
|
|
|
.def("number_of_pages",
|
|
[](docling::docling_parser &self, const std::string &key) -> int {
|
|
return self.number_of_pages(key);
|
|
},
|
|
pybind11::arg("key"),
|
|
R"(
|
|
Get the number of pages in the document identified by its unique key.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
|
|
Returns:
|
|
int: The number of pages in the document.)")
|
|
|
|
.def("get_annotations",
|
|
[](docling::docling_parser &self, const std::string &key) -> nlohmann::json {
|
|
return self.get_annotations(key);
|
|
},
|
|
pybind11::arg("key"),
|
|
R"(
|
|
Retrieve annotations for the document identified by its unique key and return them as JSON.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
|
|
Returns:
|
|
dict: A JSON object containing the annotations for the document.)")
|
|
|
|
.def("get_table_of_contents",
|
|
[](docling::docling_parser &self, const std::string &key) -> nlohmann::json {
|
|
return self.get_table_of_contents(key);
|
|
},
|
|
pybind11::arg("key"),
|
|
R"(
|
|
Retrieve the table of contents for the document identified by its unique key and return it as JSON.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
|
|
Returns:
|
|
dict: A JSON object representing the table of contents of the document.)")
|
|
|
|
.def("get_meta_xml",
|
|
[](docling::docling_parser &self, const std::string &key) -> nlohmann::json {
|
|
return self.get_meta_xml(key);
|
|
},
|
|
pybind11::arg("key"),
|
|
R"(
|
|
Retrieve the meta data in string or None.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
|
|
Returns:
|
|
dict: A None or string of the metadata in xml of the document.)")
|
|
|
|
.def("parse_pdf_from_key",
|
|
[](docling::docling_parser &self,
|
|
const std::string &key,
|
|
const std::string &page_boundary,
|
|
bool do_sanitization
|
|
) -> nlohmann::json {
|
|
return self.parse_pdf_from_key(key, page_boundary, do_sanitization);
|
|
},
|
|
pybind11::arg("key"),
|
|
pybind11::arg("page_boundary") = "crop_box", // media_box
|
|
pybind11::arg("do_sanitization") = true, // media_box
|
|
R"(
|
|
Parse the PDF document identified by its unique key and return a JSON representation.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
page_boundary (str): The page boundary specification for parsing. One of [`crop_box`, `media_box`].
|
|
do_sanitization: Sanitize the chars into lines [default=true].
|
|
|
|
Returns:
|
|
dict: A JSON representation of the parsed PDF document.)")
|
|
|
|
.def("parse_pdf_from_key_on_page",
|
|
[](docling::docling_parser &self,
|
|
const std::string &key,
|
|
int page,
|
|
const std::string &page_boundary,
|
|
bool do_sanitization,
|
|
bool keep_char_cells,
|
|
bool keep_lines,
|
|
bool keep_bitmaps,
|
|
bool create_word_cells,
|
|
bool create_line_cells) -> nlohmann::json {
|
|
return self.parse_pdf_from_key_on_page(key,
|
|
page,
|
|
page_boundary,
|
|
do_sanitization,
|
|
keep_char_cells,
|
|
keep_lines,
|
|
keep_bitmaps,
|
|
create_word_cells,
|
|
create_line_cells);
|
|
},
|
|
pybind11::arg("key"),
|
|
pybind11::arg("page"),
|
|
pybind11::arg("page_boundary") = "crop_box", // media_box
|
|
pybind11::arg("do_sanitization") = true,
|
|
pybind11::arg("keep_char_cells") = true,
|
|
pybind11::arg("keep_lines") = true,
|
|
pybind11::arg("keep_bitmaps") = true,
|
|
pybind11::arg("create_word_cells") = true,
|
|
pybind11::arg("create_line_cells") = true,
|
|
R"(
|
|
Parse a specific page of the PDF document identified by its unique key and return a JSON representation.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
page (int): The page number to parse.
|
|
page_boundary (str): The page boundary specification for parsing [choices: crop_box, media_box].
|
|
do_sanitization: Sanitize the chars into lines [default=true].
|
|
keep_char_cells: keep all the individual char's
|
|
keep_lines: keep all the lines
|
|
keep_bitmaps: keep all the bitmap resources
|
|
create_word_cells: create words from the char-cells
|
|
create_line_cells: create lines from the char-cells
|
|
|
|
Returns:
|
|
dict: A JSON representation of the parsed page.)")
|
|
|
|
.def("get_page_decoder",
|
|
[](docling::docling_parser &self,
|
|
const std::string &key,
|
|
int page,
|
|
const std::string &page_boundary,
|
|
bool do_sanitization,
|
|
bool create_word_cells,
|
|
bool create_line_cells) -> std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>> {
|
|
return self.get_page_decoder(key,
|
|
page,
|
|
page_boundary,
|
|
do_sanitization,
|
|
create_word_cells,
|
|
create_line_cells);
|
|
},
|
|
pybind11::arg("key"),
|
|
pybind11::arg("page"),
|
|
pybind11::arg("page_boundary") = "crop_box",
|
|
pybind11::arg("do_sanitization") = true,
|
|
pybind11::arg("create_word_cells") = true,
|
|
pybind11::arg("create_line_cells") = true,
|
|
R"(
|
|
Get a typed page decoder for direct access to page data without JSON serialization.
|
|
|
|
This method provides efficient access to parsed page data through typed Python objects
|
|
instead of dictionaries. Use this for better performance when processing large documents.
|
|
|
|
Parameters:
|
|
key (str): The unique key of the document.
|
|
page (int): The page number to parse (0-indexed).
|
|
page_boundary (str): The page boundary specification [choices: crop_box, media_box].
|
|
do_sanitization (bool): Sanitize the chars into lines [default=true].
|
|
create_word_cells (bool): Create word cells from char cells [default=true].
|
|
create_line_cells (bool): Create line cells from char cells [default=true].
|
|
|
|
Returns:
|
|
PdfPageDecoder: A typed page decoder object with direct access to:
|
|
- get_char_cells(): Individual character cells
|
|
- get_word_cells(): Word cells (aggregated)
|
|
- get_line_cells(): Line cells (aggregated)
|
|
- get_page_lines(): Graphic lines
|
|
- get_page_images(): Bitmap resources
|
|
- get_page_dimension(): Page geometry)")
|
|
|
|
.def("sanitize_cells",
|
|
[](docling::docling_parser &self,
|
|
nlohmann::json &original_cells,
|
|
nlohmann::json &page_dim,
|
|
nlohmann::json &page_lines,
|
|
double horizontal_cell_tolerance,
|
|
bool enforce_same_font,
|
|
double space_width_factor_for_merge,
|
|
double space_width_factor_for_merge_with_space) -> nlohmann::json {
|
|
return self.sanitize_cells(original_cells,
|
|
page_dim,
|
|
page_lines,
|
|
horizontal_cell_tolerance,
|
|
enforce_same_font,
|
|
space_width_factor_for_merge,
|
|
space_width_factor_for_merge_with_space
|
|
);
|
|
},
|
|
pybind11::arg("original_cells"),
|
|
pybind11::arg("page_dimension"),
|
|
pybind11::arg("page_lines"),
|
|
pybind11::arg("horizontal_cell_tolerance")=1.0,
|
|
pybind11::arg("enforce_same_font")=true,
|
|
pybind11::arg("space_width_factor_for_merge")=1.5,
|
|
pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
|
|
R"(
|
|
Sanitize table cells with specified parameters and return the processed JSON.
|
|
|
|
Parameters:
|
|
original_cells (dict): The original table cells as a JSON object.
|
|
page_dim (dict): Page dimensions as a JSON object.
|
|
page_lines (dict): Page lines as a JSON object.
|
|
horizontal_cell_tolerance (float): Vertical adjustment parameter to judge if two cells need to be merged (yes if abs(cell_i.r_y1-cell_i.r_y0)<horizontal_cell_tolerance), default = 1.0.
|
|
enforce_same_font (bool): Whether to enforce the same font across cells. Default = True.
|
|
space_width_factor_for_merge (float): Factor for merging cells based on space width. Default = 1.5.
|
|
space_width_factor_for_merge_with_space (float): Factor for merging cells with space width. Default = 0.33.
|
|
|
|
Returns:
|
|
dict: A JSON object representing the sanitized table cells.)")
|
|
|
|
.def("sanitize_cells_in_bbox",
|
|
[](docling::docling_parser &self,
|
|
nlohmann::json &page,
|
|
const std::array<double, 4> &bbox,
|
|
double cell_overlap,
|
|
double horizontal_cell_tolerance,
|
|
bool enforce_same_font,
|
|
double space_width_factor_for_merge = 1.5,
|
|
double space_width_factor_for_merge_with_space = 0.33) -> nlohmann::json {
|
|
return self.sanitize_cells_in_bbox(page,
|
|
bbox,
|
|
cell_overlap,
|
|
horizontal_cell_tolerance,
|
|
enforce_same_font,
|
|
space_width_factor_for_merge,
|
|
space_width_factor_for_merge_with_space
|
|
);
|
|
},
|
|
pybind11::arg("page"),
|
|
pybind11::arg("bbox"),
|
|
pybind11::arg("cell_overlap")=0.99,
|
|
pybind11::arg("horizontal_cell_tolerance")=1.0,
|
|
pybind11::arg("enforce_same_font")=true,
|
|
pybind11::arg("space_width_factor_for_merge")=1.5,
|
|
pybind11::arg("space_width_factor_for_merge_with_space")=0.33,
|
|
R"(
|
|
Sanitize table cells in a given bounding box with specified parameters and return the processed JSON.
|
|
|
|
Parameters:
|
|
page (dict): The JSON object representing the page.
|
|
bbox (Tuple[float, float, float, float]): Bounding box specified as [x_min, y_min, x_max, y_max].
|
|
cell_overlap (float: 0.0-1.0): Overlap of cell (%) with bounding-box.
|
|
horizontal_cell_tolerance (float): Vertical adjustment parameter to judge if two cells need to be merged (yes if abs(cell_i.r_y1-cell_i.r_y0)<horizontal_cell_tolerance), default = 1.0.
|
|
enforce_same_font (bool): Whether to enforce the same font across cells. Default is True
|
|
space_width_factor_for_merge (float): Factor for merging cells based on space width. Default is 1.5.
|
|
space_width_factor_for_merge_with_space (float): Factor for merging cells with space width. Default is 0.33.
|
|
|
|
Returns:
|
|
dict: A JSON object representing the sanitized table cells in the bounding box.)");
|
|
|
|
|
|
// purely for backward compatibility
|
|
pybind11::class_<docling::docling_sanitizer>(m, "pdf_sanitizer")
|
|
.def(pybind11::init())
|
|
|
|
.def(pybind11::init<const std::string&>(),
|
|
pybind11::arg("level"),
|
|
R"(
|
|
Construct docling_sanitizer with logging level.
|
|
|
|
Parameters:
|
|
level (str): Logging level as a string.
|
|
One of ['fatal', 'error', 'warning', 'info'])")
|
|
|
|
.def("set_loglevel",
|
|
[](docling::docling_sanitizer &self, int level) -> void {
|
|
self.set_loglevel(level);
|
|
},
|
|
pybind11::arg("level"),
|
|
R"(
|
|
Set the log level using an integer.
|
|
|
|
Parameters:
|
|
level (int): Logging level as an integer.
|
|
One of [`fatal`=0, `error`=1, `warning`=2, `info`=3])")
|
|
|
|
.def("set_loglevel_with_label",
|
|
[](docling::docling_sanitizer &self, const std::string &level) -> void {
|
|
self.set_loglevel_with_label(level);
|
|
},
|
|
pybind11::arg("level"),
|
|
R"(
|
|
Set the log level using a string label.
|
|
|
|
Parameters:
|
|
level (str): Logging level as a string.
|
|
One of ['fatal', 'error', 'warning', 'info']
|
|
)")
|
|
|
|
.def("set_char_cells",
|
|
[](docling::docling_sanitizer &self,
|
|
nlohmann::json& data) -> nlohmann::json {
|
|
return self.set_char_cells(data);
|
|
},
|
|
pybind11::arg("data"),
|
|
R"(
|
|
Set char cells
|
|
|
|
Parameters:
|
|
data: A JSON object (for with data and header) or a list or records
|
|
|
|
Returns:
|
|
dict: A JSON object representing the sanitized word cells in the bounding box.)")
|
|
|
|
.def("create_word_cells",
|
|
[](docling::docling_sanitizer &self,
|
|
double horizontal_cell_tolerance,
|
|
bool enforce_same_font,
|
|
double space_width_factor_for_merge/* = 1.0*/) -> nlohmann::json {
|
|
return self.create_word_cells(horizontal_cell_tolerance,
|
|
enforce_same_font,
|
|
space_width_factor_for_merge);
|
|
},
|
|
pybind11::arg("horizontal_cell_tolerance"), // =1.0,
|
|
pybind11::arg("enforce_same_font"), //=true,
|
|
pybind11::arg("space_width_factor_for_merge"), //=0.33,
|
|
R"(
|
|
Create word cells
|
|
|
|
Parameters:
|
|
horizontal_cell_tolerance (float): Vertical adjustment parameter to judge if two cells need to be merged (yes if abs(cell_i.r_y1-cell_i.r_y0)<horizontal_cell_tolerance), default = 1.0.
|
|
enforce_same_font (bool): Whether to enforce the same font across cells. Default is True
|
|
space_width_factor_for_merge (float): Factor for merging cells based on space width. Default is 0.33.
|
|
|
|
Returns:
|
|
dict: A JSON object representing the sanitized word cells in the bounding box.)")
|
|
|
|
.def("create_line_cells",
|
|
[](docling::docling_sanitizer &self,
|
|
double horizontal_cell_tolerance,
|
|
bool enforce_same_font,
|
|
double space_width_factor_for_merge /*= 1.0*/,
|
|
double space_width_factor_for_merge_with_space /*= 0.33*/) -> nlohmann::json {
|
|
return self.create_line_cells(horizontal_cell_tolerance,
|
|
enforce_same_font,
|
|
space_width_factor_for_merge,
|
|
space_width_factor_for_merge_with_space);
|
|
},
|
|
pybind11::arg("horizontal_cell_tolerance"), //=1.0,
|
|
pybind11::arg("enforce_same_font"), //=true,
|
|
pybind11::arg("space_width_factor_for_merge"), //=1.0,
|
|
pybind11::arg("space_width_factor_for_merge_with_space"), //=0.33,
|
|
R"(
|
|
Create line cells
|
|
|
|
Parameters:
|
|
horizontal_cell_tolerance (float): Vertical adjustment parameter to judge if two cells need to be merged (yes if abs(cell_i.r_y1-cell_i.r_y0)<horizontal_cell_tolerance), default = 1.0.
|
|
enforce_same_font (bool): Whether to enforce the same font across cells. Default is True
|
|
space_width_factor_for_merge (float): Factor for merging cells based on space width. Default is 1.0.
|
|
space_width_factor_for_merge_with_space (float): Factor for merging cells with space width. Default is 0.33.
|
|
|
|
Returns:
|
|
dict: A JSON object representing the sanitized line cells in the bounding box.)");
|
|
|
|
|
|
}
|