//-*-C++-*- #include #include #include #include #include #include #include // Include parse headers for typed bindings #include PYBIND11_MODULE(pdf_parsers, m) { // ============= Decode Page Config ============= pybind11::class_(m, "DecodePageConfig", R"( Configuration parameters for page decoding. Attributes: page_boundary (str): The page boundary specification [choices: crop_box, media_box]. do_sanitization (bool): Sanitize the chars into lines [default=true]. keep_char_cells (bool): Keep all the individual char cells [default=true]. keep_shapes (bool): Keep all the graphic shapes [default=true]. keep_bitmaps (bool): Keep all the bitmap resources [default=true]. max_num_lines (int): Maximum number of lines to keep (-1 means no cap) [default=-1]. max_num_bitmaps (int): Maximum number of bitmaps to keep (-1 means no cap) [default=-1]. keep_glyphs (bool): If true, keep GLYPH<...> fallback strings in output; if false, replace them with a space [default=false]. keep_qpdf_warnings (bool): If true, QPDF warnings are emitted; if false, they are suppressed [default=false]. )") .def(pybind11::init<>()) .def_readwrite("page_boundary", &pdflib::decode_config::page_boundary) .def_readwrite("do_sanitization", &pdflib::decode_config::do_sanitization) .def_readwrite("keep_char_cells", &pdflib::decode_config::keep_char_cells) .def_readwrite("keep_shapes", &pdflib::decode_config::keep_shapes) .def_readwrite("keep_bitmaps", &pdflib::decode_config::keep_bitmaps) .def_readwrite("max_num_lines", &pdflib::decode_config::max_num_lines) .def_readwrite("max_num_bitmaps", &pdflib::decode_config::max_num_bitmaps) .def_readwrite("create_word_cells", &pdflib::decode_config::create_word_cells) .def_readwrite("create_line_cells", &pdflib::decode_config::create_line_cells) .def_readwrite("enforce_same_font", &pdflib::decode_config::enforce_same_font) .def_readwrite("horizontal_cell_tolerance", &pdflib::decode_config::horizontal_cell_tolerance) .def_readwrite("word_space_width_factor_for_merge", &pdflib::decode_config::word_space_width_factor_for_merge) .def_readwrite("line_space_width_factor_for_merge", &pdflib::decode_config::line_space_width_factor_for_merge) .def_readwrite("line_space_width_factor_for_merge_with_space", &pdflib::decode_config::line_space_width_factor_for_merge_with_space) .def_readwrite("do_thread_safe", &pdflib::decode_config::do_thread_safe) .def_readwrite("keep_glyphs", &pdflib::decode_config::keep_glyphs) .def_readwrite("keep_qpdf_warnings", &pdflib::decode_config::keep_qpdf_warnings); // ============= Typed Resource Bindings (for zero-copy access) ============= // PdfCell - individual text cell with bounding box and text content pybind11::class_>(m, "PdfCell") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("r_x0", &pdflib::page_item::r_x0) .def_readonly("r_y0", &pdflib::page_item::r_y0) .def_readonly("r_x1", &pdflib::page_item::r_x1) .def_readonly("r_y1", &pdflib::page_item::r_y1) .def_readonly("r_x2", &pdflib::page_item::r_x2) .def_readonly("r_y2", &pdflib::page_item::r_y2) .def_readonly("r_x3", &pdflib::page_item::r_x3) .def_readonly("r_y3", &pdflib::page_item::r_y3) .def_readonly("text", &pdflib::page_item::text) .def_readonly("rendering_mode", &pdflib::page_item::rendering_mode) .def_readonly("space_width", &pdflib::page_item::space_width) .def_readonly("enc_name", &pdflib::page_item::enc_name) .def_readonly("font_enc", &pdflib::page_item::font_enc) .def_readonly("font_key", &pdflib::page_item::font_key) .def_readonly("font_name", &pdflib::page_item::font_name) .def_readonly("widget", &pdflib::page_item::widget) .def_readonly("left_to_right", &pdflib::page_item::left_to_right); // PdfShape - graphic shape with coordinates pybind11::class_>(m, "PdfShape") .def("get_x", &pdflib::page_item::get_x, pybind11::return_value_policy::reference_internal, "Get x coordinates of shape points") .def("get_y", &pdflib::page_item::get_y, pybind11::return_value_policy::reference_internal, "Get y coordinates of shape points") .def("get_i", &pdflib::page_item::get_i, pybind11::return_value_policy::reference_internal, "Get segment indices") .def("__len__", &pdflib::page_item::size) .def("get_has_graphics_state", &pdflib::page_item::get_has_graphics_state, "Check if graphics state has been set") .def("get_line_width", &pdflib::page_item::get_line_width, "Get line width") .def("get_miter_limit", &pdflib::page_item::get_miter_limit, "Get miter limit") .def("get_line_cap", &pdflib::page_item::get_line_cap, "Get line cap style") .def("get_line_join", &pdflib::page_item::get_line_join, "Get line join style") .def("get_dash_phase", &pdflib::page_item::get_dash_phase, "Get dash phase") .def("get_dash_array", &pdflib::page_item::get_dash_array, pybind11::return_value_policy::reference_internal, "Get dash array") .def("get_flatness", &pdflib::page_item::get_flatness, "Get flatness tolerance") .def("get_rgb_stroking_ops", &pdflib::page_item::get_rgb_stroking_ops, pybind11::return_value_policy::reference_internal, "Get RGB stroking color") .def("get_rgb_filling_ops", &pdflib::page_item::get_rgb_filling_ops, pybind11::return_value_policy::reference_internal, "Get RGB filling color"); // PdfImage - bitmap resource with bounding box and image data pybind11::class_>(m, "PdfImage") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("image_width", &pdflib::page_item::image_width) .def_readonly("image_height", &pdflib::page_item::image_height) .def("get_image_format", &pdflib::page_item::get_image_format, "Get image format hint: 'jpeg', 'jp2', 'jbig2', or 'raw'") .def("get_pil_mode", &pdflib::page_item::get_pil_mode, "Get PIL-compatible mode string: 'L', 'RGB', 'CMYK', or '1'") .def("get_image_as_bytes", [](pdflib::page_item const& self) { auto data = self.get_image_as_bytes(); return pybind11::bytes(reinterpret_cast(data.data()), data.size()); }, "Get image data as bytes (corrected JPEG, raw JP2, or decoded pixels)"); // PdfPageDimension - page geometry and bounding boxes pybind11::class_>(m, "PdfPageDimension") .def("get_angle", &pdflib::page_item::get_angle, "Get page rotation angle in degrees") .def("get_crop_bbox", &pdflib::page_item::get_crop_bbox, "Get crop box as [x0, y0, x1, y1]") .def("get_media_bbox", &pdflib::page_item::get_media_bbox, "Get media box as [x0, y0, x1, y1]"); // PdfWidget - form field widget with bounding box and field info pybind11::class_>(m, "PdfWidget") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("text", &pdflib::page_item::text) .def_readonly("description", &pdflib::page_item::description) .def_readonly("field_name", &pdflib::page_item::field_name) .def_readonly("field_type", &pdflib::page_item::field_type); // PdfHyperlink - hyperlink annotation with bounding box and URI pybind11::class_>(m, "PdfHyperlink") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("uri", &pdflib::page_item::uri); // ============= Container Type Bindings ============= // PdfCells - iterable container of PdfCell objects pybind11::class_>(m, "PdfCells") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfShapes - iterable container of PdfShape objects pybind11::class_>(m, "PdfShapes") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfImages - iterable container of PdfImage objects pybind11::class_>(m, "PdfImages") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfWidgets - iterable container of PdfWidget objects pybind11::class_>(m, "PdfWidgets") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfHyperlinks - iterable container of PdfHyperlink objects pybind11::class_>(m, "PdfHyperlinks") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // ============= Page Decoder Binding ============= // PdfPageDecoder - provides typed access to decoded page data pybind11::class_, std::shared_ptr>>(m, "PdfPageDecoder") .def("get_page_number", &pdflib::pdf_decoder::get_page_number, "Get the page number (0-indexed)") .def("get_page_dimension", &pdflib::pdf_decoder::get_page_dimension, pybind11::return_value_policy::reference_internal, "Get page dimension/geometry") .def("get_char_cells", &pdflib::pdf_decoder::get_char_cells, pybind11::return_value_policy::reference_internal, "Get individual character cells") .def("get_word_cells", &pdflib::pdf_decoder::get_word_cells, pybind11::return_value_policy::reference_internal, "Get word cells (aggregated from char cells)") .def("get_line_cells", &pdflib::pdf_decoder::get_line_cells, pybind11::return_value_policy::reference_internal, "Get line cells (aggregated from char cells)") .def("get_page_shapes", &pdflib::pdf_decoder::get_page_shapes, pybind11::return_value_policy::reference_internal, "Get graphic shapes on the page") .def("get_page_images", &pdflib::pdf_decoder::get_page_images, pybind11::return_value_policy::reference_internal, "Get bitmap/image resources on the page") .def("get_page_widgets", &pdflib::pdf_decoder::get_page_widgets, pybind11::return_value_policy::reference_internal, "Get form field widgets on the page") .def("get_page_hyperlinks", &pdflib::pdf_decoder::get_page_hyperlinks, pybind11::return_value_policy::reference_internal, "Get hyperlink annotations on the page") .def("has_word_cells", &pdflib::pdf_decoder::has_word_cells, "Check if word cells have been created") .def("has_line_cells", &pdflib::pdf_decoder::has_line_cells, "Check if line cells have been created") .def("get_timings", [](pdflib::pdf_decoder& self) { // Return as Dict[str, float] (sums) for backward compatibility return self.get_timings().to_sum_map(); }, "Get timing information for page decoding as Dict[str, float]") .def("get_timings_raw", [](pdflib::pdf_decoder& self) { // Return as Dict[str, List[float]] for detailed timing data return self.get_timings().get_raw_data(); }, "Get detailed timing information as Dict[str, List[float]]") .def("get_static_timings", [](pdflib::pdf_decoder& self) { return self.get_timings().get_static_timings(); }, "Get only static (constant) timing keys as Dict[str, float]") .def("get_dynamic_timings", [](pdflib::pdf_decoder& self) { return self.get_timings().get_dynamic_timings(); }, "Get only dynamic timing keys as Dict[str, float]") .def("create_word_cells", &pdflib::pdf_decoder::create_word_cells, pybind11::arg("config"), "Recompute word cells from char cells with the given config") .def("create_line_cells", &pdflib::pdf_decoder::create_line_cells, pybind11::arg("config"), "Recompute line cells from char cells with the given config"); // ============= Timing Keys Constants ============= m.attr("TIMING_KEY_DECODE_PAGE") = pdflib::pdf_timings::KEY_DECODE_PAGE; m.attr("TIMING_KEY_DECODE_DIMENSIONS") = pdflib::pdf_timings::KEY_DECODE_DIMENSIONS; m.attr("TIMING_KEY_DECODE_RESOURCES") = pdflib::pdf_timings::KEY_DECODE_RESOURCES; m.attr("TIMING_KEY_DECODE_GRPHS") = pdflib::pdf_timings::KEY_DECODE_GRPHS; m.attr("TIMING_KEY_DECODE_FONTS") = pdflib::pdf_timings::KEY_DECODE_FONTS; m.attr("TIMING_KEY_DECODE_XOBJECTS") = pdflib::pdf_timings::KEY_DECODE_XOBJECTS; m.attr("TIMING_KEY_DECODE_CONTENTS") = pdflib::pdf_timings::KEY_DECODE_CONTENTS; m.attr("TIMING_KEY_DECODE_ANNOTS") = pdflib::pdf_timings::KEY_DECODE_ANNOTS; m.attr("TIMING_KEY_SANITISE_CONTENTS") = pdflib::pdf_timings::KEY_SANITISE_CONTENTS; m.attr("TIMING_KEY_CREATE_WORD_CELLS") = pdflib::pdf_timings::KEY_CREATE_WORD_CELLS; m.attr("TIMING_KEY_CREATE_LINE_CELLS") = pdflib::pdf_timings::KEY_CREATE_LINE_CELLS; m.attr("TIMING_KEY_DECODE_FONTS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_FONTS_TOTAL; m.attr("TIMING_KEY_DECODE_XOBJECTS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_XOBJECTS_TOTAL; m.attr("TIMING_KEY_DECODE_GRPHS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_GRPHS_TOTAL; // Additional decode_page step keys m.attr("TIMING_KEY_TO_JSON_PAGE") = pdflib::pdf_timings::KEY_TO_JSON_PAGE; m.attr("TIMING_KEY_EXTRACT_ANNOTS_JSON") = pdflib::pdf_timings::KEY_EXTRACT_ANNOTS_JSON; m.attr("TIMING_KEY_ROTATE_CONTENTS") = pdflib::pdf_timings::KEY_ROTATE_CONTENTS; m.attr("TIMING_KEY_SANITIZE_ORIENTATION") = pdflib::pdf_timings::KEY_SANITIZE_ORIENTATION; m.attr("TIMING_KEY_SANITIZE_CELLS") = pdflib::pdf_timings::KEY_SANITIZE_CELLS; m.attr("TIMING_KEY_PROCESS_DOCUMENT_FROM_FILE") = pdflib::pdf_timings::KEY_PROCESS_DOCUMENT_FROM_FILE; m.attr("TIMING_KEY_PROCESS_DOCUMENT_FROM_BYTESIO") = pdflib::pdf_timings::KEY_PROCESS_DOCUMENT_FROM_BYTESIO; m.attr("TIMING_KEY_QPDF_PROCESS") = pdflib::pdf_timings::KEY_QPDF_PROCESS; m.attr("TIMING_KEY_EXTRACT_DOC_ANNOTATIONS") = pdflib::pdf_timings::KEY_EXTRACT_DOC_ANNOTATIONS; m.attr("TIMING_KEY_DECODE_DOCUMENT") = pdflib::pdf_timings::KEY_DECODE_DOCUMENT; m.attr("TIMING_PREFIX_DECODE_FONT") = pdflib::pdf_timings::PREFIX_DECODE_FONT; m.attr("TIMING_PREFIX_DECODE_XOBJECT") = pdflib::pdf_timings::PREFIX_DECODE_XOBJECT; m.attr("TIMING_PREFIX_DECODE_GRPH") = pdflib::pdf_timings::PREFIX_DECODE_GRPH; m.attr("TIMING_PREFIX_DECODING_PAGE") = pdflib::pdf_timings::PREFIX_DECODING_PAGE; m.attr("TIMING_PREFIX_DECODE_PAGE") = pdflib::pdf_timings::PREFIX_DECODE_PAGE; m.def("get_static_timing_keys", &pdflib::pdf_timings::get_static_keys, "Get all static timing keys as Set[str]"); m.def("is_static_timing_key", &pdflib::pdf_timings::is_static_key, pybind11::arg("key"), "Check if a timing key is static (constant)"); m.def("get_decode_page_timing_keys", &pdflib::pdf_timings::get_decode_page_keys, "Get timing keys used in decode_page method (in order, excluding global timer) as List[str]"); // ============= PDF Parser ============= // next generation parser, 10x faster with more finegrained output pybind11::class_(m, "pdf_parser") .def(pybind11::init()) .def(pybind11::init(), pybind11::arg("level"), R"( Construct pdf_parser with logging level. Parameters: level (str): Logging level as a string. One of ['fatal', 'error', 'warning', 'info'])") .def("set_loglevel_with_label", [](docling::docling_parser &self, const std::string &level) -> void { self.set_loglevel_with_label(level); }, pybind11::arg("level"), R"( Set the log level using a string label. Parameters: level (str): Logging level as a string. One of ['fatal', 'error', 'warning', 'info'] )") .def("is_loaded", [](docling::docling_parser &self, const std::string &key) -> bool { return self.is_loaded(key); }, pybind11::arg("key"), R"( Check if a document with the given key is loaded. Parameters: key (str): The unique key of the document to check. Returns: bool: True if the document is loaded, False otherwise.)") .def("list_loaded_keys", [](docling::docling_parser &self) -> std::vector { return self.list_loaded_keys(); }, R"( List the keys of the loaded documents. Returns: List[str]: A list of keys for the currently loaded documents.)") .def("load_document", [](docling::docling_parser &self, const std::string &key, const std::string &filename, std::optional& password ) -> bool { return self.load_document(key, filename, password); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none(), R"( Load a document by key and filename. Parameters: key (str): The unique key to identify the document. filename (str): The path to the document file to load. password (str, optional): Optional password for password-protected files Returns: bool: True if the document was successfully loaded, False otherwise.)") .def("load_document_from_bytesio", [](docling::docling_parser &self, const std::string &key, pybind11::object bytes_io, std::optional& password ) -> bool { return self.load_document_from_bytesio(key, bytes_io, password); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none(), R"( Load a document by key from a BytesIO-like object. Parameters: key (str): The unique key to identify the document. bytes_io (Any): A BytesIO-like object containing the document data. password (str, optional): Optional password for password-protected files Returns: bool: True if the document was successfully loaded, False otherwise.)") .def("unload_document", [](docling::docling_parser &self, const std::string &key) -> bool { return self.unload_document(key); }, R"( Unload a document by its unique key. Parameters: key (str): The unique key of the document to unload. Returns: bool: True if the document was successfully unloaded, False otherwise.)") .def("unload_document_page", [](docling::docling_parser &self, const std::string &key, int page) -> bool { return self.unload_document_page(key, page); }, pybind11::arg("key"), pybind11::arg("page"), R"( Unload a single page of the document by its unique key and page_number. Parameters: key (str): The unique key of the document to unload. page (int): The page number of the document to unload. Returns: bool: True if the document was successfully unloaded, False otherwise.)") .def("number_of_pages", [](docling::docling_parser &self, const std::string &key) -> int { return self.number_of_pages(key); }, pybind11::arg("key"), R"( Get the number of pages in the document identified by its unique key. Parameters: key (str): The unique key of the document. Returns: int: The number of pages in the document.)") .def("get_annotations", [](docling::docling_parser &self, const std::string &key) -> nlohmann::json { return self.get_annotations(key); }, pybind11::arg("key"), R"( Retrieve annotations for the document identified by its unique key and return them as JSON. Parameters: key (str): The unique key of the document. Returns: dict: A JSON object containing the annotations for the document.)") .def("get_table_of_contents", [](docling::docling_parser &self, const std::string &key) -> nlohmann::json { return self.get_table_of_contents(key); }, pybind11::arg("key"), R"( Retrieve the table of contents for the document identified by its unique key and return it as JSON. Parameters: key (str): The unique key of the document. Returns: dict: A JSON object representing the table of contents of the document.)") .def("get_meta_xml", [](docling::docling_parser &self, const std::string &key) -> nlohmann::json { return self.get_meta_xml(key); }, pybind11::arg("key"), R"( Retrieve the meta data in string or None. Parameters: key (str): The unique key of the document. Returns: dict: A None or string of the metadata in xml of the document.)") .def("get_page_decoder", [](docling::docling_parser &self, const std::string &key, int page, const pdflib::decode_config &config) -> std::shared_ptr> { return self.get_page_decoder(key, page, config); }, pybind11::arg("key"), pybind11::arg("page"), pybind11::arg("config"), R"( Get a typed page decoder using a DecodePageConfig object. Parameters: key (str): The unique key of the document. page (int): The page number to parse (0-indexed). config (DecodePageConfig): Configuration object for page decoding. Returns: PdfPageDecoder: A typed page decoder object.)"); // ============= Threaded PDF Parser ============= // PageDecodeResult - result of a threaded page decode task pybind11::class_(m, "PageDecodeResult", R"( Result of a threaded page decoding task. Attributes: doc_key (str): The document key this page belongs to. page_number (int): The page number (0-indexed). success (bool): Whether the decoding succeeded. )") .def_readonly("doc_key", &docling::page_decode_result::doc_key) .def_readonly("page_number", &docling::page_decode_result::page_number) .def_readonly("success", &docling::page_decode_result::success) .def("get", [](docling::page_decode_result& self) -> std::pair>, std::unordered_map> { if(!self.success) { throw std::runtime_error("Cannot get result from failed task: " + self.error_message); } auto timings_map = self.page_decoder->get_timings().to_sum_map(); return std::make_pair(self.page_decoder, timings_map); }, R"( Get the page decoder and timing information. Returns: Tuple[PdfPageDecoder, Dict[str, float]]: The page decoder and timing data. Raises: RuntimeError: If the task was not successful.)") .def("error", [](docling::page_decode_result& self) -> std::string { return self.error_message; }, R"( Get the error message if the task failed. Returns: str: The error message.)"); // threaded_pdf_parser - parallel PDF parser with bounded result queue pybind11::class_(m, "threaded_pdf_parser", R"( Threaded PDF parser that processes pages in parallel. Loads multiple documents and decodes their pages using a thread pool. Results are available via a bounded queue to control memory usage. )") .def(pybind11::init(), pybind11::arg("loglevel") = "fatal", pybind11::arg("num_threads") = 4, pybind11::arg("max_concurrent_results") = 32, pybind11::arg("config") = pdflib::decode_config(), R"( Construct a threaded PDF parser. Parameters: loglevel (str): Logging level ('fatal', 'error', 'warning', 'info'). num_threads (int): Number of worker threads. max_concurrent_results (int): Maximum results buffered before workers pause. config (DecodePageConfig): Configuration for page decoding.)") .def("load_document", [](docling::docling_threaded_parser& self, const std::string& key, const std::string& filename, std::optional& password) -> bool { return self.load_document(key, filename, password); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none(), R"( Load a document by key and filename. Parameters: key (str): The unique key to identify the document. filename (str): The path to the document file to load. password (str, optional): Optional password for password-protected files. Returns: bool: True if the document was successfully loaded.)") .def("load_document_from_bytesio", [](docling::docling_threaded_parser& self, const std::string& key, pybind11::object bytes_io, std::optional& password) -> bool { return self.load_document_from_bytesio(key, bytes_io, password); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none(), R"( Load a document from a BytesIO-like object. Parameters: key (str): The unique key to identify the document. bytes_io (Any): A BytesIO-like object containing the document data. password (str, optional): Optional password for password-protected files. Returns: bool: True if the document was successfully loaded.)") .def("has_tasks", [](docling::docling_threaded_parser& self) -> bool { return self.has_tasks(); }, R"( Check if there are remaining tasks to consume. On first call, builds the task queue from all loaded documents and starts worker threads. Returns: bool: True if there are remaining results to consume.)") .def("get_task", [](docling::docling_threaded_parser& self) -> docling::page_decode_result { pybind11::gil_scoped_release release; return self.get_task(); }, R"( Get the next completed page decode result. Blocks until a result is available. Releases the GIL while waiting. Returns: PageDecodeResult: The result of a page decoding task.)"); }