//-*-C++-*- #include #include #include #include #include #include #include #include // Include parse headers for typed bindings #include namespace { const char* pixel_format_name(pdflib::pixel_format fmt) { switch(fmt) { case pdflib::PIXEL_FORMAT_GRAY: return "gray"; case pdflib::PIXEL_FORMAT_RGB: return "rgb"; case pdflib::PIXEL_FORMAT_CMYK: return "cmyk"; default: return "unknown"; } } const char* instruction_name(pdflib::RENDER_INSTRUCTION_NAME name) { switch(name) { case pdflib::SIZE_INSTRUCTION: return "size"; case pdflib::TEXT_RENDER_INSTRUCTION: return "text"; case pdflib::TEXT_WIDGET_RENDER_INSTRUCTION: return "widget"; case pdflib::BITMAP_RENDER_INSTRUCTION: return "bitmap"; case pdflib::SHAPE_RENDER_INSTRUCTION: return "shape"; default: return "unknown"; } } pybind11::dict make_quad_dict(double x0, double y0, double x1, double y1, double x2, double y2, double x3, double y3) { pybind11::dict quad; quad["r_x0"] = x0; quad["r_y0"] = y0; quad["r_x1"] = x1; quad["r_y1"] = y1; quad["r_x2"] = x2; quad["r_y2"] = y2; quad["r_x3"] = x3; quad["r_y3"] = y3; return quad; } struct render_instruction_export_visitor { pybind11::dict root; pybind11::list instructions; render_instruction_export_visitor() { root["instructions"] = instructions; } void set_size(pdflib::size_instruction& instr) { pybind11::dict size; size["type"] = instruction_name(pdflib::SIZE_INSTRUCTION); size["media_bbox"] = instr.media_bbox; size["crop_bbox"] = instr.crop_bbox; root["size_instruction"] = size; } void render_text(pdflib::text_instruction& instr) { pybind11::dict row; row["type"] = instruction_name(pdflib::TEXT_RENDER_INSTRUCTION); row["text"] = instr.get_text(); row["font_enc"] = instr.get_font_enc(); row["font_key"] = instr.get_font_key(); row["font_name"] = instr.get_font_name(); row["encoding_name"] = instr.get_encoding_name(); row["base_font"] = instr.get_base_font(); row["font_size"] = instr.get_font_size(); row["font_ascent_norm"] = instr.get_font_ascent_norm(); row["font_descent_norm"] = instr.get_font_descent_norm(); row["base_x0"] = instr.get_base_x0(); row["base_y0"] = instr.get_base_y0(); row["quad"] = make_quad_dict( instr.get_r_x0(), instr.get_r_y0(), instr.get_r_x1(), instr.get_r_y1(), instr.get_r_x2(), instr.get_r_y2(), instr.get_r_x3(), instr.get_r_y3()); instructions.append(row); } void render_widget(pdflib::text_widget_instruction& instr) { pybind11::dict row; row["type"] = instruction_name(pdflib::TEXT_WIDGET_RENDER_INSTRUCTION); row["text"] = instr.get_text(); row["bbox"] = std::array{ instr.get_x0(), instr.get_y0(), instr.get_x1(), instr.get_y1()}; row["quad"] = make_quad_dict( instr.get_r_x0(), instr.get_r_y0(), instr.get_r_x1(), instr.get_r_y1(), instr.get_r_x2(), instr.get_r_y2(), instr.get_r_x3(), instr.get_r_y3()); instructions.append(row); } void render_bitmap(pdflib::bitmap_instruction& instr) { pybind11::dict row; row["type"] = instruction_name(pdflib::BITMAP_RENDER_INSTRUCTION); row["xobject_key"] = instr.get_key(); row["shape"] = instr.get_shape(); row["pixel_format"] = pixel_format_name(instr.get_pixel_format()); row["image_mask"] = instr.is_image_mask(); row["has_soft_mask"] = instr.has_alpha_data(); row["rgb_filling"] = instr.get_rgb_filling(); row["quad"] = make_quad_dict( instr.get_r_x0(), instr.get_r_y0(), instr.get_r_x1(), instr.get_r_y1(), instr.get_r_x2(), instr.get_r_y2(), instr.get_r_x3(), instr.get_r_y3()); instructions.append(row); } void render_shape(pdflib::shape_instruction& instr) { pybind11::dict row; row["type"] = instruction_name(pdflib::SHAPE_RENDER_INSTRUCTION); row["x"] = instr.get_x(); row["y"] = instr.get_y(); row["closing_type"] = static_cast(instr.get_closing_type()); row["shape_type"] = static_cast(instr.get_shape_type()); row["line_width"] = instr.get_line_width(); row["rgb_stroking"] = instr.get_rgb_stroking(); row["rgb_filling"] = instr.get_rgb_filling(); instructions.append(row); } }; struct bitmap_artifact_export_visitor { pybind11::list artifacts; int bitmap_index = 0; void set_size(pdflib::size_instruction&) {} void render_text(pdflib::text_instruction&) {} void render_widget(pdflib::text_widget_instruction&) {} void render_shape(pdflib::shape_instruction&) {} void render_bitmap(pdflib::bitmap_instruction& instr) { ++bitmap_index; pybind11::dict row; row["index"] = bitmap_index; row["xobject_key"] = instr.get_key(); row["shape"] = instr.get_shape(); row["pixel_format"] = pixel_format_name(instr.get_pixel_format()); row["image_mask"] = instr.is_image_mask(); row["has_soft_mask"] = instr.has_alpha_data(); row["rgb_filling"] = instr.get_rgb_filling(); row["quad"] = make_quad_dict( instr.get_r_x0(), instr.get_r_y0(), instr.get_r_x1(), instr.get_r_y1(), instr.get_r_x2(), instr.get_r_y2(), instr.get_r_x3(), instr.get_r_y3()); std::vector encoded; std::string extension = ".bin"; auto const& data = instr.get_data(); auto const& alpha_data = instr.get_alpha_data(); if(data) { row["raw_data"] = pybind11::bytes( reinterpret_cast(data->data()), data->size()); } else { row["raw_data"] = pybind11::bytes(); } if(instr.has_data()) { auto const& shape = instr.get_shape(); const int height = shape[0]; const int width = shape[1]; if(instr.get_pixel_format() == pdflib::PIXEL_FORMAT_GRAY) { encoded = pdflib::ccitt::encode_debug_png(*data, width, height); extension = ".png"; } else if(instr.get_pixel_format() == pdflib::PIXEL_FORMAT_RGB or instr.get_pixel_format() == pdflib::PIXEL_FORMAT_CMYK) { std::vector composited; auto const* export_data = data.get(); if(instr.get_pixel_format() == pdflib::PIXEL_FORMAT_RGB and instr.has_alpha_data() and alpha_data->size() >= static_cast(width) * height) { composited.resize(static_cast(width) * height * 3); for(int i = 0; i < width * height; ++i) { const uint8_t alpha = alpha_data->at(i); for(int c = 0; c < 3; ++c) { const uint8_t src = data->at(static_cast(i) * 3 + c); composited[static_cast(i) * 3 + c] = static_cast((static_cast(src) * alpha + 255u * (255u - alpha)) / 255u); } } export_data = &composited; } pdflib::jpeg::jpeg_parameters params; params.width = width; params.height = height; params.bits_per_component = 8; params.color_space = (instr.get_pixel_format() == pdflib::PIXEL_FORMAT_RGB) ? pdflib::jpeg::ColorSpace::RGB : pdflib::jpeg::ColorSpace::CMYK; encoded = pdflib::jpeg::write_jpeg_from_raw_pixels_to_memory( reinterpret_cast(export_data->data()), export_data->size(), params); extension = ".jpg"; } if(encoded.empty()) { encoded.assign(data->begin(), data->end()); extension = ".bin"; } } row["extension"] = extension; row["encoded_data"] = pybind11::bytes( reinterpret_cast(encoded.data()), encoded.size()); artifacts.append(row); } }; } PYBIND11_MODULE(pdf_parsers, m) { // ============= Decode Page Config ============= pybind11::class_(m, "DecodePageConfig", R"( Configuration parameters for page decoding. Attributes: page_boundary (str): The page boundary specification [choices: crop_box, media_box]. do_sanitization (bool): Sanitize the chars into lines [default=true]. keep_char_cells (bool): Keep all the individual char cells [default=true]. keep_shapes (bool): Keep all the graphic shapes [default=true]. keep_bitmaps (bool): Keep all the bitmap resources [default=true]. max_num_lines (int): Maximum number of lines to keep (-1 means no cap) [default=-1]. max_num_bitmaps (int): Maximum number of bitmaps to keep (-1 means no cap) [default=-1]. keep_glyphs (bool): If true, keep GLYPH<...> fallback strings in output; if false, replace them with a space [default=false]. keep_qpdf_warnings (bool): If true, QPDF warnings are emitted; if false, they are suppressed [default=false]. )") .def(pybind11::init<>()) .def_readwrite("page_boundary", &pdflib::decode_config::page_boundary) .def_readwrite("do_sanitization", &pdflib::decode_config::do_sanitization) .def_readwrite("keep_char_cells", &pdflib::decode_config::keep_char_cells) .def_readwrite("keep_shapes", &pdflib::decode_config::keep_shapes) .def_readwrite("keep_bitmaps", &pdflib::decode_config::keep_bitmaps) .def_readwrite("max_num_lines", &pdflib::decode_config::max_num_lines) .def_readwrite("max_num_bitmaps", &pdflib::decode_config::max_num_bitmaps) .def_readwrite("create_word_cells", &pdflib::decode_config::create_word_cells) .def_readwrite("create_line_cells", &pdflib::decode_config::create_line_cells) .def_readwrite("enforce_same_font", &pdflib::decode_config::enforce_same_font) .def_readwrite("horizontal_cell_tolerance", &pdflib::decode_config::horizontal_cell_tolerance) .def_readwrite("word_space_width_factor_for_merge", &pdflib::decode_config::word_space_width_factor_for_merge) .def_readwrite("line_space_width_factor_for_merge", &pdflib::decode_config::line_space_width_factor_for_merge) .def_readwrite("line_space_width_factor_for_merge_with_space", &pdflib::decode_config::line_space_width_factor_for_merge_with_space) .def_readwrite("do_thread_safe", &pdflib::decode_config::do_thread_safe) .def_readwrite("keep_glyphs", &pdflib::decode_config::keep_glyphs) .def_readwrite("keep_qpdf_warnings", &pdflib::decode_config::keep_qpdf_warnings); // ============= Typed Resource Bindings (for zero-copy access) ============= // PdfCell - individual text cell with bounding box and text content pybind11::class_>(m, "PdfCell") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("r_x0", &pdflib::page_item::r_x0) .def_readonly("r_y0", &pdflib::page_item::r_y0) .def_readonly("r_x1", &pdflib::page_item::r_x1) .def_readonly("r_y1", &pdflib::page_item::r_y1) .def_readonly("r_x2", &pdflib::page_item::r_x2) .def_readonly("r_y2", &pdflib::page_item::r_y2) .def_readonly("r_x3", &pdflib::page_item::r_x3) .def_readonly("r_y3", &pdflib::page_item::r_y3) .def_readonly("text", &pdflib::page_item::text) .def_readonly("rendering_mode", &pdflib::page_item::rendering_mode) .def_readonly("space_width", &pdflib::page_item::space_width) .def_readonly("enc_name", &pdflib::page_item::enc_name) .def_readonly("font_enc", &pdflib::page_item::font_enc) .def_readonly("font_key", &pdflib::page_item::font_key) .def_readonly("font_name", &pdflib::page_item::font_name) .def_readonly("widget", &pdflib::page_item::widget) .def_readonly("left_to_right", &pdflib::page_item::left_to_right); // PdfShape - graphic shape with coordinates pybind11::class_>(m, "PdfShape") .def("get_x", &pdflib::page_item::get_x, pybind11::return_value_policy::reference_internal, "Get x coordinates of shape points") .def("get_y", &pdflib::page_item::get_y, pybind11::return_value_policy::reference_internal, "Get y coordinates of shape points") .def("get_i", &pdflib::page_item::get_i, pybind11::return_value_policy::reference_internal, "Get segment indices") .def("__len__", &pdflib::page_item::size) .def("get_has_graphics_state", &pdflib::page_item::get_has_graphics_state, "Check if graphics state has been set") .def("get_line_width", &pdflib::page_item::get_line_width, "Get line width") .def("get_miter_limit", &pdflib::page_item::get_miter_limit, "Get miter limit") .def("get_line_cap", &pdflib::page_item::get_line_cap, "Get line cap style") .def("get_line_join", &pdflib::page_item::get_line_join, "Get line join style") .def("get_dash_phase", &pdflib::page_item::get_dash_phase, "Get dash phase") .def("get_dash_array", &pdflib::page_item::get_dash_array, pybind11::return_value_policy::reference_internal, "Get dash array") .def("get_flatness", &pdflib::page_item::get_flatness, "Get flatness tolerance") .def("get_rgb_stroking_ops", &pdflib::page_item::get_rgb_stroking_ops, pybind11::return_value_policy::reference_internal, "Get RGB stroking color") .def("get_rgb_filling_ops", &pdflib::page_item::get_rgb_filling_ops, pybind11::return_value_policy::reference_internal, "Get RGB filling color"); // PdfImage - bitmap resource with bounding box and image data pybind11::class_>(m, "PdfImage") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("image_width", &pdflib::page_item::image_width) .def_readonly("image_height", &pdflib::page_item::image_height) .def("get_image_format", &pdflib::page_item::get_image_format, "Get image format hint: 'jpeg', 'jp2', 'jbig2', or 'raw'") .def("get_pil_mode", &pdflib::page_item::get_pil_mode, "Get PIL-compatible mode string: 'L', 'RGB', 'CMYK', or '1'") .def("get_image_as_bytes", [](pdflib::page_item const& self) { auto data = self.get_image_as_bytes(); return pybind11::bytes(reinterpret_cast(data.data()), data.size()); }, "Get image data as bytes (corrected JPEG, raw JP2, or decoded pixels)"); // PdfPageDimension - page geometry and bounding boxes pybind11::class_>(m, "PdfPageDimension") .def("get_angle", &pdflib::page_item::get_angle, "Get page rotation angle in degrees") .def("get_crop_bbox", &pdflib::page_item::get_crop_bbox, "Get crop box as [x0, y0, x1, y1]") .def("get_media_bbox", &pdflib::page_item::get_media_bbox, "Get media box as [x0, y0, x1, y1]"); // PdfWidget - form field widget with bounding box and field info pybind11::class_>(m, "PdfWidget") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("text", &pdflib::page_item::text) .def_readonly("description", &pdflib::page_item::description) .def_readonly("field_name", &pdflib::page_item::field_name) .def_readonly("field_type", &pdflib::page_item::field_type); // PdfHyperlink - hyperlink annotation with bounding box and URI pybind11::class_>(m, "PdfHyperlink") .def_readonly("x0", &pdflib::page_item::x0) .def_readonly("y0", &pdflib::page_item::y0) .def_readonly("x1", &pdflib::page_item::x1) .def_readonly("y1", &pdflib::page_item::y1) .def_readonly("uri", &pdflib::page_item::uri); // ============= Container Type Bindings ============= // PdfCells - iterable container of PdfCell objects pybind11::class_>(m, "PdfCells") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfShapes - iterable container of PdfShape objects pybind11::class_>(m, "PdfShapes") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfImages - iterable container of PdfImage objects pybind11::class_>(m, "PdfImages") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfWidgets - iterable container of PdfWidget objects pybind11::class_>(m, "PdfWidgets") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // PdfHyperlinks - iterable container of PdfHyperlink objects pybind11::class_>(m, "PdfHyperlinks") .def("__len__", &pdflib::page_item::size) .def("__getitem__", [](pdflib::page_item& self, size_t i) -> pdflib::page_item& { if (i >= self.size()) { throw pybind11::index_error("index out of range"); } return self[i]; }, pybind11::return_value_policy::reference_internal) .def("__iter__", [](pdflib::page_item& self) { return pybind11::make_iterator(self.begin(), self.end()); }, pybind11::keep_alive<0, 1>()); // ============= Page Decoder Binding ============= // PdfPageDecoder - provides typed access to decoded page data pybind11::class_, std::shared_ptr>>(m, "PdfPageDecoder") .def("get_page_number", &pdflib::pdf_decoder::get_page_number, "Get the page number (0-indexed)") .def("get_page_dimension", &pdflib::pdf_decoder::get_page_dimension, pybind11::return_value_policy::reference_internal, "Get page dimension/geometry") .def("get_char_cells", &pdflib::pdf_decoder::get_char_cells, pybind11::return_value_policy::reference_internal, "Get individual character cells") .def("get_word_cells", &pdflib::pdf_decoder::get_word_cells, pybind11::return_value_policy::reference_internal, "Get word cells (aggregated from char cells)") .def("get_line_cells", &pdflib::pdf_decoder::get_line_cells, pybind11::return_value_policy::reference_internal, "Get line cells (aggregated from char cells)") .def("get_page_shapes", &pdflib::pdf_decoder::get_page_shapes, pybind11::return_value_policy::reference_internal, "Get graphic shapes on the page") .def("get_page_images", &pdflib::pdf_decoder::get_page_images, pybind11::return_value_policy::reference_internal, "Get bitmap/image resources on the page") .def("get_page_widgets", &pdflib::pdf_decoder::get_page_widgets, pybind11::return_value_policy::reference_internal, "Get form field widgets on the page") .def("get_page_hyperlinks", &pdflib::pdf_decoder::get_page_hyperlinks, pybind11::return_value_policy::reference_internal, "Get hyperlink annotations on the page") .def("has_word_cells", &pdflib::pdf_decoder::has_word_cells, "Check if word cells have been created") .def("has_line_cells", &pdflib::pdf_decoder::has_line_cells, "Check if line cells have been created") .def("get_timings", [](pdflib::pdf_decoder& self) { // Return as Dict[str, float] (sums) for backward compatibility return self.get_timings().to_sum_map(); }, "Get timing information for page decoding as Dict[str, float]") .def("get_timings_raw", [](pdflib::pdf_decoder& self) { // Return as Dict[str, List[float]] for detailed timing data return self.get_timings().get_raw_data(); }, "Get detailed timing information as Dict[str, List[float]]") .def("get_static_timings", [](pdflib::pdf_decoder& self) { return self.get_timings().get_static_timings(); }, "Get only static (constant) timing keys as Dict[str, float]") .def("get_dynamic_timings", [](pdflib::pdf_decoder& self) { return self.get_timings().get_dynamic_timings(); }, "Get only dynamic timing keys as Dict[str, float]") .def("create_word_cells", &pdflib::pdf_decoder::create_word_cells, pybind11::arg("config"), "Recompute word cells from char cells with the given config") .def("create_line_cells", &pdflib::pdf_decoder::create_line_cells, pybind11::arg("config"), "Recompute line cells from char cells with the given config") .def("export_render_instructions_json", [](pdflib::pdf_decoder& self) -> pybind11::dict { render_instruction_export_visitor visitor; self.get_instructions().iterate_over_instructions(visitor); return visitor.root; }, "Export render instructions in deterministic decode order") .def("export_bitmap_artifacts", [](pdflib::pdf_decoder& self) -> pybind11::list { bitmap_artifact_export_visitor visitor; self.get_instructions().iterate_over_instructions(visitor); return visitor.artifacts; }, "Export bitmap artifacts as inspectable image bytes plus raw payload bytes"); // ============= Timing Keys Constants ============= m.attr("TIMING_KEY_DECODE_PAGE") = pdflib::pdf_timings::KEY_DECODE_PAGE; m.attr("TIMING_KEY_DECODE_DIMENSIONS") = pdflib::pdf_timings::KEY_DECODE_DIMENSIONS; m.attr("TIMING_KEY_DECODE_RESOURCES") = pdflib::pdf_timings::KEY_DECODE_RESOURCES; m.attr("TIMING_KEY_DECODE_GRPHS") = pdflib::pdf_timings::KEY_DECODE_GRPHS; m.attr("TIMING_KEY_DECODE_FONTS") = pdflib::pdf_timings::KEY_DECODE_FONTS; m.attr("TIMING_KEY_DECODE_XOBJECTS") = pdflib::pdf_timings::KEY_DECODE_XOBJECTS; m.attr("TIMING_KEY_DECODE_CONTENTS") = pdflib::pdf_timings::KEY_DECODE_CONTENTS; m.attr("TIMING_KEY_DECODE_ANNOTS") = pdflib::pdf_timings::KEY_DECODE_ANNOTS; m.attr("TIMING_KEY_SANITISE_CONTENTS") = pdflib::pdf_timings::KEY_SANITISE_CONTENTS; m.attr("TIMING_KEY_CREATE_WORD_CELLS") = pdflib::pdf_timings::KEY_CREATE_WORD_CELLS; m.attr("TIMING_KEY_CREATE_LINE_CELLS") = pdflib::pdf_timings::KEY_CREATE_LINE_CELLS; m.attr("TIMING_KEY_DECODE_FONTS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_FONTS_TOTAL; m.attr("TIMING_KEY_DECODE_XOBJECTS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_XOBJECTS_TOTAL; m.attr("TIMING_KEY_DECODE_GRPHS_TOTAL") = pdflib::pdf_timings::KEY_DECODE_GRPHS_TOTAL; // Additional decode_page step keys m.attr("TIMING_KEY_TO_JSON_PAGE") = pdflib::pdf_timings::KEY_TO_JSON_PAGE; m.attr("TIMING_KEY_EXTRACT_ANNOTS_JSON") = pdflib::pdf_timings::KEY_EXTRACT_ANNOTS_JSON; m.attr("TIMING_KEY_ROTATE_CONTENTS") = pdflib::pdf_timings::KEY_ROTATE_CONTENTS; m.attr("TIMING_KEY_SANITIZE_ORIENTATION") = pdflib::pdf_timings::KEY_SANITIZE_ORIENTATION; m.attr("TIMING_KEY_SANITIZE_CELLS") = pdflib::pdf_timings::KEY_SANITIZE_CELLS; m.attr("TIMING_KEY_PROCESS_DOCUMENT_FROM_FILE") = pdflib::pdf_timings::KEY_PROCESS_DOCUMENT_FROM_FILE; m.attr("TIMING_KEY_PROCESS_DOCUMENT_FROM_BYTESIO") = pdflib::pdf_timings::KEY_PROCESS_DOCUMENT_FROM_BYTESIO; m.attr("TIMING_KEY_QPDF_PROCESS") = pdflib::pdf_timings::KEY_QPDF_PROCESS; m.attr("TIMING_KEY_EXTRACT_DOC_ANNOTATIONS") = pdflib::pdf_timings::KEY_EXTRACT_DOC_ANNOTATIONS; m.attr("TIMING_KEY_DECODE_DOCUMENT") = pdflib::pdf_timings::KEY_DECODE_DOCUMENT; m.attr("TIMING_PREFIX_DECODE_FONT") = pdflib::pdf_timings::PREFIX_DECODE_FONT; m.attr("TIMING_PREFIX_DECODE_XOBJECT") = pdflib::pdf_timings::PREFIX_DECODE_XOBJECT; m.attr("TIMING_PREFIX_DECODE_GRPH") = pdflib::pdf_timings::PREFIX_DECODE_GRPH; m.attr("TIMING_PREFIX_DECODING_PAGE") = pdflib::pdf_timings::PREFIX_DECODING_PAGE; m.attr("TIMING_PREFIX_DECODE_PAGE") = pdflib::pdf_timings::PREFIX_DECODE_PAGE; m.def("get_static_timing_keys", &pdflib::pdf_timings::get_static_keys, "Get all static timing keys as Set[str]"); m.def("is_static_timing_key", &pdflib::pdf_timings::is_static_key, pybind11::arg("key"), "Check if a timing key is static (constant)"); m.def("get_decode_page_timing_keys", &pdflib::pdf_timings::get_decode_page_keys, "Get timing keys used in decode_page method (in order, excluding global timer) as List[str]"); // ============= PDF Parser ============= // next generation parser, 10x faster with more finegrained output pybind11::class_(m, "pdf_parser") .def(pybind11::init()) .def(pybind11::init(), pybind11::arg("level"), R"( Construct pdf_parser with logging level. Parameters: level (str): Logging level as a string. One of ['fatal', 'error', 'warning', 'info'])") .def("set_loglevel_with_label", [](docling::docling_parser &self, const std::string &level) -> void { self.set_loglevel_with_label(level); }, pybind11::arg("level"), R"( Set the log level using a string label. Parameters: level (str): Logging level as a string. One of ['fatal', 'error', 'warning', 'info'] )") .def("is_loaded", [](docling::docling_parser &self, const std::string &key) -> bool { return self.is_loaded(key); }, pybind11::arg("key"), R"( Check if a document with the given key is loaded. Parameters: key (str): The unique key of the document to check. Returns: bool: True if the document is loaded, False otherwise.)") .def("list_loaded_keys", [](docling::docling_parser &self) -> std::vector { return self.list_loaded_keys(); }, R"( List the keys of the loaded documents. Returns: List[str]: A list of keys for the currently loaded documents.)") .def("load_document", [](docling::docling_parser &self, const std::string &key, const std::string &filename, std::optional& password ) -> bool { return self.load_document(key, filename, password); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none(), R"( Load a document by key and filename. Parameters: key (str): The unique key to identify the document. filename (str): The path to the document file to load. password (str, optional): Optional password for password-protected files Returns: bool: True if the document was successfully loaded, False otherwise.)") .def("load_document_from_bytesio", [](docling::docling_parser &self, const std::string &key, pybind11::object bytes_io, std::optional& password ) -> bool { return self.load_document_from_bytesio(key, bytes_io, password); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none(), R"( Load a document by key from a BytesIO-like object. Parameters: key (str): The unique key to identify the document. bytes_io (Any): A BytesIO-like object containing the document data. password (str, optional): Optional password for password-protected files Returns: bool: True if the document was successfully loaded, False otherwise.)") .def("unload_document", [](docling::docling_parser &self, const std::string &key) -> bool { return self.unload_document(key); }, R"( Unload a document by its unique key. Parameters: key (str): The unique key of the document to unload. Returns: bool: True if the document was successfully unloaded, False otherwise.)") .def("unload_document_page", [](docling::docling_parser &self, const std::string &key, int page) -> bool { return self.unload_document_page(key, page); }, pybind11::arg("key"), pybind11::arg("page"), R"( Unload a single page of the document by its unique key and page_number. Parameters: key (str): The unique key of the document to unload. page (int): The page number of the document to unload. Returns: bool: True if the document was successfully unloaded, False otherwise.)") .def("number_of_pages", [](docling::docling_parser &self, const std::string &key) -> int { return self.number_of_pages(key); }, pybind11::arg("key"), R"( Get the number of pages in the document identified by its unique key. Parameters: key (str): The unique key of the document. Returns: int: The number of pages in the document.)") .def("get_annotations", [](docling::docling_parser &self, const std::string &key) -> nlohmann::json { return self.get_annotations(key); }, pybind11::arg("key"), R"( Retrieve annotations for the document identified by its unique key and return them as JSON. Parameters: key (str): The unique key of the document. Returns: dict: A JSON object containing the annotations for the document.)") .def("get_table_of_contents", [](docling::docling_parser &self, const std::string &key) -> nlohmann::json { return self.get_table_of_contents(key); }, pybind11::arg("key"), R"( Retrieve the table of contents for the document identified by its unique key and return it as JSON. Parameters: key (str): The unique key of the document. Returns: dict: A JSON object representing the table of contents of the document.)") .def("get_meta_xml", [](docling::docling_parser &self, const std::string &key) -> nlohmann::json { return self.get_meta_xml(key); }, pybind11::arg("key"), R"( Retrieve the meta data in string or None. Parameters: key (str): The unique key of the document. Returns: dict: A None or string of the metadata in xml of the document.)") .def("get_page_decoder", [](docling::docling_parser &self, const std::string &key, int page, const pdflib::decode_config &config) -> std::shared_ptr> { return self.get_page_decoder(key, page, config); }, pybind11::arg("key"), pybind11::arg("page"), pybind11::arg("config"), R"( Get a typed page decoder using a DecodePageConfig object. Parameters: key (str): The unique key of the document. page (int): The page number to parse (0-indexed). config (DecodePageConfig): Configuration object for page decoding. Returns: PdfPageDecoder: A typed page decoder object.)"); // ============= Threaded PDF Parser ============= // PageDecodeResult - result of a threaded page decode task pybind11::class_(m, "PageDecodeResult", R"( Result of a threaded page decoding task. Attributes: doc_key (str): The document key this page belongs to. page_number (int): The page number (0-indexed). success (bool): Whether the decoding succeeded. )") .def_readonly("doc_key", &docling::page_decode_result::doc_key) .def_readonly("page_number", &docling::page_decode_result::page_number) .def_readonly("success", &docling::page_decode_result::success) .def("get", [](docling::page_decode_result& self) -> std::pair>, std::unordered_map> { if(!self.success) { throw std::runtime_error("Cannot get result from failed task: " + self.error_message); } auto timings_map = self.page_decoder->get_timings().to_sum_map(); return std::make_pair(self.page_decoder, timings_map); }, R"( Get the page decoder and timing information. Returns: Tuple[PdfPageDecoder, Dict[str, float]]: The page decoder and timing data. Raises: RuntimeError: If the task was not successful.)") .def("error", [](docling::page_decode_result& self) -> std::string { return self.error_message; }, R"( Get the error message if the task failed. Returns: str: The error message.)"); // threaded_pdf_parser - parallel PDF parser with bounded result queue pybind11::class_(m, "threaded_pdf_parser", R"( Threaded PDF parser that processes pages in parallel. Loads multiple documents and decodes their pages using a thread pool. Results are available via a bounded queue to control memory usage. )") .def(pybind11::init(), pybind11::arg("loglevel") = "fatal", pybind11::arg("num_threads") = 4, pybind11::arg("max_concurrent_results") = 32, pybind11::arg("config") = pdflib::decode_config(), R"( Construct a threaded PDF parser. Parameters: loglevel (str): Logging level ('fatal', 'error', 'warning', 'info'). num_threads (int): Number of worker threads. max_concurrent_results (int): Maximum results buffered before workers pause. config (DecodePageConfig): Configuration for page decoding.)") .def("load_document", [](docling::docling_threaded_parser& self, const std::string& key, const std::string& filename, std::optional& password) -> bool { return self.load_document(key, filename, password); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none(), R"( Load a document by key and filename. Parameters: key (str): The unique key to identify the document. filename (str): The path to the document file to load. password (str, optional): Optional password for password-protected files. Returns: bool: True if the document was successfully loaded.)") .def("load_document_from_bytesio", [](docling::docling_threaded_parser& self, const std::string& key, pybind11::object bytes_io, std::optional& password) -> bool { return self.load_document_from_bytesio(key, bytes_io, password); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none(), R"( Load a document from a BytesIO-like object. Parameters: key (str): The unique key to identify the document. bytes_io (Any): A BytesIO-like object containing the document data. password (str, optional): Optional password for password-protected files. Returns: bool: True if the document was successfully loaded.)") .def("has_tasks", [](docling::docling_threaded_parser& self) -> bool { return self.has_tasks(); }, R"( Check if there are remaining tasks to consume. On first call, builds the task queue from all loaded documents and starts worker threads. Returns: bool: True if there are remaining results to consume.)") .def("get_task", [](docling::docling_threaded_parser& self) -> docling::page_decode_result { pybind11::gil_scoped_release release; return self.get_task(); }, R"( Get the next completed page decode result. Blocks until a result is available. Releases the GIL while waiting. Returns: PageDecodeResult: The result of a page decoding task.)"); // ============= Threaded PDF Renderer ============= // RenderConfig - configuration for the renderer pybind11::class_(m, "RenderConfig", R"( Configuration parameters for page rendering. Attributes: render_text (bool): Render glyph outlines for text cells [default=true]. draw_text_bbox (bool): Draw bounding quad for each text cell [default=false]. resolve_fonts (bool): Resolve PDF font names to system fonts [default=true]. font_similarity_cutoff (float): Minimum Jaccard similarity for fuzzy font matching; candidates below this threshold fall back to the default font [default=0.25]. canvas_width (int): Target canvas width in pixels; -1 means use PDF page size [default=-1]. canvas_height (int): Target canvas height in pixels; -1 means use PDF page size [default=-1]. )") .def(pybind11::init<>()) .def_readwrite("render_text", &pdflib::render_config::render_text) .def_readwrite("draw_text_bbox", &pdflib::render_config::draw_text_bbox) .def_readwrite("resolve_fonts", &pdflib::render_config::resolve_fonts) .def_readwrite("font_similarity_cutoff", &pdflib::render_config::font_similarity_cutoff) .def_readwrite("canvas_width", &pdflib::render_config::canvas_width) .def_readwrite("canvas_height", &pdflib::render_config::canvas_height); // PageRenderResult - result of a threaded page render task pybind11::class_(m, "PageRenderResult", R"( Result of a threaded page rendering task. Inherits all attributes of PageDecodeResult and adds rendered image data. Attributes: image_data: Raw RGBA bytes of the rendered page (height x width x 4, row-major). image_shape: Shape of the image as [height, width, channels]. )") .def_readonly("image_shape", &docling::page_render_result::image_shape) .def("get_image", [](docling::page_render_result& self) -> pybind11::bytes { if(not self.image_data or self.image_data->empty()) { return pybind11::bytes(); } return pybind11::bytes( reinterpret_cast(self.image_data->data()), self.image_data->size()); }, R"( Return the raw RGBA pixel data as Python bytes. Use together with image_shape to reconstruct a PIL image: result = renderer.get_task() h, w, _ = result.image_shape img = Image.frombuffer("RGBA", (w, h), result.get_image(), "raw", "RGBA", 0, 1) Returns: bytes: Raw RGBA pixel data, or empty bytes on failure.)"); // threaded_pdf_renderer - parallel PDF renderer with bounded result queue pybind11::class_(m, "threaded_pdf_renderer", R"( Threaded PDF renderer that decodes and renders pages in parallel. Loads multiple documents and renders their pages using a thread pool. Each result contains both the decoded page data and the rendered RGBA image. Results are available via a bounded queue to control memory usage. )") .def(pybind11::init(), pybind11::arg("loglevel") = "fatal", pybind11::arg("num_threads") = 4, pybind11::arg("max_concurrent_results") = 32, pybind11::arg("decode_config") = pdflib::decode_config(), pybind11::arg("render_config") = pdflib::render_config(), R"( Construct a threaded PDF renderer. Parameters: loglevel (str): Logging level ('fatal', 'error', 'warning', 'info'). num_threads (int): Number of worker threads. max_concurrent_results (int): Maximum results buffered before workers pause. decode_config (DecodePageConfig): Configuration for page decoding. render_config (RenderConfig): Configuration for page rendering.)") .def("load_document", [](docling::docling_threaded_renderer& self, const std::string& key, const std::string& filename, std::optional& password) -> bool { return self.load_document(key, filename, password); }, pybind11::arg("key"), pybind11::arg("filename"), pybind11::arg("password") = pybind11::none()) .def("load_document_from_bytesio", [](docling::docling_threaded_renderer& self, const std::string& key, pybind11::object bytes_io, std::optional& password) -> bool { return self.load_document_from_bytesio(key, bytes_io, password); }, pybind11::arg("key"), pybind11::arg("bytes_io"), pybind11::arg("password") = pybind11::none()) .def("has_tasks", [](docling::docling_threaded_renderer& self) -> bool { return self.has_tasks(); }) .def("get_task", [](docling::docling_threaded_renderer& self) -> docling::page_render_result { pybind11::gil_scoped_release release; return self.get_task(); }, R"( Get the next completed page render result. Blocks until a result is available. Releases the GIL while waiting. Returns: PageRenderResult: The result of a page rendering task.)"); }