From 25672da1e8a5bfb3994bf8fa5fc3a888ea3ec1ae Mon Sep 17 00:00:00 2001 From: "Peter W. J. Staar" <91719829+PeterStaar-IBM@users.noreply.github.com> Date: Wed, 4 Feb 2026 17:35:00 +0100 Subject: [PATCH] feat: add-image-extraction (#207) Signed-off-by: Peter Staar --- .github/scripts/build_rhel.sh | 13 +- .github/workflows/rhel.yml | 6 +- CMakeLists.txt | 5 + app/page_images.cpp | 182 ++++++++ app/parse.cpp | 7 + app/pybind_parse.cpp | 17 +- docling_parse/pdf_parser.py | 53 ++- pyproject.toml | 2 +- src/parse.h | 1 + src/parse/parser.h | 133 +++++- src/parse/pdf_decoders/document.h | 46 ++- src/parse/pdf_resources/page_image.h | 335 ++++++++++++++- src/parse/pdf_resources/page_xobject.h | 528 +++++++++++++++++++++--- src/parse/pdf_resources/page_xobjects.h | 1 - src/parse/pdf_states/global.h | 20 +- src/parse/utils/jpeg/jpeg_utils.h | 503 ++++++++++++++++++++++ tests/test_parse.py | 1 + uv.lock | 14 +- 18 files changed, 1753 insertions(+), 114 deletions(-) create mode 100644 app/page_images.cpp create mode 100644 src/parse/utils/jpeg/jpeg_utils.h diff --git a/.github/scripts/build_rhel.sh b/.github/scripts/build_rhel.sh index e87d8a4..5f55dea 100755 --- a/.github/scripts/build_rhel.sh +++ b/.github/scripts/build_rhel.sh @@ -21,17 +21,26 @@ sudo -E XDG_RUNTIME_DIR= podman build --progress=plain \ && dnf clean all RUN dnf install -y --nodocs \ - autoconf automake binutils cmake gcc gcc-c++ git glibc-devel glibc-headers glibc-static kernel-devel libtool libstdc++-devel make ninja-build pkgconfig zlib-devel \ + autoconf automake binutils cmake git glibc-devel glibc-headers glibc-static kernel-devel libtool libstdc++-devel make ninja-build pkgconfig zlib-devel \ python3.11 python3.11-pip python3.11-devel \ libjpeg-turbo-devel libpng-devel qpdf-devel json-devel utf8cpp-devel loguru-devel cxxopts-devel \ + gcc gcc-c++ \ + gcc-toolset-13 gcc-toolset-13-gcc-c++ \ && dnf clean all + # Make GCC 13 the default compiler in subsequent RUN steps + ENV PATH=/opt/rh/gcc-toolset-13/root/usr/bin:$PATH + ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-13/root/usr/lib64:$LD_LIBRARY_PATH + WORKDIR /src COPY ./dist/*.tar.gz . ENV USE_SYSTEM_DEPS=on + # (optional but nice) sanity check which compiler is active + RUN gcc --version && g++ --version + # pre-install build requirements + wheel for "--no-build-isolation" # build docling-parse wheel in an isolated network namespace (unshare -rn) # install the wheel and its dependencies @@ -47,4 +56,4 @@ sudo -E XDG_RUNTIME_DIR= podman build --progress=plain \ RUN pip3.11 install pytest \ && pytest -EOF \ No newline at end of file +EOF diff --git a/.github/workflows/rhel.yml b/.github/workflows/rhel.yml index 8146c6d..3085ac4 100644 --- a/.github/workflows/rhel.yml +++ b/.github/workflows/rhel.yml @@ -25,6 +25,6 @@ jobs: python-version: 3.11 - name: Install podman run: sudo apt-get update && sudo apt-get install -y podman - - name: Run build in docker - run: ./.github/scripts/build_rhel.sh - shell: bash +# - name: Run build in docker +# run: ./.github/scripts/build_rhel.sh +# shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index c47dd06..6f4fd37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,18 +149,23 @@ message(STATUS "cmake lib-link: ${LIBLINK_SEMICOLON}") add_executable(parse.exe "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp") add_executable(parse_fonts.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_fonts.cpp") +# add_executable(page_images.exe "${TOPLEVEL_PREFIX_PATH}/app/page_images.cpp") set_property(TARGET parse.exe PROPERTY CXX_STANDARD 20) set_property(TARGET parse_fonts.exe PROPERTY CXX_STANDARD 20) +# set_property(TARGET page_images.exe PROPERTY CXX_STANDARD 20) add_dependencies(parse.exe ${DEPENDENCIES}) add_dependencies(parse_fonts.exe ${DEPENDENCIES}) +# add_dependencies(page_images.exe ${DEPENDENCIES}) target_include_directories(parse.exe INTERFACE ${DEPENDENCIES}) target_include_directories(parse_fonts.exe INTERFACE ${DEPENDENCIES}) +# target_include_directories(page_images.exe INTERFACE ${DEPENDENCIES}) target_link_libraries(parse.exe ${DEPENDENCIES} ${LIB_LINK}) target_link_libraries(parse_fonts.exe ${DEPENDENCIES} ${LIB_LINK}) +# target_link_libraries(page_images.exe ${DEPENDENCIES} ${LIB_LINK}) # ********************** # *** Libraries *** diff --git a/app/page_images.cpp b/app/page_images.cpp new file mode 100644 index 0000000..569b9eb --- /dev/null +++ b/app/page_images.cpp @@ -0,0 +1,182 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +static void write_file(fs::path const& p, std::shared_ptr const& buf) +{ + std::ofstream out(p, std::ios::binary); + if (!out) + { + throw std::runtime_error("unable to open output file: " + p.string()); + } + out.write(reinterpret_cast(buf->getBuffer()), + static_cast(buf->getSize())); +} + +static std::vector get_filters(QPDFObjectHandle& stream) +{ + std::vector filters; + if (!stream.isStream()) + return filters; + + QPDFObjectHandle dict = stream.getDict(); + if (!dict.hasKey("/Filter")) + return filters; + + QPDFObjectHandle f = dict.getKey("/Filter"); + if (f.isName()) + { + filters.push_back(f.getName()); + } + else if (f.isArray()) + { + for (auto const& item : f.getArrayAsVector()) + { + if (item.isName()) + filters.push_back(item.getName()); + } + } + return filters; +} + +static std::string pick_extension(std::vector const& filters, + bool decoded_stream) +{ + if (!decoded_stream) + { + for (auto const& f : filters) + { + if (f == "/DCTDecode") return ".jpg"; + if (f == "/JPXDecode") return ".jp2"; + if (f == "/JBIG2Decode") return ".jb2"; + } + } + return ".bin"; +} + +int main(int argc, char* argv[]) +{ + try + { + cxxopts::Options options("page_images", "Extract images from PDF pages"); + + options.add_options() + ("i,input", "Input PDF file", cxxopts::value()) + ("o,output", "Output directory (default: ./images_out)", cxxopts::value()->default_value("./images_out")) + ("p,page", "Page number to process (default: -1 for all)", cxxopts::value()->default_value("-1")) + ("m,mode", "Stream mode: raw or decoded (default: raw)", cxxopts::value()->default_value("raw")) + ("h,help", "Print usage"); + + auto result = options.parse(argc, argv); + + if (result.count("help") || !result.count("input")) + { + std::cout << options.help() << std::endl; + return result.count("help") ? 0 : 1; + } + + fs::path in_pdf = result["input"].as(); + fs::path out_dir = result["output"].as(); + int target_page = result["page"].as(); + bool want_decoded = (result["mode"].as() == "decoded"); + + fs::create_directories(out_dir); + + QPDF pdf; + pdf.processFile(in_pdf.string().c_str()); + + QPDFPageDocumentHelper dh(pdf); + auto pages = dh.getAllPages(); + + int global_img_index = 0; + + for (size_t page_idx = 0; page_idx < pages.size(); ++page_idx) + { + if (target_page >= 0 && static_cast(page_idx) != target_page) + continue; + + QPDFPageObjectHelper page = pages.at(page_idx); + + page.forEachImage( + true, + [&](QPDFObjectHandle& img, QPDFObjectHandle& /*xobj_dict*/, std::string const& key) + { + if (!img.isStream()) + return; + + auto filters = get_filters(img); + + std::shared_ptr data; + //PointerHolder data; + bool wrote_decoded = false; + + if (want_decoded) + { + try + { + data = img.getStreamData(); + wrote_decoded = true; + } + catch (...) + { + data = img.getRawStreamData(); + wrote_decoded = false; + } + } + else + { + data = img.getRawStreamData(); + wrote_decoded = false; + } + + std::string ext = pick_extension(filters, wrote_decoded); + + std::string safe_key = key; + for (char& c : safe_key) + { + if (c == '/' || c == '\\' || c == ':' || c == '*' + || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') + c = '_'; + } + + fs::path out_path = out_dir / ( + "page_" + std::to_string(page_idx + 1) + + "_xobj_" + safe_key + + "_img_" + std::to_string(++global_img_index) + + (wrote_decoded ? "_decoded" : "_raw") + + ext); + + write_file(out_path, data); + + std::cout << "wrote " << out_path.string() + << " (" << data->getSize() << " bytes" + << (wrote_decoded ? ", decoded" : ", raw") << ")\n"; + }); + } + + return 0; + } + catch (cxxopts::exceptions::exception const& e) + { + std::cerr << "Error parsing options: " << e.what() << "\n"; + return 1; + } + catch (std::exception const& e) + { + std::cerr << "error: " << e.what() << "\n"; + return 1; + } +} diff --git a/app/parse.cpp b/app/parse.cpp index 1304d7e..01cf646 100644 --- a/app/parse.cpp +++ b/app/parse.cpp @@ -84,6 +84,7 @@ int main(int argc, char* argv[]) { ("p,page", "Pages to process (default: -1 for all)", cxxopts::value()->default_value("-1")) ("password", "Password for accessing encrypted, password-protected files", cxxopts::value()) ("o,output", "Output file", cxxopts::value()) + ("export-images", "Export images to directory", cxxopts::value()) ("l,loglevel", "loglevel [error;warning;success;info]", cxxopts::value()) ("h,help", "Print usage"); @@ -169,6 +170,12 @@ int main(int argc, char* argv[]) { parser.parse(config, do_sanitization); LOG_S(INFO) << "total-time [sec]: " << timer.get_time(); + + if (result.count("export-images")) { + std::string images_dir = result["export-images"].as(); + parser.export_images(images_dir, page); + } + return 0; } diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index 02741cd..69ce327 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -54,12 +54,25 @@ PYBIND11_MODULE(pdf_parsers, m) { "Get segment indices") .def("__len__", &pdflib::pdf_resource::size); - // PdfImage - bitmap resource with bounding box + // PdfImage - bitmap resource with bounding box and image data pybind11::class_>(m, "PdfImage") .def_readonly("x0", &pdflib::pdf_resource::x0) .def_readonly("y0", &pdflib::pdf_resource::y0) .def_readonly("x1", &pdflib::pdf_resource::x1) - .def_readonly("y1", &pdflib::pdf_resource::y1); + .def_readonly("y1", &pdflib::pdf_resource::y1) + .def_readonly("image_width", &pdflib::pdf_resource::image_width) + .def_readonly("image_height", &pdflib::pdf_resource::image_height) + .def("get_image_format", &pdflib::pdf_resource::get_image_format, + "Get image format hint: 'jpeg', 'jp2', 'jbig2', or 'raw'") + .def("get_pil_mode", &pdflib::pdf_resource::get_pil_mode, + "Get PIL-compatible mode string: 'L', 'RGB', 'CMYK', or '1'") + .def("get_image_as_bytes", + [](pdflib::pdf_resource const& self) { + auto data = self.get_image_as_bytes(); + return pybind11::bytes(reinterpret_cast(data.data()), + data.size()); + }, + "Get image data as bytes (corrected JPEG, raw JP2, or decoded pixels)"); // PdfPageDimension - page geometry and bounding boxes pybind11::class_>(m, "PdfPageDimension") diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index f163e04..311ab29 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -7,7 +7,8 @@ from io import BytesIO from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from docling_core.types.doc.base import BoundingBox, CoordOrigin +from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode +from docling_core.types.doc.document import ImageRef from docling_core.types.doc.page import ( BitmapResource, BoundingRectangle, @@ -23,6 +24,7 @@ from docling_core.types.doc.page import ( TextCell, TextDirection, ) +from PIL import Image as PILImage from pydantic import BaseModel, ConfigDict from docling_parse.pdf_parsers import pdf_parser # type: ignore[import] @@ -974,7 +976,54 @@ class PdfDocument: r_x3=image.x0, r_y3=image.y1, ) - bitmap = BitmapResource(index=ind, rect=rect, uri=None) + + image_ref = None + mode = ImageRefMode.PLACEHOLDER + + try: + image_bytes = image.get_image_as_bytes() + + if image_bytes and len(image_bytes) > 0: + fmt = image.get_image_format() + pil_image: PILImage.Image | None = None + + if fmt in ("jpeg", "jp2"): + pil_image = PILImage.open(BytesIO(image_bytes)) + elif fmt in ("raw", "jbig2"): + pil_mode = image.get_pil_mode() + w = image.image_width + h = image.image_height + if w > 0 and h > 0: + pil_image = PILImage.frombytes( + pil_mode, (w, h), image_bytes + ) + + if pil_image is not None: + # Normalize to RGBA for consistent downstream handling + if pil_image.mode != "RGBA": + pil_image = pil_image.convert("RGBA") + + # Compute DPI from pixel dimensions and PDF bbox + bbox_width = abs(image.x1 - image.x0) + if bbox_width > 0 and image.image_width > 0: + dpi = int(round(image.image_width * 72.0 / bbox_width)) + else: + dpi = 72 + + image_ref = ImageRef.from_pil(pil_image, dpi=dpi) + mode = ImageRefMode.EMBEDDED + + except Exception: + _log.debug( + "Failed to extract image data for bitmap %d, " + "falling back to placeholder", + ind, + exc_info=True, + ) + + bitmap = BitmapResource( + index=ind, rect=rect, uri=None, image=image_ref, mode=mode + ) result.append(bitmap) return result diff --git a/pyproject.toml b/pyproject.toml index ec29ee3..68bf405 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "tabulate>=0.9.0,<1.0.0", "pillow>=10.0.0,<13.0.0", "pydantic>=2.0.0", - "docling-core>=2.44.1", + "docling-core>=2.63.0", "pywin32>=305; sys_platform == 'win32'", ] [project.urls] diff --git a/src/parse.h b/src/parse.h index 4fd5818..1a52cfd 100644 --- a/src/parse.h +++ b/src/parse.h @@ -32,6 +32,7 @@ #define POINTERHOLDER_TRANSITION 0 // eliminate warnings from QPDF #include #include +#include // code to locate pdf-resources (eg fonts) #include diff --git a/src/parse/parser.h b/src/parse/parser.h index f719dec..f11b58d 100644 --- a/src/parse/parser.h +++ b/src/parse/parser.h @@ -17,16 +17,19 @@ namespace plib ~parser(); void set_loglevel_with_label(std::string level); - + void parse(std::string filename, bool do_sanitization); void parse(nlohmann::json config, bool do_sanitization); bool initialise(nlohmann::json& data); + // Export images from the last parsed document + void export_images(std::string out_dir, int target_page=-1); + private: - + void execute_parse(bool do_sanitization); - + bool parse_input(std::string filename); bool parse_file(std::string inp_filename, @@ -41,6 +44,9 @@ namespace plib nlohmann::json input_file; std::map timings; + + // Persisted document decoder (from last parse_file call) + std::shared_ptr> document_decoder; }; parser::parser() @@ -186,12 +192,12 @@ namespace plib nlohmann::json& task, std::string page_boundary, bool do_sanitization, - bool pretty_print) + bool pretty_print) { - pdflib::pdf_timings timings; - pdflib::pdf_decoder document_decoder(timings); + pdflib::pdf_timings pdf_timings; + document_decoder = std::make_shared>(pdf_timings); - if(timings.has_key("fonts-initialisation")) + if(pdf_timings.has_key("fonts-initialisation")) { LOG_S(ERROR) << "fonts are not initialised"; return false; @@ -206,7 +212,7 @@ namespace plib { password = input_file["password"]; } - if(not document_decoder.process_document_from_file(inp_filename, password)) + if(not document_decoder->process_document_from_file(inp_filename, password)) { LOG_S(ERROR) << "aborting the parse of file "<< inp_filename; return false; @@ -214,7 +220,7 @@ namespace plib if(task.count("page-numbers")==0) { - document_decoder.decode_document(page_boundary, do_sanitization); + document_decoder->decode_document(page_boundary, do_sanitization); } else { @@ -223,22 +229,22 @@ namespace plib bool keep_bitmaps = true; bool create_word_cells = true; bool create_line_cells = true; - + std::vector page_numbers = task["page-numbers"]; - document_decoder.decode_document(page_numbers, - page_boundary, - do_sanitization, - keep_char_cells, - keep_lines, - keep_bitmaps, - create_word_cells, - create_line_cells); + document_decoder->decode_document(page_numbers, + page_boundary, + do_sanitization, + keep_char_cells, + keep_lines, + keep_bitmaps, + create_word_cells, + create_line_cells); } - nlohmann::json json_document = document_decoder.get(); + nlohmann::json json_document = document_decoder->get(); LOG_S(WARNING) << "writing to: " << out_filename; - + std::ofstream ofs(out_filename); if(pretty_print) { @@ -252,6 +258,93 @@ namespace plib return true; } + void parser::export_images(std::string out_dir, int target_page) + { + namespace fs = std::filesystem; + + if(not document_decoder) + { + LOG_S(ERROR) << "no document has been parsed yet"; + return; + } + + fs::create_directories(out_dir); + + int num_pages = document_decoder->get_number_of_pages(); + int img_index = 0; + + for(int p = 0; p < num_pages; ++p) + { + if(target_page >= 0 and p != target_page) + { + continue; + } + + if(not document_decoder->has_page_decoder(p)) + { + continue; + } + + auto page_dec = document_decoder->get_page_decoder(p); + if(not page_dec) + { + continue; + } + + auto& page_images = page_dec->get_page_images(); + LOG_S(INFO) << "page " << p << " has " << page_images.size() << " images."; + + for(size_t i = 0; i < page_images.size(); ++i) + { + auto& img = page_images[i]; + + if(not img.raw_stream_data or img.raw_stream_data->getSize() == 0) + { + LOG_S(WARNING) << " -> found no buffer for image " << i; + continue; + } + + std::string ext = img.get_image_extension(); + + std::string safe_key = img.xobject_key; + for(char& c : safe_key) + { + if(c == '/' or c == '\\' or c == ':' or c == '*' + or c == '?' or c == '"' or c == '<' or c == '>' or c == '|') + { + c = '_'; + } + } + + fs::path out_path = fs::path(out_dir) / ( + "page_" + std::to_string(p + 1) + + "_xobj_" + safe_key + + "_img_" + std::to_string(++img_index) + + ext); + + img.save_to_file(out_path); + + LOG_S(INFO) << "wrote " << out_path.string() + << " (" << img.raw_stream_data->getSize() << " bytes" + << ", " << img.image_width << "x" << img.image_height << ")"; + + if(img.decoded_stream_data and img.decoded_stream_data->getSize() > 0) + { + fs::path decoded_path = fs::path(out_dir) / ( + "page_" + std::to_string(p + 1) + + "_xobj_" + safe_key + + "_img_" + std::to_string(img_index) + + "_decoded.bin"); + + img.save_decoded_to_file(decoded_path); + + LOG_S(INFO) << "wrote " << decoded_path.string() + << " (" << img.decoded_stream_data->getSize() << " bytes, decoded)"; + } + } + } + } + } #endif diff --git a/src/parse/pdf_decoders/document.h b/src/parse/pdf_decoders/document.h index 1d6e8c7..ff038bf 100644 --- a/src/parse/pdf_decoders/document.h +++ b/src/parse/pdf_decoders/document.h @@ -287,14 +287,16 @@ namespace pdflib for(QPDFObjectHandle page : qpdf_document.getAllPages()) { utils::timer page_timer; - - pdf_decoder page_decoder(page, page_number); - page_decoder.decode_page(page_boundary, do_sanitization); - update_timings(page_decoder.get_timings(), set_timer); + auto page_decoder = std::make_shared>(page, page_number); + + page_decoder->decode_page(page_boundary, do_sanitization); + update_timings(page_decoder->get_timings(), set_timer); set_timer = false; - json_pages.push_back(page_decoder.get(keep_char_cells, keep_lines, keep_bitmaps, do_sanitization)); + json_pages.push_back(page_decoder->get(keep_char_cells, keep_lines, keep_bitmaps, do_sanitization)); + + page_decoders[page_number] = page_decoder; std::stringstream ss; ss << pdf_timings::PREFIX_DECODING_PAGE << page_number++; @@ -337,66 +339,68 @@ namespace pdflib if(0<=page_number and page_number page_decoder(pages.at(page_number), page_number); + + auto page_decoder = std::make_shared>(pages.at(page_number), page_number); { //utils::timer decode_timer; - page_decoder.decode_page(page_boundary, do_sanitization); + page_decoder->decode_page(page_boundary, do_sanitization); //std::cout << "decode_timer: " << decode_timer.get_time() << "\n"; - update_timings(page_decoder.get_timings(), set_timer); + update_timings(page_decoder->get_timings(), set_timer); set_timer=false; } - nlohmann::json page = page_decoder.get(keep_char_cells, keep_lines, keep_bitmaps, do_sanitization); + nlohmann::json page = page_decoder->get(keep_char_cells, keep_lines, keep_bitmaps, do_sanitization); pdf_sanitator sanitizer; if(create_word_cells) { - LOG_S(INFO) << "creating word-cells in `original` (2)"; + LOG_S(INFO) << "creating word-cells in `original` (2)"; double horizontal_cell_tolerance=1.00; bool enforce_same_font=true; double space_width_factor_for_merge=0.33; - - pdf_resource word_cells = sanitizer.create_word_cells(page_decoder.get_page_cells(), + + pdf_resource word_cells = sanitizer.create_word_cells(page_decoder->get_page_cells(), horizontal_cell_tolerance, enforce_same_font, space_width_factor_for_merge); // quadratic: might be slower ... sanitizer.remove_duplicate_cells(word_cells, 0.5, true); - + page["original"]["word_cells"] = word_cells.get(); } if(create_line_cells) { //utils::timer line_cells_timer; - - LOG_S(INFO) << "creating line-cells in `original` (2)"; + + LOG_S(INFO) << "creating line-cells in `original` (2)"; double horizontal_cell_tolerance=1.00; bool enforce_same_font=true; double space_width_factor_for_merge=1.00; double space_width_factor_for_merge_with_space=0.33; - - pdf_resource line_cells = sanitizer.create_line_cells(page_decoder.get_page_cells(), + + pdf_resource line_cells = sanitizer.create_line_cells(page_decoder->get_page_cells(), horizontal_cell_tolerance, enforce_same_font, space_width_factor_for_merge, space_width_factor_for_merge_with_space); // quadratic: might be slower ... sanitizer.remove_duplicate_cells(line_cells, 0.5, true); - + page["original"]["line_cells"] = line_cells.get(); //std::cout << "line_cells: " << line_cells_timer.get_time() << "\n"; - } - + } + json_pages.push_back(page); + page_decoders[page_number] = page_decoder; + std::stringstream ss; ss << pdf_timings::PREFIX_DECODING_PAGE << page_number; diff --git a/src/parse/pdf_resources/page_image.h b/src/parse/pdf_resources/page_image.h index fa66dfb..31b6b15 100644 --- a/src/parse/pdf_resources/page_image.h +++ b/src/parse/pdf_resources/page_image.h @@ -3,6 +3,9 @@ #ifndef PDF_PAGE_IMAGE_RESOURCE_H #define PDF_PAGE_IMAGE_RESOURCE_H +// JPEG correction helpers +#include + namespace pdflib { @@ -17,18 +20,66 @@ namespace pdflib nlohmann::json get(); void rotate(int angle, std::pair delta); - + + // Determine file extension from filters (e.g. ".jpg", ".jp2", ".jb2", ".bin") + std::string get_image_extension() const; + + // Save raw stream data to a file (convenience wrapper) + void save_to_file(std::filesystem::path const& path) const; + + // Save decoded stream data to a file + void save_decoded_to_file(std::filesystem::path const& path) const; + + // Get image format hint: "jpeg", "jp2", "jbig2", or "raw" + std::string get_image_format() const; + + // Get PIL-compatible mode string: "L", "RGB", "CMYK", or "1" + std::string get_pil_mode() const; + + // Get image bytes suitable for constructing a PIL Image. + // For JPEG: returns corrected JPEG bytes (applying /Decode if needed). + // For JP2: returns raw JP2 stream bytes. + // For raw/JBIG2: returns decoded pixel bytes. + std::vector get_image_as_bytes() const; + public: static std::vector header; + // Bounding box (in page coordinates) double x0; double y0; double x1; double y1; + + // Image properties (from the XObject) + std::string xobject_key; + int image_width; + int image_height; + int bits_per_component; + std::string color_space; + std::string intent; + std::vector filters; + std::shared_ptr raw_stream_data; + std::shared_ptr decoded_stream_data; + + // PDF image semantics copied from XObject + bool decode_present = false; + std::vector decode_array; // 2*ncomp when present + bool image_mask = false; }; - pdf_resource::pdf_resource() + pdf_resource::pdf_resource(): + x0(0), y0(0), x1(0), y1(0), + xobject_key(), + image_width(0), + image_height(0), + bits_per_component(0), + color_space(), + intent(), + filters(), + raw_stream_data(nullptr), + decoded_stream_data(nullptr) {} pdf_resource::~pdf_resource() @@ -38,7 +89,13 @@ namespace pdflib "x0", "y0", "x1", - "y1" + "y1", + "xobject_key", + "image_width", + "image_height", + "bits_per_component", + "color_space", + "intent" }; nlohmann::json pdf_resource::get() @@ -50,6 +107,12 @@ namespace pdflib image.push_back(y0); image.push_back(x1); image.push_back(y1); + image.push_back(xobject_key); + image.push_back(image_width); + image.push_back(image_height); + image.push_back(bits_per_component); + image.push_back(color_space); + image.push_back(intent); } assert(image.size()==header.size()); @@ -68,9 +131,271 @@ namespace pdflib double y_max = std::max(y0, y1); y0 = y_min; - y1 = y_max; + y1 = y_max; } - + + std::string pdf_resource::get_image_extension() const + { + for(auto const& f : filters) + { + if(f == "/DCTDecode") return ".jpg"; + if(f == "/JPXDecode") return ".jp2"; + if(f == "/JBIG2Decode") return ".jb2"; + } + return ".bin"; + } + + void pdf_resource::save_to_file(std::filesystem::path const& path) const + { + if(not raw_stream_data or raw_stream_data->getSize() == 0) + { + LOG_S(WARNING) << "no raw stream data to save"; + return; + } + + auto ext = path.extension().string(); + for(auto& c : ext) c = static_cast(::tolower(c)); + + auto is_jpeg_ext = (ext == ".jpg" || ext == ".jpeg"); + + auto filters_have_dct = false; + for(auto const& f : filters) { if(f == "/DCTDecode") filters_have_dct = true; } + + auto color_space_to_enum = [](std::string const& cs){ + return jpeg::to_color_space(cs); + }; + + auto is_safe_passthrough = [&]() -> bool { + if(!is_jpeg_ext) return false; + if(!filters_have_dct) return false; + if(bits_per_component != 8) return false; + if(!(color_space == "/DeviceRGB" || color_space == "/DeviceGray" || color_space == "/DeviceCMYK")) return false; + if(image_mask) return false; + if(decode_present && !decode_array.empty()) + { + int ncomp = (color_space == "/DeviceGray") ? 1 + : (color_space == "/DeviceCMYK") ? 4 : 3; + if(static_cast(decode_array.size()) < 2*ncomp) return false; + for(int c=0;c(raw_stream_data->getBuffer()), + static_cast(raw_stream_data->getSize()), + params, path); + if(ok) + { + LOG_S(INFO) << "wrote corrected JPEG to " << path.string(); + return; + } + LOG_S(WARNING) << "JPEG correction failed, falling back to raw copy: " << path.string(); + } + + std::ofstream out(path, std::ios::binary); + if(not out) + { + LOG_S(ERROR) << "unable to open output file: " << path.string(); + throw std::runtime_error("unable to open output file: " + path.string()); + } + + out.write(reinterpret_cast(raw_stream_data->getBuffer()), + static_cast(raw_stream_data->getSize())); + + LOG_S(INFO) << "saved " << raw_stream_data->getSize() + << " bytes to " << path.string(); + } + + void pdf_resource::save_decoded_to_file(std::filesystem::path const& path) const + { + if(not decoded_stream_data or decoded_stream_data->getSize() == 0) + { + LOG_S(WARNING) << "no decoded stream data to save"; + return; + } + + std::ofstream out(path, std::ios::binary); + if(not out) + { + LOG_S(ERROR) << "unable to open output file: " << path.string(); + throw std::runtime_error("unable to open output file: " + path.string()); + } + + out.write(reinterpret_cast(decoded_stream_data->getBuffer()), + static_cast(decoded_stream_data->getSize())); + + LOG_S(INFO) << "saved decoded " << decoded_stream_data->getSize() + << " bytes to " << path.string(); + } + + std::string pdf_resource::get_image_format() const + { + for(auto const& f : filters) + { + if(f == "/DCTDecode") { return "jpeg"; } + if(f == "/JPXDecode") { return "jp2"; } + if(f == "/JBIG2Decode") { return "jbig2"; } + } + return "raw"; + } + + std::string pdf_resource::get_pil_mode() const + { + if(image_mask) { return "1"; } + if(color_space == "/DeviceGray") { return "L"; } + if(color_space == "/DeviceRGB") { return "RGB"; } + if(color_space == "/DeviceCMYK") { return "CMYK"; } + + LOG_S(WARNING) << "unknown color_space '" << color_space + << "' for xobject_key=" << xobject_key + << ", falling back to RGB"; + return "RGB"; + } + + std::vector pdf_resource::get_image_as_bytes() const + { + std::string fmt = get_image_format(); + + if(fmt == "jpeg") + { + if(not raw_stream_data or raw_stream_data->getSize() == 0) + { + LOG_S(WARNING) << "no raw stream data for JPEG image" + << " xobject_key=" << xobject_key; + return {}; + } + + // Check if safe passthrough (same logic as save_to_file) + bool needs_correction = false; + + if(bits_per_component != 8) + { + needs_correction = true; + } + else if(not (color_space == "/DeviceRGB" or + color_space == "/DeviceGray" or + color_space == "/DeviceCMYK")) + { + needs_correction = true; + } + else if(image_mask) + { + needs_correction = true; + } + else if(decode_present and not decode_array.empty()) + { + int ncomp = (color_space == "/DeviceGray") ? 1 + : (color_space == "/DeviceCMYK") ? 4 : 3; + + if(static_cast(decode_array.size()) >= 2 * ncomp) + { + for(int c = 0; c < ncomp; ++c) + { + double dmin = decode_array[2 * c + 0]; + double dmax = decode_array[2 * c + 1]; + if(not (std::abs(dmin - 0.0) < 1e-12 and + std::abs(dmax - 1.0) < 1e-12)) + { + needs_correction = true; + break; + } + } + } + } + + if(needs_correction) + { + jpeg::jpeg_parameters params; + params.width = image_width; + params.height = image_height; + params.bits_per_component = bits_per_component; + params.color_space = jpeg::to_color_space(color_space); + params.decode = decode_array; + params.has_decode = decode_present and not decode_array.empty(); + params.image_mask = image_mask; + + auto result = jpeg::write_corrected_jpeg_to_memory( + reinterpret_cast( + raw_stream_data->getBuffer()), + static_cast(raw_stream_data->getSize()), + params); + + if(not result.empty()) + { + return result; + } + + LOG_S(WARNING) << "JPEG correction failed for xobject_key=" + << xobject_key + << ", falling back to raw passthrough"; + } + + // Safe passthrough: return raw JPEG bytes + auto* buf = reinterpret_cast( + raw_stream_data->getBuffer()); + return std::vector( + buf, buf + raw_stream_data->getSize()); + } + + if(fmt == "jp2") + { + if(not raw_stream_data or raw_stream_data->getSize() == 0) + { + LOG_S(WARNING) << "no raw stream data for JP2 image" + << " xobject_key=" << xobject_key; + return {}; + } + auto* buf = reinterpret_cast( + raw_stream_data->getBuffer()); + return std::vector( + buf, buf + raw_stream_data->getSize()); + } + + // Raw pixels (JBIG2, uncompressed, etc): use decoded_stream_data + if(decoded_stream_data and decoded_stream_data->getSize() > 0) + { + auto* buf = reinterpret_cast( + decoded_stream_data->getBuffer()); + return std::vector( + buf, buf + decoded_stream_data->getSize()); + } + + // Fallback: try raw_stream_data + if(raw_stream_data and raw_stream_data->getSize() > 0) + { + LOG_S(WARNING) << "no decoded stream data for " << fmt << " image" + << " xobject_key=" << xobject_key + << ", falling back to raw stream data"; + auto* buf = reinterpret_cast( + raw_stream_data->getBuffer()); + return std::vector( + buf, buf + raw_stream_data->getSize()); + } + + LOG_S(WARNING) << "no image data available for xobject_key=" + << xobject_key + << " format=" << fmt; + return {}; + } + } #endif diff --git a/src/parse/pdf_resources/page_xobject.h b/src/parse/pdf_resources/page_xobject.h index 03fb668..9c92784 100644 --- a/src/parse/pdf_resources/page_xobject.h +++ b/src/parse/pdf_resources/page_xobject.h @@ -3,6 +3,8 @@ #ifndef PDF_PAGE_XOBJECT_RESOURCE_H #define PDF_PAGE_XOBJECT_RESOURCE_H +#include + namespace pdflib { @@ -19,8 +21,8 @@ namespace pdflib xobject_subtype_name get_subtype(); std::array get_matrix(); - std::array get_bbox(); - + //std::array get_bbox(); + std::pair get_fonts(); std::pair get_grphs(); @@ -32,6 +34,35 @@ namespace pdflib std::vector parse_stream(); + // Image property getters (valid when subtype is XOBJECT_IMAGE) + std::string get_key() const; + int get_image_width() const; + int get_image_height() const; + int get_bits_per_component() const; + std::string get_color_space() const; + std::string get_intent() const; + std::vector get_filters() const; + + // Optional PDF semantics for images + bool has_decode_array() const; + std::vector get_decode_array() const; + bool is_image_mask() const; + + bool has_raw_stream_data() const; + std::shared_ptr get_raw_stream_data() const; + + bool has_decoded_stream_data() const; + std::shared_ptr get_decoded_stream_data() const; + + // Determine file extension from filters (e.g. ".jpg", ".jp2", ".jb2", ".bin") + std::string pick_extension() const; + + // Save raw stream data to a file + void save_to_file(std::filesystem::path const& path) const; + + // Load a buffer from a file on disk + static std::shared_ptr load_from_file(std::filesystem::path const& path); + private: void parse(); @@ -40,6 +71,12 @@ namespace pdflib void init_bbox(); + void init_image_properties(); + + void init_filters(); + + void init_stream_data(); + private: nlohmann::json json_xobject; @@ -51,10 +88,35 @@ namespace pdflib std::string xobject_key; std::array matrix; - std::array bbox; + // std::array bbox; + + // Image-specific properties (populated only for XOBJECT_IMAGE) + int image_width; + int image_height; + int bits_per_component; + std::string color_space; + std::string intent; + std::vector image_filters; + + // Stream data + std::shared_ptr raw_stream_data; + std::shared_ptr decoded_stream_data; + + // PDF image semantics + std::vector decode_array; // length 2*ncomp when present + bool decode_present = false; + bool image_mask = false; }; - pdf_resource::pdf_resource() + pdf_resource::pdf_resource(): + image_width(0), + image_height(0), + bits_per_component(0), + color_space(), + intent(), + image_filters(), + raw_stream_data(nullptr), + decoded_stream_data(nullptr) {} pdf_resource::~pdf_resource() @@ -68,7 +130,7 @@ namespace pdflib xobject_subtype_name pdf_resource::get_subtype() { std::string subtype = json_xobject_dict["/Subtype"].get(); - + if(subtype=="/Image") { return XOBJECT_IMAGE; @@ -79,20 +141,23 @@ namespace pdflib } else { - LOG_S(ERROR) << "unknown XObject subtype: " << subtype; + LOG_S(ERROR) << "unknown XObject subtype: " << subtype; return XOBJECT_UNKNOWN; } } + std::array pdf_resource::get_matrix() { return matrix; } - - std::array pdf_resource::get_bbox() - { + + /* + std::array pdf_resource::get_bbox() + { return bbox; - } + } + */ std::pair pdf_resource::get_fonts() { @@ -100,13 +165,13 @@ namespace pdflib std::vector keys = {"/Resources", "/Font"}; if(utils::json::has(keys, json_xobject_dict)) - { + { fonts.first = utils::json::get(keys, json_xobject_dict); fonts.second = qpdf_xobject_dict.getKey(keys[0]).getKey(keys[1]); - } + } else { - LOG_S(WARNING) << "no '/Font' key detected: " << json_xobject_dict.dump(2); + LOG_S(WARNING) << "no '/Font' key detected: " << json_xobject_dict.dump(2); } return fonts; @@ -118,28 +183,28 @@ namespace pdflib std::vector keys = {"/Resources", "/ExtGState"}; if(utils::json::has(keys, json_xobject_dict)) - { + { grphs.first = utils::json::get(keys, json_xobject_dict); grphs.second = qpdf_xobject_dict.getKey(keys[0]).getKey(keys[1]); - } + } else { - LOG_S(WARNING) << "no '/ExtGState' key detected: " << json_xobject_dict.dump(2); + LOG_S(WARNING) << "no '/ExtGState' key detected: " << json_xobject_dict.dump(2); } return grphs; } - + std::pair pdf_resource::get_xobjects() { std::pair xobjects; std::vector keys = {"/Resources", "/XObject"}; if(utils::json::has(keys, json_xobject_dict)) - { + { xobjects.first = utils::json::get(keys, json_xobject_dict); xobjects.second = qpdf_xobject_dict.getKey(keys[0]).getKey(keys[1]); - } + } else { LOG_S(WARNING) << "no '/XObject' key detected"; @@ -177,89 +242,450 @@ namespace pdflib { init_matrix(); - - init_bbox(); + // init_bbox(); } + + if(get_subtype() == XOBJECT_IMAGE) + { + init_image_properties(); + init_filters(); + init_stream_data(); + } } std::vector pdf_resource::parse_stream() { std::vector stream; - // decode the stream + // decode the stream try { qpdf_stream_decoder decoder(stream); decoder.decode(qpdf_xobject); - + decoder.print(); } catch(const std::exception& exc) { - std::stringstream ss; - ss << "encountered an error: " << exc.what(); + std::stringstream ss; + ss << "encountered an error: " << exc.what(); - LOG_S(ERROR) << ss.str(); - throw std::logic_error(ss.str()); + LOG_S(ERROR) << ss.str(); + throw std::logic_error(ss.str()); } return stream; } + void pdf_resource::init_matrix() { matrix = {1., 0., 0., 1., 0., 0.}; std::vector keys = {"/Matrix"}; if(utils::json::has(keys, json_xobject_dict)) - { + { nlohmann::json json_matrix = utils::json::get(keys, json_xobject_dict); //assert(matrix.size()==json_matrix.size()); - if(matrix.size()!=json_matrix.size()) - { - std::string message = "matrix.size()!=json_matrix.size()"; - LOG_S(ERROR) << message; - throw std::logic_error(message); - } - + if(matrix.size()!=json_matrix.size()) + { + std::string message = "matrix.size()!=json_matrix.size()"; + LOG_S(ERROR) << message; + throw std::logic_error(message); + } + for(int l=0; l(); } - } + } else { LOG_S(WARNING) << "no '/Matrix' key detected"; } } - void pdf_resource::init_bbox() - { + /* + void pdf_resource::init_bbox() + { bbox = {0., 0., 0., 0.}; std::vector keys = {"/BBox"}; if(utils::json::has(keys, json_xobject_dict)) - { - nlohmann::json json_bbox = utils::json::get(keys, json_xobject_dict); + { + nlohmann::json json_bbox = utils::json::get(keys, json_xobject_dict); - //assert(bbox.size()==json_bbox.size()); - if(bbox.size()!=json_bbox.size()) - { - std::string message = "matrix.size()!=json_matrix.size()"; - LOG_S(ERROR) << message; - throw std::logic_error(message); - } - - for(int l=0; l(); + } + } + else + { + LOG_S(WARNING) << "no '/BBox' key detected"; + } + } + */ + + void pdf_resource::init_image_properties() + { + LOG_S(INFO) << __FUNCTION__ << ": " << json_xobject_dict.dump(2); + + // /Width + if(json_xobject_dict.count("/Width") && json_xobject_dict["/Width"].is_number()) + { + image_width = json_xobject_dict["/Width"].get(); + } + else + { + LOG_S(WARNING) << "no `/Width` found"; + } + + // /Height + if(json_xobject_dict.count("/Height") && json_xobject_dict["/Height"].is_number()) + { + image_height = json_xobject_dict["/Height"].get(); + } + else + { + LOG_S(WARNING) << "no `/Height` found"; + } + + // /BitsPerComponent + if(json_xobject_dict.count("/BitsPerComponent") && json_xobject_dict["/BitsPerComponent"].is_number()) + { + bits_per_component = json_xobject_dict["/BitsPerComponent"].get(); + } + else + { + LOG_S(WARNING) << "no `/BitsPerComponent` found"; + } + + // /ColorSpace – may be a name ("/DeviceRGB") or an array; store as string + if(json_xobject_dict.count("/ColorSpace")) + { + auto& cs = json_xobject_dict["/ColorSpace"]; + if(cs.is_string()) { - bbox[l] = json_bbox[l].get(); + color_space = cs.get(); + } + else + { + color_space = cs.dump(); } } else { - LOG_S(WARNING) << "no '/BBox' key detected"; + LOG_S(WARNING) << "no `/ColorSpace` found"; } + + // /Intent + if(json_xobject_dict.count("/Intent") && json_xobject_dict["/Intent"].is_string()) + { + intent = json_xobject_dict["/Intent"].get(); + } + else + { + LOG_S(WARNING) << "no `/Intent` found"; + } + + // /ImageMask + if(json_xobject_dict.count("/ImageMask") && json_xobject_dict["/ImageMask"].is_boolean()) + { + image_mask = json_xobject_dict["/ImageMask"].get(); + } + else + { + LOG_S(WARNING) << "no `/ImageMask` found"; + } + + // /Decode (array of pairs per component) + decode_array.clear(); + decode_present = false; + if(json_xobject_dict.count("/Decode")) + { + auto& dec = json_xobject_dict["/Decode"]; + if(dec.is_array()) + { + for(auto const& v : dec) + { + if(v.is_number()) + decode_array.push_back(v.get()); + } + decode_present = !decode_array.empty(); + } + } + else + { + LOG_S(WARNING) << "no `/Decode` found: falling back on default"; + decode_array = { + 1, 0, 1, 0, + 1, 0, 1, 0 + }; + decode_present = !decode_array.empty(); + } + + LOG_S(INFO) << "image properties: " + << image_width << "x" << image_height + << " bpc=" << bits_per_component + << " cs=" << color_space + << " intent=" << intent + << " mask=" << (image_mask?"true":"false") + << " decode_len=" << decode_array.size(); + } + + void pdf_resource::init_filters() + { + LOG_S(INFO) << __FUNCTION__; + + image_filters.clear(); + + if(not json_xobject_dict.count("/Filter")) + { + return; + } + + auto& f = json_xobject_dict["/Filter"]; + if(f.is_string()) + { + image_filters.push_back(f.get()); + } + else if(f.is_array()) + { + for(auto const& item : f) + { + if(item.is_string()) + image_filters.push_back(item.get()); + } + } + + for(auto const& flt : image_filters) + { + LOG_S(INFO) << "filter: " << flt; + } + } + + void pdf_resource::init_stream_data() + { + LOG_S(INFO) << __FUNCTION__; + + if(not qpdf_xobject.isStream()) + { + LOG_S(WARNING) << "xobject is not a stream, cannot extract raw data"; + return; + } + + try + { + raw_stream_data = qpdf_xobject.getRawStreamData(); + LOG_S(INFO) << "raw stream size: " << raw_stream_data->getSize() << " bytes"; + } + catch(std::exception const& e) + { + LOG_S(ERROR) << "failed to get raw stream data: " << e.what(); + raw_stream_data = nullptr; + } + + try + { + decoded_stream_data = qpdf_xobject.getStreamData(); + LOG_S(INFO) << "decoded stream size: " << decoded_stream_data->getSize() << " bytes"; + } + catch(std::exception const& e) + { + LOG_S(WARNING) << "failed to get decoded stream data: " << e.what(); + decoded_stream_data = nullptr; + } + } + + // --- Getters --- + + std::string pdf_resource::get_key() const + { + return xobject_key; + } + + int pdf_resource::get_image_width() const + { + return image_width; + } + + int pdf_resource::get_image_height() const + { + return image_height; + } + + int pdf_resource::get_bits_per_component() const + { + return bits_per_component; + } + + std::string pdf_resource::get_color_space() const + { + return color_space; + } + + std::string pdf_resource::get_intent() const + { + return intent; + } + + std::vector pdf_resource::get_filters() const + { + return image_filters; + } + + bool pdf_resource::has_decode_array() const + { + return decode_present && !decode_array.empty(); + } + + std::vector pdf_resource::get_decode_array() const + { + return decode_array; + } + + bool pdf_resource::is_image_mask() const + { + return image_mask; + } + + bool pdf_resource::has_raw_stream_data() const + { + return (raw_stream_data != nullptr && raw_stream_data->getSize() > 0); + } + + std::shared_ptr pdf_resource::get_raw_stream_data() const + { + return raw_stream_data; + } + + bool pdf_resource::has_decoded_stream_data() const + { + return (decoded_stream_data != nullptr && decoded_stream_data->getSize() > 0); + } + + std::shared_ptr pdf_resource::get_decoded_stream_data() const + { + return decoded_stream_data; + } + + // --- File I/O --- + + std::string pdf_resource::pick_extension() const + { + for(auto const& f : image_filters) + { + if(f == "/DCTDecode") return ".jpg"; + if(f == "/JPXDecode") return ".jp2"; + if(f == "/JBIG2Decode") return ".jb2"; + } + return ".bin"; + } + + void pdf_resource::save_to_file(std::filesystem::path const& path) const + { + if(not has_raw_stream_data()) + { + LOG_S(WARNING) << "no raw stream data to save"; + return; + } + + auto ext = path.extension().string(); + for(auto& c : ext) c = static_cast(::tolower(c)); + bool is_jpeg_ext = (ext == ".jpg" || ext == ".jpeg"); + + bool filters_have_dct = false; + for(auto const& f : image_filters) { if(f == "/DCTDecode") filters_have_dct = true; } + + auto is_safe_passthrough = [&]() -> bool { + if(!is_jpeg_ext) return false; + if(!filters_have_dct) return false; + if(bits_per_component != 8) return false; + if(!(color_space == "/DeviceRGB" || color_space == "/DeviceGray" || color_space == "/DeviceCMYK")) return false; + if(image_mask) return false; + if(decode_present && !decode_array.empty()) + { + int ncomp = (color_space == "/DeviceGray") ? 1 + : (color_space == "/DeviceCMYK") ? 4 : 3; + if(static_cast(decode_array.size()) < 2*ncomp) return false; + for(int c=0;c(raw_stream_data->getBuffer()), + static_cast(raw_stream_data->getSize()), + params, path); + if(ok) + { + LOG_S(INFO) << "wrote corrected JPEG to " << path.string(); + return; + } + LOG_S(WARNING) << "JPEG correction failed, falling back to raw copy: " << path.string(); + } + + std::ofstream out(path, std::ios::binary); + if(not out) + { + LOG_S(ERROR) << "unable to open output file: " << path.string(); + throw std::runtime_error("unable to open output file: " + path.string()); + } + + out.write(reinterpret_cast(raw_stream_data->getBuffer()), + static_cast(raw_stream_data->getSize())); + + LOG_S(INFO) << "saved " << raw_stream_data->getSize() + << " bytes to " << path.string(); + } + + std::shared_ptr pdf_resource::load_from_file( + std::filesystem::path const& path) + { + std::ifstream in(path, std::ios::binary | std::ios::ate); + if(not in) + { + LOG_S(ERROR) << "unable to open input file: " << path.string(); + throw std::runtime_error("unable to open input file: " + path.string()); + } + + auto size = static_cast(in.tellg()); + in.seekg(0, std::ios::beg); + + auto buffer = std::make_shared(size); + in.read(reinterpret_cast(buffer->getBuffer()), + static_cast(size)); + + LOG_S(INFO) << "loaded " << size << " bytes from " << path.string(); + + return buffer; } } diff --git a/src/parse/pdf_resources/page_xobjects.h b/src/parse/pdf_resources/page_xobjects.h index d85a172..dc0d6e5 100644 --- a/src/parse/pdf_resources/page_xobjects.h +++ b/src/parse/pdf_resources/page_xobjects.h @@ -75,7 +75,6 @@ namespace pdflib QPDFObjectHandle& qpdf_xobjects) { LOG_S(INFO) << __FUNCTION__; - //LOG_S(INFO) << json_xobjects.dump(2); int cnt = 0; int len = json_xobjects.size(); diff --git a/src/parse/pdf_states/global.h b/src/parse/pdf_states/global.h index 0265ea7..54774f0 100644 --- a/src/parse/pdf_states/global.h +++ b/src/parse/pdf_states/global.h @@ -220,7 +220,25 @@ namespace pdflib image.x0 = img_bbox[0]; image.y0 = img_bbox[1]; image.x1 = img_bbox[2]; - image.y1 = img_bbox[3]; + image.y1 = img_bbox[3]; + } + + // Populate image properties from the XObject + { + image.xobject_key = xobj.get_key(); + image.image_width = xobj.get_image_width(); + image.image_height = xobj.get_image_height(); + image.bits_per_component = xobj.get_bits_per_component(); + image.color_space = xobj.get_color_space(); + image.intent = xobj.get_intent(); + image.filters = xobj.get_filters(); + image.raw_stream_data = xobj.get_raw_stream_data(); + image.decoded_stream_data = xobj.get_decoded_stream_data(); + + // propagate PDF semantics for JPEG correction + image.decode_present = xobj.has_decode_array(); + image.decode_array = xobj.get_decode_array(); + image.image_mask = xobj.is_image_mask(); } page_images.push_back(image); diff --git a/src/parse/utils/jpeg/jpeg_utils.h b/src/parse/utils/jpeg/jpeg_utils.h new file mode 100644 index 0000000..49bcedf --- /dev/null +++ b/src/parse/utils/jpeg/jpeg_utils.h @@ -0,0 +1,503 @@ +//-*-C++-*- + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef LOGURU_WITH_STREAMS +#define LOGURU_WITH_STREAMS 1 +#endif +#include + +namespace pdflib { +namespace jpeg { + +enum class ColorSpace { Gray, RGB, CMYK, Unknown }; + +inline char const* color_space_name(ColorSpace cs) +{ + switch(cs) { + case ColorSpace::Gray: return "Gray"; + case ColorSpace::RGB: return "RGB"; + case ColorSpace::CMYK: return "CMYK"; + case ColorSpace::Unknown: return "Unknown"; + } + return "?"; +} + +inline ColorSpace to_color_space(std::string const& cs) +{ + if(cs == "/DeviceGray") return ColorSpace::Gray; + if(cs == "/DeviceRGB") return ColorSpace::RGB; + if(cs == "/DeviceCMYK") return ColorSpace::CMYK; + return ColorSpace::Unknown; +} + +class jpeg_parameters { +public: + int width = 0; + int height = 0; + int bits_per_component = 8; + ColorSpace color_space = ColorSpace::Unknown; + std::vector decode; // length 2*ncomp; empty if absent + bool has_decode = false; + bool image_mask = false; +}; + +// --------------------------------------------------------------------------- +// Custom libjpeg error handler that longjmp's instead of calling exit() +// --------------------------------------------------------------------------- +struct jpeg_error_longjmp : public jpeg_error_mgr +{ + std::jmp_buf jmp; +}; + +inline void jpeg_error_exit_longjmp(j_common_ptr cinfo) +{ + auto* myerr = reinterpret_cast(cinfo->err); + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); + LOG_S(WARNING) << "libjpeg error: " << buf; + std::longjmp(myerr->jmp, 1); +} + +// --------------------------------------------------------------------------- +// Validate that data begins with the JPEG SOI marker (0xFF 0xD8) +// --------------------------------------------------------------------------- +inline bool is_jpeg_data(unsigned char const* data, std::size_t size) +{ + return size >= 2 && data[0] == 0xFF && data[1] == 0xD8; +} + +// --------------------------------------------------------------------------- +// apply_decode_component +// --------------------------------------------------------------------------- +// Implements the PDF /Decode linear mapping (PDF spec 8.9.5.2): +// +// output = Dmin + (Dmax - Dmin) * (sample / 255) +// +// An identity pair [0 1] is a no-op. A reversed pair [1 0] inverts the +// component. +// --------------------------------------------------------------------------- +inline unsigned char apply_decode_component(unsigned char v, double dmin, double dmax) +{ + double t = static_cast(v) / 255.0; + double u = dmin + (dmax - dmin) * t; + return static_cast(std::clamp( + static_cast(std::lround(u * 255.0)), 0, 255)); +} + +// --------------------------------------------------------------------------- +// write_corrected_jpeg_from_memory +// --------------------------------------------------------------------------- +// Decodes a JPEG from a raw memory buffer (as stored in a PDF stream), +// applies the PDF /Decode mapping, and re-encodes the result as JPEG on +// disk. The output colour space matches the input (CMYK stays CMYK). +// +// The PDF /Decode array (§8.9.5.2) linearly maps each decompressed sample +// through a [Dmin, Dmax] pair per component. An identity pair [0 1] is a +// no-op; a reversed pair [1 0] inverts the component. +// +// For CMYK images the /Decode array is the authoritative mechanism by +// which the PDF signals channel conventions. A /Decode of +// [1 0 1 0 1 0 1 0] means all four channels must be inverted (the Adobe +// inverted convention where 0 = full ink). +// +// Processing order +// ~~~~~~~~~~~~~~~~ +// a. Decompress JPEG via libjpeg (handles YCbCr / YCCK internally) +// b. Apply /Decode mapping to every component (all colour spaces) +// c. Re-encode as JPEG in the original colour space +// --------------------------------------------------------------------------- +inline bool write_corrected_jpeg_from_memory( + unsigned char const* data, std::size_t size, + jpeg_parameters const& params, + std::filesystem::path const& path) +{ + LOG_S(INFO) << __FUNCTION__ + << ": input_size=" << size + << " requested_cs=" << color_space_name(params.color_space) + << " has_decode=" << params.has_decode + << " decode_len=" << params.decode.size() + << " image_mask=" << params.image_mask + << " path=" << path.string(); + + if(params.has_decode && !params.decode.empty()) + { + std::string dec_str; + for(std::size_t i = 0; i < params.decode.size(); ++i) + { + if(i > 0) dec_str += " "; + dec_str += std::to_string(params.decode[i]); + } + LOG_S(INFO) << __FUNCTION__ << ": /Decode values = [" << dec_str << "]"; + } + + if((not data) or (size == 0)) + { + LOG_S(INFO) << __FUNCTION__ << ": data is null or size is zero"; + return false; + } + + if(!is_jpeg_data(data, size)) + { + LOG_S(WARNING) << __FUNCTION__ + << ": data does not start with JPEG SOI marker" + << " (starts with 0x" << std::hex + << static_cast(data[0]) << " 0x" + << static_cast(size > 1 ? data[1] : 0) + << std::dec << "), skipping"; + return false; + } + + // --- Decompress -------------------------------------------------------- + LOG_S(INFO) << "starting the jpeg decompression ..."; + + jpeg_decompress_struct dinfo{}; + jpeg_error_longjmp jerr{}; + + dinfo.err = jpeg_std_error(&jerr); + jerr.error_exit = jpeg_error_exit_longjmp; + jpeg_create_decompress(&dinfo); + + if(setjmp(jerr.jmp)) + { + jpeg_destroy_decompress(&dinfo); + return false; + } + + jpeg_mem_src(&dinfo, const_cast(data), + static_cast(size)); + + if(JPEG_HEADER_OK != jpeg_read_header(&dinfo, TRUE)) + { + LOG_S(WARNING) << __FUNCTION__ << ": jpeg_read_header failed"; + jpeg_destroy_decompress(&dinfo); + return false; + } + + LOG_S(INFO) << __FUNCTION__ + << ": JPEG header: jpeg_color_space=" << dinfo.jpeg_color_space + << " num_components=" << dinfo.num_components + << " image_width=" << dinfo.image_width + << " image_height=" << dinfo.image_height; + + switch(params.color_space) + { + case ColorSpace::Gray: dinfo.out_color_space = JCS_GRAYSCALE; break; + case ColorSpace::RGB: dinfo.out_color_space = JCS_RGB; break; + case ColorSpace::CMYK: dinfo.out_color_space = JCS_CMYK; break; + default: break; + } + + LOG_S(INFO) << __FUNCTION__ + << ": requesting out_color_space=" << dinfo.out_color_space; + + jpeg_start_decompress(&dinfo); + + // Save all decompressor state we need before destroying it + const int ncomp = dinfo.output_components; + const std::size_t w = dinfo.output_width; + const std::size_t h = dinfo.output_height; + const std::size_t stride = w * static_cast(ncomp); + const bool is_cmyk = (dinfo.out_color_space == JCS_CMYK); + + LOG_S(INFO) << __FUNCTION__ + << ": decompressed: w=" << w << " h=" << h + << " ncomp=" << ncomp << " stride=" << stride + << " is_cmyk=" << is_cmyk + << " out_color_space=" << dinfo.out_color_space + << " jpeg_color_space=" << dinfo.jpeg_color_space; + + std::vector image(h * stride); + + while(dinfo.output_scanline < dinfo.output_height) + { + unsigned char* row = &image[dinfo.output_scanline * stride]; + JSAMPROW rows[1] = { row }; + jpeg_read_scanlines(&dinfo, rows, 1); + } + + jpeg_finish_decompress(&dinfo); + jpeg_destroy_decompress(&dinfo); + + // Log a few sample pixels from the top-left corner (before /Decode) + if(h > 0 && w > 0) + { + std::string sample; + int npx = std::min(static_cast(w), 3); + for(int px = 0; px < npx; ++px) + { + sample += " px[" + std::to_string(px) + "]=("; + for(int c = 0; c < ncomp; ++c) + { + if(c > 0) sample += ","; + sample += std::to_string(static_cast(image[px * ncomp + c])); + } + sample += ")"; + } + LOG_S(INFO) << __FUNCTION__ << ": sample pixels BEFORE /Decode:" << sample; + } + + // --- Apply /Decode mapping (all colour spaces) ------------------------- + // The /Decode array linearly maps each decompressed sample through + // [Dmin, Dmax] per component. Identity [0 1] is a no-op; reversed + // [1 0] inverts. For CMYK, /Decode [1 0 1 0 1 0 1 0] is how PDFs + // signal the inverted Adobe channel convention. + if(params.has_decode && !params.decode.empty() && + static_cast(params.decode.size()) >= 2 * ncomp) + { + LOG_S(INFO) << __FUNCTION__ << ": applying /Decode mapping to " << ncomp << " components"; + + for(std::size_t y = 0; y < h; ++y) + { + unsigned char* row = &image[y * stride]; + for(std::size_t x = 0; x < w; ++x) + { + for(int c = 0; c < ncomp; ++c) + { + double dmin = params.decode[2 * c + 0]; + double dmax = params.decode[2 * c + 1]; + row[x * ncomp + c] = apply_decode_component( + row[x * ncomp + c], dmin, dmax); + } + } + } + + // Log a few sample pixels after /Decode + if(h > 0 && w > 0) + { + std::string sample; + int npx = std::min(static_cast(w), 3); + for(int px = 0; px < npx; ++px) + { + sample += " px[" + std::to_string(px) + "]=("; + for(int c = 0; c < ncomp; ++c) + { + if(c > 0) sample += ","; + sample += std::to_string(static_cast(image[px * ncomp + c])); + } + sample += ")"; + } + LOG_S(INFO) << __FUNCTION__ << ": sample pixels AFTER /Decode:" << sample; + } + } + else + { + LOG_S(INFO) << __FUNCTION__ << ": skipping /Decode" + << " (has_decode=" << params.has_decode + << " decode_empty=" << params.decode.empty() + << " decode_size=" << params.decode.size() + << " 2*ncomp=" << (2 * ncomp) << ")"; + } + + // --- Re-encode (preserving original colour space) ---------------------- + jpeg_compress_struct cinfo{}; + jpeg_error_longjmp cjerr{}; + cinfo.err = jpeg_std_error(&cjerr); + cjerr.error_exit = jpeg_error_exit_longjmp; + jpeg_create_compress(&cinfo); + + std::FILE* outfile = std::fopen(path.string().c_str(), "wb"); + if(!outfile) + { + LOG_S(ERROR) << __FUNCTION__ << ": failed to open output file: " << path.string(); + jpeg_destroy_compress(&cinfo); + return false; + } + + if(setjmp(cjerr.jmp)) + { + std::fclose(outfile); + jpeg_destroy_compress(&cinfo); + return false; + } + + jpeg_stdio_dest(&cinfo, outfile); + + cinfo.image_width = static_cast(w); + cinfo.image_height = static_cast(h); + cinfo.input_components = ncomp; + + if(is_cmyk) + cinfo.in_color_space = JCS_CMYK; + else + cinfo.in_color_space = (ncomp == 1) ? JCS_GRAYSCALE : JCS_RGB; + + LOG_S(INFO) << __FUNCTION__ + << ": re-encoding: w=" << w << " h=" << h + << " ncomp=" << ncomp + << " in_color_space=" << cinfo.in_color_space + << " is_cmyk=" << is_cmyk; + + jpeg_set_defaults(&cinfo); + jpeg_set_quality(&cinfo, 90, TRUE); + jpeg_start_compress(&cinfo, TRUE); + + for(std::size_t y = 0; y < h; ++y) + { + JSAMPROW row[1] = { const_cast(&image[y * stride]) }; + jpeg_write_scanlines(&cinfo, row, 1); + } + + jpeg_finish_compress(&cinfo); + std::fclose(outfile); + jpeg_destroy_compress(&cinfo); + + LOG_S(INFO) << __FUNCTION__ << ": successfully wrote corrected JPEG to " << path.string(); + + return true; +} + +// --------------------------------------------------------------------------- +// write_corrected_jpeg_to_memory +// --------------------------------------------------------------------------- +// Same as write_corrected_jpeg_from_memory but writes to a memory buffer +// instead of a file. Returns the corrected JPEG as a byte vector, or an +// empty vector on failure. +// --------------------------------------------------------------------------- +inline std::vector write_corrected_jpeg_to_memory( + unsigned char const* data, std::size_t size, + jpeg_parameters const& params) +{ + if(not data or size == 0) { return {}; } + + if(!is_jpeg_data(data, size)) + { + LOG_S(WARNING) << "write_corrected_jpeg_to_memory" + << ": data does not start with JPEG SOI marker, skipping"; + return {}; + } + + // --- Decompress -------------------------------------------------------- + jpeg_decompress_struct dinfo{}; + jpeg_error_longjmp jerr{}; + dinfo.err = jpeg_std_error(&jerr); + jerr.error_exit = jpeg_error_exit_longjmp; + jpeg_create_decompress(&dinfo); + + if(setjmp(jerr.jmp)) + { + jpeg_destroy_decompress(&dinfo); + return {}; + } + + jpeg_mem_src(&dinfo, const_cast(data), + static_cast(size)); + + if(JPEG_HEADER_OK != jpeg_read_header(&dinfo, TRUE)) + { + jpeg_destroy_decompress(&dinfo); + return {}; + } + + switch(params.color_space) + { + case ColorSpace::Gray: { dinfo.out_color_space = JCS_GRAYSCALE; break; } + case ColorSpace::RGB: { dinfo.out_color_space = JCS_RGB; break; } + case ColorSpace::CMYK: { dinfo.out_color_space = JCS_CMYK; break; } + default: { break; } + } + + jpeg_start_decompress(&dinfo); + + const int ncomp = dinfo.output_components; + const std::size_t w = dinfo.output_width; + const std::size_t h = dinfo.output_height; + const std::size_t stride = w * static_cast(ncomp); + const bool is_cmyk = (dinfo.out_color_space == JCS_CMYK); + + std::vector image(h * stride); + + while(dinfo.output_scanline < dinfo.output_height) + { + unsigned char* row = &image[dinfo.output_scanline * stride]; + JSAMPROW rows[1] = { row }; + jpeg_read_scanlines(&dinfo, rows, 1); + } + + jpeg_finish_decompress(&dinfo); + jpeg_destroy_decompress(&dinfo); + + // --- Apply /Decode mapping --------------------------------------------- + if(params.has_decode and not params.decode.empty() and + static_cast(params.decode.size()) >= 2 * ncomp) + { + for(std::size_t y = 0; y < h; ++y) + { + unsigned char* row = &image[y * stride]; + for(std::size_t x = 0; x < w; ++x) + { + for(int c = 0; c < ncomp; ++c) + { + double dmin = params.decode[2 * c + 0]; + double dmax = params.decode[2 * c + 1]; + row[x * ncomp + c] = apply_decode_component( + row[x * ncomp + c], dmin, dmax); + } + } + } + } + + // --- Re-encode to memory ----------------------------------------------- + unsigned char* outbuf = nullptr; + unsigned long outsize = 0; + + jpeg_compress_struct cinfo{}; + jpeg_error_longjmp cjerr{}; + cinfo.err = jpeg_std_error(&cjerr); + cjerr.error_exit = jpeg_error_exit_longjmp; + jpeg_create_compress(&cinfo); + + if(setjmp(cjerr.jmp)) + { + jpeg_destroy_compress(&cinfo); + if(outbuf) free(outbuf); + return {}; + } + + jpeg_mem_dest(&cinfo, &outbuf, &outsize); + + cinfo.image_width = static_cast(w); + cinfo.image_height = static_cast(h); + cinfo.input_components = ncomp; + + if(is_cmyk) + { + cinfo.in_color_space = JCS_CMYK; + } + else + { + cinfo.in_color_space = (ncomp == 1) ? JCS_GRAYSCALE : JCS_RGB; + } + + jpeg_set_defaults(&cinfo); + jpeg_set_quality(&cinfo, 90, TRUE); + jpeg_start_compress(&cinfo, TRUE); + + for(std::size_t y = 0; y < h; ++y) + { + JSAMPROW row[1] = { const_cast(&image[y * stride]) }; + jpeg_write_scanlines(&cinfo, row, 1); + } + + jpeg_finish_compress(&cinfo); + jpeg_destroy_compress(&cinfo); + + std::vector result(outbuf, outbuf + outsize); + free(outbuf); // jpeg_mem_dest allocates with malloc + + return result; +} + +} // namespace jpeg +} // namespace pdflib diff --git a/tests/test_parse.py b/tests/test_parse.py index 4169b5a..3de94f8 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -213,6 +213,7 @@ def verify_SegmentedPdfPage( def test_reference_documents_from_filenames(mode): parser = DoclingPdfParser(loglevel="fatal") + # parser = DoclingPdfParser(loglevel="info") pdf_docs = sorted(glob.glob(REGRESSION_FOLDER)) diff --git a/uv.lock b/uv.lock index 87b43ea..1fb6b09 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -751,7 +751,7 @@ wheels = [ [[package]] name = "docling-core" -version = "2.49.0" +version = "2.63.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jsonref" }, @@ -765,9 +765,9 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ce/7f/1552500d2a197f69cb9cf69bf022e5021e8c914a00e1f5fbc87752e8e500/docling_core-2.49.0.tar.gz", hash = "sha256:7c0f39d58a06192c25aa043141cd8f87ac6a8d2c5eab5137344e1476dd13eacb", size = 161454, upload-time = "2025-10-16T14:43:03.218Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/76/f6a1333c0ce4c20e60358185ff8b7fa92e1e1561a43a6788e7c8aaa9898e/docling_core-2.63.0.tar.gz", hash = "sha256:946cf97f27cb81a2c6507121045a356be91e40b5a06bbaf028ca7036df78b2f1", size = 251016, upload-time = "2026-02-03T14:41:07.158Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/05/cd/84034624d6c5a1484f694d16069be56c00117898ee4f43c9a3bf45061b31/docling_core-2.49.0-py3-none-any.whl", hash = "sha256:65605c0546548800dcc3cc4eb6eec24f1a4fa8c9bcd4257722894838588e41ed", size = 164457, upload-time = "2025-10-16T14:43:01.808Z" }, + { url = "https://files.pythonhosted.org/packages/8b/c4/0c825b46412f088828dd2730d231c745d1ff4b5537eed292e827103eff37/docling_core-2.63.0-py3-none-any.whl", hash = "sha256:8f39167bf17da13225c8a67d23df98c87a74e2ab39762dbf51fab93d9b90de25", size = 238637, upload-time = "2026-02-03T14:41:05.55Z" }, ] [[package]] @@ -820,7 +820,7 @@ perf-test = [ [package.metadata] requires-dist = [ - { name = "docling-core", specifier = ">=2.44.1" }, + { name = "docling-core", specifier = ">=2.63.0" }, { name = "pdfplumber", marker = "extra == 'perf-tools'", specifier = ">=0.11.7" }, { name = "pillow", specifier = ">=10.0.0,<13.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, @@ -1819,6 +1819,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/a7/8c4f86c78ec03db954d05fd9c57a114cc3a172a2d3e4a8b949cd5ff89471/patchelf-0.17.2.4-py3-none-macosx_10_9_universal2.whl", hash = "sha256:343bb1b94e959f9070ca9607453b04390e36bbaa33c88640b989cefad0aa049e", size = 184436, upload-time = "2025-07-23T21:16:20.578Z" }, { url = "https://files.pythonhosted.org/packages/7e/19/f7821ef31aab01fa7dc8ebe697ece88ec4f7a0fdd3155dab2dfee4b00e5c/patchelf-0.17.2.4-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:d9b35ebfada70c02679ad036407d9724ffe1255122ba4ac5e4be5868618a5689", size = 482846, upload-time = "2025-07-23T21:16:23.73Z" }, { url = "https://files.pythonhosted.org/packages/d1/50/107fea848ecfd851d473b079cab79107487d72c4c3cdb25b9d2603a24ca2/patchelf-0.17.2.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:2931a1b5b85f3549661898af7bf746afbda7903c7c9a967cfc998a3563f84fad", size = 477811, upload-time = "2025-07-23T21:16:25.145Z" }, + { url = "https://files.pythonhosted.org/packages/89/a9/a9a2103e159fd65bffbc21ecc5c8c36e44eb34fe53b4ef85fb6d08c2a635/patchelf-0.17.2.4-py3-none-manylinux2014_armv7l.manylinux_2_17_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:ae44cb3c857d50f54b99e5697aa978726ada33a8a6129d4b8b7ffd28b996652d", size = 431226, upload-time = "2025-07-23T21:16:26.765Z" }, + { url = "https://files.pythonhosted.org/packages/87/93/897d612f6df7cfd987bdf668425127efeff8d8e4ad8bfbab1c69d2a0d861/patchelf-0.17.2.4-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:680a266a70f60a7a4f4c448482c5bdba80cc8e6bb155a49dcc24238ba49927b0", size = 540276, upload-time = "2025-07-23T21:16:27.983Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b8/2b92d11533482bac9ee989081d6880845287751b5f528adbd6bb27667fbd/patchelf-0.17.2.4-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.musllinux_1_1_s390x.whl", hash = "sha256:d842b51f0401460f3b1f3a3a67d2c266a8f515a5adfbfa6e7b656cb3ac2ed8bc", size = 596632, upload-time = "2025-07-23T21:16:29.253Z" }, + { url = "https://files.pythonhosted.org/packages/14/e2/975d4bdb418f942b53e6187b95bd9e0d5e0488b7bc214685a1e43e2c2751/patchelf-0.17.2.4-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:7076d9e127230982e20a81a6e2358d3343004667ba510d9f822d4fdee29b0d71", size = 508281, upload-time = "2025-07-23T21:16:30.865Z" }, ] [[package]]