feat: add jpeg2000 pixel data (#259)

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2026-04-22 08:47:15 +02:00
committed by GitHub
parent b5804c1654
commit 8546560474
18 changed files with 1465 additions and 56 deletions
+1 -1
View File
@@ -103,7 +103,7 @@ jobs:
CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" # do not run delocate-wheel before the re-tag
CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=${{ matrix.os.min_macos_version }}.0"
ARCHFLAGS: -arch x86_64
BUILD_THREADS: "4"
BUILD_THREADS: "1"
PYTORCH_MPS_HIGH_WATERMARK_RATIO: "0.0"
run: |
PY_CACHE_TAG=$(uv run python -c 'import sys;print(sys.implementation.cache_tag)')
+3 -1
View File
@@ -108,13 +108,15 @@ include(cmake/extlib_loguru.cmake)
include(cmake/extlib_json.cmake)
include(cmake/extlib_utf8.git.cmake)
include(cmake/extlib_jpeg.cmake)
include(cmake/extlib_openjpeg.cmake)
include(cmake/extlib_lcms2.cmake)
# include(cmake/extlib_qpdf_v11.cmake)
include(cmake/extlib_qpdf_v12.cmake)
include(cmake/extlib_blend2d.cmake)
include(cmake/extlib_pdfium_jbig2.cmake)
# aggregate the targets created by the dependencies
set(DEPENDENCIES qpdf jpeg utf8 json loguru cxxopts blend2d pdfium_jbig2)
set(DEPENDENCIES qpdf jpeg openjpeg lcms2 utf8 json loguru cxxopts blend2d pdfium_jbig2)
# ************************
# *** libraries ***
+24
View File
@@ -4,6 +4,16 @@
#include "render.h"
#include "parse/utils/bitmap/bitmap_exporter.h"
namespace
{
std::filesystem::path page_pdf_output_path(std::filesystem::path const& image_path)
{
std::filesystem::path pdf_path = image_path;
pdf_path.replace_extension(".pdf");
return pdf_path;
}
}
struct ImageIssue
{
std::string pdf_path;
@@ -52,6 +62,7 @@ static int analyse_pdf(const std::string& pdf_path,
int& total_pages,
const std::string& render_dir,
bool export_bitmaps,
bool export_page_pdf,
const std::string& bitmap_dir,
int target_page)
{
@@ -184,6 +195,10 @@ static int analyse_pdf(const std::string& pdf_path,
pdflib::renderer<pdflib::BLEND2D> rnd(render_cfg);
page_dec->get_instructions().iterate_over_instructions(rnd);
rnd.save(out_path);
if(export_page_pdf)
{
page_dec->save_pdf_page(page_pdf_output_path(out_path));
}
LOG_S(INFO) << "saved rendered page: " << out_path;
}
catch (std::exception const& exc)
@@ -257,6 +272,8 @@ int main(int argc, char* argv[])
cxxopts::value<int>()->default_value("-1"))
("export-bitmaps", "Export decoded bitmap payloads encountered on each page",
cxxopts::value<bool>()->implicit_value("true"))
("export-page-pdf", "Export each rendered page as a sibling PDF",
cxxopts::value<bool>()->implicit_value("true"))
("l,loglevel", "Log level [error, warning, info]", cxxopts::value<std::string>())
("h,help", "Print usage");
@@ -300,6 +317,12 @@ int main(int argc, char* argv[])
export_bitmaps = result["export-bitmaps"].as<bool>();
}
bool export_page_pdf = false;
if(result.count("export-page-pdf"))
{
export_page_pdf = result["export-page-pdf"].as<bool>();
}
std::string bitmap_dir;
if(export_bitmaps)
{
@@ -335,6 +358,7 @@ int main(int argc, char* argv[])
total_pages,
render_dir,
export_bitmaps,
export_page_pdf,
bitmap_dir,
target_page);
}
+57 -2
View File
@@ -4,6 +4,18 @@
#include "render.h"
#include "parse/utils/bitmap/bitmap_exporter.h"
namespace
{
std::filesystem::path page_pdf_output_path(std::filesystem::path const& export_dir,
std::filesystem::path const& pdf_path,
int page)
{
return export_dir / (pdf_path.stem().string()
+ "_p" + std::to_string(page)
+ ".pdf");
}
}
void set_loglevel(std::string level)
{
if(level=="info")
@@ -35,6 +47,8 @@ bool decode_and_render(pdflib::pdf_decoder<pdflib::DOCUMENT>& doc,
Renderer& rnd,
bool export_bitmaps,
std::filesystem::path const& bitmap_dir,
bool export_page_pdf_files,
std::filesystem::path const& page_pdf_dir,
std::string const& pdf_path)
{
if (page == -1)
@@ -52,6 +66,12 @@ bool decode_and_render(pdflib::pdf_decoder<pdflib::DOCUMENT>& doc,
bitmap_dir, pdf_path, p);
instructions.iterate_over_instructions(exporter);
}
if(export_page_pdf_files)
{
page_decoder->save_pdf_page(page_pdf_output_path(page_pdf_dir,
pdf_path,
p));
}
instructions.iterate_over_instructions(rnd);
}
}
@@ -68,6 +88,12 @@ bool decode_and_render(pdflib::pdf_decoder<pdflib::DOCUMENT>& doc,
bitmap_dir, pdf_path, page);
instructions.iterate_over_instructions(exporter);
}
if(export_page_pdf_files)
{
page_decoder->save_pdf_page(page_pdf_output_path(page_pdf_dir,
pdf_path,
page));
}
instructions.iterate_over_instructions(rnd);
}
else
@@ -89,7 +115,9 @@ int render_pdf_file(const std::string& pdf_path,
const RenderCfg& render_cfg,
bool save_output,
bool export_bitmaps,
const std::string& bitmap_dir)
const std::string& bitmap_dir,
bool export_page_pdf_files,
const std::string& page_pdf_dir)
{
pdflib::pdf_timings timings;
pdflib::pdf_decoder<pdflib::DOCUMENT> doc(timings);
@@ -129,6 +157,12 @@ int render_pdf_file(const std::string& pdf_path,
p);
instructions.iterate_over_instructions(exporter);
}
if(export_page_pdf_files)
{
page_decoder->save_pdf_page(page_pdf_output_path(std::filesystem::path(page_pdf_dir),
pdf_path,
p));
}
instructions.iterate_over_instructions(rnd);
if (save_output)
@@ -202,6 +236,8 @@ int main(int argc, char* argv[])
("keep-qpdf-warnings", "Emit QPDF warnings (default: false)", cxxopts::value<bool>()->implicit_value("true"))
("populate-json", "Populate JSON objects during decode (default: false)", cxxopts::value<bool>()->implicit_value("true"))
("export-bitmaps", "Export decoded bitmap payloads encountered on each page (default: false)",
cxxopts::value<bool>()->default_value("false"))
("export-page-pdf", "Export each selected page as a one-page PDF (default: false)",
cxxopts::value<bool>()->default_value("false"));
// Parse command line arguments
@@ -267,6 +303,7 @@ int main(int argc, char* argv[])
if (result.count("keep-qpdf-warnings")) { page_config.keep_qpdf_warnings = result["keep-qpdf-warnings"].as<bool>(); }
if (result.count("populate-json")) { page_config.populate_json_objects = result["populate-json"].as<bool>(); }
bool export_bitmaps = result["export-bitmaps"].as<bool>();
bool export_page_pdf_files = result["export-page-pdf"].as<bool>();
// --- render_config ---
pdflib::render_config cfg;
@@ -285,6 +322,7 @@ int main(int argc, char* argv[])
std::string ifile = result["input"].as<std::string>();
std::string ofile = ifile + ".rendered.json";
std::string bitmap_dir = "./bitmaps_out";
std::string page_pdf_dir = "./pages_out";
if(export_bitmaps and result.count("output"))
{
std::filesystem::path output_path = result["output"].as<std::string>();
@@ -293,6 +331,14 @@ int main(int argc, char* argv[])
bitmap_dir = (output_path / "bitmaps").string();
}
}
if(export_page_pdf_files and result.count("output"))
{
std::filesystem::path output_path = result["output"].as<std::string>();
if(output_path.extension().empty())
{
page_pdf_dir = (output_path / "pages").string();
}
}
int page = result["page"].as<int>();
LOG_F(INFO, "Page to process: %d", page);
@@ -328,6 +374,8 @@ int main(int argc, char* argv[])
if (not decode_and_render(doc, page, page_config, rnd,
export_bitmaps,
std::filesystem::path(bitmap_dir),
export_page_pdf_files,
std::filesystem::path(page_pdf_dir),
ifile)) { return 1; }
rnd.show();
}
@@ -337,6 +385,8 @@ int main(int argc, char* argv[])
if (not decode_and_render(doc, page, page_config, rnd,
export_bitmaps,
std::filesystem::path(bitmap_dir),
export_page_pdf_files,
std::filesystem::path(page_pdf_dir),
ifile)) { return 1; }
}
@@ -354,6 +404,9 @@ int main(int argc, char* argv[])
const std::string bitmap_dir = save
? (std::filesystem::path(out_dir) / "bitmaps").string()
: "./bitmaps_out";
const std::string page_pdf_dir = save
? (std::filesystem::path(out_dir) / "pages").string()
: "./pages_out";
if (not std::filesystem::is_directory(dir_path))
{
@@ -383,7 +436,9 @@ int main(int argc, char* argv[])
cfg,
save,
export_bitmaps,
bitmap_dir);
bitmap_dir,
export_page_pdf_files,
page_pdf_dir);
if (pages == 0)
{
++failed_files;
+69
View File
@@ -0,0 +1,69 @@
message(STATUS "entering in extlib_lcms2.cmake")
set(ext_name "lcms2")
if(USE_SYSTEM_DEPS)
find_package(PkgConfig)
if(PkgConfig_FOUND)
pkg_check_modules(liblcms2 IMPORTED_TARGET lcms2)
endif()
if(TARGET PkgConfig::liblcms2)
add_library(${ext_name} ALIAS PkgConfig::liblcms2)
else()
find_path(LCMS2_INCLUDE_DIR lcms2.h)
find_library(LCMS2_LIBRARY NAMES lcms2 liblcms2)
if(NOT LCMS2_INCLUDE_DIR OR NOT LCMS2_LIBRARY)
message(FATAL_ERROR "lcms2 not found. Install Little CMS 2 or disable USE_SYSTEM_DEPS.")
endif()
add_library(${ext_name} UNKNOWN IMPORTED)
set_target_properties(${ext_name} PROPERTIES
IMPORTED_LOCATION "${LCMS2_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${LCMS2_INCLUDE_DIR}"
)
endif()
else()
include(ExternalProject)
include(CMakeParseArguments)
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include)
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/lib)
set(LCMS2_URL https://github.com/mm2/Little-CMS.git)
set(LCMS2_TAG lcms2.17)
set(LCMS2_IMPORTED_LIB ${EXTERNALS_PREFIX_PATH}/lib/liblcms2.a)
ExternalProject_Add(extlib_lcms2
PREFIX extlib_lcms2
UPDATE_COMMAND ""
GIT_REPOSITORY ${LCMS2_URL}
GIT_TAG ${LCMS2_TAG}
BUILD_ALWAYS OFF
INSTALL_DIR ${EXTERNALS_PREFIX_PATH}
BUILD_IN_SOURCE ON
CONFIGURE_COMMAND ./configure
--prefix=${EXTERNALS_PREFIX_PATH}
--disable-shared
--enable-static
CFLAGS=-fPIC\ ${ENV_ARCHFLAGS}
BUILD_COMMAND make
INSTALL_COMMAND make install
LOG_DOWNLOAD ON
)
add_library(${ext_name} STATIC IMPORTED)
add_dependencies(${ext_name} extlib_lcms2)
set_target_properties(${ext_name} PROPERTIES
IMPORTED_LOCATION ${LCMS2_IMPORTED_LIB}
INTERFACE_INCLUDE_DIRECTORIES ${EXTERNALS_PREFIX_PATH}/include
)
endif()
+61
View File
@@ -0,0 +1,61 @@
message(STATUS "entering in extlib_openjpeg.cmake")
set(ext_name "openjpeg")
if(USE_SYSTEM_DEPS)
find_package(PkgConfig)
pkg_check_modules(libopenjp2 REQUIRED IMPORTED_TARGET libopenjp2)
add_library(${ext_name} ALIAS PkgConfig::libopenjp2)
else()
include(ExternalProject)
include(CMakeParseArguments)
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include)
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include/openjpeg-2.5)
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/lib)
set(OPENJPEG_URL https://github.com/uclouvain/openjpeg.git)
set(OPENJPEG_TAG v2.5.3)
set(OPENJPEG_IMPORTED_LIB ${EXTERNALS_PREFIX_PATH}/lib/libopenjp2.a)
ExternalProject_Add(extlib_openjpeg
PREFIX extlib_openjpeg
UPDATE_COMMAND ""
GIT_REPOSITORY ${OPENJPEG_URL}
GIT_TAG ${OPENJPEG_TAG}
BUILD_ALWAYS OFF
INSTALL_DIR ${EXTERNALS_PREFIX_PATH}
CMAKE_ARGS \\
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \\
-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \\
-DCMAKE_C_FLAGS=${ENV_ARCHFLAGS} \\
-DBUILD_CODEC=OFF \\
-DBUILD_JPIP=OFF \\
-DBUILD_JPWL=OFF \\
-DBUILD_THIRDPARTY=OFF \\
-DBUILD_TESTING=OFF \\
-DBUILD_SHARED_LIBS=OFF \\
-DCMAKE_INSTALL_LIBDIR=${EXTERNALS_PREFIX_PATH}/lib \\
-DCMAKE_INSTALL_PREFIX=${EXTERNALS_PREFIX_PATH}
BUILD_IN_SOURCE ON
LOG_DOWNLOAD ON
)
add_library(${ext_name} STATIC IMPORTED)
add_dependencies(${ext_name} extlib_openjpeg)
set_target_properties(${ext_name} PROPERTIES
IMPORTED_LOCATION ${OPENJPEG_IMPORTED_LIB}
INTERFACE_INCLUDE_DIRECTORIES "${EXTERNALS_PREFIX_PATH}/include/openjpeg-2.5;${EXTERNALS_PREFIX_PATH}/include"
INTERFACE_COMPILE_DEFINITIONS "OPJ_STATIC"
)
endif()
+10
View File
@@ -75,11 +75,15 @@ namespace pdflib
std::vector<double> decode_array; // 2*ncomp when present
bool image_mask = false;
int icc_components = 0; // number of color components from /ICCBased /N entry; 0 if not ICCBased
int device_n_components = 0; // number of components from /DeviceN names array; 0 if not DeviceN
std::vector<std::string> device_n_names;
// /Indexed color space support
int indexed_hival = -1;
std::string indexed_base_cs;
std::shared_ptr<std::vector<uint8_t>> indexed_palette;
std::vector<std::string> indexed_base_device_n_names;
bool indexed_base_device_n_single_black = false;
// /CCITTFaxDecode parameters (from /DecodeParms)
int ccitt_k = 0; // /K default per PDF spec: 0=Group3-1D, <0=Group4, >0=Group3-mixed
@@ -353,6 +357,12 @@ namespace pdflib
if(icc_components == 3) { return "RGB"; }
if(icc_components == 4) { return "CMYK"; }
}
if(color_space.find("/DeviceN") != std::string::npos and device_n_components > 0)
{
if(device_n_components == 1) { return "L"; }
if(device_n_components == 3) { return "RGB"; }
if(device_n_components == 4) { return "CMYK"; }
}
LOG_S(WARNING) << "unknown color_space '" << color_space
<< "' for xobject_key=" << xobject_key
@@ -20,6 +20,12 @@ namespace pdflib
PIXEL_FORMAT_CMYK, // 4 channels (/DeviceCMYK)
};
enum cmyk_convention {
CMYK_CONVENTION_UNKNOWN,
CMYK_CONVENTION_ADOBE_INVERTED,
CMYK_CONVENTION_PROCESS,
};
enum RENDER_INSTRUCTION_NAME {
SIZE_INSTRUCTION, // set the size of the canvas on which we render
TEXT_RENDER_INSTRUCTION, // render text on the canvas
@@ -188,6 +194,7 @@ namespace pdflib
bitmap_instruction(std::string xobject_key,
std::shared_ptr<std::vector<uint8_t> > data,
std::shared_ptr<std::vector<uint8_t> > alpha_data,
cmyk_convention cmyk_conv,
std::array<int, 3> shape,
pixel_format fmt,
bool image_mask,
@@ -199,6 +206,7 @@ namespace pdflib
xobject_key(xobject_key),
data(std::move(data)),
alpha_data(std::move(alpha_data)),
cmyk_conv(cmyk_conv),
shape(shape),
fmt(fmt),
image_mask(image_mask),
@@ -212,6 +220,7 @@ namespace pdflib
const std::shared_ptr<std::vector<uint8_t> >& get_data() const { return data; }
const std::shared_ptr<std::vector<uint8_t> >& get_alpha_data() const { return alpha_data; }
cmyk_convention get_cmyk_convention() const { return cmyk_conv; }
const std::array<int, 3>& get_shape() const { return shape; }
pixel_format get_pixel_format() const { return fmt; }
bool is_image_mask() const { return image_mask; }
@@ -235,6 +244,7 @@ namespace pdflib
const std::shared_ptr<std::vector<uint8_t> > data;
const std::shared_ptr<std::vector<uint8_t> > alpha_data;
const cmyk_convention cmyk_conv;
const std::array<int, 3> shape;
const pixel_format fmt;
const bool image_mask;
+20
View File
@@ -5,7 +5,9 @@
#include <optional>
#include <qpdf/QPDF.hh>
#include <qpdf/QPDFPageDocumentHelper.hh>
#include <qpdf/QPDFPageObjectHelper.hh>
#include <qpdf/QPDFWriter.hh>
#include <nlohmann/json.hpp>
@@ -63,6 +65,9 @@ namespace pdflib
// Get render instructions collected during decode
pdf_render_instructions& get_instructions() { return instructions; }
// Export this page as a standalone one-page PDF.
void save_pdf_page(std::filesystem::path const& out_path) const;
private:
void update_qpdf_logger();
@@ -241,6 +246,21 @@ namespace pdflib
return page_number;
}
void pdf_decoder<PAGE>::save_pdf_page(std::filesystem::path const& out_path) const
{
std::filesystem::create_directories(out_path.parent_path());
QPDF out_pdf;
out_pdf.emptyPDF();
QPDFPageDocumentHelper out_pages(out_pdf);
QPDFPageObjectHelper source_page(qpdf_page);
out_pages.addPage(source_page, false);
QPDFWriter writer(out_pdf, out_path.string().c_str());
writer.write();
}
nlohmann::json pdf_decoder<PAGE>::get(const decode_config& config)
{
bool keep_char_cells = config.keep_char_cells;
+84 -1
View File
@@ -674,7 +674,7 @@ namespace pdflib
break;
/**************************************************
*** text-objects
*** group-objects
**************************************************/
case pdf_operator::BT:
@@ -696,6 +696,54 @@ namespace pdflib
}
break;
case pdf_operator::BX:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::EX:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::BMC:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::BDC:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::EMC:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::BI:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::ID:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::EI:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
/**************************************************
*** text-state
**************************************************/
@@ -976,6 +1024,41 @@ namespace pdflib
}
break;
case pdf_operator::MP:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::DP:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::sh:
{
LOG_S(INFO) << "executing " << to_string(name);
//current_graphic_state().sh(parameters);
}
break;
/**************************************************
*** Type 3 font metrics
**************************************************/
case pdf_operator::d0:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
case pdf_operator::d1:
{
LOG_S(INFO) << "executing " << to_string(name);
}
break;
/**************************************************
*** other
**************************************************/
+87 -2
View File
@@ -19,6 +19,22 @@ namespace pdflib
PATH_PAINTING
};
/*
Table: PDF content stream operators
| Category | Operators |
| -------------------- | ------------------------------------------ |
| Graphics state | w J j M d ri i gs q Q cm |
| Path | m l c v y h re S s f F f* B B* b b* n W W* |
| Text | BT ET Tc Tw Tz TL Tf Tr Ts Td TD Tm T* Tj TJ ' " |
| Color | CS cs SC SCN sc scn G g RG rg K k |
| XObject / shading | Do sh |
| Marked content | MP DP BMC BDC EMC |
| Compatibility | BX EX |
| Inline image markers | BI ID EI |
| Type 3 font metrics | d0 d1 |
*/
// Table 51 Operator Categories [ p 119 ]
enum operator_name {
@@ -34,9 +50,11 @@ namespace pdflib
// color-scheme
CS, cs, SC, SCN, sc, scn, G, g, RG, rg, K, k,
// text objects
// group objects
BT, ET,
BX, EX,
BI, ID, EI,
// Text state
Tc, Tw, Tz, TL, Tf, Tr, Ts,
@@ -58,6 +76,12 @@ namespace pdflib
// Marked content
MP, DP, BMC, BDC, EMC,
// shading
sh,
// Type 3 font operators
d0, d1,
// dummy
null
};
@@ -96,6 +120,46 @@ namespace pdflib
return COLOR_SCHEME;
}
case q:
case Q:
case cm:
case Do:
case BT:
case ET:
case BX:
case EX:
case BMC:
case BDC:
case EMC:
case BI:
case ID:
case EI:
case Tc:
case Tw:
case Tz:
case TL:
case Tf:
case Tr:
case Ts:
case Td:
case TD:
case Tm:
case TStar:
case Tj:
case TJ:
case accent:
case double_accent:
case W:
case WStar:
case MP:
case DP:
case sh:
case d0:
case d1:
{
return rest;
}
// lines
case m:
case l:
@@ -169,6 +233,11 @@ namespace pdflib
// text objects
else if(name=="BT") { return BT; }
else if(name=="ET") { return ET; }
else if(name=="BX") { return BX; }
else if(name=="EX") { return EX; }
else if(name=="BI") { return BI; }
else if(name=="ID") { return ID; }
else if(name=="EI") { return EI; }
// Text state
else if(name=="Tc") { return Tc; }
@@ -222,6 +291,9 @@ namespace pdflib
else if(name=="BMC") { return BMC; }
else if(name=="BDC") { return BDC; }
else if(name=="EMC") { return EMC; }
else if(name=="sh") { return sh; }
else if(name=="d0") { return d0; }
else if(name=="d1") { return d1; }
else
{
@@ -273,6 +345,11 @@ namespace pdflib
// text
case BT: return "BT";
case ET: return "ET";
case BX: return "BX";
case EX: return "EX";
case BI: return "BI";
case ID: return "ID";
case EI: return "EI";
case Tc: return "Tc";
@@ -320,6 +397,14 @@ namespace pdflib
case W: return "W";
case WStar: return "W*";
case MP: return "MP";
case DP: return "DP";
case BMC: return "BMC";
case BDC: return "BDC";
case EMC: return "EMC";
case sh: return "sh";
case d0: return "d0";
case d1: return "d1";
default:
{
@@ -3,12 +3,137 @@
#ifndef PDF_PAGE_XOBJECT_IMAGE_RESOURCE_H
#define PDF_PAGE_XOBJECT_IMAGE_RESOURCE_H
#include <cstdint>
#include <cstring>
#include <parse/utils/color/icc_utils.h>
#include <parse/utils/jpeg/jpeg_utils.h>
#include <parse/qpdf/qpdf_compat.h>
namespace pdflib
{
namespace detail
{
inline int icc_signature_to_components(char const* sig)
{
if(std::memcmp(sig, "GRAY", 4) == 0) return 1;
if(std::memcmp(sig, "RGB ", 4) == 0) return 3;
if(std::memcmp(sig, "CMYK", 4) == 0) return 4;
if(sig[1] == 'C' and sig[2] == 'L' and sig[3] == 'R')
{
if(sig[0] >= '2' and sig[0] <= '9') return sig[0] - '0';
if(sig[0] >= 'A' and sig[0] <= 'F') return 10 + (sig[0] - 'A');
}
return 0;
}
inline int infer_icc_components_from_profile(QPDFObjectHandle icc_stream,
std::string const& context)
{
if(not icc_stream.isStream())
{
LOG_S(WARNING) << context << ": ICC object is not a stream";
return 0;
}
try
{
auto profile = to_shared_ptr(icc_stream.getStreamData());
if(not profile or profile->getSize() < 20)
{
LOG_S(WARNING) << context << ": ICC profile too small to inspect";
return 0;
}
auto const* bytes = reinterpret_cast<std::uint8_t const*>(profile->getBuffer());
int const n = icc_signature_to_components(reinterpret_cast<char const*>(bytes + 16));
if(n > 0)
{
LOG_S(INFO) << context << ": inferred ICC components from profile header: N=" << n;
}
else
{
LOG_S(WARNING) << context << ": unsupported ICC data color space signature";
}
return n;
}
catch(std::exception const& e)
{
LOG_S(WARNING) << context << ": failed to inspect ICC profile stream: " << e.what();
return 0;
}
}
inline int cmyk_process_component_index(std::string const& name)
{
if(name == "/Cyan") return 0;
if(name == "/Magenta") return 1;
if(name == "/Yellow") return 2;
if(name == "/Black") return 3;
return -1;
}
inline bool device_n_names_are_process_cmyk_subset(
std::vector<std::string> const& names)
{
if(names.empty())
{
return false;
}
for(auto const& name : names)
{
if(cmyk_process_component_index(name) < 0)
{
return false;
}
}
return true;
}
inline std::shared_ptr<std::vector<uint8_t>> expand_device_n_palette_to_cmyk(
std::shared_ptr<std::vector<uint8_t>> const& palette,
std::vector<std::string> const& names)
{
if(not palette or names.empty())
{
return nullptr;
}
const std::size_t src_components = names.size();
if(src_components == 0 or (palette->size() % src_components) != 0)
{
return nullptr;
}
const std::size_t entry_count = palette->size() / src_components;
auto expanded = std::make_shared<std::vector<uint8_t>>();
expanded->assign(entry_count * 4u, 0u);
for(std::size_t entry = 0; entry < entry_count; ++entry)
{
const std::size_t src_offset = entry * src_components;
const std::size_t dst_offset = entry * 4u;
for(std::size_t i = 0; i < src_components; ++i)
{
const int dst_component = cmyk_process_component_index(names[i]);
if(dst_component >= 0)
{
(*expanded)[dst_offset + static_cast<std::size_t>(dst_component)] =
(*palette)[src_offset + i];
}
}
}
return expanded;
}
}
template<>
class pdf_resource<PAGE_XOBJECT_IMAGE>
{
@@ -31,9 +156,13 @@ namespace pdflib
int get_bits_per_component() const;
std::string get_color_space() const;
int get_icc_components() const;
int get_device_n_components() const;
std::vector<std::string> get_device_n_names() const;
int get_indexed_hival() const;
std::string get_indexed_base_cs() const;
std::shared_ptr<std::vector<uint8_t>> get_indexed_palette() const;
std::vector<std::string> get_indexed_base_device_n_names() const;
bool get_indexed_base_device_n_single_black() const;
std::string get_intent() const;
std::vector<std::string> get_filters() const;
@@ -91,9 +220,15 @@ namespace pdflib
int bits_per_component;
std::string color_space;
int icc_components = 0; // number of color components from /ICCBased /N entry; 0 if not ICCBased
int device_n_components = 0; // number of components from /DeviceN names array; 0 if not DeviceN
std::vector<std::string> device_n_names; // names from /DeviceN colorant array
int indexed_hival = -1; // hival from /Indexed color space; -1 if not Indexed
std::string indexed_base_cs; // base color space name for /Indexed (e.g. "/DeviceRGB")
std::shared_ptr<std::vector<uint8_t>> indexed_palette; // raw palette bytes: (hival+1)*ncomps bytes
std::shared_ptr<std::vector<uint8_t>> indexed_base_icc_profile;
int indexed_base_icc_components = 0;
std::vector<std::string> indexed_base_device_n_names;
bool indexed_base_device_n_single_black = false;
std::string intent;
std::vector<std::string> image_filters;
@@ -246,6 +381,8 @@ namespace pdflib
else
{
LOG_S(WARNING) << "ICCBased stream missing /N entry";
icc_components = detail::infer_icc_components_from_profile(
icc_stream, "ICCBased");
}
}
else
@@ -253,6 +390,28 @@ namespace pdflib
LOG_S(WARNING) << "ICCBased: second array element is not a stream";
}
}
else if(name_obj.isName() and name_obj.getName() == "/DeviceN")
{
device_n_names.clear();
auto names_obj = qpdf_cs.getArrayItem(1);
if(names_obj.isArray())
{
device_n_components = names_obj.getArrayNItems();
for(int i = 0; i < names_obj.getArrayNItems(); ++i)
{
auto name = names_obj.getArrayItem(i);
if(name.isName())
{
device_n_names.push_back(name.getName());
}
}
LOG_S(INFO) << "DeviceN color space: N=" << device_n_components;
}
else
{
LOG_S(WARNING) << "DeviceN color space: names array missing";
}
}
else if(name_obj.isName() and name_obj.getName() == "/Indexed"
and qpdf_cs.getArrayNItems() >= 3)
{
@@ -260,6 +419,10 @@ namespace pdflib
// base color space
auto base_obj = qpdf_cs.getArrayItem(1);
indexed_base_device_n_single_black = false;
indexed_base_icc_profile.reset();
indexed_base_icc_components = 0;
indexed_base_device_n_names.clear();
if(base_obj.isName())
{
indexed_base_cs = base_obj.getName();
@@ -272,10 +435,21 @@ namespace pdflib
auto icc_stream = base_obj.getArrayItem(1);
if(icc_stream.isStream())
{
auto profile_buf = to_shared_ptr(icc_stream.getStreamData());
if(profile_buf and profile_buf->getSize() > 0)
{
auto const* ptr = reinterpret_cast<const uint8_t*>(
profile_buf->getBuffer());
indexed_base_icc_profile =
std::make_shared<std::vector<uint8_t>>(
ptr, ptr + profile_buf->getSize());
}
auto icc_dict = icc_stream.getDict();
if(icc_dict.hasKey("/N") and icc_dict.getKey("/N").isInteger())
{
const int n = icc_dict.getKey("/N").getIntValue();
indexed_base_icc_components = n;
if(n == 1) { indexed_base_cs = "/DeviceGray"; }
else if(n == 3) { indexed_base_cs = "/DeviceRGB"; }
else if(n == 4) { indexed_base_cs = "/DeviceCMYK"; }
@@ -290,6 +464,12 @@ namespace pdflib
else
{
LOG_S(WARNING) << "Indexed ICCBased base missing /N entry";
const int n = detail::infer_icc_components_from_profile(
icc_stream, "Indexed ICCBased base");
indexed_base_icc_components = n;
if(n == 1) { indexed_base_cs = "/DeviceGray"; }
else if(n == 3) { indexed_base_cs = "/DeviceRGB"; }
else if(n == 4) { indexed_base_cs = "/DeviceCMYK"; }
}
}
else
@@ -297,6 +477,55 @@ namespace pdflib
LOG_S(WARNING) << "Indexed ICCBased base: second array element is not a stream";
}
}
else if(base_name.isName() and base_name.getName() == "/DeviceN")
{
auto names_obj = base_obj.getArrayItem(1);
if(names_obj.isArray())
{
std::vector<std::string> nested_names;
for(int i = 0; i < names_obj.getArrayNItems(); ++i)
{
auto name = names_obj.getArrayItem(i);
if(name.isName())
{
nested_names.push_back(name.getName());
}
}
indexed_base_device_n_names = nested_names;
const int nested_n = static_cast<int>(nested_names.size());
const bool single_black =
nested_n == 1
and nested_names[0] == "/Black";
const bool process_cmyk_subset =
detail::device_n_names_are_process_cmyk_subset(nested_names);
indexed_base_device_n_single_black = single_black;
if(single_black) { indexed_base_cs = "/DeviceGray"; }
else if(process_cmyk_subset)
{
indexed_base_cs = "/DeviceCMYK";
LOG_S(INFO) << "Indexed DeviceN base uses process CMYK subset; "
<< "will expand palette to CMYK";
}
else if(nested_n == 3)
{
indexed_base_cs = "/DeviceRGB";
}
else if(nested_n == 4)
{
indexed_base_cs = "/DeviceCMYK";
}
else
{
indexed_base_cs = "/DeviceN";
LOG_S(WARNING) << "Indexed DeviceN base has unsupported component layout N="
<< nested_n;
}
LOG_S(INFO) << "Indexed DeviceN base: N=" << nested_n
<< " -> " << indexed_base_cs;
}
}
else if(base_name.isName())
{
indexed_base_cs = base_name.getName();
@@ -350,6 +579,52 @@ namespace pdflib
{
LOG_S(WARNING) << "Indexed color space: unrecognized lookup table type";
}
if(indexed_base_cs == "/DeviceCMYK"
and not indexed_base_device_n_names.empty()
and detail::device_n_names_are_process_cmyk_subset(
indexed_base_device_n_names))
{
auto expanded =
detail::expand_device_n_palette_to_cmyk(indexed_palette,
indexed_base_device_n_names);
if(expanded)
{
indexed_palette = std::move(expanded);
LOG_S(INFO) << "Indexed DeviceN palette expanded to CMYK: "
<< indexed_palette->size() << " bytes";
}
else
{
LOG_S(WARNING) << "Indexed DeviceN palette expansion to CMYK failed";
}
}
if(indexed_base_icc_profile
and not indexed_base_icc_profile->empty()
and indexed_base_icc_components > 0
and indexed_palette
and not indexed_palette->empty())
{
auto rgb_palette = icc::transform_palette_to_rgb(
*indexed_palette,
indexed_base_icc_components,
*indexed_base_icc_profile);
if(not rgb_palette.empty())
{
indexed_palette = std::make_shared<std::vector<uint8_t>>(
std::move(rgb_palette));
indexed_base_cs = "/DeviceRGB";
indexed_base_device_n_names.clear();
indexed_base_device_n_single_black = false;
LOG_S(INFO) << "Indexed ICCBased palette converted to RGB: "
<< indexed_palette->size() << " bytes";
}
else
{
LOG_S(WARNING) << "Indexed ICCBased palette RGB conversion failed";
}
}
}
}
}
@@ -443,6 +718,22 @@ namespace pdflib
}
decode_present = not decode_array.empty();
}
else if(device_n_components > 0)
{
const bool single_black =
device_n_components == 1
and device_n_names.size() == 1
and device_n_names[0] == "/Black";
LOG_S(INFO) << "no `/Decode` found: using default for DeviceN N="
<< device_n_components
<< " single_black=" << (single_black ? "true" : "false");
for(int i = 0; i < device_n_components; ++i)
{
decode_array.push_back(single_black ? 1.0 : 0.0);
decode_array.push_back(single_black ? 0.0 : 1.0);
}
decode_present = not decode_array.empty();
}
else if(indexed_hival >= 0)
{
// Indexed: default decode is [0, hival] (one component — the palette index)
@@ -754,6 +1045,16 @@ namespace pdflib
return icc_components;
}
int pdf_resource<PAGE_XOBJECT_IMAGE>::get_device_n_components() const
{
return device_n_components;
}
std::vector<std::string> pdf_resource<PAGE_XOBJECT_IMAGE>::get_device_n_names() const
{
return device_n_names;
}
int pdf_resource<PAGE_XOBJECT_IMAGE>::get_indexed_hival() const
{
return indexed_hival;
@@ -769,6 +1070,16 @@ namespace pdflib
return indexed_palette;
}
std::vector<std::string> pdf_resource<PAGE_XOBJECT_IMAGE>::get_indexed_base_device_n_names() const
{
return indexed_base_device_n_names;
}
bool pdf_resource<PAGE_XOBJECT_IMAGE>::get_indexed_base_device_n_single_black() const
{
return indexed_base_device_n_single_black;
}
std::string pdf_resource<PAGE_XOBJECT_IMAGE>::get_intent() const
{
return intent;
+385 -44
View File
@@ -3,7 +3,11 @@
#ifndef PDF_BITMAP_STATE_H
#define PDF_BITMAP_STATE_H
#include <algorithm>
#include <cmath>
#include <parse/utils/ccitt/ccitt_utils.h>
#include <parse/utils/jpx/jpx_utils.h>
#include <third_party/pdfium_jbig2.h>
namespace pdflib
@@ -148,12 +152,17 @@ namespace pdflib
image.ccitt_k = xobj.get_ccitt_k();
image.ccitt_black_is_1 = xobj.get_ccitt_black_is_1();
image.icc_components = xobj.get_icc_components();
image.device_n_components = xobj.get_device_n_components();
image.device_n_names = xobj.get_device_n_names();
image.jbig2_globals_data = xobj.get_jbig2_globals_data();
// propagate /Indexed color space data
image.indexed_hival = xobj.get_indexed_hival();
image.indexed_base_cs = xobj.get_indexed_base_cs();
image.indexed_palette = xobj.get_indexed_palette();
image.indexed_base_device_n_names = xobj.get_indexed_base_device_n_names();
image.indexed_base_device_n_single_black =
xobj.get_indexed_base_device_n_single_black();
// propagate graphics state
image.has_graphics_state = true;
@@ -171,8 +180,229 @@ namespace pdflib
std::shared_ptr<std::vector<uint8_t>> pixel_data;
std::array<int, 3> pixel_shape = {0, 0, 0};
pixel_format fmt = PIXEL_FORMAT_UNKNOWN;
cmyk_convention cmyk_conv = CMYK_CONVENTION_UNKNOWN;
int channels = 0;
auto has_default_adobe_cmyk_decode = [&](std::vector<double> const& decode_array) -> bool
{
static constexpr double expected_decode[8] = {
1.0, 0.0, 1.0, 0.0,
1.0, 0.0, 1.0, 0.0
};
if(decode_array.size() < 8)
{
return false;
}
for(int i = 0; i < 8; ++i)
{
if(std::abs(decode_array[static_cast<std::size_t>(i)] - expected_decode[i]) > 1e-12)
{
return false;
}
}
return true;
};
auto apply_decode_to_u8_samples = [&](std::shared_ptr<std::vector<uint8_t>>& dst,
int ncomps) -> void
{
if(not dst or ncomps <= 0 or not image.decode_present or image.decode_array.size() < 2)
{
return;
}
const int pair_count = static_cast<int>(image.decode_array.size() / 2);
for(size_t i = 0; i < dst->size(); ++i)
{
const int comp = static_cast<int>(i % static_cast<size_t>(ncomps));
if(comp < pair_count)
{
(*dst)[i] = jpeg::apply_decode_component(
(*dst)[i],
image.decode_array[2 * comp + 0],
image.decode_array[2 * comp + 1]);
}
}
};
auto expand_indexed_samples = [&](int ncomps,
const uint8_t* indices,
size_t n_indices,
int w,
int h) -> bool
{
if(ncomps <= 0 or not image.indexed_palette or image.indexed_palette->empty() or not indices)
{
return false;
}
const auto& palette = *image.indexed_palette;
auto expanded = std::make_shared<std::vector<uint8_t>>();
expanded->reserve(static_cast<size_t>(w) * h * ncomps);
for(size_t i = 0; i < n_indices; ++i)
{
int idx = static_cast<int>(indices[i]);
if(image.indexed_hival >= 0 and idx > image.indexed_hival)
{
idx = image.indexed_hival;
}
const size_t palette_offset = static_cast<size_t>(idx) * ncomps;
if(palette_offset + ncomps <= palette.size())
{
for(int c = 0; c < ncomps; ++c)
{
expanded->push_back(palette[palette_offset + c]);
}
}
else
{
for(int c = 0; c < ncomps; ++c)
{
expanded->push_back(0);
}
}
}
pixel_data = std::move(expanded);
pixel_shape = {h, w, ncomps};
channels = ncomps;
if(fmt == PIXEL_FORMAT_CMYK
and detail::device_n_names_are_process_cmyk_subset(image.indexed_base_device_n_names))
{
cmyk_conv = CMYK_CONVENTION_PROCESS;
}
if(image.indexed_base_device_n_single_black and ncomps == 1)
{
for(auto& sample : *pixel_data)
{
sample = static_cast<uint8_t>(255 - sample);
}
LOG_S(INFO) << "bitmap: inverted Indexed single-Black DeviceN palette "
<< "for xobject_key=" << image.xobject_key;
}
return true;
};
auto unpack_subbyte_samples_to_u8 =
[&](std::shared_ptr<Buffer> const& src,
int w,
int h,
int ncomps,
int bits_per_component,
std::vector<double> const& decode_array) -> bool
{
// QPDF's getStreamData() decodes the filter chain, but it does not
// expand sub-8-bit image samples into one byte per component. For a
// `/FlateDecode` image with `/BitsPerComponent 1`, the decoded stream
// therefore still contains packed bits (with producer-dependent row
// padding), while the renderer expects a dense 8-bit-per-component
// buffer. This helper performs that expansion and applies the image's
// `/Decode` mapping while unpacking.
if(not src or src->getSize() == 0 or w <= 0 or h <= 0 or ncomps <= 0)
{
return false;
}
if(bits_per_component <= 0 or bits_per_component >= 8)
{
return false;
}
const std::size_t row_bits =
static_cast<std::size_t>(w) * static_cast<std::size_t>(ncomps)
* static_cast<std::size_t>(bits_per_component);
const std::size_t min_row_bytes = (row_bits + 7u) / 8u;
if(min_row_bytes == 0)
{
return false;
}
const std::size_t src_size = src->getSize();
const std::size_t min_total = min_row_bytes * static_cast<std::size_t>(h);
if(src_size < min_total)
{
LOG_S(WARNING) << "bitmap: packed decoded_stream_data too small ("
<< src_size << " < " << min_total
<< ") for sub-byte image xobject_key=" << image.xobject_key
<< " width=" << w
<< " height=" << h
<< " channels=" << ncomps
<< " bpc=" << bits_per_component;
return false;
}
// Per the PDF spec, rows are exactly min_row_bytes wide. QPDF's
// getStreamData() may return more bytes than width*height*bpc/8 (e.g.
// trailing data), but those extra bytes are not per-row padding and must
// not be used to inflate the row stride.
const std::size_t row_stride = min_row_bytes;
const std::uint32_t sample_max = (1u << bits_per_component) - 1u;
auto decode_sample =
[&](int component_index, std::uint32_t raw_sample) -> std::uint8_t
{
const int pair_count = static_cast<int>(decode_array.size() / 2);
if(component_index < pair_count)
{
const double dmin = decode_array[2 * component_index + 0];
const double dmax = decode_array[2 * component_index + 1];
const double norm =
static_cast<double>(raw_sample) / static_cast<double>(sample_max);
const double decoded = dmin + norm * (dmax - dmin);
const double clamped = std::clamp(decoded, 0.0, 1.0);
return static_cast<std::uint8_t>(std::lround(clamped * 255.0));
}
// Absent /Decode entry: fall back to PDF's identity mapping.
const double norm =
static_cast<double>(raw_sample) / static_cast<double>(sample_max);
return static_cast<std::uint8_t>(std::lround(norm * 255.0));
};
const auto* bytes = reinterpret_cast<const std::uint8_t*>(src->getBuffer());
auto expanded = std::make_shared<std::vector<uint8_t>>();
expanded->reserve(static_cast<std::size_t>(w) * h * ncomps);
for(int row = 0; row < h; ++row)
{
const auto* row_ptr = bytes + static_cast<std::size_t>(row) * row_stride;
std::size_t bit_offset = 0;
for(int col = 0; col < w; ++col)
{
for(int comp = 0; comp < ncomps; ++comp)
{
std::uint32_t raw_sample = 0u;
for(int bit = 0; bit < bits_per_component; ++bit)
{
const std::size_t absolute_bit = bit_offset + static_cast<std::size_t>(bit);
const std::size_t byte_index = absolute_bit / 8u;
const int bit_in_byte = 7 - static_cast<int>(absolute_bit % 8u);
const std::uint8_t byte = row_ptr[byte_index];
raw_sample = (raw_sample << 1u)
| static_cast<std::uint32_t>((byte >> bit_in_byte) & 1u);
}
expanded->push_back(decode_sample(comp, raw_sample));
bit_offset += static_cast<std::size_t>(bits_per_component);
}
}
}
pixel_data = std::move(expanded);
pixel_shape = {h, w, ncomps};
channels = ncomps;
LOG_S(INFO) << "bitmap: unpacked sub-byte decoded_stream_data"
<< " for xobject_key=" << image.xobject_key
<< " width=" << w
<< " height=" << h
<< " channels=" << ncomps
<< " bpc=" << bits_per_component
<< " row_stride=" << row_stride
<< " output_size=" << pixel_data->size();
return true;
};
if(image.image_mask)
{
fmt = PIXEL_FORMAT_GRAY; channels = 1;
@@ -215,6 +445,30 @@ namespace pdflib
<< " for xobject_key=" << image.xobject_key;
}
}
else if(image.color_space.find("/DeviceN") != std::string::npos
and image.device_n_components > 0)
{
LOG_S(INFO) << "bitmap: DeviceN color space with N=" << image.device_n_components
<< " for xobject_key=" << image.xobject_key;
if(image.device_n_components == 1)
{
fmt = PIXEL_FORMAT_GRAY; channels = 1;
}
else if(image.device_n_components == 3)
{
fmt = PIXEL_FORMAT_RGB; channels = 3;
}
else if(image.device_n_components == 4)
{
fmt = PIXEL_FORMAT_CMYK; channels = 4;
}
else
{
LOG_S(WARNING) << "bitmap: DeviceN with unsupported N="
<< image.device_n_components
<< " for xobject_key=" << image.xobject_key;
}
}
else if(image.indexed_palette and not image.indexed_palette->empty())
{
// /Indexed: expand palette indices into base color space pixels.
@@ -239,51 +493,22 @@ namespace pdflib
<< "' for xobject_key=" << image.xobject_key;
}
channels = ncomps;
if(ncomps > 0 and image.decoded_stream_data and image.decoded_stream_data->getSize() > 0)
{
const int w = image.image_width;
const int h = image.image_height;
const auto& palette = *image.indexed_palette;
const auto* indices = reinterpret_cast<const uint8_t*>(
image.decoded_stream_data->getBuffer());
const size_t n_indices = image.decoded_stream_data->getSize();
auto expanded = std::make_shared<std::vector<uint8_t>>();
expanded->reserve(static_cast<size_t>(w) * h * ncomps);
for(size_t i = 0; i < n_indices; ++i)
if(expand_indexed_samples(ncomps, indices, n_indices, w, h))
{
int idx = static_cast<int>(indices[i]);
// clamp to hival
if(image.indexed_hival >= 0 and idx > image.indexed_hival)
{
idx = image.indexed_hival;
}
const size_t palette_offset = static_cast<size_t>(idx) * ncomps;
if(palette_offset + ncomps <= palette.size())
{
for(int c = 0; c < ncomps; ++c)
{
expanded->push_back(palette[palette_offset + c]);
}
}
else
{
// out-of-range index: fill with zeros
for(int c = 0; c < ncomps; ++c)
{
expanded->push_back(0);
}
}
LOG_S(INFO) << "bitmap: expanded Indexed palette for xobject_key="
<< image.xobject_key
<< " (" << n_indices << " indices -> "
<< pixel_data->size() << " bytes, ncomps=" << ncomps << ")";
}
pixel_data = std::move(expanded);
pixel_shape = {h, w, ncomps};
channels = ncomps; // mark as handled
LOG_S(INFO) << "bitmap: expanded Indexed palette for xobject_key="
<< image.xobject_key
<< " (" << n_indices << " indices → "
<< pixel_data->size() << " bytes, ncomps=" << ncomps << ")";
}
else if(ncomps > 0)
{
@@ -313,25 +538,44 @@ namespace pdflib
"/DCTDecode") != image.filters.end();
const bool has_flate = std::find(image.filters.begin(), image.filters.end(),
"/FlateDecode") != image.filters.end();
const bool has_jpx = std::find(image.filters.begin(), image.filters.end(),
"/JPXDecode") != image.filters.end();
if (image.decoded_stream_data and image.decoded_stream_data->getSize() > 0)
{
const int w = image.image_width;
const int h = image.image_height;
const size_t expected = static_cast<size_t>(w) * h * channels;
const auto src = image.decoded_stream_data;
if (src->getSize() >= expected)
if(image.bits_per_component > 0 and image.bits_per_component < 8)
{
const auto* raw = reinterpret_cast<const uint8_t*>(src->getBuffer());
pixel_data = std::make_shared<std::vector<uint8_t>>(raw, raw + expected);
pixel_shape = {h, w, channels};
if(not unpack_subbyte_samples_to_u8(src,
w,
h,
channels,
image.bits_per_component,
image.decode_array))
{
LOG_S(WARNING) << "bitmap: failed to unpack sub-byte decoded_stream_data "
<< "for xobject_key=" << image.xobject_key;
}
}
else
{
LOG_S(WARNING) << "bitmap: decoded_stream_data too small ("
<< src->getSize() << " < " << expected
<< ") for xobject_key=" << image.xobject_key;
const size_t expected = static_cast<size_t>(w) * h * channels;
if (src->getSize() >= expected)
{
const auto* raw = reinterpret_cast<const uint8_t*>(src->getBuffer());
pixel_data = std::make_shared<std::vector<uint8_t>>(raw, raw + expected);
apply_decode_to_u8_samples(pixel_data, channels);
pixel_shape = {h, w, channels};
}
else
{
LOG_S(WARNING) << "bitmap: decoded_stream_data too small ("
<< src->getSize() << " < " << expected
<< ") for xobject_key=" << image.xobject_key;
}
}
}
else if (has_dct and image.raw_stream_data and image.raw_stream_data->getSize() > 0)
@@ -387,6 +631,10 @@ namespace pdflib
else if(decoded_channels == 4)
{
fmt = PIXEL_FORMAT_CMYK;
if(image.decode_present and has_default_adobe_cmyk_decode(image.decode_array))
{
cmyk_conv = CMYK_CONVENTION_ADOBE_INVERTED;
}
}
else
{
@@ -404,6 +652,98 @@ namespace pdflib
<< "for xobject_key=" << image.xobject_key;
}
}
else if (has_jpx and image.raw_stream_data and image.raw_stream_data->getSize() > 0)
{
LOG_S(INFO) << "bitmap: decoded_stream_data unavailable for /JPXDecode image, "
<< "decoding JPEG2000 via OpenJPEG "
<< "for xobject_key=" << image.xobject_key;
auto decoded = jpx::decode_jpx_to_raw_pixels(
reinterpret_cast<uint8_t const*>(image.raw_stream_data->getBuffer()),
static_cast<std::size_t>(image.raw_stream_data->getSize()));
if(not decoded.empty())
{
if(image.indexed_palette and not image.indexed_palette->empty())
{
LOG_S(INFO) << "bitmap: Indexed JPX fallback metadata "
<< "xobject_key=" << image.xobject_key
<< " base_cs=" << image.indexed_base_cs
<< " expected_components=" << channels
<< " palette_bytes=" << image.indexed_palette->size()
<< " decoded_components=" << decoded.components;
if(decoded.components == 1
and expand_indexed_samples(channels,
decoded.pixels.data(),
decoded.pixels.size(),
decoded.width,
decoded.height))
{
LOG_S(INFO) << "bitmap: OpenJPEG decode succeeded for Indexed image "
<< "xobject_key=" << image.xobject_key
<< " actual_shape=" << decoded.height << "x"
<< decoded.width << "x" << decoded.components
<< " expanded_shape=" << pixel_shape[0] << "x"
<< pixel_shape[1] << "x" << pixel_shape[2];
}
else if(decoded.components == channels)
{
pixel_data = std::make_shared<std::vector<uint8_t>>(std::move(decoded.pixels));
pixel_shape = {decoded.height, decoded.width, decoded.components};
LOG_S(INFO) << "bitmap: OpenJPEG returned already-expanded pixels "
<< "for Indexed image xobject_key=" << image.xobject_key
<< " actual_cs=" << jpeg::color_space_name(decoded.color_space)
<< " actual_shape=" << decoded.height << "x"
<< decoded.width << "x" << decoded.components;
}
else
{
LOG_S(WARNING) << "bitmap: OpenJPEG decode for Indexed image returned "
<< decoded.components
<< " components, expected 1, for xobject_key="
<< image.xobject_key;
}
}
else
{
pixel_data = std::make_shared<std::vector<uint8_t>>(std::move(decoded.pixels));
apply_decode_to_u8_samples(pixel_data, decoded.components);
pixel_shape = {decoded.height, decoded.width, decoded.components};
channels = decoded.components;
if(decoded.components == 1)
{
fmt = PIXEL_FORMAT_GRAY;
}
else if(decoded.components == 3)
{
fmt = PIXEL_FORMAT_RGB;
}
else if(decoded.components == 4)
{
fmt = PIXEL_FORMAT_CMYK;
cmyk_conv = CMYK_CONVENTION_PROCESS;
}
else
{
fmt = PIXEL_FORMAT_UNKNOWN;
}
LOG_S(INFO) << "bitmap: OpenJPEG decode succeeded "
<< "for xobject_key=" << image.xobject_key
<< " actual_cs=" << jpeg::color_space_name(decoded.color_space)
<< " actual_shape=" << decoded.height << "x"
<< decoded.width << "x" << decoded.components;
}
}
else
{
LOG_S(WARNING) << "bitmap: OpenJPEG decode failed "
<< "for xobject_key=" << image.xobject_key;
}
}
else if (std::find(image.filters.begin(), image.filters.end(),
"/JBIG2Decode") != image.filters.end()
and ((image.raw_stream_data and image.raw_stream_data->getSize() > 0)
@@ -582,6 +922,7 @@ namespace pdflib
bitmap_instruction binstr(image.xobject_key,
std::move(pixel_data),
image.soft_mask_data,
cmyk_conv,
pixel_shape,
fmt,
image.image_mask,
+84
View File
@@ -0,0 +1,84 @@
//-*-C++-*-
#pragma once
#include <cstdint>
#include <vector>
#include <lcms2.h>
#ifndef LOGURU_WITH_STREAMS
#define LOGURU_WITH_STREAMS 1
#endif
#include <loguru.hpp>
namespace pdflib::icc
{
inline std::vector<uint8_t> transform_palette_to_rgb(
std::vector<uint8_t> const& palette,
int components,
std::vector<uint8_t> const& profile_bytes)
{
if(profile_bytes.empty() or palette.empty() or components <= 0)
{
return {};
}
if((palette.size() % static_cast<std::size_t>(components)) != 0u)
{
LOG_S(WARNING) << "icc: palette size is not divisible by component count";
return {};
}
cmsUInt32Number input_type = 0;
switch(components)
{
case 1: input_type = TYPE_GRAY_8; break;
case 3: input_type = TYPE_RGB_8; break;
case 4: input_type = TYPE_CMYK_8; break;
default:
LOG_S(WARNING) << "icc: unsupported palette component count " << components;
return {};
}
cmsHPROFILE input_profile = cmsOpenProfileFromMem(
profile_bytes.data(), static_cast<cmsUInt32Number>(profile_bytes.size()));
if(not input_profile)
{
LOG_S(WARNING) << "icc: failed to open embedded ICC profile";
return {};
}
cmsHPROFILE output_profile = cmsCreate_sRGBProfile();
if(not output_profile)
{
cmsCloseProfile(input_profile);
LOG_S(WARNING) << "icc: failed to create sRGB profile";
return {};
}
cmsHTRANSFORM transform = cmsCreateTransform(input_profile,
input_type,
output_profile,
TYPE_RGB_8,
INTENT_RELATIVE_COLORIMETRIC,
0);
if(not transform)
{
cmsCloseProfile(output_profile);
cmsCloseProfile(input_profile);
LOG_S(WARNING) << "icc: failed to create ICC transform";
return {};
}
const cmsUInt32Number entry_count =
static_cast<cmsUInt32Number>(palette.size() / static_cast<std::size_t>(components));
std::vector<uint8_t> rgb(static_cast<std::size_t>(entry_count) * 3u, 0u);
cmsDoTransform(transform, palette.data(), rgb.data(), entry_count);
cmsDeleteTransform(transform);
cmsCloseProfile(output_profile);
cmsCloseProfile(input_profile);
return rgb;
}
}
+244
View File
@@ -0,0 +1,244 @@
//-*-C++-*-
#pragma once
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <vector>
#include <openjpeg.h>
#include <parse/utils/jpeg/jpeg_utils.h>
#ifndef LOGURU_WITH_STREAMS
#define LOGURU_WITH_STREAMS 1
#endif
#include <loguru.hpp>
namespace pdflib {
namespace jpx {
class decoded_jpx_result {
public:
std::vector<uint8_t> pixels;
int width = 0;
int height = 0;
int components = 0;
jpeg::ColorSpace color_space = jpeg::ColorSpace::Unknown;
bool empty() const { return pixels.empty(); }
};
namespace detail {
struct memory_stream_context {
uint8_t const* data = nullptr;
OPJ_SIZE_T size = 0;
OPJ_SIZE_T offset = 0;
};
inline OPJ_SIZE_T stream_read(void* buffer, OPJ_SIZE_T bytes, void* user_data)
{
auto* ctx = reinterpret_cast<memory_stream_context*>(user_data);
if(!ctx || !buffer || ctx->offset >= ctx->size) {
return static_cast<OPJ_SIZE_T>(-1);
}
const OPJ_SIZE_T remaining = ctx->size - ctx->offset;
const OPJ_SIZE_T to_copy = std::min(bytes, remaining);
std::memcpy(buffer, ctx->data + ctx->offset, to_copy);
ctx->offset += to_copy;
return to_copy;
}
inline OPJ_OFF_T stream_skip(OPJ_OFF_T bytes, void* user_data)
{
auto* ctx = reinterpret_cast<memory_stream_context*>(user_data);
if(!ctx || bytes < 0) {
return static_cast<OPJ_OFF_T>(-1);
}
const auto remaining = static_cast<OPJ_OFF_T>(ctx->size - ctx->offset);
const auto to_skip = std::min(bytes, remaining);
ctx->offset += static_cast<OPJ_SIZE_T>(to_skip);
return to_skip;
}
inline OPJ_BOOL stream_seek(OPJ_OFF_T bytes, void* user_data)
{
auto* ctx = reinterpret_cast<memory_stream_context*>(user_data);
if(!ctx || bytes < 0 || static_cast<OPJ_SIZE_T>(bytes) > ctx->size) {
return OPJ_FALSE;
}
ctx->offset = static_cast<OPJ_SIZE_T>(bytes);
return OPJ_TRUE;
}
inline void stream_free(void*) {}
inline bool has_jp2_signature(uint8_t const* data, std::size_t size)
{
static constexpr uint8_t kJP2Header[] = {
0x00, 0x00, 0x00, 0x0c, 0x6a, 0x50,
0x20, 0x20, 0x0d, 0x0a, 0x87, 0x0a
};
return data && size >= sizeof(kJP2Header)
&& std::memcmp(data, kJP2Header, sizeof(kJP2Header)) == 0;
}
inline void ignore_callback(const char*, void*) {}
inline uint8_t clamp_component_to_u8(int value, int precision, bool is_signed)
{
if(precision <= 0) {
return 0;
}
if(is_signed) {
value += 1 << std::max(precision - 1, 0);
}
const int max_value = (precision >= 31) ? 0x7fffffff : ((1 << precision) - 1);
value = std::clamp(value, 0, max_value);
if(precision == 8) {
return static_cast<uint8_t>(value);
}
if(precision > 8) {
const int shift = precision - 8;
value = (value >> shift) + ((value >> std::max(shift - 1, 0)) & (shift > 0 ? 1 : 0));
return static_cast<uint8_t>(std::clamp(value, 0, 255));
}
const int scaled = (value * 255 + max_value / 2) / std::max(max_value, 1);
return static_cast<uint8_t>(std::clamp(scaled, 0, 255));
}
} // namespace detail
inline decoded_jpx_result decode_jpx_to_raw_pixels(uint8_t const* data,
std::size_t size)
{
decoded_jpx_result result;
if(!data || size == 0) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: empty input";
return result;
}
detail::memory_stream_context ctx{data, static_cast<OPJ_SIZE_T>(size), 0};
opj_stream_t* stream = opj_stream_create(1u << 16, OPJ_TRUE);
if(!stream) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: failed to create stream";
return result;
}
opj_stream_set_user_data(stream, &ctx, detail::stream_free);
opj_stream_set_user_data_length(stream, ctx.size);
opj_stream_set_read_function(stream, detail::stream_read);
opj_stream_set_skip_function(stream, detail::stream_skip);
opj_stream_set_seek_function(stream, detail::stream_seek);
opj_dparameters_t params{};
opj_set_default_decoder_parameters(&params);
const auto codec_format =
detail::has_jp2_signature(data, size) ? OPJ_CODEC_JP2 : OPJ_CODEC_J2K;
opj_codec_t* codec = opj_create_decompress(codec_format);
if(!codec) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: failed to create codec";
opj_stream_destroy(stream);
return result;
}
opj_set_info_handler(codec, detail::ignore_callback, nullptr);
opj_set_warning_handler(codec, detail::ignore_callback, nullptr);
opj_set_error_handler(codec, detail::ignore_callback, nullptr);
if(!opj_setup_decoder(codec, &params)) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: opj_setup_decoder failed";
opj_destroy_codec(codec);
opj_stream_destroy(stream);
return result;
}
opj_image_t* image = nullptr;
if(!opj_read_header(stream, codec, &image) ||
!image ||
!opj_decode(codec, stream, image) ||
!opj_end_decompress(codec, stream)) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: OpenJPEG decode failed";
if(image) {
opj_image_destroy(image);
}
opj_destroy_codec(codec);
opj_stream_destroy(stream);
return result;
}
opj_stream_destroy(stream);
opj_destroy_codec(codec);
if(image->numcomps <= 0) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: no image components";
opj_image_destroy(image);
return result;
}
const int width = static_cast<int>(image->comps[0].w);
const int height = static_cast<int>(image->comps[0].h);
const int numcomps = static_cast<int>(image->numcomps);
if(width <= 0 || height <= 0) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: invalid dimensions "
<< width << "x" << height;
opj_image_destroy(image);
return result;
}
for(int c = 0; c < numcomps; ++c) {
if(static_cast<int>(image->comps[c].w) != width ||
static_cast<int>(image->comps[c].h) != height) {
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: unsupported subsampled component layout "
<< "component=" << c
<< " size=" << image->comps[c].w << "x" << image->comps[c].h
<< " expected=" << width << "x" << height;
opj_image_destroy(image);
return result;
}
}
result.width = width;
result.height = height;
result.components = numcomps;
result.color_space = (numcomps == 1) ? jpeg::ColorSpace::Gray
: (numcomps == 3) ? jpeg::ColorSpace::RGB
: (numcomps == 4) ? jpeg::ColorSpace::CMYK
: jpeg::ColorSpace::Unknown;
const auto pixel_count = static_cast<std::size_t>(width) * height;
result.pixels.resize(pixel_count * static_cast<std::size_t>(numcomps));
for(std::size_t i = 0; i < pixel_count; ++i) {
for(int c = 0; c < numcomps; ++c) {
const auto& comp = image->comps[c];
result.pixels[i * static_cast<std::size_t>(numcomps) + static_cast<std::size_t>(c)] =
detail::clamp_component_to_u8(comp.data[i], comp.prec, comp.sgnd != 0);
}
}
LOG_S(INFO) << "decode_jpx_to_raw_pixels: decoded "
<< width << "x" << height
<< " components=" << numcomps
<< " color_space=" << jpeg::color_space_name(result.color_space)
<< " input_size=" << size;
opj_image_destroy(image);
return result;
}
} // namespace jpx
} // namespace pdflib
+12 -3
View File
@@ -1148,9 +1148,18 @@ namespace pdflib
const uint8_t y = src_data->at(idx + 2);
const uint8_t k = src_data->at(idx + 3);
r = static_cast<uint8_t>((static_cast<unsigned int>(c) * k) / 255u);
g = static_cast<uint8_t>((static_cast<unsigned int>(m) * k) / 255u);
b = static_cast<uint8_t>((static_cast<unsigned int>(y) * k) / 255u);
if(instr.get_cmyk_convention() == CMYK_CONVENTION_PROCESS)
{
r = static_cast<uint8_t>(((255u - c) * (255u - k)) / 255u);
g = static_cast<uint8_t>(((255u - m) * (255u - k)) / 255u);
b = static_cast<uint8_t>(((255u - y) * (255u - k)) / 255u);
}
else
{
r = static_cast<uint8_t>((static_cast<unsigned int>(c) * k) / 255u);
g = static_cast<uint8_t>((static_cast<unsigned int>(m) * k) / 255u);
b = static_cast<uint8_t>((static_cast<unsigned int>(y) * k) / 255u);
}
}
else if (fmt == PIXEL_FORMAT_GRAY)
{
+2 -1
View File
@@ -5,7 +5,8 @@ from pathlib import Path
from huggingface_hub import snapshot_download
HF_DATASET_REPO_ID = "docling-project/regression-dataset-for-docling-parse"
HF_DATASET_REVISION = "5d7c3d7b575397ca5b2a943171b0da4fe08c5a5b"
# HF_DATASET_REVISION = "5d7c3d7b575397ca5b2a943171b0da4fe08c5a5b"
HF_DATASET_REVISION = "9a3713bd2e7b5b55ad9dde9d85953a0f5eb5150e"
TESTS_DIR = Path(__file__).resolve().parent
TEST_DATA_DIR = TESTS_DIR / "data"
+1 -1
View File
@@ -12,7 +12,7 @@ from docling_parse.pdf_parser import (
PdfRenderDocument,
)
GENERATE = False
GENERATE = True
RENDER_INSTRUCTION_EPS = 0.005
GROUNDTRUTH_RENDERER_FOLDER = "tests/data/groundtruth_renderer"