mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
feat: add jpeg2000 pixel data (#259)
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
b5804c1654
commit
8546560474
@@ -103,7 +103,7 @@ jobs:
|
||||
CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" # do not run delocate-wheel before the re-tag
|
||||
CIBW_ENVIRONMENT: "MACOSX_DEPLOYMENT_TARGET=${{ matrix.os.min_macos_version }}.0"
|
||||
ARCHFLAGS: -arch x86_64
|
||||
BUILD_THREADS: "4"
|
||||
BUILD_THREADS: "1"
|
||||
PYTORCH_MPS_HIGH_WATERMARK_RATIO: "0.0"
|
||||
run: |
|
||||
PY_CACHE_TAG=$(uv run python -c 'import sys;print(sys.implementation.cache_tag)')
|
||||
|
||||
+3
-1
@@ -108,13 +108,15 @@ include(cmake/extlib_loguru.cmake)
|
||||
include(cmake/extlib_json.cmake)
|
||||
include(cmake/extlib_utf8.git.cmake)
|
||||
include(cmake/extlib_jpeg.cmake)
|
||||
include(cmake/extlib_openjpeg.cmake)
|
||||
include(cmake/extlib_lcms2.cmake)
|
||||
# include(cmake/extlib_qpdf_v11.cmake)
|
||||
include(cmake/extlib_qpdf_v12.cmake)
|
||||
include(cmake/extlib_blend2d.cmake)
|
||||
include(cmake/extlib_pdfium_jbig2.cmake)
|
||||
|
||||
# aggregate the targets created by the dependencies
|
||||
set(DEPENDENCIES qpdf jpeg utf8 json loguru cxxopts blend2d pdfium_jbig2)
|
||||
set(DEPENDENCIES qpdf jpeg openjpeg lcms2 utf8 json loguru cxxopts blend2d pdfium_jbig2)
|
||||
|
||||
# ************************
|
||||
# *** libraries ***
|
||||
|
||||
@@ -4,6 +4,16 @@
|
||||
#include "render.h"
|
||||
#include "parse/utils/bitmap/bitmap_exporter.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
std::filesystem::path page_pdf_output_path(std::filesystem::path const& image_path)
|
||||
{
|
||||
std::filesystem::path pdf_path = image_path;
|
||||
pdf_path.replace_extension(".pdf");
|
||||
return pdf_path;
|
||||
}
|
||||
}
|
||||
|
||||
struct ImageIssue
|
||||
{
|
||||
std::string pdf_path;
|
||||
@@ -52,6 +62,7 @@ static int analyse_pdf(const std::string& pdf_path,
|
||||
int& total_pages,
|
||||
const std::string& render_dir,
|
||||
bool export_bitmaps,
|
||||
bool export_page_pdf,
|
||||
const std::string& bitmap_dir,
|
||||
int target_page)
|
||||
{
|
||||
@@ -184,6 +195,10 @@ static int analyse_pdf(const std::string& pdf_path,
|
||||
pdflib::renderer<pdflib::BLEND2D> rnd(render_cfg);
|
||||
page_dec->get_instructions().iterate_over_instructions(rnd);
|
||||
rnd.save(out_path);
|
||||
if(export_page_pdf)
|
||||
{
|
||||
page_dec->save_pdf_page(page_pdf_output_path(out_path));
|
||||
}
|
||||
LOG_S(INFO) << "saved rendered page: " << out_path;
|
||||
}
|
||||
catch (std::exception const& exc)
|
||||
@@ -257,6 +272,8 @@ int main(int argc, char* argv[])
|
||||
cxxopts::value<int>()->default_value("-1"))
|
||||
("export-bitmaps", "Export decoded bitmap payloads encountered on each page",
|
||||
cxxopts::value<bool>()->implicit_value("true"))
|
||||
("export-page-pdf", "Export each rendered page as a sibling PDF",
|
||||
cxxopts::value<bool>()->implicit_value("true"))
|
||||
("l,loglevel", "Log level [error, warning, info]", cxxopts::value<std::string>())
|
||||
("h,help", "Print usage");
|
||||
|
||||
@@ -300,6 +317,12 @@ int main(int argc, char* argv[])
|
||||
export_bitmaps = result["export-bitmaps"].as<bool>();
|
||||
}
|
||||
|
||||
bool export_page_pdf = false;
|
||||
if(result.count("export-page-pdf"))
|
||||
{
|
||||
export_page_pdf = result["export-page-pdf"].as<bool>();
|
||||
}
|
||||
|
||||
std::string bitmap_dir;
|
||||
if(export_bitmaps)
|
||||
{
|
||||
@@ -335,6 +358,7 @@ int main(int argc, char* argv[])
|
||||
total_pages,
|
||||
render_dir,
|
||||
export_bitmaps,
|
||||
export_page_pdf,
|
||||
bitmap_dir,
|
||||
target_page);
|
||||
}
|
||||
|
||||
+57
-2
@@ -4,6 +4,18 @@
|
||||
#include "render.h"
|
||||
#include "parse/utils/bitmap/bitmap_exporter.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
std::filesystem::path page_pdf_output_path(std::filesystem::path const& export_dir,
|
||||
std::filesystem::path const& pdf_path,
|
||||
int page)
|
||||
{
|
||||
return export_dir / (pdf_path.stem().string()
|
||||
+ "_p" + std::to_string(page)
|
||||
+ ".pdf");
|
||||
}
|
||||
}
|
||||
|
||||
void set_loglevel(std::string level)
|
||||
{
|
||||
if(level=="info")
|
||||
@@ -35,6 +47,8 @@ bool decode_and_render(pdflib::pdf_decoder<pdflib::DOCUMENT>& doc,
|
||||
Renderer& rnd,
|
||||
bool export_bitmaps,
|
||||
std::filesystem::path const& bitmap_dir,
|
||||
bool export_page_pdf_files,
|
||||
std::filesystem::path const& page_pdf_dir,
|
||||
std::string const& pdf_path)
|
||||
{
|
||||
if (page == -1)
|
||||
@@ -52,6 +66,12 @@ bool decode_and_render(pdflib::pdf_decoder<pdflib::DOCUMENT>& doc,
|
||||
bitmap_dir, pdf_path, p);
|
||||
instructions.iterate_over_instructions(exporter);
|
||||
}
|
||||
if(export_page_pdf_files)
|
||||
{
|
||||
page_decoder->save_pdf_page(page_pdf_output_path(page_pdf_dir,
|
||||
pdf_path,
|
||||
p));
|
||||
}
|
||||
instructions.iterate_over_instructions(rnd);
|
||||
}
|
||||
}
|
||||
@@ -68,6 +88,12 @@ bool decode_and_render(pdflib::pdf_decoder<pdflib::DOCUMENT>& doc,
|
||||
bitmap_dir, pdf_path, page);
|
||||
instructions.iterate_over_instructions(exporter);
|
||||
}
|
||||
if(export_page_pdf_files)
|
||||
{
|
||||
page_decoder->save_pdf_page(page_pdf_output_path(page_pdf_dir,
|
||||
pdf_path,
|
||||
page));
|
||||
}
|
||||
instructions.iterate_over_instructions(rnd);
|
||||
}
|
||||
else
|
||||
@@ -89,7 +115,9 @@ int render_pdf_file(const std::string& pdf_path,
|
||||
const RenderCfg& render_cfg,
|
||||
bool save_output,
|
||||
bool export_bitmaps,
|
||||
const std::string& bitmap_dir)
|
||||
const std::string& bitmap_dir,
|
||||
bool export_page_pdf_files,
|
||||
const std::string& page_pdf_dir)
|
||||
{
|
||||
pdflib::pdf_timings timings;
|
||||
pdflib::pdf_decoder<pdflib::DOCUMENT> doc(timings);
|
||||
@@ -129,6 +157,12 @@ int render_pdf_file(const std::string& pdf_path,
|
||||
p);
|
||||
instructions.iterate_over_instructions(exporter);
|
||||
}
|
||||
if(export_page_pdf_files)
|
||||
{
|
||||
page_decoder->save_pdf_page(page_pdf_output_path(std::filesystem::path(page_pdf_dir),
|
||||
pdf_path,
|
||||
p));
|
||||
}
|
||||
instructions.iterate_over_instructions(rnd);
|
||||
|
||||
if (save_output)
|
||||
@@ -202,6 +236,8 @@ int main(int argc, char* argv[])
|
||||
("keep-qpdf-warnings", "Emit QPDF warnings (default: false)", cxxopts::value<bool>()->implicit_value("true"))
|
||||
("populate-json", "Populate JSON objects during decode (default: false)", cxxopts::value<bool>()->implicit_value("true"))
|
||||
("export-bitmaps", "Export decoded bitmap payloads encountered on each page (default: false)",
|
||||
cxxopts::value<bool>()->default_value("false"))
|
||||
("export-page-pdf", "Export each selected page as a one-page PDF (default: false)",
|
||||
cxxopts::value<bool>()->default_value("false"));
|
||||
|
||||
// Parse command line arguments
|
||||
@@ -267,6 +303,7 @@ int main(int argc, char* argv[])
|
||||
if (result.count("keep-qpdf-warnings")) { page_config.keep_qpdf_warnings = result["keep-qpdf-warnings"].as<bool>(); }
|
||||
if (result.count("populate-json")) { page_config.populate_json_objects = result["populate-json"].as<bool>(); }
|
||||
bool export_bitmaps = result["export-bitmaps"].as<bool>();
|
||||
bool export_page_pdf_files = result["export-page-pdf"].as<bool>();
|
||||
|
||||
// --- render_config ---
|
||||
pdflib::render_config cfg;
|
||||
@@ -285,6 +322,7 @@ int main(int argc, char* argv[])
|
||||
std::string ifile = result["input"].as<std::string>();
|
||||
std::string ofile = ifile + ".rendered.json";
|
||||
std::string bitmap_dir = "./bitmaps_out";
|
||||
std::string page_pdf_dir = "./pages_out";
|
||||
if(export_bitmaps and result.count("output"))
|
||||
{
|
||||
std::filesystem::path output_path = result["output"].as<std::string>();
|
||||
@@ -293,6 +331,14 @@ int main(int argc, char* argv[])
|
||||
bitmap_dir = (output_path / "bitmaps").string();
|
||||
}
|
||||
}
|
||||
if(export_page_pdf_files and result.count("output"))
|
||||
{
|
||||
std::filesystem::path output_path = result["output"].as<std::string>();
|
||||
if(output_path.extension().empty())
|
||||
{
|
||||
page_pdf_dir = (output_path / "pages").string();
|
||||
}
|
||||
}
|
||||
|
||||
int page = result["page"].as<int>();
|
||||
LOG_F(INFO, "Page to process: %d", page);
|
||||
@@ -328,6 +374,8 @@ int main(int argc, char* argv[])
|
||||
if (not decode_and_render(doc, page, page_config, rnd,
|
||||
export_bitmaps,
|
||||
std::filesystem::path(bitmap_dir),
|
||||
export_page_pdf_files,
|
||||
std::filesystem::path(page_pdf_dir),
|
||||
ifile)) { return 1; }
|
||||
rnd.show();
|
||||
}
|
||||
@@ -337,6 +385,8 @@ int main(int argc, char* argv[])
|
||||
if (not decode_and_render(doc, page, page_config, rnd,
|
||||
export_bitmaps,
|
||||
std::filesystem::path(bitmap_dir),
|
||||
export_page_pdf_files,
|
||||
std::filesystem::path(page_pdf_dir),
|
||||
ifile)) { return 1; }
|
||||
}
|
||||
|
||||
@@ -354,6 +404,9 @@ int main(int argc, char* argv[])
|
||||
const std::string bitmap_dir = save
|
||||
? (std::filesystem::path(out_dir) / "bitmaps").string()
|
||||
: "./bitmaps_out";
|
||||
const std::string page_pdf_dir = save
|
||||
? (std::filesystem::path(out_dir) / "pages").string()
|
||||
: "./pages_out";
|
||||
|
||||
if (not std::filesystem::is_directory(dir_path))
|
||||
{
|
||||
@@ -383,7 +436,9 @@ int main(int argc, char* argv[])
|
||||
cfg,
|
||||
save,
|
||||
export_bitmaps,
|
||||
bitmap_dir);
|
||||
bitmap_dir,
|
||||
export_page_pdf_files,
|
||||
page_pdf_dir);
|
||||
if (pages == 0)
|
||||
{
|
||||
++failed_files;
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
message(STATUS "entering in extlib_lcms2.cmake")
|
||||
|
||||
set(ext_name "lcms2")
|
||||
|
||||
if(USE_SYSTEM_DEPS)
|
||||
find_package(PkgConfig)
|
||||
if(PkgConfig_FOUND)
|
||||
pkg_check_modules(liblcms2 IMPORTED_TARGET lcms2)
|
||||
endif()
|
||||
|
||||
if(TARGET PkgConfig::liblcms2)
|
||||
add_library(${ext_name} ALIAS PkgConfig::liblcms2)
|
||||
else()
|
||||
find_path(LCMS2_INCLUDE_DIR lcms2.h)
|
||||
find_library(LCMS2_LIBRARY NAMES lcms2 liblcms2)
|
||||
|
||||
if(NOT LCMS2_INCLUDE_DIR OR NOT LCMS2_LIBRARY)
|
||||
message(FATAL_ERROR "lcms2 not found. Install Little CMS 2 or disable USE_SYSTEM_DEPS.")
|
||||
endif()
|
||||
|
||||
add_library(${ext_name} UNKNOWN IMPORTED)
|
||||
set_target_properties(${ext_name} PROPERTIES
|
||||
IMPORTED_LOCATION "${LCMS2_LIBRARY}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${LCMS2_INCLUDE_DIR}"
|
||||
)
|
||||
endif()
|
||||
|
||||
else()
|
||||
include(ExternalProject)
|
||||
include(CMakeParseArguments)
|
||||
|
||||
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include)
|
||||
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/lib)
|
||||
|
||||
set(LCMS2_URL https://github.com/mm2/Little-CMS.git)
|
||||
set(LCMS2_TAG lcms2.17)
|
||||
|
||||
set(LCMS2_IMPORTED_LIB ${EXTERNALS_PREFIX_PATH}/lib/liblcms2.a)
|
||||
|
||||
ExternalProject_Add(extlib_lcms2
|
||||
PREFIX extlib_lcms2
|
||||
|
||||
UPDATE_COMMAND ""
|
||||
GIT_REPOSITORY ${LCMS2_URL}
|
||||
GIT_TAG ${LCMS2_TAG}
|
||||
|
||||
BUILD_ALWAYS OFF
|
||||
|
||||
INSTALL_DIR ${EXTERNALS_PREFIX_PATH}
|
||||
|
||||
BUILD_IN_SOURCE ON
|
||||
CONFIGURE_COMMAND ./configure
|
||||
--prefix=${EXTERNALS_PREFIX_PATH}
|
||||
--disable-shared
|
||||
--enable-static
|
||||
CFLAGS=-fPIC\ ${ENV_ARCHFLAGS}
|
||||
BUILD_COMMAND make
|
||||
INSTALL_COMMAND make install
|
||||
|
||||
LOG_DOWNLOAD ON
|
||||
)
|
||||
|
||||
add_library(${ext_name} STATIC IMPORTED)
|
||||
add_dependencies(${ext_name} extlib_lcms2)
|
||||
set_target_properties(${ext_name} PROPERTIES
|
||||
IMPORTED_LOCATION ${LCMS2_IMPORTED_LIB}
|
||||
INTERFACE_INCLUDE_DIRECTORIES ${EXTERNALS_PREFIX_PATH}/include
|
||||
)
|
||||
endif()
|
||||
@@ -0,0 +1,61 @@
|
||||
|
||||
message(STATUS "entering in extlib_openjpeg.cmake")
|
||||
|
||||
set(ext_name "openjpeg")
|
||||
|
||||
if(USE_SYSTEM_DEPS)
|
||||
find_package(PkgConfig)
|
||||
pkg_check_modules(libopenjp2 REQUIRED IMPORTED_TARGET libopenjp2)
|
||||
|
||||
add_library(${ext_name} ALIAS PkgConfig::libopenjp2)
|
||||
|
||||
else()
|
||||
include(ExternalProject)
|
||||
include(CMakeParseArguments)
|
||||
|
||||
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include)
|
||||
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/include/openjpeg-2.5)
|
||||
file(MAKE_DIRECTORY ${EXTERNALS_PREFIX_PATH}/lib)
|
||||
|
||||
set(OPENJPEG_URL https://github.com/uclouvain/openjpeg.git)
|
||||
set(OPENJPEG_TAG v2.5.3)
|
||||
set(OPENJPEG_IMPORTED_LIB ${EXTERNALS_PREFIX_PATH}/lib/libopenjp2.a)
|
||||
|
||||
ExternalProject_Add(extlib_openjpeg
|
||||
|
||||
PREFIX extlib_openjpeg
|
||||
|
||||
UPDATE_COMMAND ""
|
||||
GIT_REPOSITORY ${OPENJPEG_URL}
|
||||
GIT_TAG ${OPENJPEG_TAG}
|
||||
|
||||
BUILD_ALWAYS OFF
|
||||
|
||||
INSTALL_DIR ${EXTERNALS_PREFIX_PATH}
|
||||
|
||||
CMAKE_ARGS \\
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \\
|
||||
-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \\
|
||||
-DCMAKE_C_FLAGS=${ENV_ARCHFLAGS} \\
|
||||
-DBUILD_CODEC=OFF \\
|
||||
-DBUILD_JPIP=OFF \\
|
||||
-DBUILD_JPWL=OFF \\
|
||||
-DBUILD_THIRDPARTY=OFF \\
|
||||
-DBUILD_TESTING=OFF \\
|
||||
-DBUILD_SHARED_LIBS=OFF \\
|
||||
-DCMAKE_INSTALL_LIBDIR=${EXTERNALS_PREFIX_PATH}/lib \\
|
||||
-DCMAKE_INSTALL_PREFIX=${EXTERNALS_PREFIX_PATH}
|
||||
|
||||
BUILD_IN_SOURCE ON
|
||||
LOG_DOWNLOAD ON
|
||||
)
|
||||
|
||||
add_library(${ext_name} STATIC IMPORTED)
|
||||
add_dependencies(${ext_name} extlib_openjpeg)
|
||||
set_target_properties(${ext_name} PROPERTIES
|
||||
IMPORTED_LOCATION ${OPENJPEG_IMPORTED_LIB}
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${EXTERNALS_PREFIX_PATH}/include/openjpeg-2.5;${EXTERNALS_PREFIX_PATH}/include"
|
||||
INTERFACE_COMPILE_DEFINITIONS "OPJ_STATIC"
|
||||
)
|
||||
|
||||
endif()
|
||||
@@ -75,11 +75,15 @@ namespace pdflib
|
||||
std::vector<double> decode_array; // 2*ncomp when present
|
||||
bool image_mask = false;
|
||||
int icc_components = 0; // number of color components from /ICCBased /N entry; 0 if not ICCBased
|
||||
int device_n_components = 0; // number of components from /DeviceN names array; 0 if not DeviceN
|
||||
std::vector<std::string> device_n_names;
|
||||
|
||||
// /Indexed color space support
|
||||
int indexed_hival = -1;
|
||||
std::string indexed_base_cs;
|
||||
std::shared_ptr<std::vector<uint8_t>> indexed_palette;
|
||||
std::vector<std::string> indexed_base_device_n_names;
|
||||
bool indexed_base_device_n_single_black = false;
|
||||
|
||||
// /CCITTFaxDecode parameters (from /DecodeParms)
|
||||
int ccitt_k = 0; // /K default per PDF spec: 0=Group3-1D, <0=Group4, >0=Group3-mixed
|
||||
@@ -353,6 +357,12 @@ namespace pdflib
|
||||
if(icc_components == 3) { return "RGB"; }
|
||||
if(icc_components == 4) { return "CMYK"; }
|
||||
}
|
||||
if(color_space.find("/DeviceN") != std::string::npos and device_n_components > 0)
|
||||
{
|
||||
if(device_n_components == 1) { return "L"; }
|
||||
if(device_n_components == 3) { return "RGB"; }
|
||||
if(device_n_components == 4) { return "CMYK"; }
|
||||
}
|
||||
|
||||
LOG_S(WARNING) << "unknown color_space '" << color_space
|
||||
<< "' for xobject_key=" << xobject_key
|
||||
|
||||
@@ -20,6 +20,12 @@ namespace pdflib
|
||||
PIXEL_FORMAT_CMYK, // 4 channels (/DeviceCMYK)
|
||||
};
|
||||
|
||||
enum cmyk_convention {
|
||||
CMYK_CONVENTION_UNKNOWN,
|
||||
CMYK_CONVENTION_ADOBE_INVERTED,
|
||||
CMYK_CONVENTION_PROCESS,
|
||||
};
|
||||
|
||||
enum RENDER_INSTRUCTION_NAME {
|
||||
SIZE_INSTRUCTION, // set the size of the canvas on which we render
|
||||
TEXT_RENDER_INSTRUCTION, // render text on the canvas
|
||||
@@ -188,6 +194,7 @@ namespace pdflib
|
||||
bitmap_instruction(std::string xobject_key,
|
||||
std::shared_ptr<std::vector<uint8_t> > data,
|
||||
std::shared_ptr<std::vector<uint8_t> > alpha_data,
|
||||
cmyk_convention cmyk_conv,
|
||||
std::array<int, 3> shape,
|
||||
pixel_format fmt,
|
||||
bool image_mask,
|
||||
@@ -199,6 +206,7 @@ namespace pdflib
|
||||
xobject_key(xobject_key),
|
||||
data(std::move(data)),
|
||||
alpha_data(std::move(alpha_data)),
|
||||
cmyk_conv(cmyk_conv),
|
||||
shape(shape),
|
||||
fmt(fmt),
|
||||
image_mask(image_mask),
|
||||
@@ -212,6 +220,7 @@ namespace pdflib
|
||||
|
||||
const std::shared_ptr<std::vector<uint8_t> >& get_data() const { return data; }
|
||||
const std::shared_ptr<std::vector<uint8_t> >& get_alpha_data() const { return alpha_data; }
|
||||
cmyk_convention get_cmyk_convention() const { return cmyk_conv; }
|
||||
const std::array<int, 3>& get_shape() const { return shape; }
|
||||
pixel_format get_pixel_format() const { return fmt; }
|
||||
bool is_image_mask() const { return image_mask; }
|
||||
@@ -235,6 +244,7 @@ namespace pdflib
|
||||
|
||||
const std::shared_ptr<std::vector<uint8_t> > data;
|
||||
const std::shared_ptr<std::vector<uint8_t> > alpha_data;
|
||||
const cmyk_convention cmyk_conv;
|
||||
const std::array<int, 3> shape;
|
||||
const pixel_format fmt;
|
||||
const bool image_mask;
|
||||
|
||||
@@ -5,7 +5,9 @@
|
||||
|
||||
#include <optional>
|
||||
#include <qpdf/QPDF.hh>
|
||||
#include <qpdf/QPDFPageDocumentHelper.hh>
|
||||
#include <qpdf/QPDFPageObjectHelper.hh>
|
||||
#include <qpdf/QPDFWriter.hh>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
@@ -63,6 +65,9 @@ namespace pdflib
|
||||
// Get render instructions collected during decode
|
||||
pdf_render_instructions& get_instructions() { return instructions; }
|
||||
|
||||
// Export this page as a standalone one-page PDF.
|
||||
void save_pdf_page(std::filesystem::path const& out_path) const;
|
||||
|
||||
private:
|
||||
|
||||
void update_qpdf_logger();
|
||||
@@ -241,6 +246,21 @@ namespace pdflib
|
||||
return page_number;
|
||||
}
|
||||
|
||||
void pdf_decoder<PAGE>::save_pdf_page(std::filesystem::path const& out_path) const
|
||||
{
|
||||
std::filesystem::create_directories(out_path.parent_path());
|
||||
|
||||
QPDF out_pdf;
|
||||
out_pdf.emptyPDF();
|
||||
|
||||
QPDFPageDocumentHelper out_pages(out_pdf);
|
||||
QPDFPageObjectHelper source_page(qpdf_page);
|
||||
out_pages.addPage(source_page, false);
|
||||
|
||||
QPDFWriter writer(out_pdf, out_path.string().c_str());
|
||||
writer.write();
|
||||
}
|
||||
|
||||
nlohmann::json pdf_decoder<PAGE>::get(const decode_config& config)
|
||||
{
|
||||
bool keep_char_cells = config.keep_char_cells;
|
||||
|
||||
@@ -674,7 +674,7 @@ namespace pdflib
|
||||
break;
|
||||
|
||||
/**************************************************
|
||||
*** text-objects
|
||||
*** group-objects
|
||||
**************************************************/
|
||||
|
||||
case pdf_operator::BT:
|
||||
@@ -696,6 +696,54 @@ namespace pdflib
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::BX:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::EX:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::BMC:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::BDC:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::EMC:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::BI:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::ID:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::EI:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
/**************************************************
|
||||
*** text-state
|
||||
**************************************************/
|
||||
@@ -976,6 +1024,41 @@ namespace pdflib
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::MP:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::DP:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::sh:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
//current_graphic_state().sh(parameters);
|
||||
}
|
||||
break;
|
||||
|
||||
/**************************************************
|
||||
*** Type 3 font metrics
|
||||
**************************************************/
|
||||
|
||||
case pdf_operator::d0:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
case pdf_operator::d1:
|
||||
{
|
||||
LOG_S(INFO) << "executing " << to_string(name);
|
||||
}
|
||||
break;
|
||||
|
||||
/**************************************************
|
||||
*** other
|
||||
**************************************************/
|
||||
|
||||
@@ -19,6 +19,22 @@ namespace pdflib
|
||||
PATH_PAINTING
|
||||
};
|
||||
|
||||
/*
|
||||
Table: PDF content stream operators
|
||||
|
||||
| Category | Operators |
|
||||
| -------------------- | ------------------------------------------ |
|
||||
| Graphics state | w J j M d ri i gs q Q cm |
|
||||
| Path | m l c v y h re S s f F f* B B* b b* n W W* |
|
||||
| Text | BT ET Tc Tw Tz TL Tf Tr Ts Td TD Tm T* Tj TJ ' " |
|
||||
| Color | CS cs SC SCN sc scn G g RG rg K k |
|
||||
| XObject / shading | Do sh |
|
||||
| Marked content | MP DP BMC BDC EMC |
|
||||
| Compatibility | BX EX |
|
||||
| Inline image markers | BI ID EI |
|
||||
| Type 3 font metrics | d0 d1 |
|
||||
*/
|
||||
|
||||
// Table 51 – Operator Categories [ p 119 ]
|
||||
enum operator_name {
|
||||
|
||||
@@ -34,9 +50,11 @@ namespace pdflib
|
||||
// color-scheme
|
||||
CS, cs, SC, SCN, sc, scn, G, g, RG, rg, K, k,
|
||||
|
||||
// text objects
|
||||
// group objects
|
||||
BT, ET,
|
||||
|
||||
BX, EX,
|
||||
BI, ID, EI,
|
||||
|
||||
// Text state
|
||||
Tc, Tw, Tz, TL, Tf, Tr, Ts,
|
||||
|
||||
@@ -58,6 +76,12 @@ namespace pdflib
|
||||
// Marked content
|
||||
MP, DP, BMC, BDC, EMC,
|
||||
|
||||
// shading
|
||||
sh,
|
||||
|
||||
// Type 3 font operators
|
||||
d0, d1,
|
||||
|
||||
// dummy
|
||||
null
|
||||
};
|
||||
@@ -96,6 +120,46 @@ namespace pdflib
|
||||
return COLOR_SCHEME;
|
||||
}
|
||||
|
||||
case q:
|
||||
case Q:
|
||||
case cm:
|
||||
case Do:
|
||||
case BT:
|
||||
case ET:
|
||||
case BX:
|
||||
case EX:
|
||||
case BMC:
|
||||
case BDC:
|
||||
case EMC:
|
||||
case BI:
|
||||
case ID:
|
||||
case EI:
|
||||
case Tc:
|
||||
case Tw:
|
||||
case Tz:
|
||||
case TL:
|
||||
case Tf:
|
||||
case Tr:
|
||||
case Ts:
|
||||
case Td:
|
||||
case TD:
|
||||
case Tm:
|
||||
case TStar:
|
||||
case Tj:
|
||||
case TJ:
|
||||
case accent:
|
||||
case double_accent:
|
||||
case W:
|
||||
case WStar:
|
||||
case MP:
|
||||
case DP:
|
||||
case sh:
|
||||
case d0:
|
||||
case d1:
|
||||
{
|
||||
return rest;
|
||||
}
|
||||
|
||||
// lines
|
||||
case m:
|
||||
case l:
|
||||
@@ -169,6 +233,11 @@ namespace pdflib
|
||||
// text objects
|
||||
else if(name=="BT") { return BT; }
|
||||
else if(name=="ET") { return ET; }
|
||||
else if(name=="BX") { return BX; }
|
||||
else if(name=="EX") { return EX; }
|
||||
else if(name=="BI") { return BI; }
|
||||
else if(name=="ID") { return ID; }
|
||||
else if(name=="EI") { return EI; }
|
||||
|
||||
// Text state
|
||||
else if(name=="Tc") { return Tc; }
|
||||
@@ -222,6 +291,9 @@ namespace pdflib
|
||||
else if(name=="BMC") { return BMC; }
|
||||
else if(name=="BDC") { return BDC; }
|
||||
else if(name=="EMC") { return EMC; }
|
||||
else if(name=="sh") { return sh; }
|
||||
else if(name=="d0") { return d0; }
|
||||
else if(name=="d1") { return d1; }
|
||||
|
||||
else
|
||||
{
|
||||
@@ -273,6 +345,11 @@ namespace pdflib
|
||||
// text
|
||||
case BT: return "BT";
|
||||
case ET: return "ET";
|
||||
case BX: return "BX";
|
||||
case EX: return "EX";
|
||||
case BI: return "BI";
|
||||
case ID: return "ID";
|
||||
case EI: return "EI";
|
||||
|
||||
case Tc: return "Tc";
|
||||
|
||||
@@ -320,6 +397,14 @@ namespace pdflib
|
||||
|
||||
case W: return "W";
|
||||
case WStar: return "W*";
|
||||
case MP: return "MP";
|
||||
case DP: return "DP";
|
||||
case BMC: return "BMC";
|
||||
case BDC: return "BDC";
|
||||
case EMC: return "EMC";
|
||||
case sh: return "sh";
|
||||
case d0: return "d0";
|
||||
case d1: return "d1";
|
||||
|
||||
default:
|
||||
{
|
||||
|
||||
@@ -3,12 +3,137 @@
|
||||
#ifndef PDF_PAGE_XOBJECT_IMAGE_RESOURCE_H
|
||||
#define PDF_PAGE_XOBJECT_IMAGE_RESOURCE_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include <parse/utils/color/icc_utils.h>
|
||||
#include <parse/utils/jpeg/jpeg_utils.h>
|
||||
#include <parse/qpdf/qpdf_compat.h>
|
||||
|
||||
namespace pdflib
|
||||
{
|
||||
|
||||
namespace detail
|
||||
{
|
||||
inline int icc_signature_to_components(char const* sig)
|
||||
{
|
||||
if(std::memcmp(sig, "GRAY", 4) == 0) return 1;
|
||||
if(std::memcmp(sig, "RGB ", 4) == 0) return 3;
|
||||
if(std::memcmp(sig, "CMYK", 4) == 0) return 4;
|
||||
|
||||
if(sig[1] == 'C' and sig[2] == 'L' and sig[3] == 'R')
|
||||
{
|
||||
if(sig[0] >= '2' and sig[0] <= '9') return sig[0] - '0';
|
||||
if(sig[0] >= 'A' and sig[0] <= 'F') return 10 + (sig[0] - 'A');
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline int infer_icc_components_from_profile(QPDFObjectHandle icc_stream,
|
||||
std::string const& context)
|
||||
{
|
||||
if(not icc_stream.isStream())
|
||||
{
|
||||
LOG_S(WARNING) << context << ": ICC object is not a stream";
|
||||
return 0;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
auto profile = to_shared_ptr(icc_stream.getStreamData());
|
||||
if(not profile or profile->getSize() < 20)
|
||||
{
|
||||
LOG_S(WARNING) << context << ": ICC profile too small to inspect";
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto const* bytes = reinterpret_cast<std::uint8_t const*>(profile->getBuffer());
|
||||
int const n = icc_signature_to_components(reinterpret_cast<char const*>(bytes + 16));
|
||||
|
||||
if(n > 0)
|
||||
{
|
||||
LOG_S(INFO) << context << ": inferred ICC components from profile header: N=" << n;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << context << ": unsupported ICC data color space signature";
|
||||
}
|
||||
return n;
|
||||
}
|
||||
catch(std::exception const& e)
|
||||
{
|
||||
LOG_S(WARNING) << context << ": failed to inspect ICC profile stream: " << e.what();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline int cmyk_process_component_index(std::string const& name)
|
||||
{
|
||||
if(name == "/Cyan") return 0;
|
||||
if(name == "/Magenta") return 1;
|
||||
if(name == "/Yellow") return 2;
|
||||
if(name == "/Black") return 3;
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline bool device_n_names_are_process_cmyk_subset(
|
||||
std::vector<std::string> const& names)
|
||||
{
|
||||
if(names.empty())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
for(auto const& name : names)
|
||||
{
|
||||
if(cmyk_process_component_index(name) < 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline std::shared_ptr<std::vector<uint8_t>> expand_device_n_palette_to_cmyk(
|
||||
std::shared_ptr<std::vector<uint8_t>> const& palette,
|
||||
std::vector<std::string> const& names)
|
||||
{
|
||||
if(not palette or names.empty())
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const std::size_t src_components = names.size();
|
||||
if(src_components == 0 or (palette->size() % src_components) != 0)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const std::size_t entry_count = palette->size() / src_components;
|
||||
auto expanded = std::make_shared<std::vector<uint8_t>>();
|
||||
expanded->assign(entry_count * 4u, 0u);
|
||||
|
||||
for(std::size_t entry = 0; entry < entry_count; ++entry)
|
||||
{
|
||||
const std::size_t src_offset = entry * src_components;
|
||||
const std::size_t dst_offset = entry * 4u;
|
||||
for(std::size_t i = 0; i < src_components; ++i)
|
||||
{
|
||||
const int dst_component = cmyk_process_component_index(names[i]);
|
||||
if(dst_component >= 0)
|
||||
{
|
||||
(*expanded)[dst_offset + static_cast<std::size_t>(dst_component)] =
|
||||
(*palette)[src_offset + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return expanded;
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
class pdf_resource<PAGE_XOBJECT_IMAGE>
|
||||
{
|
||||
@@ -31,9 +156,13 @@ namespace pdflib
|
||||
int get_bits_per_component() const;
|
||||
std::string get_color_space() const;
|
||||
int get_icc_components() const;
|
||||
int get_device_n_components() const;
|
||||
std::vector<std::string> get_device_n_names() const;
|
||||
int get_indexed_hival() const;
|
||||
std::string get_indexed_base_cs() const;
|
||||
std::shared_ptr<std::vector<uint8_t>> get_indexed_palette() const;
|
||||
std::vector<std::string> get_indexed_base_device_n_names() const;
|
||||
bool get_indexed_base_device_n_single_black() const;
|
||||
std::string get_intent() const;
|
||||
std::vector<std::string> get_filters() const;
|
||||
|
||||
@@ -91,9 +220,15 @@ namespace pdflib
|
||||
int bits_per_component;
|
||||
std::string color_space;
|
||||
int icc_components = 0; // number of color components from /ICCBased /N entry; 0 if not ICCBased
|
||||
int device_n_components = 0; // number of components from /DeviceN names array; 0 if not DeviceN
|
||||
std::vector<std::string> device_n_names; // names from /DeviceN colorant array
|
||||
int indexed_hival = -1; // hival from /Indexed color space; -1 if not Indexed
|
||||
std::string indexed_base_cs; // base color space name for /Indexed (e.g. "/DeviceRGB")
|
||||
std::shared_ptr<std::vector<uint8_t>> indexed_palette; // raw palette bytes: (hival+1)*ncomps bytes
|
||||
std::shared_ptr<std::vector<uint8_t>> indexed_base_icc_profile;
|
||||
int indexed_base_icc_components = 0;
|
||||
std::vector<std::string> indexed_base_device_n_names;
|
||||
bool indexed_base_device_n_single_black = false;
|
||||
std::string intent;
|
||||
std::vector<std::string> image_filters;
|
||||
|
||||
@@ -246,6 +381,8 @@ namespace pdflib
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "ICCBased stream missing /N entry";
|
||||
icc_components = detail::infer_icc_components_from_profile(
|
||||
icc_stream, "ICCBased");
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -253,6 +390,28 @@ namespace pdflib
|
||||
LOG_S(WARNING) << "ICCBased: second array element is not a stream";
|
||||
}
|
||||
}
|
||||
else if(name_obj.isName() and name_obj.getName() == "/DeviceN")
|
||||
{
|
||||
device_n_names.clear();
|
||||
auto names_obj = qpdf_cs.getArrayItem(1);
|
||||
if(names_obj.isArray())
|
||||
{
|
||||
device_n_components = names_obj.getArrayNItems();
|
||||
for(int i = 0; i < names_obj.getArrayNItems(); ++i)
|
||||
{
|
||||
auto name = names_obj.getArrayItem(i);
|
||||
if(name.isName())
|
||||
{
|
||||
device_n_names.push_back(name.getName());
|
||||
}
|
||||
}
|
||||
LOG_S(INFO) << "DeviceN color space: N=" << device_n_components;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "DeviceN color space: names array missing";
|
||||
}
|
||||
}
|
||||
else if(name_obj.isName() and name_obj.getName() == "/Indexed"
|
||||
and qpdf_cs.getArrayNItems() >= 3)
|
||||
{
|
||||
@@ -260,6 +419,10 @@ namespace pdflib
|
||||
|
||||
// base color space
|
||||
auto base_obj = qpdf_cs.getArrayItem(1);
|
||||
indexed_base_device_n_single_black = false;
|
||||
indexed_base_icc_profile.reset();
|
||||
indexed_base_icc_components = 0;
|
||||
indexed_base_device_n_names.clear();
|
||||
if(base_obj.isName())
|
||||
{
|
||||
indexed_base_cs = base_obj.getName();
|
||||
@@ -272,10 +435,21 @@ namespace pdflib
|
||||
auto icc_stream = base_obj.getArrayItem(1);
|
||||
if(icc_stream.isStream())
|
||||
{
|
||||
auto profile_buf = to_shared_ptr(icc_stream.getStreamData());
|
||||
if(profile_buf and profile_buf->getSize() > 0)
|
||||
{
|
||||
auto const* ptr = reinterpret_cast<const uint8_t*>(
|
||||
profile_buf->getBuffer());
|
||||
indexed_base_icc_profile =
|
||||
std::make_shared<std::vector<uint8_t>>(
|
||||
ptr, ptr + profile_buf->getSize());
|
||||
}
|
||||
|
||||
auto icc_dict = icc_stream.getDict();
|
||||
if(icc_dict.hasKey("/N") and icc_dict.getKey("/N").isInteger())
|
||||
{
|
||||
const int n = icc_dict.getKey("/N").getIntValue();
|
||||
indexed_base_icc_components = n;
|
||||
if(n == 1) { indexed_base_cs = "/DeviceGray"; }
|
||||
else if(n == 3) { indexed_base_cs = "/DeviceRGB"; }
|
||||
else if(n == 4) { indexed_base_cs = "/DeviceCMYK"; }
|
||||
@@ -290,6 +464,12 @@ namespace pdflib
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "Indexed ICCBased base missing /N entry";
|
||||
const int n = detail::infer_icc_components_from_profile(
|
||||
icc_stream, "Indexed ICCBased base");
|
||||
indexed_base_icc_components = n;
|
||||
if(n == 1) { indexed_base_cs = "/DeviceGray"; }
|
||||
else if(n == 3) { indexed_base_cs = "/DeviceRGB"; }
|
||||
else if(n == 4) { indexed_base_cs = "/DeviceCMYK"; }
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -297,6 +477,55 @@ namespace pdflib
|
||||
LOG_S(WARNING) << "Indexed ICCBased base: second array element is not a stream";
|
||||
}
|
||||
}
|
||||
else if(base_name.isName() and base_name.getName() == "/DeviceN")
|
||||
{
|
||||
auto names_obj = base_obj.getArrayItem(1);
|
||||
if(names_obj.isArray())
|
||||
{
|
||||
std::vector<std::string> nested_names;
|
||||
for(int i = 0; i < names_obj.getArrayNItems(); ++i)
|
||||
{
|
||||
auto name = names_obj.getArrayItem(i);
|
||||
if(name.isName())
|
||||
{
|
||||
nested_names.push_back(name.getName());
|
||||
}
|
||||
}
|
||||
indexed_base_device_n_names = nested_names;
|
||||
|
||||
const int nested_n = static_cast<int>(nested_names.size());
|
||||
const bool single_black =
|
||||
nested_n == 1
|
||||
and nested_names[0] == "/Black";
|
||||
const bool process_cmyk_subset =
|
||||
detail::device_n_names_are_process_cmyk_subset(nested_names);
|
||||
indexed_base_device_n_single_black = single_black;
|
||||
|
||||
if(single_black) { indexed_base_cs = "/DeviceGray"; }
|
||||
else if(process_cmyk_subset)
|
||||
{
|
||||
indexed_base_cs = "/DeviceCMYK";
|
||||
LOG_S(INFO) << "Indexed DeviceN base uses process CMYK subset; "
|
||||
<< "will expand palette to CMYK";
|
||||
}
|
||||
else if(nested_n == 3)
|
||||
{
|
||||
indexed_base_cs = "/DeviceRGB";
|
||||
}
|
||||
else if(nested_n == 4)
|
||||
{
|
||||
indexed_base_cs = "/DeviceCMYK";
|
||||
}
|
||||
else
|
||||
{
|
||||
indexed_base_cs = "/DeviceN";
|
||||
LOG_S(WARNING) << "Indexed DeviceN base has unsupported component layout N="
|
||||
<< nested_n;
|
||||
}
|
||||
LOG_S(INFO) << "Indexed DeviceN base: N=" << nested_n
|
||||
<< " -> " << indexed_base_cs;
|
||||
}
|
||||
}
|
||||
else if(base_name.isName())
|
||||
{
|
||||
indexed_base_cs = base_name.getName();
|
||||
@@ -350,6 +579,52 @@ namespace pdflib
|
||||
{
|
||||
LOG_S(WARNING) << "Indexed color space: unrecognized lookup table type";
|
||||
}
|
||||
|
||||
if(indexed_base_cs == "/DeviceCMYK"
|
||||
and not indexed_base_device_n_names.empty()
|
||||
and detail::device_n_names_are_process_cmyk_subset(
|
||||
indexed_base_device_n_names))
|
||||
{
|
||||
auto expanded =
|
||||
detail::expand_device_n_palette_to_cmyk(indexed_palette,
|
||||
indexed_base_device_n_names);
|
||||
if(expanded)
|
||||
{
|
||||
indexed_palette = std::move(expanded);
|
||||
LOG_S(INFO) << "Indexed DeviceN palette expanded to CMYK: "
|
||||
<< indexed_palette->size() << " bytes";
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "Indexed DeviceN palette expansion to CMYK failed";
|
||||
}
|
||||
}
|
||||
|
||||
if(indexed_base_icc_profile
|
||||
and not indexed_base_icc_profile->empty()
|
||||
and indexed_base_icc_components > 0
|
||||
and indexed_palette
|
||||
and not indexed_palette->empty())
|
||||
{
|
||||
auto rgb_palette = icc::transform_palette_to_rgb(
|
||||
*indexed_palette,
|
||||
indexed_base_icc_components,
|
||||
*indexed_base_icc_profile);
|
||||
if(not rgb_palette.empty())
|
||||
{
|
||||
indexed_palette = std::make_shared<std::vector<uint8_t>>(
|
||||
std::move(rgb_palette));
|
||||
indexed_base_cs = "/DeviceRGB";
|
||||
indexed_base_device_n_names.clear();
|
||||
indexed_base_device_n_single_black = false;
|
||||
LOG_S(INFO) << "Indexed ICCBased palette converted to RGB: "
|
||||
<< indexed_palette->size() << " bytes";
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "Indexed ICCBased palette RGB conversion failed";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -443,6 +718,22 @@ namespace pdflib
|
||||
}
|
||||
decode_present = not decode_array.empty();
|
||||
}
|
||||
else if(device_n_components > 0)
|
||||
{
|
||||
const bool single_black =
|
||||
device_n_components == 1
|
||||
and device_n_names.size() == 1
|
||||
and device_n_names[0] == "/Black";
|
||||
LOG_S(INFO) << "no `/Decode` found: using default for DeviceN N="
|
||||
<< device_n_components
|
||||
<< " single_black=" << (single_black ? "true" : "false");
|
||||
for(int i = 0; i < device_n_components; ++i)
|
||||
{
|
||||
decode_array.push_back(single_black ? 1.0 : 0.0);
|
||||
decode_array.push_back(single_black ? 0.0 : 1.0);
|
||||
}
|
||||
decode_present = not decode_array.empty();
|
||||
}
|
||||
else if(indexed_hival >= 0)
|
||||
{
|
||||
// Indexed: default decode is [0, hival] (one component — the palette index)
|
||||
@@ -754,6 +1045,16 @@ namespace pdflib
|
||||
return icc_components;
|
||||
}
|
||||
|
||||
int pdf_resource<PAGE_XOBJECT_IMAGE>::get_device_n_components() const
|
||||
{
|
||||
return device_n_components;
|
||||
}
|
||||
|
||||
std::vector<std::string> pdf_resource<PAGE_XOBJECT_IMAGE>::get_device_n_names() const
|
||||
{
|
||||
return device_n_names;
|
||||
}
|
||||
|
||||
int pdf_resource<PAGE_XOBJECT_IMAGE>::get_indexed_hival() const
|
||||
{
|
||||
return indexed_hival;
|
||||
@@ -769,6 +1070,16 @@ namespace pdflib
|
||||
return indexed_palette;
|
||||
}
|
||||
|
||||
std::vector<std::string> pdf_resource<PAGE_XOBJECT_IMAGE>::get_indexed_base_device_n_names() const
|
||||
{
|
||||
return indexed_base_device_n_names;
|
||||
}
|
||||
|
||||
bool pdf_resource<PAGE_XOBJECT_IMAGE>::get_indexed_base_device_n_single_black() const
|
||||
{
|
||||
return indexed_base_device_n_single_black;
|
||||
}
|
||||
|
||||
std::string pdf_resource<PAGE_XOBJECT_IMAGE>::get_intent() const
|
||||
{
|
||||
return intent;
|
||||
|
||||
+385
-44
@@ -3,7 +3,11 @@
|
||||
#ifndef PDF_BITMAP_STATE_H
|
||||
#define PDF_BITMAP_STATE_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#include <parse/utils/ccitt/ccitt_utils.h>
|
||||
#include <parse/utils/jpx/jpx_utils.h>
|
||||
#include <third_party/pdfium_jbig2.h>
|
||||
|
||||
namespace pdflib
|
||||
@@ -148,12 +152,17 @@ namespace pdflib
|
||||
image.ccitt_k = xobj.get_ccitt_k();
|
||||
image.ccitt_black_is_1 = xobj.get_ccitt_black_is_1();
|
||||
image.icc_components = xobj.get_icc_components();
|
||||
image.device_n_components = xobj.get_device_n_components();
|
||||
image.device_n_names = xobj.get_device_n_names();
|
||||
image.jbig2_globals_data = xobj.get_jbig2_globals_data();
|
||||
|
||||
// propagate /Indexed color space data
|
||||
image.indexed_hival = xobj.get_indexed_hival();
|
||||
image.indexed_base_cs = xobj.get_indexed_base_cs();
|
||||
image.indexed_palette = xobj.get_indexed_palette();
|
||||
image.indexed_base_device_n_names = xobj.get_indexed_base_device_n_names();
|
||||
image.indexed_base_device_n_single_black =
|
||||
xobj.get_indexed_base_device_n_single_black();
|
||||
|
||||
// propagate graphics state
|
||||
image.has_graphics_state = true;
|
||||
@@ -171,8 +180,229 @@ namespace pdflib
|
||||
std::shared_ptr<std::vector<uint8_t>> pixel_data;
|
||||
std::array<int, 3> pixel_shape = {0, 0, 0};
|
||||
pixel_format fmt = PIXEL_FORMAT_UNKNOWN;
|
||||
cmyk_convention cmyk_conv = CMYK_CONVENTION_UNKNOWN;
|
||||
|
||||
int channels = 0;
|
||||
auto has_default_adobe_cmyk_decode = [&](std::vector<double> const& decode_array) -> bool
|
||||
{
|
||||
static constexpr double expected_decode[8] = {
|
||||
1.0, 0.0, 1.0, 0.0,
|
||||
1.0, 0.0, 1.0, 0.0
|
||||
};
|
||||
if(decode_array.size() < 8)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
for(int i = 0; i < 8; ++i)
|
||||
{
|
||||
if(std::abs(decode_array[static_cast<std::size_t>(i)] - expected_decode[i]) > 1e-12)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
auto apply_decode_to_u8_samples = [&](std::shared_ptr<std::vector<uint8_t>>& dst,
|
||||
int ncomps) -> void
|
||||
{
|
||||
if(not dst or ncomps <= 0 or not image.decode_present or image.decode_array.size() < 2)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const int pair_count = static_cast<int>(image.decode_array.size() / 2);
|
||||
for(size_t i = 0; i < dst->size(); ++i)
|
||||
{
|
||||
const int comp = static_cast<int>(i % static_cast<size_t>(ncomps));
|
||||
if(comp < pair_count)
|
||||
{
|
||||
(*dst)[i] = jpeg::apply_decode_component(
|
||||
(*dst)[i],
|
||||
image.decode_array[2 * comp + 0],
|
||||
image.decode_array[2 * comp + 1]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto expand_indexed_samples = [&](int ncomps,
|
||||
const uint8_t* indices,
|
||||
size_t n_indices,
|
||||
int w,
|
||||
int h) -> bool
|
||||
{
|
||||
if(ncomps <= 0 or not image.indexed_palette or image.indexed_palette->empty() or not indices)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& palette = *image.indexed_palette;
|
||||
auto expanded = std::make_shared<std::vector<uint8_t>>();
|
||||
expanded->reserve(static_cast<size_t>(w) * h * ncomps);
|
||||
|
||||
for(size_t i = 0; i < n_indices; ++i)
|
||||
{
|
||||
int idx = static_cast<int>(indices[i]);
|
||||
if(image.indexed_hival >= 0 and idx > image.indexed_hival)
|
||||
{
|
||||
idx = image.indexed_hival;
|
||||
}
|
||||
|
||||
const size_t palette_offset = static_cast<size_t>(idx) * ncomps;
|
||||
if(palette_offset + ncomps <= palette.size())
|
||||
{
|
||||
for(int c = 0; c < ncomps; ++c)
|
||||
{
|
||||
expanded->push_back(palette[palette_offset + c]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int c = 0; c < ncomps; ++c)
|
||||
{
|
||||
expanded->push_back(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pixel_data = std::move(expanded);
|
||||
pixel_shape = {h, w, ncomps};
|
||||
channels = ncomps;
|
||||
if(fmt == PIXEL_FORMAT_CMYK
|
||||
and detail::device_n_names_are_process_cmyk_subset(image.indexed_base_device_n_names))
|
||||
{
|
||||
cmyk_conv = CMYK_CONVENTION_PROCESS;
|
||||
}
|
||||
|
||||
if(image.indexed_base_device_n_single_black and ncomps == 1)
|
||||
{
|
||||
for(auto& sample : *pixel_data)
|
||||
{
|
||||
sample = static_cast<uint8_t>(255 - sample);
|
||||
}
|
||||
LOG_S(INFO) << "bitmap: inverted Indexed single-Black DeviceN palette "
|
||||
<< "for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto unpack_subbyte_samples_to_u8 =
|
||||
[&](std::shared_ptr<Buffer> const& src,
|
||||
int w,
|
||||
int h,
|
||||
int ncomps,
|
||||
int bits_per_component,
|
||||
std::vector<double> const& decode_array) -> bool
|
||||
{
|
||||
// QPDF's getStreamData() decodes the filter chain, but it does not
|
||||
// expand sub-8-bit image samples into one byte per component. For a
|
||||
// `/FlateDecode` image with `/BitsPerComponent 1`, the decoded stream
|
||||
// therefore still contains packed bits (with producer-dependent row
|
||||
// padding), while the renderer expects a dense 8-bit-per-component
|
||||
// buffer. This helper performs that expansion and applies the image's
|
||||
// `/Decode` mapping while unpacking.
|
||||
if(not src or src->getSize() == 0 or w <= 0 or h <= 0 or ncomps <= 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if(bits_per_component <= 0 or bits_per_component >= 8)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::size_t row_bits =
|
||||
static_cast<std::size_t>(w) * static_cast<std::size_t>(ncomps)
|
||||
* static_cast<std::size_t>(bits_per_component);
|
||||
const std::size_t min_row_bytes = (row_bits + 7u) / 8u;
|
||||
if(min_row_bytes == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::size_t src_size = src->getSize();
|
||||
const std::size_t min_total = min_row_bytes * static_cast<std::size_t>(h);
|
||||
if(src_size < min_total)
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: packed decoded_stream_data too small ("
|
||||
<< src_size << " < " << min_total
|
||||
<< ") for sub-byte image xobject_key=" << image.xobject_key
|
||||
<< " width=" << w
|
||||
<< " height=" << h
|
||||
<< " channels=" << ncomps
|
||||
<< " bpc=" << bits_per_component;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Per the PDF spec, rows are exactly min_row_bytes wide. QPDF's
|
||||
// getStreamData() may return more bytes than width*height*bpc/8 (e.g.
|
||||
// trailing data), but those extra bytes are not per-row padding and must
|
||||
// not be used to inflate the row stride.
|
||||
const std::size_t row_stride = min_row_bytes;
|
||||
|
||||
const std::uint32_t sample_max = (1u << bits_per_component) - 1u;
|
||||
auto decode_sample =
|
||||
[&](int component_index, std::uint32_t raw_sample) -> std::uint8_t
|
||||
{
|
||||
const int pair_count = static_cast<int>(decode_array.size() / 2);
|
||||
if(component_index < pair_count)
|
||||
{
|
||||
const double dmin = decode_array[2 * component_index + 0];
|
||||
const double dmax = decode_array[2 * component_index + 1];
|
||||
const double norm =
|
||||
static_cast<double>(raw_sample) / static_cast<double>(sample_max);
|
||||
const double decoded = dmin + norm * (dmax - dmin);
|
||||
const double clamped = std::clamp(decoded, 0.0, 1.0);
|
||||
return static_cast<std::uint8_t>(std::lround(clamped * 255.0));
|
||||
}
|
||||
|
||||
// Absent /Decode entry: fall back to PDF's identity mapping.
|
||||
const double norm =
|
||||
static_cast<double>(raw_sample) / static_cast<double>(sample_max);
|
||||
return static_cast<std::uint8_t>(std::lround(norm * 255.0));
|
||||
};
|
||||
|
||||
const auto* bytes = reinterpret_cast<const std::uint8_t*>(src->getBuffer());
|
||||
auto expanded = std::make_shared<std::vector<uint8_t>>();
|
||||
expanded->reserve(static_cast<std::size_t>(w) * h * ncomps);
|
||||
|
||||
for(int row = 0; row < h; ++row)
|
||||
{
|
||||
const auto* row_ptr = bytes + static_cast<std::size_t>(row) * row_stride;
|
||||
std::size_t bit_offset = 0;
|
||||
for(int col = 0; col < w; ++col)
|
||||
{
|
||||
for(int comp = 0; comp < ncomps; ++comp)
|
||||
{
|
||||
std::uint32_t raw_sample = 0u;
|
||||
for(int bit = 0; bit < bits_per_component; ++bit)
|
||||
{
|
||||
const std::size_t absolute_bit = bit_offset + static_cast<std::size_t>(bit);
|
||||
const std::size_t byte_index = absolute_bit / 8u;
|
||||
const int bit_in_byte = 7 - static_cast<int>(absolute_bit % 8u);
|
||||
const std::uint8_t byte = row_ptr[byte_index];
|
||||
raw_sample = (raw_sample << 1u)
|
||||
| static_cast<std::uint32_t>((byte >> bit_in_byte) & 1u);
|
||||
}
|
||||
expanded->push_back(decode_sample(comp, raw_sample));
|
||||
bit_offset += static_cast<std::size_t>(bits_per_component);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pixel_data = std::move(expanded);
|
||||
pixel_shape = {h, w, ncomps};
|
||||
channels = ncomps;
|
||||
|
||||
LOG_S(INFO) << "bitmap: unpacked sub-byte decoded_stream_data"
|
||||
<< " for xobject_key=" << image.xobject_key
|
||||
<< " width=" << w
|
||||
<< " height=" << h
|
||||
<< " channels=" << ncomps
|
||||
<< " bpc=" << bits_per_component
|
||||
<< " row_stride=" << row_stride
|
||||
<< " output_size=" << pixel_data->size();
|
||||
return true;
|
||||
};
|
||||
|
||||
if(image.image_mask)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_GRAY; channels = 1;
|
||||
@@ -215,6 +445,30 @@ namespace pdflib
|
||||
<< " for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
}
|
||||
else if(image.color_space.find("/DeviceN") != std::string::npos
|
||||
and image.device_n_components > 0)
|
||||
{
|
||||
LOG_S(INFO) << "bitmap: DeviceN color space with N=" << image.device_n_components
|
||||
<< " for xobject_key=" << image.xobject_key;
|
||||
if(image.device_n_components == 1)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_GRAY; channels = 1;
|
||||
}
|
||||
else if(image.device_n_components == 3)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_RGB; channels = 3;
|
||||
}
|
||||
else if(image.device_n_components == 4)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_CMYK; channels = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: DeviceN with unsupported N="
|
||||
<< image.device_n_components
|
||||
<< " for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
}
|
||||
else if(image.indexed_palette and not image.indexed_palette->empty())
|
||||
{
|
||||
// /Indexed: expand palette indices into base color space pixels.
|
||||
@@ -239,51 +493,22 @@ namespace pdflib
|
||||
<< "' for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
|
||||
channels = ncomps;
|
||||
|
||||
if(ncomps > 0 and image.decoded_stream_data and image.decoded_stream_data->getSize() > 0)
|
||||
{
|
||||
const int w = image.image_width;
|
||||
const int h = image.image_height;
|
||||
const auto& palette = *image.indexed_palette;
|
||||
const auto* indices = reinterpret_cast<const uint8_t*>(
|
||||
image.decoded_stream_data->getBuffer());
|
||||
const size_t n_indices = image.decoded_stream_data->getSize();
|
||||
|
||||
auto expanded = std::make_shared<std::vector<uint8_t>>();
|
||||
expanded->reserve(static_cast<size_t>(w) * h * ncomps);
|
||||
|
||||
for(size_t i = 0; i < n_indices; ++i)
|
||||
if(expand_indexed_samples(ncomps, indices, n_indices, w, h))
|
||||
{
|
||||
int idx = static_cast<int>(indices[i]);
|
||||
// clamp to hival
|
||||
if(image.indexed_hival >= 0 and idx > image.indexed_hival)
|
||||
{
|
||||
idx = image.indexed_hival;
|
||||
}
|
||||
const size_t palette_offset = static_cast<size_t>(idx) * ncomps;
|
||||
if(palette_offset + ncomps <= palette.size())
|
||||
{
|
||||
for(int c = 0; c < ncomps; ++c)
|
||||
{
|
||||
expanded->push_back(palette[palette_offset + c]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// out-of-range index: fill with zeros
|
||||
for(int c = 0; c < ncomps; ++c)
|
||||
{
|
||||
expanded->push_back(0);
|
||||
}
|
||||
}
|
||||
LOG_S(INFO) << "bitmap: expanded Indexed palette for xobject_key="
|
||||
<< image.xobject_key
|
||||
<< " (" << n_indices << " indices -> "
|
||||
<< pixel_data->size() << " bytes, ncomps=" << ncomps << ")";
|
||||
}
|
||||
|
||||
pixel_data = std::move(expanded);
|
||||
pixel_shape = {h, w, ncomps};
|
||||
channels = ncomps; // mark as handled
|
||||
LOG_S(INFO) << "bitmap: expanded Indexed palette for xobject_key="
|
||||
<< image.xobject_key
|
||||
<< " (" << n_indices << " indices → "
|
||||
<< pixel_data->size() << " bytes, ncomps=" << ncomps << ")";
|
||||
}
|
||||
else if(ncomps > 0)
|
||||
{
|
||||
@@ -313,25 +538,44 @@ namespace pdflib
|
||||
"/DCTDecode") != image.filters.end();
|
||||
const bool has_flate = std::find(image.filters.begin(), image.filters.end(),
|
||||
"/FlateDecode") != image.filters.end();
|
||||
const bool has_jpx = std::find(image.filters.begin(), image.filters.end(),
|
||||
"/JPXDecode") != image.filters.end();
|
||||
|
||||
if (image.decoded_stream_data and image.decoded_stream_data->getSize() > 0)
|
||||
{
|
||||
const int w = image.image_width;
|
||||
const int h = image.image_height;
|
||||
const size_t expected = static_cast<size_t>(w) * h * channels;
|
||||
const auto src = image.decoded_stream_data;
|
||||
|
||||
if (src->getSize() >= expected)
|
||||
if(image.bits_per_component > 0 and image.bits_per_component < 8)
|
||||
{
|
||||
const auto* raw = reinterpret_cast<const uint8_t*>(src->getBuffer());
|
||||
pixel_data = std::make_shared<std::vector<uint8_t>>(raw, raw + expected);
|
||||
pixel_shape = {h, w, channels};
|
||||
if(not unpack_subbyte_samples_to_u8(src,
|
||||
w,
|
||||
h,
|
||||
channels,
|
||||
image.bits_per_component,
|
||||
image.decode_array))
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: failed to unpack sub-byte decoded_stream_data "
|
||||
<< "for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: decoded_stream_data too small ("
|
||||
<< src->getSize() << " < " << expected
|
||||
<< ") for xobject_key=" << image.xobject_key;
|
||||
const size_t expected = static_cast<size_t>(w) * h * channels;
|
||||
if (src->getSize() >= expected)
|
||||
{
|
||||
const auto* raw = reinterpret_cast<const uint8_t*>(src->getBuffer());
|
||||
pixel_data = std::make_shared<std::vector<uint8_t>>(raw, raw + expected);
|
||||
apply_decode_to_u8_samples(pixel_data, channels);
|
||||
pixel_shape = {h, w, channels};
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: decoded_stream_data too small ("
|
||||
<< src->getSize() << " < " << expected
|
||||
<< ") for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (has_dct and image.raw_stream_data and image.raw_stream_data->getSize() > 0)
|
||||
@@ -387,6 +631,10 @@ namespace pdflib
|
||||
else if(decoded_channels == 4)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_CMYK;
|
||||
if(image.decode_present and has_default_adobe_cmyk_decode(image.decode_array))
|
||||
{
|
||||
cmyk_conv = CMYK_CONVENTION_ADOBE_INVERTED;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -404,6 +652,98 @@ namespace pdflib
|
||||
<< "for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
}
|
||||
else if (has_jpx and image.raw_stream_data and image.raw_stream_data->getSize() > 0)
|
||||
{
|
||||
LOG_S(INFO) << "bitmap: decoded_stream_data unavailable for /JPXDecode image, "
|
||||
<< "decoding JPEG2000 via OpenJPEG "
|
||||
<< "for xobject_key=" << image.xobject_key;
|
||||
|
||||
auto decoded = jpx::decode_jpx_to_raw_pixels(
|
||||
reinterpret_cast<uint8_t const*>(image.raw_stream_data->getBuffer()),
|
||||
static_cast<std::size_t>(image.raw_stream_data->getSize()));
|
||||
|
||||
if(not decoded.empty())
|
||||
{
|
||||
if(image.indexed_palette and not image.indexed_palette->empty())
|
||||
{
|
||||
LOG_S(INFO) << "bitmap: Indexed JPX fallback metadata "
|
||||
<< "xobject_key=" << image.xobject_key
|
||||
<< " base_cs=" << image.indexed_base_cs
|
||||
<< " expected_components=" << channels
|
||||
<< " palette_bytes=" << image.indexed_palette->size()
|
||||
<< " decoded_components=" << decoded.components;
|
||||
|
||||
if(decoded.components == 1
|
||||
and expand_indexed_samples(channels,
|
||||
decoded.pixels.data(),
|
||||
decoded.pixels.size(),
|
||||
decoded.width,
|
||||
decoded.height))
|
||||
{
|
||||
LOG_S(INFO) << "bitmap: OpenJPEG decode succeeded for Indexed image "
|
||||
<< "xobject_key=" << image.xobject_key
|
||||
<< " actual_shape=" << decoded.height << "x"
|
||||
<< decoded.width << "x" << decoded.components
|
||||
<< " expanded_shape=" << pixel_shape[0] << "x"
|
||||
<< pixel_shape[1] << "x" << pixel_shape[2];
|
||||
}
|
||||
else if(decoded.components == channels)
|
||||
{
|
||||
pixel_data = std::make_shared<std::vector<uint8_t>>(std::move(decoded.pixels));
|
||||
pixel_shape = {decoded.height, decoded.width, decoded.components};
|
||||
|
||||
LOG_S(INFO) << "bitmap: OpenJPEG returned already-expanded pixels "
|
||||
<< "for Indexed image xobject_key=" << image.xobject_key
|
||||
<< " actual_cs=" << jpeg::color_space_name(decoded.color_space)
|
||||
<< " actual_shape=" << decoded.height << "x"
|
||||
<< decoded.width << "x" << decoded.components;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: OpenJPEG decode for Indexed image returned "
|
||||
<< decoded.components
|
||||
<< " components, expected 1, for xobject_key="
|
||||
<< image.xobject_key;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pixel_data = std::make_shared<std::vector<uint8_t>>(std::move(decoded.pixels));
|
||||
apply_decode_to_u8_samples(pixel_data, decoded.components);
|
||||
pixel_shape = {decoded.height, decoded.width, decoded.components};
|
||||
channels = decoded.components;
|
||||
|
||||
if(decoded.components == 1)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_GRAY;
|
||||
}
|
||||
else if(decoded.components == 3)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_RGB;
|
||||
}
|
||||
else if(decoded.components == 4)
|
||||
{
|
||||
fmt = PIXEL_FORMAT_CMYK;
|
||||
cmyk_conv = CMYK_CONVENTION_PROCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
fmt = PIXEL_FORMAT_UNKNOWN;
|
||||
}
|
||||
|
||||
LOG_S(INFO) << "bitmap: OpenJPEG decode succeeded "
|
||||
<< "for xobject_key=" << image.xobject_key
|
||||
<< " actual_cs=" << jpeg::color_space_name(decoded.color_space)
|
||||
<< " actual_shape=" << decoded.height << "x"
|
||||
<< decoded.width << "x" << decoded.components;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "bitmap: OpenJPEG decode failed "
|
||||
<< "for xobject_key=" << image.xobject_key;
|
||||
}
|
||||
}
|
||||
else if (std::find(image.filters.begin(), image.filters.end(),
|
||||
"/JBIG2Decode") != image.filters.end()
|
||||
and ((image.raw_stream_data and image.raw_stream_data->getSize() > 0)
|
||||
@@ -582,6 +922,7 @@ namespace pdflib
|
||||
bitmap_instruction binstr(image.xobject_key,
|
||||
std::move(pixel_data),
|
||||
image.soft_mask_data,
|
||||
cmyk_conv,
|
||||
pixel_shape,
|
||||
fmt,
|
||||
image.image_mask,
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
//-*-C++-*-
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#include <lcms2.h>
|
||||
|
||||
#ifndef LOGURU_WITH_STREAMS
|
||||
#define LOGURU_WITH_STREAMS 1
|
||||
#endif
|
||||
#include <loguru.hpp>
|
||||
|
||||
namespace pdflib::icc
|
||||
{
|
||||
inline std::vector<uint8_t> transform_palette_to_rgb(
|
||||
std::vector<uint8_t> const& palette,
|
||||
int components,
|
||||
std::vector<uint8_t> const& profile_bytes)
|
||||
{
|
||||
if(profile_bytes.empty() or palette.empty() or components <= 0)
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
if((palette.size() % static_cast<std::size_t>(components)) != 0u)
|
||||
{
|
||||
LOG_S(WARNING) << "icc: palette size is not divisible by component count";
|
||||
return {};
|
||||
}
|
||||
|
||||
cmsUInt32Number input_type = 0;
|
||||
switch(components)
|
||||
{
|
||||
case 1: input_type = TYPE_GRAY_8; break;
|
||||
case 3: input_type = TYPE_RGB_8; break;
|
||||
case 4: input_type = TYPE_CMYK_8; break;
|
||||
default:
|
||||
LOG_S(WARNING) << "icc: unsupported palette component count " << components;
|
||||
return {};
|
||||
}
|
||||
|
||||
cmsHPROFILE input_profile = cmsOpenProfileFromMem(
|
||||
profile_bytes.data(), static_cast<cmsUInt32Number>(profile_bytes.size()));
|
||||
if(not input_profile)
|
||||
{
|
||||
LOG_S(WARNING) << "icc: failed to open embedded ICC profile";
|
||||
return {};
|
||||
}
|
||||
|
||||
cmsHPROFILE output_profile = cmsCreate_sRGBProfile();
|
||||
if(not output_profile)
|
||||
{
|
||||
cmsCloseProfile(input_profile);
|
||||
LOG_S(WARNING) << "icc: failed to create sRGB profile";
|
||||
return {};
|
||||
}
|
||||
|
||||
cmsHTRANSFORM transform = cmsCreateTransform(input_profile,
|
||||
input_type,
|
||||
output_profile,
|
||||
TYPE_RGB_8,
|
||||
INTENT_RELATIVE_COLORIMETRIC,
|
||||
0);
|
||||
if(not transform)
|
||||
{
|
||||
cmsCloseProfile(output_profile);
|
||||
cmsCloseProfile(input_profile);
|
||||
LOG_S(WARNING) << "icc: failed to create ICC transform";
|
||||
return {};
|
||||
}
|
||||
|
||||
const cmsUInt32Number entry_count =
|
||||
static_cast<cmsUInt32Number>(palette.size() / static_cast<std::size_t>(components));
|
||||
std::vector<uint8_t> rgb(static_cast<std::size_t>(entry_count) * 3u, 0u);
|
||||
cmsDoTransform(transform, palette.data(), rgb.data(), entry_count);
|
||||
|
||||
cmsDeleteTransform(transform);
|
||||
cmsCloseProfile(output_profile);
|
||||
cmsCloseProfile(input_profile);
|
||||
return rgb;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,244 @@
|
||||
//-*-C++-*-
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include <openjpeg.h>
|
||||
|
||||
#include <parse/utils/jpeg/jpeg_utils.h>
|
||||
|
||||
#ifndef LOGURU_WITH_STREAMS
|
||||
#define LOGURU_WITH_STREAMS 1
|
||||
#endif
|
||||
#include <loguru.hpp>
|
||||
|
||||
namespace pdflib {
|
||||
namespace jpx {
|
||||
|
||||
class decoded_jpx_result {
|
||||
public:
|
||||
std::vector<uint8_t> pixels;
|
||||
int width = 0;
|
||||
int height = 0;
|
||||
int components = 0;
|
||||
jpeg::ColorSpace color_space = jpeg::ColorSpace::Unknown;
|
||||
|
||||
bool empty() const { return pixels.empty(); }
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
struct memory_stream_context {
|
||||
uint8_t const* data = nullptr;
|
||||
OPJ_SIZE_T size = 0;
|
||||
OPJ_SIZE_T offset = 0;
|
||||
};
|
||||
|
||||
inline OPJ_SIZE_T stream_read(void* buffer, OPJ_SIZE_T bytes, void* user_data)
|
||||
{
|
||||
auto* ctx = reinterpret_cast<memory_stream_context*>(user_data);
|
||||
if(!ctx || !buffer || ctx->offset >= ctx->size) {
|
||||
return static_cast<OPJ_SIZE_T>(-1);
|
||||
}
|
||||
|
||||
const OPJ_SIZE_T remaining = ctx->size - ctx->offset;
|
||||
const OPJ_SIZE_T to_copy = std::min(bytes, remaining);
|
||||
std::memcpy(buffer, ctx->data + ctx->offset, to_copy);
|
||||
ctx->offset += to_copy;
|
||||
return to_copy;
|
||||
}
|
||||
|
||||
inline OPJ_OFF_T stream_skip(OPJ_OFF_T bytes, void* user_data)
|
||||
{
|
||||
auto* ctx = reinterpret_cast<memory_stream_context*>(user_data);
|
||||
if(!ctx || bytes < 0) {
|
||||
return static_cast<OPJ_OFF_T>(-1);
|
||||
}
|
||||
|
||||
const auto remaining = static_cast<OPJ_OFF_T>(ctx->size - ctx->offset);
|
||||
const auto to_skip = std::min(bytes, remaining);
|
||||
ctx->offset += static_cast<OPJ_SIZE_T>(to_skip);
|
||||
return to_skip;
|
||||
}
|
||||
|
||||
inline OPJ_BOOL stream_seek(OPJ_OFF_T bytes, void* user_data)
|
||||
{
|
||||
auto* ctx = reinterpret_cast<memory_stream_context*>(user_data);
|
||||
if(!ctx || bytes < 0 || static_cast<OPJ_SIZE_T>(bytes) > ctx->size) {
|
||||
return OPJ_FALSE;
|
||||
}
|
||||
|
||||
ctx->offset = static_cast<OPJ_SIZE_T>(bytes);
|
||||
return OPJ_TRUE;
|
||||
}
|
||||
|
||||
inline void stream_free(void*) {}
|
||||
|
||||
inline bool has_jp2_signature(uint8_t const* data, std::size_t size)
|
||||
{
|
||||
static constexpr uint8_t kJP2Header[] = {
|
||||
0x00, 0x00, 0x00, 0x0c, 0x6a, 0x50,
|
||||
0x20, 0x20, 0x0d, 0x0a, 0x87, 0x0a
|
||||
};
|
||||
return data && size >= sizeof(kJP2Header)
|
||||
&& std::memcmp(data, kJP2Header, sizeof(kJP2Header)) == 0;
|
||||
}
|
||||
|
||||
inline void ignore_callback(const char*, void*) {}
|
||||
|
||||
inline uint8_t clamp_component_to_u8(int value, int precision, bool is_signed)
|
||||
{
|
||||
if(precision <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(is_signed) {
|
||||
value += 1 << std::max(precision - 1, 0);
|
||||
}
|
||||
|
||||
const int max_value = (precision >= 31) ? 0x7fffffff : ((1 << precision) - 1);
|
||||
value = std::clamp(value, 0, max_value);
|
||||
|
||||
if(precision == 8) {
|
||||
return static_cast<uint8_t>(value);
|
||||
}
|
||||
|
||||
if(precision > 8) {
|
||||
const int shift = precision - 8;
|
||||
value = (value >> shift) + ((value >> std::max(shift - 1, 0)) & (shift > 0 ? 1 : 0));
|
||||
return static_cast<uint8_t>(std::clamp(value, 0, 255));
|
||||
}
|
||||
|
||||
const int scaled = (value * 255 + max_value / 2) / std::max(max_value, 1);
|
||||
return static_cast<uint8_t>(std::clamp(scaled, 0, 255));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
inline decoded_jpx_result decode_jpx_to_raw_pixels(uint8_t const* data,
|
||||
std::size_t size)
|
||||
{
|
||||
decoded_jpx_result result;
|
||||
|
||||
if(!data || size == 0) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: empty input";
|
||||
return result;
|
||||
}
|
||||
|
||||
detail::memory_stream_context ctx{data, static_cast<OPJ_SIZE_T>(size), 0};
|
||||
opj_stream_t* stream = opj_stream_create(1u << 16, OPJ_TRUE);
|
||||
if(!stream) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: failed to create stream";
|
||||
return result;
|
||||
}
|
||||
|
||||
opj_stream_set_user_data(stream, &ctx, detail::stream_free);
|
||||
opj_stream_set_user_data_length(stream, ctx.size);
|
||||
opj_stream_set_read_function(stream, detail::stream_read);
|
||||
opj_stream_set_skip_function(stream, detail::stream_skip);
|
||||
opj_stream_set_seek_function(stream, detail::stream_seek);
|
||||
|
||||
opj_dparameters_t params{};
|
||||
opj_set_default_decoder_parameters(¶ms);
|
||||
|
||||
const auto codec_format =
|
||||
detail::has_jp2_signature(data, size) ? OPJ_CODEC_JP2 : OPJ_CODEC_J2K;
|
||||
opj_codec_t* codec = opj_create_decompress(codec_format);
|
||||
if(!codec) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: failed to create codec";
|
||||
opj_stream_destroy(stream);
|
||||
return result;
|
||||
}
|
||||
|
||||
opj_set_info_handler(codec, detail::ignore_callback, nullptr);
|
||||
opj_set_warning_handler(codec, detail::ignore_callback, nullptr);
|
||||
opj_set_error_handler(codec, detail::ignore_callback, nullptr);
|
||||
|
||||
if(!opj_setup_decoder(codec, ¶ms)) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: opj_setup_decoder failed";
|
||||
opj_destroy_codec(codec);
|
||||
opj_stream_destroy(stream);
|
||||
return result;
|
||||
}
|
||||
|
||||
opj_image_t* image = nullptr;
|
||||
if(!opj_read_header(stream, codec, &image) ||
|
||||
!image ||
|
||||
!opj_decode(codec, stream, image) ||
|
||||
!opj_end_decompress(codec, stream)) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: OpenJPEG decode failed";
|
||||
if(image) {
|
||||
opj_image_destroy(image);
|
||||
}
|
||||
opj_destroy_codec(codec);
|
||||
opj_stream_destroy(stream);
|
||||
return result;
|
||||
}
|
||||
|
||||
opj_stream_destroy(stream);
|
||||
opj_destroy_codec(codec);
|
||||
|
||||
if(image->numcomps <= 0) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: no image components";
|
||||
opj_image_destroy(image);
|
||||
return result;
|
||||
}
|
||||
|
||||
const int width = static_cast<int>(image->comps[0].w);
|
||||
const int height = static_cast<int>(image->comps[0].h);
|
||||
const int numcomps = static_cast<int>(image->numcomps);
|
||||
|
||||
if(width <= 0 || height <= 0) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: invalid dimensions "
|
||||
<< width << "x" << height;
|
||||
opj_image_destroy(image);
|
||||
return result;
|
||||
}
|
||||
|
||||
for(int c = 0; c < numcomps; ++c) {
|
||||
if(static_cast<int>(image->comps[c].w) != width ||
|
||||
static_cast<int>(image->comps[c].h) != height) {
|
||||
LOG_S(WARNING) << "decode_jpx_to_raw_pixels: unsupported subsampled component layout "
|
||||
<< "component=" << c
|
||||
<< " size=" << image->comps[c].w << "x" << image->comps[c].h
|
||||
<< " expected=" << width << "x" << height;
|
||||
opj_image_destroy(image);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
result.width = width;
|
||||
result.height = height;
|
||||
result.components = numcomps;
|
||||
result.color_space = (numcomps == 1) ? jpeg::ColorSpace::Gray
|
||||
: (numcomps == 3) ? jpeg::ColorSpace::RGB
|
||||
: (numcomps == 4) ? jpeg::ColorSpace::CMYK
|
||||
: jpeg::ColorSpace::Unknown;
|
||||
|
||||
const auto pixel_count = static_cast<std::size_t>(width) * height;
|
||||
result.pixels.resize(pixel_count * static_cast<std::size_t>(numcomps));
|
||||
|
||||
for(std::size_t i = 0; i < pixel_count; ++i) {
|
||||
for(int c = 0; c < numcomps; ++c) {
|
||||
const auto& comp = image->comps[c];
|
||||
result.pixels[i * static_cast<std::size_t>(numcomps) + static_cast<std::size_t>(c)] =
|
||||
detail::clamp_component_to_u8(comp.data[i], comp.prec, comp.sgnd != 0);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_S(INFO) << "decode_jpx_to_raw_pixels: decoded "
|
||||
<< width << "x" << height
|
||||
<< " components=" << numcomps
|
||||
<< " color_space=" << jpeg::color_space_name(result.color_space)
|
||||
<< " input_size=" << size;
|
||||
|
||||
opj_image_destroy(image);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace jpx
|
||||
} // namespace pdflib
|
||||
@@ -1148,9 +1148,18 @@ namespace pdflib
|
||||
const uint8_t y = src_data->at(idx + 2);
|
||||
const uint8_t k = src_data->at(idx + 3);
|
||||
|
||||
r = static_cast<uint8_t>((static_cast<unsigned int>(c) * k) / 255u);
|
||||
g = static_cast<uint8_t>((static_cast<unsigned int>(m) * k) / 255u);
|
||||
b = static_cast<uint8_t>((static_cast<unsigned int>(y) * k) / 255u);
|
||||
if(instr.get_cmyk_convention() == CMYK_CONVENTION_PROCESS)
|
||||
{
|
||||
r = static_cast<uint8_t>(((255u - c) * (255u - k)) / 255u);
|
||||
g = static_cast<uint8_t>(((255u - m) * (255u - k)) / 255u);
|
||||
b = static_cast<uint8_t>(((255u - y) * (255u - k)) / 255u);
|
||||
}
|
||||
else
|
||||
{
|
||||
r = static_cast<uint8_t>((static_cast<unsigned int>(c) * k) / 255u);
|
||||
g = static_cast<uint8_t>((static_cast<unsigned int>(m) * k) / 255u);
|
||||
b = static_cast<uint8_t>((static_cast<unsigned int>(y) * k) / 255u);
|
||||
}
|
||||
}
|
||||
else if (fmt == PIXEL_FORMAT_GRAY)
|
||||
{
|
||||
|
||||
+2
-1
@@ -5,7 +5,8 @@ from pathlib import Path
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
HF_DATASET_REPO_ID = "docling-project/regression-dataset-for-docling-parse"
|
||||
HF_DATASET_REVISION = "5d7c3d7b575397ca5b2a943171b0da4fe08c5a5b"
|
||||
# HF_DATASET_REVISION = "5d7c3d7b575397ca5b2a943171b0da4fe08c5a5b"
|
||||
HF_DATASET_REVISION = "9a3713bd2e7b5b55ad9dde9d85953a0f5eb5150e"
|
||||
TESTS_DIR = Path(__file__).resolve().parent
|
||||
TEST_DATA_DIR = TESTS_DIR / "data"
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from docling_parse.pdf_parser import (
|
||||
PdfRenderDocument,
|
||||
)
|
||||
|
||||
GENERATE = False
|
||||
GENERATE = True
|
||||
RENDER_INSTRUCTION_EPS = 0.005
|
||||
|
||||
GROUNDTRUTH_RENDERER_FOLDER = "tests/data/groundtruth_renderer"
|
||||
|
||||
Reference in New Issue
Block a user