mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
perf: default cmap speedup (#203)
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
bb0b4ef0b1
commit
82a0aaa791
@@ -167,6 +167,11 @@ class Timings(BaseModel):
|
||||
"""Get all static timing key names."""
|
||||
return get_static_timing_keys()
|
||||
|
||||
@staticmethod
|
||||
def decode_page_keys() -> List[str]:
|
||||
"""Get timing keys used in decode_page method (in order, excluding global timer)."""
|
||||
return get_decode_page_timing_keys()
|
||||
|
||||
|
||||
class PdfDocument:
|
||||
|
||||
|
||||
+4
-1
@@ -1,8 +1,10 @@
|
||||
//-*-C++-*-
|
||||
|
||||
// std libraries
|
||||
// std libraries
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <assert.h>
|
||||
#include <filesystem>
|
||||
@@ -54,6 +56,7 @@
|
||||
#include <parse/pdf_resources/page_font/encodings.h>
|
||||
#include <parse/pdf_resources/page_font/base_font.h>
|
||||
#include <parse/pdf_resources/page_font/base_fonts.h>
|
||||
#include <parse/pdf_resources/page_font/cmap_value.h>
|
||||
#include <parse/pdf_resources/page_font/cmap.h>
|
||||
#include <parse/pdf_resources/page_font/char_description.h>
|
||||
#include <parse/pdf_resources/page_font/char_processor.h>
|
||||
|
||||
@@ -139,7 +139,8 @@ namespace pdflib
|
||||
bool cmap_initialized;
|
||||
bool diff_initialized;
|
||||
|
||||
std::unordered_map<uint32_t, std::string> cmap_numb_to_char;
|
||||
//std::unordered_map<uint32_t, std::string> cmap_numb_to_char;
|
||||
cmap_value cmap_numb_to_char;
|
||||
std::unordered_map<uint32_t, std::string> diff_numb_to_char;
|
||||
|
||||
std::unordered_map<uint32_t, int> unknown_numbs;
|
||||
@@ -420,7 +421,7 @@ namespace pdflib
|
||||
{
|
||||
if(cmap_numb_to_char.count(c))
|
||||
{
|
||||
return cmap_numb_to_char[c];
|
||||
return cmap_numb_to_char.at(c);
|
||||
}
|
||||
else if(32<=c)
|
||||
{
|
||||
@@ -1805,12 +1806,12 @@ namespace pdflib
|
||||
cmap_numb_to_char.count(numb)==1)
|
||||
{
|
||||
LOG_S(WARNING) << "overloading difference from cmap";
|
||||
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
|
||||
diff_numb_to_char[numb] = cmap_numb_to_char.at(numb);
|
||||
}
|
||||
|
||||
// FIXME: might need to be commented out or fixed
|
||||
/*
|
||||
else if(name_to_descr.count(name)==1 and
|
||||
else if(name_to_descr.count(name)==1 and
|
||||
cmap_numb_to_char.count(numb)==0)
|
||||
{
|
||||
//assert(subtype==TYPE_3);
|
||||
@@ -1889,7 +1890,7 @@ namespace pdflib
|
||||
else if(std::regex_match(name, match, re_03) and cmap_numb_to_char.count(numb)==1) // if the name is of type /g23 of /G23 and we have a match in the cmap
|
||||
{
|
||||
LOG_S(WARNING) << "overloading difference from cmap";
|
||||
diff_numb_to_char[numb] = cmap_numb_to_char[numb];
|
||||
diff_numb_to_char[numb] = cmap_numb_to_char.at(numb);
|
||||
//diff_numb_to_char[numb] = name;
|
||||
//LOG_S(ERROR) << "weird differences["<<numb<<"] -> " << name;
|
||||
}
|
||||
@@ -2134,7 +2135,7 @@ namespace pdflib
|
||||
|
||||
std::string cmap = " --- ";
|
||||
if(cmap_numb_to_char.count(numb)==1)
|
||||
cmap = "'"+cmap_numb_to_char[numb]+"'";
|
||||
cmap = "'"+cmap_numb_to_char.at(numb)+"'";
|
||||
|
||||
std::string diff = " --- ";
|
||||
if(diff_numb_to_char.count(numb)==1)
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace pdflib
|
||||
cmap_parser();
|
||||
~cmap_parser();
|
||||
|
||||
std::unordered_map<uint32_t, std::string> get() { return _map; }
|
||||
cmap_value get();
|
||||
|
||||
void print();
|
||||
|
||||
@@ -55,6 +55,27 @@ namespace pdflib
|
||||
const std::string src_end,
|
||||
const std::vector<std::string> tgt);
|
||||
|
||||
// Helper to remove trailing null bytes from a string
|
||||
static void remove_trailing_nulls(std::string& str);
|
||||
|
||||
// Helper to populate the map for a range of source codepoints.
|
||||
// Detects identity mapping when tgts.size()==1 && tgts[0]==begin (maps i -> i).
|
||||
// For non-identity, uses tgts and increments tgts.back() for each iteration.
|
||||
//static void populate_range_mapping(uint32_t begin, uint32_t end,
|
||||
// std::vector<uint32_t>& tgts,
|
||||
// const std::pair<uint32_t, uint32_t>& csr_range,
|
||||
// std::unordered_map<uint32_t, std::string>& map,
|
||||
// bool cache=true);
|
||||
void populate_range_mapping(uint32_t begin, uint32_t end,
|
||||
std::vector<uint32_t>& tgts);
|
||||
|
||||
// Legacy implementation - kept for comparison, uses mapping=="" check instead of identity detection
|
||||
static void populate_range_mapping_legacy(uint32_t begin, uint32_t end,
|
||||
const std::string& mapping,
|
||||
std::vector<uint32_t>& tgts,
|
||||
const std::pair<uint32_t, uint32_t>& csr_range,
|
||||
std::unordered_map<uint32_t, std::string>& map);
|
||||
|
||||
private:
|
||||
|
||||
uint32_t char_count;
|
||||
@@ -66,6 +87,8 @@ namespace pdflib
|
||||
std::pair<uint32_t, uint32_t> bf_range;
|
||||
|
||||
std::unordered_map<uint32_t, std::string> _map;
|
||||
|
||||
cmap_value _cmap;
|
||||
};
|
||||
|
||||
cmap_parser::cmap_parser():
|
||||
@@ -75,6 +98,11 @@ namespace pdflib
|
||||
cmap_parser::~cmap_parser()
|
||||
{}
|
||||
|
||||
cmap_value cmap_parser::get()
|
||||
{
|
||||
return _cmap;
|
||||
}
|
||||
|
||||
void cmap_parser::print()
|
||||
{
|
||||
for(auto itr=_map.begin(); itr!=_map.end(); itr++)
|
||||
@@ -150,6 +178,12 @@ namespace pdflib
|
||||
}
|
||||
}
|
||||
|
||||
// If identity was not set during populate_range_mapping, construct from _map
|
||||
if(not _cmap.is_identity())
|
||||
{
|
||||
_cmap = cmap_value(std::move(_map));
|
||||
}
|
||||
|
||||
timings.add_timing(key_root + pdf_timings::KEY_CMAP_PARSE_TOTAL, total_timer.get_time());
|
||||
}
|
||||
|
||||
@@ -261,6 +295,244 @@ namespace pdflib
|
||||
return result;
|
||||
}
|
||||
|
||||
void cmap_parser::remove_trailing_nulls(std::string& str)
|
||||
{
|
||||
/* Legacy */
|
||||
// str.erase(std::remove_if(str.begin(), str.end(), [] (char x) { return x==0; }), str.end());
|
||||
|
||||
// Remove only trailing null bytes (not all nulls)
|
||||
while(not str.empty() && str.back() == '\0')
|
||||
{
|
||||
str.pop_back();
|
||||
}
|
||||
// If string became empty, it was all nulls - preserve as single null
|
||||
if(str.empty())
|
||||
{
|
||||
str = std::string(1, '\0');
|
||||
}
|
||||
}
|
||||
|
||||
// Legacy: static version with caching
|
||||
//void cmap_parser::populate_range_mapping(uint32_t begin, uint32_t end,
|
||||
// std::vector<uint32_t>& tgts,
|
||||
// const std::pair<uint32_t, uint32_t>& csr_range,
|
||||
// std::unordered_map<uint32_t, std::string>& map,
|
||||
// bool cache)
|
||||
|
||||
void cmap_parser::populate_range_mapping(uint32_t begin, uint32_t end,
|
||||
std::vector<uint32_t>& tgts)
|
||||
{
|
||||
if(begin==0 and
|
||||
end==65535 and
|
||||
csr_range.first==0 and
|
||||
csr_range.second==65535 and
|
||||
tgts.size()==1 and tgts.at(0)==0)
|
||||
{
|
||||
// Identity mapping detected: cmap_value will compute UTF-8 on the fly
|
||||
LOG_S(INFO) << "identity mapping detected, using cmap_value identity mode";
|
||||
_cmap = cmap_value(true, csr_range, {});
|
||||
return;
|
||||
}
|
||||
|
||||
// Non-identity: populate _map entry by entry
|
||||
bool is_identity = (tgts.size() == 1 && tgts[0] == begin);
|
||||
|
||||
LOG_S(INFO) << "populate_range_mapping: begin=" << begin << ", end=" << end
|
||||
<< ", tgts.size()=" << tgts.size()
|
||||
<< ", is_identity=" << is_identity;
|
||||
|
||||
for(uint32_t i = 0; i < end - begin + 1; i++)
|
||||
{
|
||||
uint32_t src_codepoint = begin + i;
|
||||
|
||||
if(not (csr_range.first <= src_codepoint and src_codepoint <= csr_range.second))
|
||||
{
|
||||
if(is_identity)
|
||||
{
|
||||
LOG_S(WARNING) << "index " << src_codepoint << " is out of bounds ["
|
||||
<< csr_range.first << ", " << csr_range.second << "]";
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(ERROR) << "index " << src_codepoint << " is out of bounds ["
|
||||
<< csr_range.first << ", " << csr_range.second << "]";
|
||||
}
|
||||
|
||||
if(not is_identity)
|
||||
{
|
||||
tgts.back() += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
std::string tmp(128, 0);
|
||||
{
|
||||
auto itr = tmp.begin();
|
||||
if(is_identity)
|
||||
{
|
||||
itr = utf8::append(src_codepoint, itr);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(auto tgt_uint : tgts)
|
||||
{
|
||||
itr = utf8::append(tgt_uint, itr);
|
||||
}
|
||||
}
|
||||
tmp.erase(itr, tmp.end());
|
||||
}
|
||||
|
||||
if(_map.count(src_codepoint) == 1)
|
||||
{
|
||||
LOG_S(WARNING) << "overwriting number c=" << src_codepoint;
|
||||
}
|
||||
|
||||
if(utf8::is_valid(tmp.begin(), tmp.end()))
|
||||
{
|
||||
_map[src_codepoint] = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string -> iteration: " << src_codepoint;
|
||||
_map[src_codepoint] = "UNICODE<" + std::to_string(src_codepoint) + ">";
|
||||
}
|
||||
}
|
||||
catch(const std::exception& exc)
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string: " << exc.what() << " -> iteration: " << src_codepoint;
|
||||
_map[src_codepoint] = "UNICODE<" + std::to_string(src_codepoint) + ">";
|
||||
}
|
||||
|
||||
if(not is_identity)
|
||||
{
|
||||
tgts.back() += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: not used code, just reference still ...
|
||||
void cmap_parser::populate_range_mapping_legacy(uint32_t begin, uint32_t end,
|
||||
const std::string& mapping,
|
||||
std::vector<uint32_t>& tgts,
|
||||
const std::pair<uint32_t, uint32_t>& csr_range,
|
||||
std::unordered_map<uint32_t, std::string>& map)
|
||||
{
|
||||
// Legacy implementation using mapping=="" check (likely dead code path)
|
||||
// Kept for comparison with populate_range_mapping which uses identity detection
|
||||
|
||||
if(mapping == "")
|
||||
{
|
||||
for(uint32_t i = 0; i < end - begin + 1; i++)
|
||||
{
|
||||
if(csr_range.first <= begin + i and begin + i <= csr_range.second)
|
||||
{
|
||||
try
|
||||
{
|
||||
std::string tmp(128, 0);
|
||||
{
|
||||
auto itr = tmp.begin();
|
||||
itr = utf8::append(begin + i, itr);
|
||||
|
||||
tmp.erase(itr, tmp.end());
|
||||
}
|
||||
|
||||
if(map.count(begin + i) == 1)
|
||||
{
|
||||
LOG_S(WARNING) << "overwriting number c=" << begin + i;
|
||||
}
|
||||
|
||||
if(utf8::is_valid(tmp.begin(), tmp.end()))
|
||||
{
|
||||
//LOG_S(INFO) << "cmap-ind:" << (begin+i) << " -> target: " << tmp;
|
||||
map[begin + i] = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string -> iteration: " << (begin + i);
|
||||
map[begin + i] = "UNICODE<" + std::to_string(begin + i) + ">";
|
||||
}
|
||||
}
|
||||
catch(const std::exception& exc)
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string: " << exc.what() << " -> iteration: " << (begin + i);
|
||||
|
||||
map[begin + i] = "UNICODE<" + std::to_string(begin + i) + ">";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "index " << begin + i << " is out of bounds ["
|
||||
<< csr_range.first << ", " << csr_range.second << "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(ERROR) << begin << ", "
|
||||
<< end << ", "
|
||||
<< csr_range.first << ", "
|
||||
<< csr_range.second << ", "
|
||||
<< tgts.at(0) << ", " << tgts.size();
|
||||
|
||||
for(uint32_t i = 0; i < end - begin + 1; i++)
|
||||
{
|
||||
if(csr_range.first <= begin + i and begin + i <= csr_range.second)
|
||||
{
|
||||
try
|
||||
{
|
||||
std::string tmp(128, 0);
|
||||
{
|
||||
auto itr = tmp.begin();
|
||||
for(auto tgt_uint : tgts)
|
||||
{
|
||||
itr = utf8::append(tgt_uint, itr);
|
||||
}
|
||||
tmp.erase(itr, tmp.end());
|
||||
}
|
||||
|
||||
if(map.count(begin + i) == 1)
|
||||
{
|
||||
LOG_S(WARNING) << "overwriting number c=" << begin + i;
|
||||
}
|
||||
|
||||
//map[begin + i] = tmp;
|
||||
if(utf8::is_valid(tmp.begin(), tmp.end()))
|
||||
{
|
||||
// LOG_S(INFO) << "cmap-ind:" << (begin+i) << " -> target: " << tmp;
|
||||
map[begin + i] = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string -> iteration: " << (begin + i);
|
||||
map[begin + i] = "UNICODE<" + std::to_string(begin + i) + ">";
|
||||
}
|
||||
}
|
||||
catch(const std::exception& exc)
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string: " << exc.what();
|
||||
|
||||
map[begin + i] = "UNICODE<" + std::to_string(begin + i) + ">";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(ERROR) << "index " << begin + i << " is out of bounds ["
|
||||
<< csr_range.first << ", " << csr_range.second << "]";
|
||||
}
|
||||
|
||||
tgts.back() += 1;
|
||||
}
|
||||
|
||||
LOG_S(ERROR) << begin << ", "
|
||||
<< end << ", "
|
||||
<< csr_range.first << ", "
|
||||
<< csr_range.second << ", "
|
||||
<< tgts.at(0) << ", " << tgts.size() << "\t => Done!";
|
||||
}
|
||||
}
|
||||
|
||||
void cmap_parser::parse_cmap_name(std::vector<qpdf_instruction>& parameters)
|
||||
{
|
||||
LOG_S(WARNING) << __FUNCTION__ << ": skipping ...";
|
||||
@@ -296,7 +568,6 @@ namespace pdflib
|
||||
void cmap_parser::parse_endcodespacerange(std::vector<qpdf_instruction>& parameters)
|
||||
{
|
||||
LOG_S(INFO) << __FUNCTION__;
|
||||
//assert(parameters.size()==2*csr_cnt);
|
||||
|
||||
const int num_params = 2;
|
||||
if(parameters.size()<num_params)
|
||||
@@ -413,60 +684,34 @@ namespace pdflib
|
||||
//LOG_S(ERROR) << "source_beg: " << source_start << ", source_end: " << source_end << ": " << target;
|
||||
|
||||
// FIXME we probably need to fix the 2 in the to_utf8(..)
|
||||
//std::string tmp = target.getUTF8Value();
|
||||
std::string tmp = get_target(target);//to_utf8(target, 2);
|
||||
LOG_S(INFO) << "source_beg: " << source_start.size() << ", source_end: " << source_end.size()
|
||||
<< " tmp: " << tmp.size()
|
||||
<< " source_start==tmp: " << (source_start==tmp);
|
||||
//std::string tgt = target.getUTF8Value();
|
||||
std::string tgt = get_target(target);//to_utf8(target, 2);
|
||||
|
||||
/* Legacy */
|
||||
// tmp.erase(std::remove_if(tmp.begin(), tmp.end(), [] (char x) { return x==0;} ), tmp.end());
|
||||
//LOG_S(INFO) << "source_beg: " << source_start.size() << ", source_end: " << source_end.size()
|
||||
//<< " tgt: " << tgt.size()
|
||||
//<< " source_start==tgt: " << (source_start==tgt);
|
||||
|
||||
// Remove only trailing null bytes (not all nulls)
|
||||
while(!tmp.empty() && tmp.back() == '\0')
|
||||
{
|
||||
tmp.pop_back();
|
||||
}
|
||||
// If string became empty, it was all nulls - preserve as single null
|
||||
if(tmp.empty())
|
||||
{
|
||||
tmp = std::string(1, '\0');
|
||||
}
|
||||
remove_trailing_nulls(tgt);
|
||||
|
||||
//LOG_S(ERROR) << "tmp: `" << tmp.size() << "`";
|
||||
//LOG_S(ERROR) << "source_beg: " << source_start << ", source_end: " << source_end << ": " << tmp;
|
||||
//LOG_S(INFO) << "source_beg: " << source_start.size() << ", source_end: " << source_end.size()
|
||||
//<< " tgt: " << tgt.size()
|
||||
//<< " source_start==tgt: " << (source_start==tgt);
|
||||
|
||||
LOG_S(INFO) << "source_beg: " << source_start.size() << ", source_end: " << source_end.size()
|
||||
<< " tmp: " << tmp.size()
|
||||
<< " source_start==tmp: " << (source_start==tmp);
|
||||
|
||||
set_range(source_start, source_end, tmp);
|
||||
set_range(source_start, source_end, tgt);
|
||||
}
|
||||
else if(target.isArray())
|
||||
{
|
||||
std::vector<QPDFObjectHandle> tmps = target.getArrayAsVector();
|
||||
std::vector<QPDFObjectHandle> tgts = target.getArrayAsVector();
|
||||
|
||||
std::vector<std::string> target_strs;
|
||||
|
||||
for(QPDFObjectHandle tmp: tmps)
|
||||
for(QPDFObjectHandle tgt_: tgts)
|
||||
{
|
||||
// FIXME we probably need to fix the 2 in the to_utf8(..)
|
||||
//std::string tgt = tmp.getUTF8Value();
|
||||
std::string tgt = get_target(tmp);
|
||||
//std::string tgt = tgt.getUTF8Value();
|
||||
std::string tgt = get_target(tgt_);
|
||||
|
||||
/* Legacy */
|
||||
//tgt.erase(std::remove_if(tgt.begin(), tgt.end(), [] (char x) { return x==0; }), tgt.end());
|
||||
|
||||
// Remove only trailing null bytes (not all nulls)
|
||||
while(!tgt.empty() && tgt.back() == '\0')
|
||||
{
|
||||
tgt.pop_back();
|
||||
}
|
||||
// If string became empty, it was all nulls - preserve as single null
|
||||
if(tgt.empty())
|
||||
{
|
||||
tgt = std::string(1, '\0');
|
||||
}
|
||||
remove_trailing_nulls(tgt);
|
||||
|
||||
target_strs.push_back(tgt);
|
||||
}
|
||||
@@ -516,11 +761,11 @@ namespace pdflib
|
||||
const std::string tgt)
|
||||
{
|
||||
//LOG_S(INFO) << __FUNCTION__;
|
||||
|
||||
|
||||
auto itr_beg = src_begin.begin();
|
||||
uint32_t begin = utf8::next(itr_beg, src_begin.end());
|
||||
|
||||
if(itr_beg!=src_begin.end())
|
||||
if(itr_beg != src_begin.end())
|
||||
{
|
||||
LOG_S(WARNING) << "itr_beg!=src_begin.end() --> errors might occur in the cmap: "
|
||||
<< "'" << src_begin << "' -> " << begin;
|
||||
@@ -529,7 +774,7 @@ namespace pdflib
|
||||
auto itr_end = src_end.begin();
|
||||
uint32_t end = utf8::next(itr_end, src_end.end());
|
||||
|
||||
if(itr_end!=src_end.end())
|
||||
if(itr_end != src_end.end())
|
||||
{
|
||||
LOG_S(WARNING) << "itr_end!=src_end.end() --> errors might occur in the cmap: "
|
||||
<< "'" << src_end << "' -> " << end;;
|
||||
@@ -538,12 +783,13 @@ namespace pdflib
|
||||
//LOG_S(INFO) << __FUNCTION__ << "\t"
|
||||
//<< "beg: " << begin << ", "
|
||||
//<< "end: " << end << "\t tgt: `" << tgt << "` with size: " << tgt.size();
|
||||
|
||||
|
||||
// Parse target string into codepoints
|
||||
std::string mapping(tgt);
|
||||
std::vector<uint32_t> tgts;
|
||||
|
||||
|
||||
auto itr_tgt = tgt.begin();
|
||||
while(itr_tgt!=tgt.end())
|
||||
while(itr_tgt != tgt.end())
|
||||
{
|
||||
uint32_t tmp = utf8::next(itr_tgt, tgt.end());
|
||||
tgts.push_back(tmp);
|
||||
@@ -555,107 +801,12 @@ namespace pdflib
|
||||
// Pre-reserve capacity to avoid rehashing during bulk insertions
|
||||
_map.reserve(_map.size() + (end - begin + 1));
|
||||
|
||||
if(mapping=="")
|
||||
{
|
||||
for(uint32_t i = 0; i < end - begin + 1; i++)
|
||||
{
|
||||
//assert(csr_range.first<=begin+i and begin+i<=csr_range.second);
|
||||
// New implementation with cmap_value identity detection
|
||||
populate_range_mapping(begin, end, tgts);
|
||||
|
||||
if(csr_range.first<=begin+i and begin+i<=csr_range.second)
|
||||
{
|
||||
try
|
||||
{
|
||||
std::string tmp(128, 0);
|
||||
{
|
||||
auto itr = tmp.begin();
|
||||
itr = utf8::append(begin+i, itr);
|
||||
|
||||
tmp.erase(itr, tmp.end());
|
||||
}
|
||||
|
||||
if(_map.count(begin+i)==1)
|
||||
{
|
||||
LOG_S(WARNING) << "overwriting number c=" << begin+i;
|
||||
}
|
||||
|
||||
if(utf8::is_valid(tmp.begin(), tmp.end()))
|
||||
{
|
||||
//LOG_S(INFO) << "cmap-ind:" << (begin+i) << " -> target: " << tmp;
|
||||
_map[begin + i] = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string -> iteration: " << (begin+i);
|
||||
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
|
||||
}
|
||||
}
|
||||
catch(const std::exception& exc)
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string: " << exc.what() << " -> iteration: " << (begin+i);
|
||||
|
||||
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "index " << begin+i << " is out of bounds ["
|
||||
<< csr_range.first << ", " << csr_range.second << "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(uint32_t i = 0; i < end - begin + 1; i++)
|
||||
{
|
||||
//assert(csr_range.first<=begin+i and begin+i<=csr_range.second);
|
||||
|
||||
if(csr_range.first<=begin+i and begin+i<=csr_range.second)
|
||||
{
|
||||
try
|
||||
{
|
||||
std::string tmp(128, 0);
|
||||
{
|
||||
auto itr = tmp.begin();
|
||||
for(auto tgt_uint: tgts)
|
||||
{
|
||||
itr = utf8::append(tgt_uint, itr);
|
||||
}
|
||||
tmp.erase(itr, tmp.end());
|
||||
}
|
||||
|
||||
if(_map.count(begin+i)==1)
|
||||
{
|
||||
LOG_S(WARNING) << "overwriting number c=" << begin+i;
|
||||
}
|
||||
|
||||
//_map[begin + i] = tmp;
|
||||
if(utf8::is_valid(tmp.begin(), tmp.end()))
|
||||
{
|
||||
// LOG_S(INFO) << "cmap-ind:" << (begin+i) << " -> target: " << tmp;
|
||||
_map[begin + i] = tmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string -> iteration: " << (begin+i);
|
||||
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
|
||||
}
|
||||
}
|
||||
catch(const std::exception& exc)
|
||||
{
|
||||
LOG_S(WARNING) << "invalid utf8 string: " << exc.what();
|
||||
|
||||
_map[begin + i] = "UNICODE<"+std::to_string(begin+i)+">";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_S(ERROR) << "index " << begin+i << " is out of bounds ["
|
||||
<< csr_range.first << ", " << csr_range.second << "]";
|
||||
}
|
||||
|
||||
tgts.back() += 1;
|
||||
}
|
||||
}
|
||||
// Legacy implementations:
|
||||
//populate_range_mapping(begin, end, tgts, csr_range, _map);
|
||||
//populate_range_mapping_legacy(begin, end, mapping, tgts, csr_range, _map);
|
||||
}
|
||||
|
||||
void cmap_parser::set_range(const std::string src_begin,
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
//-*-C++-*-
|
||||
|
||||
#ifndef PDF_PAGE_FONT_CMAP_VALUE_H
|
||||
#define PDF_PAGE_FONT_CMAP_VALUE_H
|
||||
|
||||
namespace pdflib
|
||||
{
|
||||
|
||||
class cmap_value
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
cmap_value();
|
||||
|
||||
cmap_value(std::unordered_map<uint32_t, std::string> map);
|
||||
|
||||
cmap_value(bool is_identity,
|
||||
std::pair<uint32_t, uint32_t> range,
|
||||
std::unordered_map<uint32_t, std::string> map);
|
||||
|
||||
bool is_identity() const;
|
||||
|
||||
std::string at(uint32_t key) const;
|
||||
|
||||
size_t count(uint32_t key) const;
|
||||
|
||||
size_t size() const;
|
||||
|
||||
bool empty() const;
|
||||
|
||||
// Iteration (delegates to _map; for identity mode, _map is empty)
|
||||
std::unordered_map<uint32_t, std::string>::const_iterator begin() const;
|
||||
std::unordered_map<uint32_t, std::string>::const_iterator end() const;
|
||||
|
||||
private:
|
||||
|
||||
static std::string codepoint_to_utf8(uint32_t codepoint);
|
||||
|
||||
bool _is_identity;
|
||||
std::pair<uint32_t, uint32_t> _identity_range;
|
||||
std::unordered_map<uint32_t, std::string> _map;
|
||||
};
|
||||
|
||||
cmap_value::cmap_value():
|
||||
_is_identity(false),
|
||||
_identity_range({0, 0}),
|
||||
_map()
|
||||
{}
|
||||
|
||||
cmap_value::cmap_value(std::unordered_map<uint32_t, std::string> map):
|
||||
_is_identity(false),
|
||||
_identity_range({0, 0}),
|
||||
_map(std::move(map))
|
||||
{}
|
||||
|
||||
cmap_value::cmap_value(bool is_identity,
|
||||
std::pair<uint32_t, uint32_t> range,
|
||||
std::unordered_map<uint32_t, std::string> map):
|
||||
_is_identity(is_identity),
|
||||
_identity_range(range),
|
||||
_map(std::move(map))
|
||||
{}
|
||||
|
||||
bool cmap_value::is_identity() const
|
||||
{
|
||||
return _is_identity;
|
||||
}
|
||||
|
||||
std::string cmap_value::at(uint32_t key) const
|
||||
{
|
||||
// Map overrides take priority over identity
|
||||
if(_map.count(key) == 1)
|
||||
{
|
||||
return _map.at(key);
|
||||
}
|
||||
|
||||
if(_is_identity and
|
||||
_identity_range.first <= key and key <= _identity_range.second)
|
||||
{
|
||||
return codepoint_to_utf8(key);
|
||||
}
|
||||
|
||||
throw std::out_of_range("cmap_value::at: key " + std::to_string(key) + " not found");
|
||||
}
|
||||
|
||||
size_t cmap_value::count(uint32_t key) const
|
||||
{
|
||||
if(_map.count(key) == 1)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
if(_is_identity and
|
||||
_identity_range.first <= key and key <= _identity_range.second)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t cmap_value::size() const
|
||||
{
|
||||
if(_is_identity)
|
||||
{
|
||||
return _identity_range.second - _identity_range.first + 1;
|
||||
}
|
||||
|
||||
return _map.size();
|
||||
}
|
||||
|
||||
bool cmap_value::empty() const
|
||||
{
|
||||
if(_is_identity)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return _map.empty();
|
||||
}
|
||||
|
||||
std::unordered_map<uint32_t, std::string>::const_iterator cmap_value::begin() const
|
||||
{
|
||||
return _map.begin();
|
||||
}
|
||||
|
||||
std::unordered_map<uint32_t, std::string>::const_iterator cmap_value::end() const
|
||||
{
|
||||
return _map.end();
|
||||
}
|
||||
|
||||
std::string cmap_value::codepoint_to_utf8(uint32_t codepoint)
|
||||
{
|
||||
std::string result(4, 0);
|
||||
auto itr = utf8::append(codepoint, result.begin());
|
||||
result.erase(itr, result.end());
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user