feat: fixed the v2 parser to only return the pages that are requested (#47)

* fixed the v2 parser to only return the pages that are requested

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the visualize script

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the default args for compilation

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* put std::make_pair to avoid warnings

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2024-10-23 10:14:39 +02:00
committed by GitHub
parent 836571afac
commit 48451ad095
32 changed files with 568 additions and 2024476 deletions
+3 -3
View File
@@ -25,9 +25,9 @@ void set_loglevel(std::string level)
//loguru::set_verbosity(loguru::Verbosity_ERROR);
}
else
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR; {
}
{
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
}
}
nlohmann::json create_config(std::filesystem::path ifile,
+1 -1
View File
@@ -5,7 +5,7 @@ import os
from tabulate import tabulate
from docling_parse.docling_parse import pdf_parser, pdf_parser_v2
from docling_parse import pdf_parser, pdf_parser_v2
try:
from PIL import Image, ImageDraw
+1 -5
View File
@@ -43,10 +43,7 @@ namespace docling
std::string pdf_resources_dir;
//std::map<std::string, std::filesystem::path> key2doc;
std::map<std::string, decoder_ptr_type> key2doc;
//plib::parser parser;
};
docling_parser_v2::docling_parser_v2():
@@ -82,7 +79,6 @@ namespace docling
std::map<std::string, double> timings = {};
pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise(data, timings);
}
void docling_parser_v2::set_loglevel(int level)
{
@@ -114,7 +110,7 @@ namespace docling
{
loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
}
else if(level=="warning")
else if(level=="warning" or level=="warn")
{
loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
}
+17 -8
View File
@@ -31,7 +31,7 @@ namespace pdflib
private:
void update_timings(std::map<std::string, double>& timings_);
void update_timings(std::map<std::string, double>& timings_, bool set_timer);
private:
@@ -181,7 +181,10 @@ namespace pdflib
utils::timer timer;
nlohmann::json& json_pages = json_document["pages"];
json_pages = nlohmann::json::array({});
bool set_timer=true;
int page_number=0;
for(QPDFObjectHandle page : qpdf_document.getAllPages())
{
@@ -190,7 +193,8 @@ namespace pdflib
pdf_decoder<PAGE> page_decoder(page);
auto timings_ = page_decoder.decode_page();
update_timings(timings_);
update_timings(timings_, set_timer);
set_timer = false;
json_pages.push_back(page_decoder.get());
@@ -208,10 +212,13 @@ namespace pdflib
LOG_S(INFO) << "start decoding selected pages ...";
utils::timer timer;
// make sure that we only return the page from the page-numbers
nlohmann::json& json_pages = json_document["pages"];
json_pages = nlohmann::json::array({});
std::vector<QPDFObjectHandle> pages = qpdf_document.getAllPages();
bool set_timer=true; // make sure we override all timings for this page-set
for(auto page_number:page_numbers)
{
utils::timer timer;
@@ -223,7 +230,9 @@ namespace pdflib
pdf_decoder<PAGE> page_decoder(pages.at(page_number));
auto timings_ = page_decoder.decode_page();
update_timings(timings_);
update_timings(timings_, set_timer);
set_timer=false;
json_pages.push_back(page_decoder.get());
@@ -244,11 +253,11 @@ namespace pdflib
timings[__FUNCTION__] = timer.get_time();
}
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_)
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_, bool set_timer)
{
for(auto itr=timings_.begin(); itr!=timings_.end(); itr++)
{
if(timings.count(itr->first)==0)
if(timings.count(itr->first)==0 or set_timer)
{
timings[itr->first] = itr->second;
}
+6 -3
View File
@@ -82,19 +82,22 @@ namespace pdflib
std::pair<double, double> pdf_resource<PAGE_LINE>::front()
{
assert(x.size()>0);
return std::pair<double, double>(x.front(), y.front());
//return std::pair<double, double>(x.front(), y.front());
return std::make_pair(x.front(), y.front());
}
std::pair<double, double> pdf_resource<PAGE_LINE>::back()
{
assert(x.size()>0);
return std::pair<double, double>(x.back(), y.back());
//return std::pair<double, double>(x.back(), y.back());
return std::make_pair(x.back(), y.back());
}
std::pair<double, double> pdf_resource<PAGE_LINE>::operator[](int i)
{
assert(x.size()>0 and i<x.size());
return std::pair<double, double>(x[i], y[i]);
//return std::pair<double, double>(x[i], y[i]);
return std::make_pair(x[i], y[i]);
}
void pdf_resource<PAGE_LINE>::transform(std::array<double, 9> trafo_matrix)
@@ -16684,11 +16684,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.003507,
"decode_dimensions": 0.0,
"decode_page": 0.015862,
"decode_resources": 0.008806,
"sanitise_contents": 4.1e-05
"decode_contents": 0.003358,
"decode_dimensions": 4e-06,
"decode_page": 0.015178,
"decode_resources": 0.008326,
"sanitise_contents": 3.9e-05
}
},
{
@@ -29861,11 +29861,11 @@
]
},
"timings": {
"decode_contents": 0.015415,
"decode_contents": 0.015422,
"decode_dimensions": 0.0,
"decode_page": 0.02518,
"decode_resources": 0.007428,
"sanitise_contents": 3.2e-05
"decode_page": 0.025149,
"decode_resources": 0.007392,
"sanitise_contents": 2.9e-05
}
},
{
@@ -40840,10 +40840,10 @@
]
},
"timings": {
"decode_contents": 0.002348,
"decode_contents": 0.00241,
"decode_dimensions": 0.0,
"decode_page": 0.011055,
"decode_resources": 0.006615,
"decode_page": 0.011236,
"decode_resources": 0.006414,
"sanitise_contents": 2.5e-05
}
},
@@ -54715,10 +54715,10 @@
]
},
"timings": {
"decode_contents": 0.004488,
"decode_contents": 0.004505,
"decode_dimensions": 0.0,
"decode_page": 0.012061,
"decode_resources": 0.005828,
"decode_page": 0.012243,
"decode_resources": 0.005641,
"sanitise_contents": 2.8e-05
}
},
@@ -71744,11 +71744,11 @@
]
},
"timings": {
"decode_contents": 0.002497,
"decode_contents": 0.002465,
"decode_dimensions": 0.0,
"decode_page": 0.011164,
"decode_resources": 0.006387,
"sanitise_contents": 4.9e-05
"decode_page": 0.011008,
"decode_resources": 0.006174,
"sanitise_contents": 4.1e-05
}
},
{
@@ -88941,11 +88941,11 @@
]
},
"timings": {
"decode_contents": 0.004848,
"decode_contents": 0.004823,
"decode_dimensions": 0.0,
"decode_page": 0.015907,
"decode_resources": 0.008227,
"sanitise_contents": 4e-05
"decode_page": 0.016107,
"decode_resources": 0.00802,
"sanitise_contents": 3.5e-05
}
},
{
@@ -109738,11 +109738,11 @@
]
},
"timings": {
"decode_contents": 0.010908,
"decode_contents": 0.01093,
"decode_dimensions": 0.0,
"decode_page": 0.018141,
"decode_resources": 0.005595,
"sanitise_contents": 4e-05
"decode_page": 0.018393,
"decode_resources": 0.005545,
"sanitise_contents": 3.8e-05
}
},
{
@@ -126017,11 +126017,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.002365,
"decode_contents": 0.00238,
"decode_dimensions": 0.0,
"decode_page": 0.008046,
"decode_resources": 0.003733,
"sanitise_contents": 3.8e-05
"decode_page": 0.007619,
"decode_resources": 0.003613,
"sanitise_contents": 3.7e-05
}
},
{
@@ -140088,11 +140088,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.002037,
"decode_contents": 0.002042,
"decode_dimensions": 0.0,
"decode_page": 0.008592,
"decode_resources": 0.004611,
"sanitise_contents": 3.2e-05
"decode_page": 0.008342,
"decode_resources": 0.004467,
"sanitise_contents": 3.5e-05
}
},
{
@@ -156805,10 +156805,10 @@
]
},
"timings": {
"decode_contents": 0.002362,
"decode_contents": 0.002402,
"decode_dimensions": 0.0,
"decode_page": 0.010877,
"decode_resources": 0.006402,
"decode_page": 0.010979,
"decode_resources": 0.006272,
"sanitise_contents": 3.8e-05
}
},
@@ -176492,10 +176492,10 @@
"lines": []
},
"timings": {
"decode_contents": 0.003171,
"decode_contents": 0.003227,
"decode_dimensions": 0.0,
"decode_page": 0.008376,
"decode_resources": 0.002849,
"decode_page": 0.007453,
"decode_resources": 0.002728,
"sanitise_contents": 4.7e-05
}
},
@@ -186267,33 +186267,33 @@
"lines": []
},
"timings": {
"decode_contents": 0.001524,
"decode_contents": 0.001489,
"decode_dimensions": 0.0,
"decode_page": 0.004688,
"decode_resources": 0.001911,
"decode_page": 0.004278,
"decode_resources": 0.001836,
"sanitise_contents": 2.2e-05
}
}
],
"timings": {
"decode_contents": 0.05547,
"decode_dimensions": 0.0,
"decode_document": 0.154582,
"decode_page": 0.149949,
"decode_resources": 0.068392,
"decoding page 0": 0.016121,
"decoding page 1": 0.025364,
"decoding page 10": 0.008664,
"decoding page 11": 0.004831,
"decoding page 2": 0.011215,
"decoding page 3": 0.01223,
"decoding page 4": 0.011415,
"decoding page 5": 0.016156,
"decoding page 6": 0.018376,
"decoding page 7": 0.008278,
"decoding page 8": 0.008797,
"decoding page 9": 0.011129,
"process_document_from_bytesio": 0.000393,
"sanitise_contents": 0.00043200000000000004
"decode_contents": 0.055453,
"decode_dimensions": 4e-06,
"decode_document": 0.152719,
"decode_page": 0.14798499999999998,
"decode_resources": 0.066428,
"decoding page 0": 0.015425,
"decoding page 1": 0.025324,
"decoding page 10": 0.007762,
"decoding page 11": 0.004436,
"decoding page 2": 0.011409,
"decoding page 3": 0.012426,
"decoding page 4": 0.011271,
"decoding page 5": 0.016368,
"decoding page 6": 0.018673,
"decoding page 7": 0.007881,
"decoding page 8": 0.008561,
"decoding page 9": 0.011247,
"process_document_from_bytesio": 0.000421,
"sanitise_contents": 0.000414
}
}
@@ -16684,11 +16684,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.003505,
"decode_contents": 0.003498,
"decode_dimensions": 1e-06,
"decode_page": 0.031621,
"decode_resources": 0.02447,
"sanitise_contents": 4.5e-05
"decode_page": 0.03092,
"decode_resources": 0.023731,
"sanitise_contents": 4.1e-05
}
},
{
@@ -29861,11 +29861,11 @@
]
},
"timings": {
"decode_contents": 0.015547,
"decode_contents": 0.015755,
"decode_dimensions": 0.0,
"decode_page": 0.025567,
"decode_resources": 0.007578,
"sanitise_contents": 3.4e-05
"decode_page": 0.025675,
"decode_resources": 0.00753,
"sanitise_contents": 3e-05
}
},
{
@@ -40840,11 +40840,11 @@
]
},
"timings": {
"decode_contents": 0.002357,
"decode_contents": 0.002356,
"decode_dimensions": 0.0,
"decode_page": 0.011122,
"decode_resources": 0.006673,
"sanitise_contents": 2.5e-05
"decode_page": 0.011245,
"decode_resources": 0.006435,
"sanitise_contents": 2.6e-05
}
},
{
@@ -54715,11 +54715,11 @@
]
},
"timings": {
"decode_contents": 0.004545,
"decode_contents": 0.00453,
"decode_dimensions": 0.0,
"decode_page": 0.012175,
"decode_resources": 0.005879,
"sanitise_contents": 2.8e-05
"decode_page": 0.012265,
"decode_resources": 0.005659,
"sanitise_contents": 3.1e-05
}
},
{
@@ -71744,11 +71744,11 @@
]
},
"timings": {
"decode_contents": 0.002495,
"decode_contents": 0.002468,
"decode_dimensions": 0.0,
"decode_page": 0.011211,
"decode_resources": 0.006395,
"sanitise_contents": 5.3e-05
"decode_page": 0.011043,
"decode_resources": 0.006183,
"sanitise_contents": 6e-05
}
},
{
@@ -88941,11 +88941,11 @@
]
},
"timings": {
"decode_contents": 0.004854,
"decode_contents": 0.004845,
"decode_dimensions": 0.0,
"decode_page": 0.016029,
"decode_resources": 0.008287,
"sanitise_contents": 4e-05
"decode_page": 0.016165,
"decode_resources": 0.008012,
"sanitise_contents": 3.6e-05
}
},
{
@@ -109738,11 +109738,11 @@
]
},
"timings": {
"decode_contents": 0.011055,
"decode_contents": 0.011018,
"decode_dimensions": 0.0,
"decode_page": 0.018426,
"decode_resources": 0.005673,
"sanitise_contents": 3.8e-05
"decode_page": 0.018656,
"decode_resources": 0.005701,
"sanitise_contents": 3.9e-05
}
},
{
@@ -126017,11 +126017,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.002381,
"decode_contents": 0.002392,
"decode_dimensions": 0.0,
"decode_page": 0.008162,
"decode_resources": 0.003732,
"sanitise_contents": 3.7e-05
"decode_page": 0.007637,
"decode_resources": 0.003598,
"sanitise_contents": 5e-05
}
},
{
@@ -140088,10 +140088,10 @@
"lines": []
},
"timings": {
"decode_contents": 0.002052,
"decode_contents": 0.002086,
"decode_dimensions": 0.0,
"decode_page": 0.008661,
"decode_resources": 0.004672,
"decode_page": 0.008393,
"decode_resources": 0.004465,
"sanitise_contents": 3.2e-05
}
},
@@ -156805,11 +156805,11 @@
]
},
"timings": {
"decode_contents": 0.002397,
"decode_contents": 0.002384,
"decode_dimensions": 0.0,
"decode_page": 0.01092,
"decode_resources": 0.006372,
"sanitise_contents": 3.8e-05
"decode_page": 0.010852,
"decode_resources": 0.006186,
"sanitise_contents": 4e-05
}
},
{
@@ -176494,8 +176494,8 @@
"timings": {
"decode_contents": 0.003181,
"decode_dimensions": 0.0,
"decode_page": 0.008467,
"decode_resources": 0.002853,
"decode_page": 0.007396,
"decode_resources": 0.002713,
"sanitise_contents": 4.7e-05
}
},
@@ -186267,33 +186267,33 @@
"lines": []
},
"timings": {
"decode_contents": 0.001504,
"decode_contents": 0.001509,
"decode_dimensions": 0.0,
"decode_page": 0.004717,
"decode_resources": 0.001924,
"decode_page": 0.004286,
"decode_resources": 0.001817,
"sanitise_contents": 2.2e-05
}
}
],
"timings": {
"decode_contents": 0.055873,
"decode_contents": 0.05602199999999999,
"decode_dimensions": 1e-06,
"decode_document": 0.171846,
"decode_page": 0.16707800000000003,
"decode_resources": 0.08450799999999999,
"decoding page 0": 0.031881,
"decoding page 1": 0.025748,
"decoding page 10": 0.008766,
"decoding page 11": 0.004867,
"decoding page 2": 0.011285,
"decoding page 3": 0.012362,
"decoding page 4": 0.011464,
"decoding page 5": 0.016272,
"decoding page 6": 0.018668,
"decoding page 7": 0.008406,
"decoding page 8": 0.008877,
"decoding page 9": 0.011179,
"process_document_from_file": 0.001949,
"sanitise_contents": 0.000439
"decode_document": 0.169451,
"decode_page": 0.16453300000000004,
"decode_resources": 0.08202999999999998,
"decoding page 0": 0.031191,
"decoding page 1": 0.025893,
"decoding page 10": 0.007725,
"decoding page 11": 0.004449,
"decoding page 2": 0.01142,
"decoding page 3": 0.012451,
"decoding page 4": 0.01133,
"decoding page 5": 0.016427,
"decoding page 6": 0.01891,
"decoding page 7": 0.007878,
"decoding page 8": 0.008632,
"decoding page 9": 0.011135,
"process_document_from_file": 0.001263,
"sanitise_contents": 0.00045400000000000003
}
}
@@ -16684,22 +16684,22 @@
"lines": []
},
"timings": {
"decode_contents": 0.003576,
"decode_contents": 0.00338,
"decode_dimensions": 1e-06,
"decode_page": 0.01596,
"decode_resources": 0.008973,
"sanitise_contents": 4.7e-05
"decode_page": 0.01523,
"decode_resources": 0.008172,
"sanitise_contents": 3.9e-05
}
}
],
"timings": {
"decode_contents": 0.003576,
"decode_contents": 0.00338,
"decode_dimensions": 1e-06,
"decode_document": 0.016939,
"decode_page": 0.01596,
"decode_resources": 0.008973,
"decoding page 0": 0.016238,
"process_document_from_bytesio": 0.000384,
"sanitise_contents": 4.7e-05
"decode_document": 0.016125,
"decode_page": 0.01523,
"decode_resources": 0.008172,
"decoding page 0": 0.015462,
"process_document_from_bytesio": 0.00043,
"sanitise_contents": 3.9e-05
}
}
@@ -16684,22 +16684,22 @@
"lines": []
},
"timings": {
"decode_contents": 0.003454,
"decode_contents": 0.003508,
"decode_dimensions": 0.0,
"decode_page": 0.015488,
"decode_resources": 0.008604,
"sanitise_contents": 4e-05
"decode_page": 0.015797,
"decode_resources": 0.008661,
"sanitise_contents": 4.2e-05
}
}
],
"timings": {
"decode_contents": 0.003454,
"decode_contents": 0.003508,
"decode_dimensions": 0.0,
"decode_document": 0.016365,
"decode_page": 0.015488,
"decode_resources": 0.008604,
"decoding page 0": 0.015724,
"process_document_from_file": 0.00124,
"sanitise_contents": 4e-05
"decode_document": 0.016818,
"decode_page": 0.015797,
"decode_resources": 0.008661,
"decoding page 0": 0.016078,
"process_document_from_file": 0.001244,
"sanitise_contents": 4.2e-05
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+2 -2
View File
@@ -82,8 +82,8 @@ def verify_reference_output(true_doc, pred_doc):
num_true_pages=len(true_doc["pages"])
num_pred_pages=len(pred_doc["pages"])
message = f"len(pred_doc[\"pages\"])!=len(true_doc[\"pages\"]) => {num_true_pages}!={num_pred_pages}"
assert num_true_pages==num_pred_pages, message
message = f"len(pred_doc[\"pages\"])!=len(true_doc[\"pages\"]) => {num_pred_pages}!={num_true_pages}"
assert num_pred_pages==num_true_pages, message
for pred_page,true_page in zip(pred_doc["pages"], true_doc["pages"]):
# print(pred_page.keys())