mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
cd15d00ddb
* updated the visualize script Signed-off-by: Peter Staar <taa@zurich.ibm.com> * replaced all the errors with fatals Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the python code Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com>
371 lines
10 KiB
Python
371 lines
10 KiB
Python
import argparse
|
|
import io
|
|
import json
|
|
import os
|
|
|
|
from tabulate import tabulate
|
|
|
|
from docling_parse import pdf_parser_v1, pdf_parser_v2
|
|
|
|
try:
|
|
from PIL import Image, ImageDraw
|
|
|
|
PIL_INSTALLED = True
|
|
except ImportError:
|
|
PIL_INSTALLED = False
|
|
print("Pillow is not installed. Skipping image processing.")
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Process a PDF file.")
|
|
|
|
# Restrict log-level to specific values
|
|
parser.add_argument(
|
|
"-l",
|
|
"--log-level",
|
|
type=str,
|
|
choices=["info", "warning", "error", "fatal"],
|
|
required=False,
|
|
default="error",
|
|
help="Log level [info, warning, error, fatal]",
|
|
)
|
|
|
|
# Restrict version to specific values
|
|
parser.add_argument(
|
|
"-v",
|
|
"--version",
|
|
type=str,
|
|
choices=["v1", "v2"],
|
|
required=False,
|
|
default="v2",
|
|
help="Version [v1, v2]",
|
|
)
|
|
|
|
# Add an argument for the path to the PDF file
|
|
parser.add_argument(
|
|
"-i", "--input-pdf", type=str, help="Path to the PDF file", required=True
|
|
)
|
|
|
|
# Add an optional boolean argument for interactive mode
|
|
parser.add_argument(
|
|
"--interactive",
|
|
action="store_true",
|
|
help="Enable interactive mode (default: False)",
|
|
)
|
|
|
|
# Add an optional boolean argument for interactive mode
|
|
parser.add_argument(
|
|
"--display-text",
|
|
action="store_true",
|
|
help="Enable interactive mode (default: False)",
|
|
)
|
|
|
|
# Add an argument for the output directory, defaulting to "./tmp"
|
|
parser.add_argument(
|
|
"-o",
|
|
"--output-dir",
|
|
type=str,
|
|
required=False,
|
|
default=None,
|
|
help="Path to the output directory (default: None)",
|
|
)
|
|
|
|
# Add an argument for the output directory, defaulting to "./tmp"
|
|
parser.add_argument(
|
|
"-p",
|
|
"--page",
|
|
type=int,
|
|
required=False,
|
|
default=-1,
|
|
help="page to be displayed (default: -1 -> all)",
|
|
)
|
|
|
|
# Parse the command-line arguments
|
|
args = parser.parse_args()
|
|
|
|
# Check if the PDF file exists
|
|
assert os.path.exists(args.input_pdf), f"PDF file does not exist: {args.input_pdf}"
|
|
|
|
# Check if the output directory exists, create it if not
|
|
if (args.output_dir is not None) and (not os.path.exists(args.output_dir)):
|
|
os.makedirs(args.output_dir)
|
|
print(f"Output directory '{args.output_dir}' created.")
|
|
|
|
return (
|
|
args.log_level,
|
|
args.version,
|
|
args.input_pdf,
|
|
args.interactive,
|
|
args.output_dir,
|
|
int(args.page),
|
|
args.display_text,
|
|
)
|
|
|
|
|
|
def visualise_v1(
|
|
log_level: str,
|
|
pdf_path: str,
|
|
interactive: str,
|
|
output_dir: str,
|
|
page_num: int,
|
|
display_text: bool,
|
|
):
|
|
|
|
parser = pdf_parser_v1()
|
|
parser.set_loglevel_with_label(log_level)
|
|
|
|
doc_key = "key"
|
|
success = parser.load_document(doc_key, pdf_path)
|
|
|
|
if success == False:
|
|
return
|
|
|
|
doc = None
|
|
|
|
if page_num == -1:
|
|
doc = parser.parse_pdf_from_key(doc_key)
|
|
else:
|
|
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
|
|
|
|
parser.unload_document(doc_key)
|
|
|
|
for pi, page in enumerate(doc["pages"]):
|
|
|
|
H = page["height"]
|
|
W = page["width"]
|
|
|
|
# Create a blank white image
|
|
img = Image.new("RGB", (round(W), round(H)), "white")
|
|
draw = ImageDraw.Draw(img)
|
|
|
|
for cell in page["cells"]:
|
|
bbox = cell["box"]["device"]
|
|
|
|
x0 = bbox[0]
|
|
y0 = bbox[1]
|
|
x1 = bbox[2]
|
|
y1 = bbox[3]
|
|
|
|
# Define the four corners of the rectangle
|
|
bl = (x0, H - y0)
|
|
br = (x1, H - y0)
|
|
tr = (x1, H - y1)
|
|
tl = (x0, H - y1)
|
|
|
|
# Draw the rectangle as a polygon
|
|
draw.polygon([bl, br, tr, tl], outline="black", fill="blue")
|
|
|
|
for image in page["images"]:
|
|
|
|
bbox = image["box"]
|
|
|
|
x0 = bbox[0]
|
|
y0 = bbox[1]
|
|
x1 = bbox[2]
|
|
y1 = bbox[3]
|
|
|
|
# Define the four corners of the rectangle
|
|
bl = (x0, H - y0)
|
|
br = (x1, H - y0)
|
|
tr = (x1, H - y1)
|
|
tl = (x0, H - y1)
|
|
|
|
# Draw the rectangle as a polygon
|
|
draw.polygon([bl, br, tr, tl], outline="black", fill="yellow")
|
|
|
|
for path in page["paths"]:
|
|
i = path["sub-paths"]
|
|
x = path["x-values"]
|
|
y = path["y-values"]
|
|
|
|
for l in range(0, len(i), 2):
|
|
if l + 1 >= len(i):
|
|
continue
|
|
|
|
i0 = i[l + 0]
|
|
i1 = i[l + 1]
|
|
|
|
for k in range(i0, i1 - 1):
|
|
draw.line(
|
|
(x[k], H - y[k], x[k + 1], H - y[k + 1]),
|
|
fill="black",
|
|
width=3,
|
|
)
|
|
|
|
if interactive:
|
|
img.show()
|
|
|
|
if output_dir is not None and page_num == -1:
|
|
oname = os.path.join(
|
|
output_dir, f"{os.path.basename(pdf_path)}_page={pi}.v1.png"
|
|
)
|
|
print(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
elif output_dir is not None and page_num != -1:
|
|
oname = os.path.join(
|
|
output_dir, f"{os.path.basename(pdf_path)}_page={page_num}.v1.png"
|
|
)
|
|
print(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
|
|
def visualise_v2(
|
|
log_level: str,
|
|
pdf_path: str,
|
|
interactive: str,
|
|
output_dir: str,
|
|
page_num: int,
|
|
display_text: bool,
|
|
):
|
|
|
|
parser = pdf_parser_v2(log_level)
|
|
# parser.set_loglevel_with_label(log_level)
|
|
|
|
doc_key = "key"
|
|
success = parser.load_document(doc_key, pdf_path)
|
|
|
|
if success == False:
|
|
return
|
|
|
|
doc = None
|
|
|
|
try:
|
|
if page_num == -1:
|
|
doc = parser.parse_pdf_from_key(doc_key)
|
|
else:
|
|
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
|
|
except Exception as exc:
|
|
print(f"Could not parse pdf-document: {exc}")
|
|
doc = None
|
|
|
|
if doc == None:
|
|
return
|
|
|
|
parser.unload_document(doc_key)
|
|
|
|
for pi, page in enumerate(doc["pages"]):
|
|
|
|
for _ in ["original", "sanitized"]:
|
|
|
|
dimension = page[_]["dimension"]
|
|
|
|
cells = page[_]["cells"]["data"]
|
|
cells_header = page[_]["cells"]["header"]
|
|
|
|
images = page[_]["images"]["data"]
|
|
images_header = page[_]["images"]["header"]
|
|
|
|
lines = page[_]["lines"]
|
|
|
|
if PIL_INSTALLED:
|
|
|
|
W = dimension["width"]
|
|
H = dimension["height"]
|
|
|
|
# Create a blank white image
|
|
img = Image.new("RGB", (round(W), round(H)), "white")
|
|
draw = ImageDraw.Draw(img)
|
|
|
|
# Draw each rectangle by connecting its four points
|
|
for row in images:
|
|
|
|
x0 = row[images_header.index("x0")]
|
|
y0 = row[images_header.index("y0")]
|
|
x1 = row[images_header.index("x1")]
|
|
y1 = row[images_header.index("y1")]
|
|
|
|
# Define the four corners of the rectangle
|
|
bl = (x0, H - y0)
|
|
br = (x1, H - y0)
|
|
tr = (x1, H - y1)
|
|
tl = (x0, H - y1)
|
|
|
|
# Draw the rectangle as a polygon
|
|
draw.polygon([bl, br, tr, tl], outline="green", fill="yellow")
|
|
|
|
# Draw each rectangle by connecting its four points
|
|
for line in lines:
|
|
|
|
i = line["i"]
|
|
x = line["x"]
|
|
y = line["y"]
|
|
|
|
for l in range(0, len(i), 2):
|
|
i0 = i[l + 0]
|
|
i1 = i[l + 1]
|
|
|
|
for k in range(i0, i1 - 1):
|
|
draw.line(
|
|
(x[k], H - y[k], x[k + 1], H - y[k + 1]),
|
|
fill="black",
|
|
width=3,
|
|
)
|
|
|
|
# Draw each rectangle by connecting its four points
|
|
for row in cells:
|
|
|
|
x = []
|
|
y = []
|
|
for i in range(0, 4):
|
|
x.append(row[cells_header.index(f"r_x{i}")])
|
|
y.append(row[cells_header.index(f"r_y{i}")])
|
|
|
|
rect = [
|
|
(x[0], H - y[0]),
|
|
(x[1], H - y[1]),
|
|
(x[2], H - y[2]),
|
|
(x[3], H - y[3]),
|
|
]
|
|
|
|
if display_text:
|
|
print(row[cells_header.index("text")])
|
|
|
|
if "glyph" in row[cells_header.index("text")]:
|
|
print(f" skip cell -> {row}")
|
|
continue
|
|
|
|
# You can change the outline and fill color
|
|
draw.polygon(rect, outline="red", fill="blue")
|
|
|
|
# Show the image
|
|
if interactive:
|
|
img.show()
|
|
|
|
if output_dir is not None and page_num == -1:
|
|
oname = os.path.join(
|
|
output_dir, f"{os.path.basename(pdf_path)}_page={pi}.v2.{_}.png"
|
|
)
|
|
print(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
elif output_dir is not None and page_num != -1:
|
|
oname = os.path.join(
|
|
output_dir,
|
|
f"{os.path.basename(pdf_path)}_page={page_num}.v2.{_}.png",
|
|
)
|
|
print(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
return 0
|
|
|
|
|
|
def main():
|
|
|
|
log_level, version, pdf, interactive, output_dir, page, display_text = parse_args()
|
|
|
|
if version == "v1":
|
|
visualise_v1(log_level, pdf, interactive, output_dir, page, display_text)
|
|
elif version == "v2":
|
|
visualise_v2(log_level, pdf, interactive, output_dir, page, display_text)
|
|
else:
|
|
return -1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|