mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
de18986f03
Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
356 lines
9.1 KiB
Python
356 lines
9.1 KiB
Python
import argparse
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
from typing import Dict, Optional
|
|
|
|
from docling_parse.pdf_parser import DoclingPdfParser, ParsedPdfPage, PdfDocument
|
|
from docling_parse.pdf_parsers import ( # type: ignore[import]
|
|
pdf_parser_v1,
|
|
pdf_parser_v2,
|
|
)
|
|
from docling_parse.utils import create_pil_image_of_page_v1, create_pil_image_of_page_v2
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
)
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Process a PDF file.")
|
|
|
|
# Restrict log-level to specific values
|
|
parser.add_argument(
|
|
"-l",
|
|
"--log-level",
|
|
type=str,
|
|
choices=["info", "warning", "error", "fatal"],
|
|
required=False,
|
|
default="error",
|
|
help="Log level [info, warning, error, fatal]",
|
|
)
|
|
|
|
# Restrict version to specific values
|
|
parser.add_argument(
|
|
"-v",
|
|
"--version",
|
|
type=str,
|
|
choices=["py", "v1", "v2"],
|
|
required=False,
|
|
default="py",
|
|
help="Version [v1, v2]",
|
|
)
|
|
|
|
# Restrict page-boundary
|
|
parser.add_argument(
|
|
"-b",
|
|
"--page-boundary",
|
|
type=str,
|
|
choices=["crop_box", "media_box"],
|
|
required=False,
|
|
default="crop_box",
|
|
help="page-boundary [crop_box, media_box]",
|
|
)
|
|
|
|
# Restrict page-boundary
|
|
parser.add_argument(
|
|
"-c",
|
|
"--category",
|
|
type=str,
|
|
choices=["both", "original", "sanitized"],
|
|
required=True,
|
|
default="both",
|
|
help="category [`both`, `original`, `sanitized`]",
|
|
)
|
|
|
|
# Add an argument for the path to the PDF file
|
|
parser.add_argument(
|
|
"-i", "--input-pdf", type=str, help="Path to the PDF file", required=True
|
|
)
|
|
|
|
# Add an optional boolean argument for interactive mode
|
|
parser.add_argument(
|
|
"--interactive",
|
|
action="store_true",
|
|
help="Enable interactive mode (default: False)",
|
|
)
|
|
|
|
# Add an optional boolean argument for interactive mode
|
|
parser.add_argument(
|
|
"--display-text",
|
|
action="store_true",
|
|
help="Enable interactive mode (default: False)",
|
|
)
|
|
|
|
# Add an argument for the output directory, defaulting to "./tmp"
|
|
parser.add_argument(
|
|
"-o",
|
|
"--output-dir",
|
|
type=str,
|
|
required=False,
|
|
default=None,
|
|
help="Path to the output directory (default: None)",
|
|
)
|
|
|
|
# Add an argument for the output directory, defaulting to "./tmp"
|
|
parser.add_argument(
|
|
"-p",
|
|
"--page",
|
|
type=int,
|
|
required=False,
|
|
default=-1,
|
|
help="page to be displayed (default: -1 -> all)",
|
|
)
|
|
|
|
# Parse the command-line arguments
|
|
args = parser.parse_args()
|
|
|
|
# Check if the PDF file exists
|
|
assert os.path.exists(args.input_pdf), f"PDF file does not exist: {args.input_pdf}"
|
|
|
|
# Check if the output directory exists, create it if not
|
|
if (args.output_dir is not None) and (not os.path.exists(args.output_dir)):
|
|
os.makedirs(args.output_dir)
|
|
logging.info(f"Output directory '{args.output_dir}' created.")
|
|
|
|
return (
|
|
args.log_level,
|
|
args.version,
|
|
args.input_pdf,
|
|
args.interactive,
|
|
args.output_dir,
|
|
int(args.page),
|
|
args.display_text,
|
|
args.page_boundary,
|
|
args.category,
|
|
)
|
|
|
|
|
|
def visualise_v1(
|
|
log_level: str,
|
|
pdf_path: str,
|
|
interactive: str,
|
|
output_dir: str,
|
|
page_num: int,
|
|
display_text: bool,
|
|
):
|
|
|
|
parser = pdf_parser_v1()
|
|
parser.set_loglevel_with_label(log_level)
|
|
|
|
doc_key = "key"
|
|
success = parser.load_document(doc_key, pdf_path)
|
|
|
|
if success == False:
|
|
return
|
|
|
|
doc = None
|
|
|
|
if page_num == -1:
|
|
doc = parser.parse_pdf_from_key(doc_key)
|
|
else:
|
|
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
|
|
|
|
parser.unload_document(doc_key)
|
|
|
|
for pi, page in enumerate(doc["pages"]):
|
|
|
|
img = create_pil_image_of_page_v1(page)
|
|
|
|
if interactive:
|
|
img.show()
|
|
|
|
if output_dir is not None and page_num == -1:
|
|
oname = os.path.join(
|
|
output_dir, f"{os.path.basename(pdf_path)}_page={pi}.v1.png"
|
|
)
|
|
logging.info(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
elif output_dir is not None and page_num != -1:
|
|
oname = os.path.join(
|
|
output_dir, f"{os.path.basename(pdf_path)}_page={page_num}.v1.png"
|
|
)
|
|
logging.info(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
|
|
def visualise_v2(
|
|
log_level: str,
|
|
pdf_path: str,
|
|
interactive: str,
|
|
output_dir: str,
|
|
page_num: int,
|
|
display_text: bool,
|
|
page_boundary: str = "crop_box", # media_box
|
|
category: str = "both", # "both", "sanitized", "original"
|
|
):
|
|
categories = []
|
|
if category == "both":
|
|
categories = ["sanitized", "original"]
|
|
else:
|
|
categories = [category]
|
|
|
|
parser = pdf_parser_v2(log_level)
|
|
# parser.set_loglevel_with_label(log_level)
|
|
|
|
hash_obj = hashlib.sha256(str(pdf_path).encode())
|
|
doc_key = str(hash_obj.hexdigest())
|
|
|
|
# doc_key = "key"
|
|
logging.info(f"{doc_key}: {pdf_path}")
|
|
|
|
success = parser.load_document(doc_key, pdf_path)
|
|
|
|
if success == False:
|
|
return
|
|
|
|
logging.info(f"page_boundary: {page_boundary}")
|
|
|
|
doc: Optional[Dict] = None
|
|
|
|
try:
|
|
if page_num == -1:
|
|
doc = parser.parse_pdf_from_key(doc_key, page_boundary)
|
|
else:
|
|
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num, page_boundary)
|
|
|
|
except Exception as exc:
|
|
logging.info(f"Could not parse pdf-document: {exc}")
|
|
doc = None
|
|
|
|
if doc is None:
|
|
return
|
|
|
|
parser.unload_document(doc_key)
|
|
|
|
for pi, page in enumerate(doc.get("pages", [])):
|
|
|
|
for category in categories:
|
|
|
|
img = create_pil_image_of_page_v2(
|
|
page, category=category, draw_cells_text=display_text
|
|
)
|
|
|
|
if interactive:
|
|
img.show()
|
|
|
|
if output_dir is not None and page_num == -1:
|
|
oname = os.path.join(
|
|
output_dir,
|
|
f"{os.path.basename(pdf_path)}_page={pi}.v2.{category}.png",
|
|
)
|
|
logging.info(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
elif output_dir is not None and page_num != -1:
|
|
oname = os.path.join(
|
|
output_dir,
|
|
f"{os.path.basename(pdf_path)}_page={pi}.v2.{category}.png",
|
|
)
|
|
logging.info(f"output: {oname}")
|
|
|
|
img.save(oname)
|
|
|
|
return 0
|
|
|
|
|
|
def visualise_py(
|
|
log_level: str,
|
|
pdf_path: str,
|
|
interactive: str,
|
|
output_dir: str,
|
|
page_num: int,
|
|
display_text: bool,
|
|
page_boundary: str = "crop_box", # media_box
|
|
category: str = "both", # "both", "sanitized", "original"
|
|
):
|
|
parser = DoclingPdfParser(loglevel=log_level)
|
|
|
|
pdf_doc: PdfDocument = parser.load(path_or_stream=pdf_path, lazy=True)
|
|
|
|
pdf_page: ParsedPdfPage = pdf_doc.get_page(page_no=page_num)
|
|
|
|
if category == "both":
|
|
pdf_page.original.render(
|
|
draw_cells_bbox=(not display_text), draw_cells_text=display_text
|
|
).show()
|
|
pdf_page.sanitized.render(
|
|
draw_cells_bbox=(not display_text), draw_cells_text=display_text
|
|
).show()
|
|
elif category == "sanitized":
|
|
pdf_page.sanitized.render(
|
|
draw_cells_bbox=(not display_text), draw_cells_text=display_text
|
|
).show()
|
|
elif category == "original":
|
|
pdf_page.original.render(
|
|
draw_cells_bbox=(not display_text), draw_cells_text=display_text
|
|
).show()
|
|
|
|
lines = pdf_page.sanitized.export_to_textlines(add_fontkey=True)
|
|
print("text-lines: \n", "\n".join(lines))
|
|
|
|
"""
|
|
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
|
|
print("\n".join(lines))
|
|
"""
|
|
|
|
|
|
def main():
|
|
|
|
(
|
|
log_level,
|
|
version,
|
|
pdf_path,
|
|
interactive,
|
|
output_dir,
|
|
page_num,
|
|
display_text,
|
|
page_boundary,
|
|
category,
|
|
) = parse_args()
|
|
|
|
logging.info(f"page_boundary: {page_boundary}")
|
|
|
|
if version == "v1":
|
|
visualise_v1(
|
|
log_level=log_level,
|
|
pdf_path=pdf_path,
|
|
interactive=interactive,
|
|
output_dir=output_dir,
|
|
page_num=page_num,
|
|
display_text=display_text,
|
|
)
|
|
elif version == "v2":
|
|
visualise_v2(
|
|
log_level=log_level,
|
|
pdf_path=pdf_path,
|
|
interactive=interactive,
|
|
output_dir=output_dir,
|
|
page_num=page_num,
|
|
display_text=display_text,
|
|
page_boundary=page_boundary,
|
|
category=category,
|
|
)
|
|
elif version == "py":
|
|
visualise_py(
|
|
log_level=log_level,
|
|
pdf_path=pdf_path,
|
|
interactive=interactive,
|
|
output_dir=output_dir,
|
|
page_num=page_num,
|
|
display_text=display_text,
|
|
page_boundary=page_boundary,
|
|
category=category,
|
|
)
|
|
else:
|
|
return -1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|