mirror of
https://github.com/docling-project/docling-parse.git
synced 2026-05-17 13:10:49 +00:00
b066b26215
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
642 lines
21 KiB
Python
642 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Performance harness for page-by-page PDF parsing.
|
|
|
|
Outputs a CSV with rows:
|
|
filename,page_number,elapsed_sec,success,error
|
|
|
|
Parsers supported:
|
|
- docling (default) — uses docling-parse
|
|
- pdfplumber
|
|
- pypdfium2 (alias: pypdfium)
|
|
- pymupdf (fitz)
|
|
|
|
Install extras for non-docling parsers only when needed, e.g.:
|
|
pip install .[perf-tools]
|
|
or with uv:
|
|
uv sync --group perf-test
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import os
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from statistics import mean, median
|
|
from typing import Callable, Iterable, List, Tuple
|
|
from tqdm import tqdm
|
|
from tabulate import tabulate
|
|
|
|
|
|
# -------- Utilities --------
|
|
|
|
|
|
def find_pdfs(path: Path, recursive: bool = False) -> List[Path]:
|
|
if path.is_file():
|
|
return [path] if path.suffix.lower() == ".pdf" else []
|
|
pattern = "**/*.pdf" if recursive else "*.pdf"
|
|
return sorted([p for p in path.glob(pattern) if p.is_file()])
|
|
|
|
|
|
def ensure_parent_dir(path: Path) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def percentile(values: List[float], p: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
if p <= 0:
|
|
return min(values)
|
|
if p >= 100:
|
|
return max(values)
|
|
vs = sorted(values)
|
|
k = (len(vs) - 1) * (p / 100.0)
|
|
f = int(k)
|
|
c = min(f + 1, len(vs) - 1)
|
|
if f == c:
|
|
return vs[f]
|
|
d0 = vs[f] * (c - k)
|
|
d1 = vs[c] * (k - f)
|
|
return d0 + d1
|
|
|
|
|
|
def fmt_seconds(s: float) -> str:
|
|
return f"{s:.6f}"
|
|
|
|
|
|
@dataclass
|
|
class Row:
|
|
filename: str
|
|
page_number: int
|
|
elapsed_sec: float
|
|
success: bool
|
|
error: str
|
|
timings_detail: dict = None # optional per-key timing breakdown (docling only)
|
|
|
|
def __post_init__(self):
|
|
if self.timings_detail is None:
|
|
self.timings_detail = {}
|
|
|
|
|
|
# -------- Parser adapters --------
|
|
|
|
|
|
def _get_docling_static_timing_keys() -> List[str]:
|
|
"""Return all static timing keys from the C++ pybind module."""
|
|
from docling_parse.pdf_parsers import get_static_timing_keys # type: ignore[import]
|
|
return sorted(get_static_timing_keys())
|
|
|
|
|
|
def parse_with_docling(use_bytesio: bool = False) -> Callable[[Path], Iterable[Row]]:
|
|
def _runner(pdf_path: Path) -> Iterable[Row]:
|
|
from io import BytesIO
|
|
from docling_parse.pdf_parser import DoclingPdfParser
|
|
from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import]
|
|
from docling_core.types.doc.page import PdfPageBoundaryType
|
|
|
|
timing_keys = _get_docling_static_timing_keys()
|
|
|
|
rows: List[Row] = []
|
|
try:
|
|
parser = DoclingPdfParser(loglevel="fatal")
|
|
if use_bytesio:
|
|
source = BytesIO(pdf_path.read_bytes())
|
|
else:
|
|
source = str(pdf_path)
|
|
doc = parser.load(
|
|
source,
|
|
lazy=True,
|
|
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
|
)
|
|
try:
|
|
n = doc.number_of_pages()
|
|
except Exception as e: # pragma: no cover
|
|
rows.append(Row(str(pdf_path), -1, 0.0, False, f"num_pages: {e}"))
|
|
return rows
|
|
|
|
for page_idx in range(1, n + 1):
|
|
t0 = time.perf_counter()
|
|
err = ""
|
|
ok = True
|
|
detail: dict = {}
|
|
try:
|
|
perf_config = DecodePageConfig()
|
|
perf_config.keep_char_cells = False
|
|
perf_config.keep_shapes = False
|
|
perf_config.keep_bitmaps = False
|
|
perf_config.create_word_cells = False
|
|
perf_config.create_line_cells = True
|
|
_, timings_obj = doc.get_page_with_timings(
|
|
page_idx,
|
|
config=perf_config,
|
|
)
|
|
static_t = timings_obj.get_static_timings()
|
|
for key in timing_keys:
|
|
detail[key] = static_t.get(key, 0.0)
|
|
except Exception as e: # pragma: no cover
|
|
ok = False
|
|
err = str(e)
|
|
print(f"error: {err}")
|
|
t1 = time.perf_counter()
|
|
rows.append(Row(str(pdf_path), page_idx, t1 - t0, ok, err, detail))
|
|
|
|
# best-effort cleanup
|
|
try:
|
|
doc.unload()
|
|
except Exception:
|
|
pass
|
|
|
|
except Exception as e: # pragma: no cover
|
|
rows.append(Row(str(pdf_path), -1, 0.0, False, f"load: {e}"))
|
|
|
|
return rows
|
|
|
|
return _runner
|
|
|
|
|
|
def parse_with_pdfplumber(pdf_path: Path) -> Iterable[Row]:
|
|
try:
|
|
import pdfplumber # type: ignore
|
|
except Exception as e: # pragma: no cover
|
|
return [Row(str(pdf_path), -1, 0.0, False, f"import pdfplumber: {e}")]
|
|
|
|
rows: List[Row] = []
|
|
try:
|
|
with pdfplumber.open(str(pdf_path)) as pdf:
|
|
n = len(pdf.pages)
|
|
for idx in range(n):
|
|
t0 = time.perf_counter()
|
|
ok = True
|
|
err = ""
|
|
try:
|
|
_ = pdf.pages[idx].extract_text() # parse text via pdfminer
|
|
except Exception as e: # pragma: no cover
|
|
ok = False
|
|
err = str(e)
|
|
print(f"error: {err}")
|
|
|
|
t1 = time.perf_counter()
|
|
rows.append(Row(str(pdf_path), idx + 1, t1 - t0, ok, err))
|
|
except Exception as e: # pragma: no cover
|
|
rows.append(Row(str(pdf_path), -1, 0.0, False, f"open: {e}"))
|
|
return rows
|
|
|
|
|
|
def parse_with_pypdfium2(pdf_path: Path) -> Iterable[Row]:
|
|
try:
|
|
import pypdfium2 as pdfium # type: ignore
|
|
except Exception as e: # pragma: no cover
|
|
return [Row(str(pdf_path), -1, 0.0, False, f"import pypdfium2: {e}")]
|
|
|
|
rows: List[Row] = []
|
|
try:
|
|
doc = pdfium.PdfDocument(str(pdf_path))
|
|
except Exception as e: # pragma: no cover
|
|
return [Row(str(pdf_path), -1, 0.0, False, f"open: {e}")]
|
|
|
|
try:
|
|
n = len(doc)
|
|
for i in range(n):
|
|
t0 = time.perf_counter()
|
|
ok = True
|
|
err = ""
|
|
try:
|
|
page = doc[i]
|
|
text_page = page.get_textpage()
|
|
|
|
# _ = textpage.get_text_range() # extract all page text
|
|
for l in range(text_page.count_rects()):
|
|
rect = text_page.get_rect(l)
|
|
text_piece = text_page.get_text_bounded(*rect)
|
|
# x0, y0, x1, y1 = rect
|
|
# print(f"{rect}: {text_piece}")
|
|
|
|
text_page.close()
|
|
page.close()
|
|
except Exception as e: # pragma: no cover
|
|
ok = False
|
|
err = str(e)
|
|
print(f"error: {err}")
|
|
|
|
t1 = time.perf_counter()
|
|
rows.append(Row(str(pdf_path), i + 1, t1 - t0, ok, err))
|
|
finally:
|
|
try:
|
|
doc.close()
|
|
except Exception:
|
|
pass
|
|
|
|
return rows
|
|
|
|
|
|
def parse_with_pymupdf(pdf_path: Path) -> Iterable[Row]:
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except Exception as e: # pragma: no cover
|
|
return [Row(str(pdf_path), -1, 0.0, False, f"import pymupdf: {e}")]
|
|
|
|
rows: List[Row] = []
|
|
try:
|
|
doc = fitz.open(str(pdf_path))
|
|
except Exception as e: # pragma: no cover
|
|
return [Row(str(pdf_path), -1, 0.0, False, f"open: {e}")]
|
|
|
|
try:
|
|
for i, page in enumerate(doc):
|
|
t0 = time.perf_counter()
|
|
ok = True
|
|
err = ""
|
|
try:
|
|
_ = page.get_text("text") # plain text extraction
|
|
except Exception as e: # pragma: no cover
|
|
ok = False
|
|
err = str(e)
|
|
t1 = time.perf_counter()
|
|
rows.append(Row(str(pdf_path), i + 1, t1 - t0, ok, err))
|
|
finally:
|
|
try:
|
|
doc.close()
|
|
except Exception:
|
|
pass
|
|
|
|
return rows
|
|
|
|
|
|
def parse_with_docling_threaded(
|
|
num_threads: int = 4,
|
|
max_concurrent_results: int = 64,
|
|
) -> Callable[[List[Path]], Tuple[List[Row], float]]:
|
|
"""Return a runner that loads *all* PDFs, then decodes pages in parallel.
|
|
|
|
Unlike the other adapters this one consumes the full list of files at once
|
|
so that the thread pool can work across documents. It returns
|
|
(rows, wall_time) so the caller can report the true parallel wall time.
|
|
"""
|
|
|
|
def _runner(pdf_paths: List[Path]) -> Tuple[List[Row], float]:
|
|
from docling_parse.pdf_parser import (
|
|
DoclingThreadedPdfParser,
|
|
ThreadedPdfParserConfig,
|
|
)
|
|
from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import]
|
|
|
|
decode_config = DecodePageConfig()
|
|
decode_config.keep_char_cells = False
|
|
decode_config.keep_shapes = False
|
|
decode_config.keep_bitmaps = False
|
|
decode_config.create_word_cells = False
|
|
decode_config.create_line_cells = True
|
|
|
|
parser_config = ThreadedPdfParserConfig(
|
|
loglevel="fatal",
|
|
threads=num_threads,
|
|
max_concurrent_results=max_concurrent_results,
|
|
)
|
|
|
|
parser = DoclingThreadedPdfParser(
|
|
parser_config=parser_config,
|
|
decode_config=decode_config,
|
|
)
|
|
|
|
for pdf_path in pdf_paths:
|
|
try:
|
|
parser.load(str(pdf_path))
|
|
except Exception as e:
|
|
pass # will surface as missing results below
|
|
|
|
rows: List[Row] = []
|
|
wall_start = time.perf_counter()
|
|
|
|
for task in parser.iterate_results():
|
|
t0 = time.perf_counter()
|
|
t1 = time.perf_counter()
|
|
|
|
if task.success:
|
|
detail = dict(task.get_timings().items())
|
|
rows.append(
|
|
Row(
|
|
filename=task.doc_key,
|
|
page_number=task.page_number,
|
|
elapsed_sec=t1 - t0,
|
|
success=True,
|
|
error="",
|
|
timings_detail=detail,
|
|
)
|
|
)
|
|
else:
|
|
rows.append(
|
|
Row(
|
|
filename=task.doc_key,
|
|
page_number=task.page_number,
|
|
elapsed_sec=t1 - t0,
|
|
success=False,
|
|
error=task.error_message,
|
|
)
|
|
)
|
|
|
|
wall_end = time.perf_counter()
|
|
return rows, wall_end - wall_start
|
|
|
|
return _runner
|
|
|
|
|
|
NON_DOCLING_PARSERS: dict[str, Callable[[Path], Iterable[Row]]] = {
|
|
"pdfplumber": parse_with_pdfplumber,
|
|
"pypdfium2": parse_with_pypdfium2,
|
|
"pypdfium": parse_with_pypdfium2, # alias
|
|
"pymupdf": parse_with_pymupdf,
|
|
}
|
|
|
|
ALL_PARSER_NAMES = sorted({"docling", "docling-threaded"} | set(NON_DOCLING_PARSERS.keys()))
|
|
|
|
|
|
# -------- Main program --------
|
|
|
|
|
|
def compute_stats(rows: List[Row]) -> dict:
|
|
times = [r.elapsed_sec for r in rows if r.page_number > 0 and r.success]
|
|
total_pages = sum(1 for r in rows if r.page_number > 0)
|
|
ok_pages = len(times)
|
|
failed_pages = total_pages - ok_pages
|
|
total_time = sum(times)
|
|
stats = {
|
|
"files": len(set(r.filename for r in rows)),
|
|
"pages_total": total_pages,
|
|
"pages_ok": ok_pages,
|
|
"pages_failed": failed_pages,
|
|
"time_total_sec": total_time,
|
|
"time_avg_sec": mean(times) if times else 0.0,
|
|
"p50_sec": percentile(times, 50),
|
|
"p90_sec": percentile(times, 90),
|
|
"p95_sec": percentile(times, 95),
|
|
"p99_sec": percentile(times, 99),
|
|
"min_sec": min(times) if times else 0.0,
|
|
"max_sec": max(times) if times else 0.0,
|
|
}
|
|
return stats
|
|
|
|
|
|
def print_stats(stats: dict, parser_name: str) -> None:
|
|
print("")
|
|
print(f"Summary for parser={parser_name}")
|
|
print(f" - files: {stats['files']}")
|
|
print(f" - pages total: {stats['pages_total']}")
|
|
print(f" - pages ok: {stats['pages_ok']}")
|
|
print(f" - pages failed: {stats['pages_failed']}")
|
|
print(f" - total sec: {fmt_seconds(stats['time_total_sec'])}")
|
|
print(f" - avg sec/page: {fmt_seconds(stats['time_avg_sec'])}")
|
|
print(f" - p50: {fmt_seconds(stats['p50_sec'])} p90: {fmt_seconds(stats['p90_sec'])} p95: {fmt_seconds(stats['p95_sec'])} p99: {fmt_seconds(stats['p99_sec'])}")
|
|
print(f" - min: {fmt_seconds(stats['min_sec'])} max: {fmt_seconds(stats['max_sec'])}")
|
|
|
|
|
|
def compute_per_document_stats(rows: List[Row]) -> List[dict]:
|
|
# Collect per-file successful page times and total page counts
|
|
times_by_file: dict[str, List[float]] = {}
|
|
total_pages_by_file: dict[str, int] = {}
|
|
|
|
for r in rows:
|
|
if r.page_number > 0:
|
|
total_pages_by_file[r.filename] = total_pages_by_file.get(r.filename, 0) + 1
|
|
if r.page_number > 0 and r.success:
|
|
times_by_file.setdefault(r.filename, []).append(r.elapsed_sec)
|
|
|
|
per_doc: List[dict] = []
|
|
for fname in sorted(set(times_by_file.keys()) | set(total_pages_by_file.keys())):
|
|
times = times_by_file.get(fname, [])
|
|
pages_total = total_pages_by_file.get(fname, 0)
|
|
per_doc.append(
|
|
{
|
|
"document": fname,
|
|
"pages": pages_total,
|
|
"total": sum(times) if times else 0.0,
|
|
"mean": mean(times) if times else 0.0,
|
|
"median": median(times) if times else 0.0,
|
|
"min": min(times) if times else 0.0,
|
|
"max": max(times) if times else 0.0,
|
|
"p90": percentile(times, 90),
|
|
"p95": percentile(times, 95),
|
|
"p99": percentile(times, 99),
|
|
}
|
|
)
|
|
return per_doc
|
|
|
|
|
|
def print_per_document_table(rows: List[Row]) -> None:
|
|
per_doc = compute_per_document_stats(rows)
|
|
if not per_doc:
|
|
print("\nNo per-document stats to display (no successful pages).")
|
|
return
|
|
|
|
headers = ["document", "pages", "total", "mean", "median", "min", "max", "p90", "p95", "p99"]
|
|
table_rows = []
|
|
for s in per_doc:
|
|
table_rows.append(
|
|
[
|
|
s["document"],
|
|
s["pages"],
|
|
fmt_seconds(s["total"]),
|
|
fmt_seconds(s["mean"]),
|
|
fmt_seconds(s["median"]),
|
|
fmt_seconds(s["min"]),
|
|
fmt_seconds(s["max"]),
|
|
fmt_seconds(s["p90"]),
|
|
fmt_seconds(s["p95"]),
|
|
fmt_seconds(s["p99"]),
|
|
]
|
|
)
|
|
|
|
print("\nPer-document statistics (sec/page):")
|
|
print(tabulate(table_rows, headers=headers))
|
|
|
|
|
|
def write_per_document_csv(rows: List[Row], out_path: Path) -> Path:
|
|
per_doc = compute_per_document_stats(rows)
|
|
per_doc_path = out_path.with_name(out_path.stem + "_per_doc" + out_path.suffix)
|
|
with per_doc_path.open("w", newline="") as f:
|
|
w = csv.writer(f)
|
|
w.writerow(["basename", "document", "pages", "total", "mean", "median", "min", "max", "p90", "p95", "p99"])
|
|
for s in per_doc:
|
|
w.writerow([
|
|
Path(s["document"]).name,
|
|
s["document"],
|
|
s["pages"],
|
|
fmt_seconds(s["total"]),
|
|
fmt_seconds(s["mean"]),
|
|
fmt_seconds(s["median"]),
|
|
fmt_seconds(s["min"]),
|
|
fmt_seconds(s["max"]),
|
|
fmt_seconds(s["p90"]),
|
|
fmt_seconds(s["p95"]),
|
|
fmt_seconds(s["p99"]),
|
|
])
|
|
return per_doc_path
|
|
|
|
|
|
def _get_timing_keys_from_rows(rows: List[Row]) -> List[str]:
|
|
"""Extract the sorted set of timing detail keys present across all rows."""
|
|
keys: set = set()
|
|
for r in rows:
|
|
keys.update(r.timings_detail.keys())
|
|
return sorted(keys)
|
|
|
|
|
|
def print_timing_breakdown(rows: List[Row], timing_keys: List[str]) -> None:
|
|
"""Print a table showing average absolute time and % for each static timing key."""
|
|
ok_rows = [r for r in rows if r.page_number > 0 and r.success and r.timings_detail]
|
|
if not ok_rows:
|
|
return
|
|
|
|
n = len(ok_rows)
|
|
total_elapsed = sum(r.elapsed_sec for r in ok_rows)
|
|
|
|
headers = ["timing_key", "total_sec", "avg_sec", "avg_%"]
|
|
table_rows = []
|
|
for key in timing_keys:
|
|
key_total = sum(r.timings_detail.get(key, 0.0) for r in ok_rows)
|
|
key_avg = key_total / n
|
|
key_pct = (key_total / total_elapsed * 100.0) if total_elapsed > 0 else 0.0
|
|
table_rows.append([key, fmt_seconds(key_total), fmt_seconds(key_avg), f"{key_pct:.2f}%"])
|
|
|
|
print("\nTiming breakdown (static keys, across all successful pages):")
|
|
print(tabulate(table_rows, headers=headers))
|
|
|
|
|
|
def default_output_path(parser_name: str) -> Path:
|
|
ts = time.strftime("%Y%m%d-%H%M%S")
|
|
return Path("perf") / "results" / f"perf_{parser_name}_{ts}.csv"
|
|
|
|
|
|
def main(argv: List[str]) -> int:
|
|
ap = argparse.ArgumentParser(description="Page-level PDF parsing perf harness")
|
|
ap.add_argument("input", help="Path to a PDF file or directory of PDFs")
|
|
ap.add_argument(
|
|
"--parser",
|
|
"-p",
|
|
default="docling",
|
|
choices=ALL_PARSER_NAMES,
|
|
help="Parser backend to benchmark (docling, pdfplumber, pypdfium2, pypdfium, pymupdf)",
|
|
)
|
|
ap.add_argument(
|
|
"--recursive",
|
|
"-r",
|
|
action="store_true",
|
|
help="Recurse into subdirectories when input is a directory",
|
|
)
|
|
ap.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
default=None,
|
|
help="Output CSV path. Defaults to perf/results/perf_<parser>_<timestamp>.csv",
|
|
)
|
|
ap.add_argument(
|
|
"--limit",
|
|
"-l",
|
|
type=int,
|
|
default=None,
|
|
help="Maximum number of documents to process",
|
|
)
|
|
ap.add_argument(
|
|
"--bytesio",
|
|
action="store_true",
|
|
help="(docling only) Read PDFs into memory and pass as BytesIO instead of file path",
|
|
)
|
|
ap.add_argument(
|
|
"--threads",
|
|
"-t",
|
|
type=int,
|
|
default=4,
|
|
help="(docling-threaded only) Number of worker threads (default: 4)",
|
|
)
|
|
ap.add_argument(
|
|
"--max-concurrent-results",
|
|
type=int,
|
|
default=64,
|
|
help="(docling-threaded only) Max buffered results before workers pause (default: 64)",
|
|
)
|
|
|
|
args = ap.parse_args(argv)
|
|
|
|
parser_key = args.parser
|
|
if parser_key == "docling":
|
|
parser_fn = parse_with_docling(use_bytesio=args.bytesio)
|
|
elif parser_key == "docling-threaded":
|
|
if args.bytesio:
|
|
ap.error("--bytesio is not supported with --parser docling-threaded")
|
|
# handled separately below
|
|
parser_fn = None
|
|
else:
|
|
if args.bytesio:
|
|
ap.error("--bytesio is only supported with --parser docling")
|
|
parser_fn = NON_DOCLING_PARSERS[parser_key]
|
|
input_path = Path(args.input)
|
|
pdfs = find_pdfs(input_path, recursive=args.recursive)
|
|
|
|
if args.limit is not None:
|
|
pdfs = pdfs[:args.limit]
|
|
|
|
if not pdfs:
|
|
print(f"No PDFs found at {input_path}", file=sys.stderr)
|
|
return 2
|
|
|
|
out_path = Path(args.output) if args.output else default_output_path(parser_key)
|
|
ensure_parent_dir(out_path)
|
|
|
|
rows: List[Row] = []
|
|
started = time.perf_counter()
|
|
|
|
if parser_key == "docling-threaded":
|
|
threaded_fn = parse_with_docling_threaded(
|
|
num_threads=args.threads,
|
|
max_concurrent_results=args.max_concurrent_results,
|
|
)
|
|
print(f"Loading {len(pdfs)} PDFs and parsing with {args.threads} threads ...")
|
|
rows, wall_time = threaded_fn(pdfs)
|
|
ended = started + wall_time
|
|
else:
|
|
for pdf in tqdm(pdfs, desc=f"Parsing PDFs with {parser_key}"):
|
|
# print(pdf)
|
|
rows.extend(list(parser_fn(pdf)))
|
|
ended = time.perf_counter()
|
|
|
|
# Collect timing detail keys from the rows (docling only)
|
|
timing_keys = _get_timing_keys_from_rows(rows)
|
|
|
|
# Write CSV
|
|
with out_path.open("w", newline="") as f:
|
|
w = csv.writer(f)
|
|
header = ["filename", "page_number", "elapsed_sec", "success", "error"]
|
|
for key in timing_keys:
|
|
header.append(key)
|
|
header.append(f"{key}_%")
|
|
w.writerow(header)
|
|
for r in rows:
|
|
row_data = [r.filename, r.page_number, f"{r.elapsed_sec:.9f}", int(r.success), r.error]
|
|
for key in timing_keys:
|
|
val = r.timings_detail.get(key, 0.0)
|
|
pct = (val / r.elapsed_sec * 100.0) if r.elapsed_sec > 0 else 0.0
|
|
row_data.append(f"{val:.9f}")
|
|
row_data.append(f"{pct:.2f}")
|
|
w.writerow(row_data)
|
|
|
|
# Print summary
|
|
stats = compute_stats(rows)
|
|
print_stats(stats, parser_key)
|
|
if timing_keys:
|
|
print_timing_breakdown(rows, timing_keys)
|
|
print_per_document_table(rows)
|
|
per_doc_path = write_per_document_csv(rows, out_path)
|
|
print(f"\nWrote: {out_path}")
|
|
print(f"Wrote: {per_doc_path}")
|
|
print(f"Total wall time: {fmt_seconds(ended - started)} sec")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main(sys.argv[1:]))
|