mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
c23622f6f5
* docs: add agent skill bundle with convert/evaluate helpers - Add docs/examples/agent_skill/docling-document-intelligence/ with SKILL.md, pipelines.md, EXAMPLE.md, improvement-log template, and scripts/docling-convert.py + docling-evaluate.py (standard/vlm-local/vlm-api). - Document InputFormat.PDF + PdfFormatOption for explicit PdfPipelineOptions. - Link from examples index and mkdocs nav. Made-with: Cursor * docs: align agent skill README and EXAMPLE with Cursor bundle - Document both ~/.cursor/skills and docs/examples paths. - README notes repo parity for PRs and local installs. Made-with: Cursor * DCO Remediation Commit for jehlum11 <jehlum11@gmail.com> I, jehlum11 <jehlum11@gmail.com>, hereby add my Signed-off-by to this commit:2d268ffb6fI, jehlum11 <jehlum11@gmail.com>, hereby add my Signed-off-by to this commit:041e709c66Signed-off-by: jehlum11 <jehlum11@gmail.com> Made-with: Cursor * docs: refactor agent skill to use docling CLI for conversion Address maintainer feedback: the custom docling-convert.py script was largely redundant with the existing docling CLI. This commit: - Removes scripts/docling-convert.py (redundant with `docling` CLI) - Refactors SKILL.md (v1.4 → v2.0) to use `docling` CLI for all conversion tasks, reserving the Python API only for features the CLI does not expose (chunking, VLM API endpoint config, force_backend_text hybrid mode) - Updates docling-evaluate.py recommended_actions to reference `docling` CLI flags instead of the removed script - Updates README.md, EXAMPLE.md, pipelines.md to use `docling` CLI examples throughout - Simplifies requirements.txt (removes packaging dependency) The only custom script retained is docling-evaluate.py, which provides heuristic quality evaluation — functionality the CLI does not cover. Signed-off-by: jehlum11 <jehlum11@gmail.com> Made-with: Cursor * docs: fix ruff format on docling-evaluate.py Signed-off-by: jehlum11 <jehlum11@gmail.com> Made-with: Cursor --------- Signed-off-by: jehlum11 <jehlum11@gmail.com>
297 lines
9.4 KiB
Python
Vendored
297 lines
9.4 KiB
Python
Vendored
#!/usr/bin/env python3
|
|
# SPDX-License-Identifier: MIT
|
|
"""
|
|
Evaluate a Docling JSON export and suggest pipeline / option changes.
|
|
|
|
Typical flow (agent or human):
|
|
|
|
docling input.pdf --to json --output /tmp/
|
|
docling input.pdf --to md --output /tmp/
|
|
python3 scripts/docling-evaluate.py /tmp/input.json --markdown /tmp/input.md
|
|
|
|
Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
def load_document(path: Path):
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
try:
|
|
from docling_core.types.doc.document import DoclingDocument
|
|
|
|
return DoclingDocument.model_validate(data), data
|
|
except Exception:
|
|
return None, data
|
|
|
|
|
|
def page_numbers_from_doc(doc) -> set[int]:
|
|
pages: set[int] = set()
|
|
for item, _ in doc.iterate_items():
|
|
for prov in getattr(item, "prov", None) or []:
|
|
p = getattr(prov, "page_no", None)
|
|
if p is not None:
|
|
pages.add(int(p))
|
|
return pages
|
|
|
|
|
|
def collect_text_samples(doc, limit: int = 200) -> list[str]:
|
|
texts: list[str] = []
|
|
for item, _ in doc.iterate_items():
|
|
t = getattr(item, "text", None)
|
|
if t and str(t).strip():
|
|
texts.append(str(t).strip())
|
|
if len(texts) >= limit:
|
|
break
|
|
return texts
|
|
|
|
|
|
def metrics_from_doc(doc) -> dict[str, Any]:
|
|
n_tables = len(getattr(doc, "tables", []) or [])
|
|
n_pictures = len(getattr(doc, "pictures", []) or [])
|
|
n_headers = 0
|
|
n_text_items = 0
|
|
total_chars = 0
|
|
for item, _ in doc.iterate_items():
|
|
label = getattr(getattr(item, "label", None), "name", None) or ""
|
|
if label == "SECTION_HEADER":
|
|
n_headers += 1
|
|
t = getattr(item, "text", None)
|
|
if t:
|
|
n_text_items += 1
|
|
total_chars += len(str(t))
|
|
|
|
pages = page_numbers_from_doc(doc)
|
|
n_pages = len(pages) if pages else 0
|
|
density = (total_chars / n_pages) if n_pages else total_chars
|
|
|
|
samples = collect_text_samples(doc)
|
|
rep = Counter(samples)
|
|
top_rep = rep.most_common(1)[0] if rep else ("", 0)
|
|
dup_ratio = (
|
|
sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0
|
|
)
|
|
|
|
md = ""
|
|
try:
|
|
md = doc.export_to_markdown()
|
|
except Exception:
|
|
pass
|
|
|
|
replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples)
|
|
|
|
return {
|
|
"page_count": n_pages,
|
|
"section_headers": n_headers,
|
|
"text_items": n_text_items,
|
|
"total_text_chars": total_chars,
|
|
"chars_per_page": round(density, 2),
|
|
"tables": n_tables,
|
|
"pictures": n_pictures,
|
|
"markdown_chars": len(md),
|
|
"replacement_chars": replacement,
|
|
"most_repeated_text_count": int(top_rep[1]) if top_rep else 0,
|
|
"duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10,
|
|
}
|
|
|
|
|
|
def heuristic_metrics(data: dict) -> dict[str, Any]:
|
|
"""Fallback when DoclingDocument cannot be validated (older export / drift)."""
|
|
texts = data.get("texts") or []
|
|
tables = data.get("tables") or []
|
|
body = data.get("body") or {}
|
|
children = body.get("children") if isinstance(body, dict) else None
|
|
n_children = len(children) if isinstance(children, list) else 0
|
|
char_sum = 0
|
|
for t in texts:
|
|
if isinstance(t, dict):
|
|
char_sum += len(str(t.get("text") or ""))
|
|
return {
|
|
"page_count": 0,
|
|
"section_headers": 0,
|
|
"text_items": len(texts),
|
|
"total_text_chars": char_sum,
|
|
"chars_per_page": 0.0,
|
|
"tables": len(tables),
|
|
"pictures": len(data.get("pictures") or []),
|
|
"markdown_chars": 0,
|
|
"replacement_chars": 0,
|
|
"most_repeated_text_count": 0,
|
|
"duplicate_heavy": False,
|
|
"heuristic_only": True,
|
|
"body_children": n_children,
|
|
}
|
|
|
|
|
|
def evaluate(
|
|
m: dict[str, Any],
|
|
*,
|
|
expect_tables: bool,
|
|
min_chars_per_page: float,
|
|
min_markdown_chars: int,
|
|
) -> tuple[str, list[str], list[str]]:
|
|
issues: list[str] = []
|
|
actions: list[str] = []
|
|
|
|
if m.get("heuristic_only"):
|
|
issues.append("Could not load full DoclingDocument; metrics are partial.")
|
|
actions.append(
|
|
"Ensure docling-core matches export; re-export with: docling <source> --to json --output <dir>"
|
|
)
|
|
|
|
cpp = m.get("chars_per_page") or 0
|
|
if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page:
|
|
issues.append(
|
|
f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap."
|
|
)
|
|
actions.append(
|
|
"Retry: docling <source> --ocr-engine tesserocr (or rapidocr, ocrmac)"
|
|
)
|
|
actions.append("Retry: docling <source> --pipeline vlm")
|
|
|
|
if m.get("replacement_chars", 0) > 5:
|
|
issues.append(
|
|
"Unicode replacement characters detected; OCR may be garbling text."
|
|
)
|
|
actions.append("Retry: docling <source> --ocr-engine tesserocr (or rapidocr)")
|
|
actions.append(
|
|
"Retry: docling <source> --pipeline vlm (use force_backend_text=True via Python API for hybrid)"
|
|
)
|
|
|
|
if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8):
|
|
issues.append(
|
|
"Repeated text blocks; possible layout/OCR loop or bad reading order."
|
|
)
|
|
actions.append("Retry: docling <source> --pipeline vlm")
|
|
actions.append(
|
|
"If using VLM: try force_backend_text=True via Python API for text-heavy pages"
|
|
)
|
|
|
|
if expect_tables and m.get("tables", 0) == 0:
|
|
issues.append("No tables detected but tables were expected.")
|
|
actions.append(
|
|
"Retry: docling <source> (tables are enabled by default; remove --no-tables if set)"
|
|
)
|
|
actions.append(
|
|
"Retry: docling <source> --pipeline vlm (better for merged-cell or visual tables)"
|
|
)
|
|
|
|
mc = m.get("markdown_chars", 0)
|
|
if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1:
|
|
issues.append(f"Markdown export is very short ({mc} chars) for the page count.")
|
|
actions.append(
|
|
"Retry: docling <source> --pipeline vlm (or try different --ocr-engine)"
|
|
)
|
|
|
|
if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0:
|
|
issues.append(
|
|
"No text items and no page provenance; export may be empty or invalid."
|
|
)
|
|
actions.append(
|
|
"Verify source file opens correctly; retry with: docling <source> --pipeline standard"
|
|
)
|
|
|
|
seen = set()
|
|
uniq_actions = []
|
|
for a in actions:
|
|
if a not in seen:
|
|
seen.add(a)
|
|
uniq_actions.append(a)
|
|
|
|
if not issues:
|
|
return "pass", [], []
|
|
|
|
severe = m.get("text_items", 0) == 0 or (
|
|
m.get("page_count", 0) >= 1 and mc < 50 and mc > 0
|
|
)
|
|
status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn"
|
|
return status, issues, uniq_actions
|
|
|
|
|
|
def parse_args():
|
|
p = argparse.ArgumentParser(description="Evaluate Docling JSON export quality")
|
|
p.add_argument(
|
|
"json_path", type=Path, help="Path to DoclingDocument JSON (export_to_dict)"
|
|
)
|
|
p.add_argument(
|
|
"--markdown",
|
|
type=Path,
|
|
default=None,
|
|
help="Optional markdown file to cross-check length",
|
|
)
|
|
p.add_argument("--expect-tables", action="store_true")
|
|
p.add_argument("--min-chars-per-page", type=float, default=120.0)
|
|
p.add_argument("--min-markdown-chars", type=int, default=200)
|
|
p.add_argument("--fail-on-warn", action="store_true")
|
|
p.add_argument(
|
|
"--quiet", action="store_true", help="Only print JSON report to stdout"
|
|
)
|
|
return p.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
if not args.json_path.is_file():
|
|
print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
doc, raw = load_document(args.json_path)
|
|
if doc is not None:
|
|
m = metrics_from_doc(doc)
|
|
else:
|
|
m = heuristic_metrics(raw)
|
|
|
|
if args.markdown and args.markdown.is_file():
|
|
md_len = len(args.markdown.read_text(encoding="utf-8"))
|
|
m["markdown_file_chars"] = md_len
|
|
if m.get("markdown_chars", 0) == 0:
|
|
m["markdown_chars"] = md_len
|
|
|
|
status, issues, actions = evaluate(
|
|
m,
|
|
expect_tables=args.expect_tables,
|
|
min_chars_per_page=args.min_chars_per_page,
|
|
min_markdown_chars=args.min_markdown_chars,
|
|
)
|
|
|
|
report = {
|
|
"status": status,
|
|
"metrics": m,
|
|
"issues": issues,
|
|
"recommended_actions": actions,
|
|
"next_steps_for_agent": [
|
|
"Re-run docling with flags from recommended_actions.",
|
|
"Re-export JSON and run this script again until status is pass.",
|
|
"Append a row to improvement-log.md (see SKILL.md).",
|
|
],
|
|
}
|
|
|
|
print(json.dumps(report, indent=2, ensure_ascii=False))
|
|
if not args.quiet:
|
|
print(f"\nstatus={status}", file=sys.stderr)
|
|
if issues:
|
|
print("issues:", file=sys.stderr)
|
|
for i in issues:
|
|
print(f" - {i}", file=sys.stderr)
|
|
if actions:
|
|
print("recommended_actions:", file=sys.stderr)
|
|
for a in actions:
|
|
print(f" - {a}", file=sys.stderr)
|
|
|
|
if status == "fail":
|
|
sys.exit(1)
|
|
if status == "warn" and args.fail_on_warn:
|
|
sys.exit(1)
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|