Files
Jehlum Pandit c23622f6f5 docs: add agent skill bundle for coding assistants (SKILL.md, pipelines, convert/evaluate) (#3174)
* docs: add agent skill bundle with convert/evaluate helpers

- Add docs/examples/agent_skill/docling-document-intelligence/ with
  SKILL.md, pipelines.md, EXAMPLE.md, improvement-log template, and
  scripts/docling-convert.py + docling-evaluate.py (standard/vlm-local/vlm-api).
- Document InputFormat.PDF + PdfFormatOption for explicit PdfPipelineOptions.
- Link from examples index and mkdocs nav.

Made-with: Cursor

* docs: align agent skill README and EXAMPLE with Cursor bundle

- Document both ~/.cursor/skills and docs/examples paths.
- README notes repo parity for PRs and local installs.

Made-with: Cursor

* DCO Remediation Commit for jehlum11 <jehlum11@gmail.com>

I, jehlum11 <jehlum11@gmail.com>, hereby add my Signed-off-by to this commit: 2d268ffb6f
I, jehlum11 <jehlum11@gmail.com>, hereby add my Signed-off-by to this commit: 041e709c66

Signed-off-by: jehlum11 <jehlum11@gmail.com>
Made-with: Cursor

* docs: refactor agent skill to use docling CLI for conversion

Address maintainer feedback: the custom docling-convert.py script was
largely redundant with the existing docling CLI. This commit:

- Removes scripts/docling-convert.py (redundant with `docling` CLI)
- Refactors SKILL.md (v1.4 → v2.0) to use `docling` CLI for all
  conversion tasks, reserving the Python API only for features the
  CLI does not expose (chunking, VLM API endpoint config,
  force_backend_text hybrid mode)
- Updates docling-evaluate.py recommended_actions to reference
  `docling` CLI flags instead of the removed script
- Updates README.md, EXAMPLE.md, pipelines.md to use `docling` CLI
  examples throughout
- Simplifies requirements.txt (removes packaging dependency)

The only custom script retained is docling-evaluate.py, which provides
heuristic quality evaluation — functionality the CLI does not cover.

Signed-off-by: jehlum11 <jehlum11@gmail.com>
Made-with: Cursor

* docs: fix ruff format on docling-evaluate.py

Signed-off-by: jehlum11 <jehlum11@gmail.com>
Made-with: Cursor

---------

Signed-off-by: jehlum11 <jehlum11@gmail.com>
2026-04-13 15:02:51 +02:00

297 lines
9.4 KiB
Python
Vendored

#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
"""
Evaluate a Docling JSON export and suggest pipeline / option changes.
Typical flow (agent or human):
docling input.pdf --to json --output /tmp/
docling input.pdf --to md --output /tmp/
python3 scripts/docling-evaluate.py /tmp/input.json --markdown /tmp/input.md
Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import Counter
from pathlib import Path
from typing import Any
def load_document(path: Path):
data = json.loads(path.read_text(encoding="utf-8"))
try:
from docling_core.types.doc.document import DoclingDocument
return DoclingDocument.model_validate(data), data
except Exception:
return None, data
def page_numbers_from_doc(doc) -> set[int]:
pages: set[int] = set()
for item, _ in doc.iterate_items():
for prov in getattr(item, "prov", None) or []:
p = getattr(prov, "page_no", None)
if p is not None:
pages.add(int(p))
return pages
def collect_text_samples(doc, limit: int = 200) -> list[str]:
texts: list[str] = []
for item, _ in doc.iterate_items():
t = getattr(item, "text", None)
if t and str(t).strip():
texts.append(str(t).strip())
if len(texts) >= limit:
break
return texts
def metrics_from_doc(doc) -> dict[str, Any]:
n_tables = len(getattr(doc, "tables", []) or [])
n_pictures = len(getattr(doc, "pictures", []) or [])
n_headers = 0
n_text_items = 0
total_chars = 0
for item, _ in doc.iterate_items():
label = getattr(getattr(item, "label", None), "name", None) or ""
if label == "SECTION_HEADER":
n_headers += 1
t = getattr(item, "text", None)
if t:
n_text_items += 1
total_chars += len(str(t))
pages = page_numbers_from_doc(doc)
n_pages = len(pages) if pages else 0
density = (total_chars / n_pages) if n_pages else total_chars
samples = collect_text_samples(doc)
rep = Counter(samples)
top_rep = rep.most_common(1)[0] if rep else ("", 0)
dup_ratio = (
sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0
)
md = ""
try:
md = doc.export_to_markdown()
except Exception:
pass
replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples)
return {
"page_count": n_pages,
"section_headers": n_headers,
"text_items": n_text_items,
"total_text_chars": total_chars,
"chars_per_page": round(density, 2),
"tables": n_tables,
"pictures": n_pictures,
"markdown_chars": len(md),
"replacement_chars": replacement,
"most_repeated_text_count": int(top_rep[1]) if top_rep else 0,
"duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10,
}
def heuristic_metrics(data: dict) -> dict[str, Any]:
"""Fallback when DoclingDocument cannot be validated (older export / drift)."""
texts = data.get("texts") or []
tables = data.get("tables") or []
body = data.get("body") or {}
children = body.get("children") if isinstance(body, dict) else None
n_children = len(children) if isinstance(children, list) else 0
char_sum = 0
for t in texts:
if isinstance(t, dict):
char_sum += len(str(t.get("text") or ""))
return {
"page_count": 0,
"section_headers": 0,
"text_items": len(texts),
"total_text_chars": char_sum,
"chars_per_page": 0.0,
"tables": len(tables),
"pictures": len(data.get("pictures") or []),
"markdown_chars": 0,
"replacement_chars": 0,
"most_repeated_text_count": 0,
"duplicate_heavy": False,
"heuristic_only": True,
"body_children": n_children,
}
def evaluate(
m: dict[str, Any],
*,
expect_tables: bool,
min_chars_per_page: float,
min_markdown_chars: int,
) -> tuple[str, list[str], list[str]]:
issues: list[str] = []
actions: list[str] = []
if m.get("heuristic_only"):
issues.append("Could not load full DoclingDocument; metrics are partial.")
actions.append(
"Ensure docling-core matches export; re-export with: docling <source> --to json --output <dir>"
)
cpp = m.get("chars_per_page") or 0
if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page:
issues.append(
f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap."
)
actions.append(
"Retry: docling <source> --ocr-engine tesserocr (or rapidocr, ocrmac)"
)
actions.append("Retry: docling <source> --pipeline vlm")
if m.get("replacement_chars", 0) > 5:
issues.append(
"Unicode replacement characters detected; OCR may be garbling text."
)
actions.append("Retry: docling <source> --ocr-engine tesserocr (or rapidocr)")
actions.append(
"Retry: docling <source> --pipeline vlm (use force_backend_text=True via Python API for hybrid)"
)
if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8):
issues.append(
"Repeated text blocks; possible layout/OCR loop or bad reading order."
)
actions.append("Retry: docling <source> --pipeline vlm")
actions.append(
"If using VLM: try force_backend_text=True via Python API for text-heavy pages"
)
if expect_tables and m.get("tables", 0) == 0:
issues.append("No tables detected but tables were expected.")
actions.append(
"Retry: docling <source> (tables are enabled by default; remove --no-tables if set)"
)
actions.append(
"Retry: docling <source> --pipeline vlm (better for merged-cell or visual tables)"
)
mc = m.get("markdown_chars", 0)
if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1:
issues.append(f"Markdown export is very short ({mc} chars) for the page count.")
actions.append(
"Retry: docling <source> --pipeline vlm (or try different --ocr-engine)"
)
if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0:
issues.append(
"No text items and no page provenance; export may be empty or invalid."
)
actions.append(
"Verify source file opens correctly; retry with: docling <source> --pipeline standard"
)
seen = set()
uniq_actions = []
for a in actions:
if a not in seen:
seen.add(a)
uniq_actions.append(a)
if not issues:
return "pass", [], []
severe = m.get("text_items", 0) == 0 or (
m.get("page_count", 0) >= 1 and mc < 50 and mc > 0
)
status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn"
return status, issues, uniq_actions
def parse_args():
p = argparse.ArgumentParser(description="Evaluate Docling JSON export quality")
p.add_argument(
"json_path", type=Path, help="Path to DoclingDocument JSON (export_to_dict)"
)
p.add_argument(
"--markdown",
type=Path,
default=None,
help="Optional markdown file to cross-check length",
)
p.add_argument("--expect-tables", action="store_true")
p.add_argument("--min-chars-per-page", type=float, default=120.0)
p.add_argument("--min-markdown-chars", type=int, default=200)
p.add_argument("--fail-on-warn", action="store_true")
p.add_argument(
"--quiet", action="store_true", help="Only print JSON report to stdout"
)
return p.parse_args()
def main() -> None:
args = parse_args()
if not args.json_path.is_file():
print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr)
sys.exit(1)
doc, raw = load_document(args.json_path)
if doc is not None:
m = metrics_from_doc(doc)
else:
m = heuristic_metrics(raw)
if args.markdown and args.markdown.is_file():
md_len = len(args.markdown.read_text(encoding="utf-8"))
m["markdown_file_chars"] = md_len
if m.get("markdown_chars", 0) == 0:
m["markdown_chars"] = md_len
status, issues, actions = evaluate(
m,
expect_tables=args.expect_tables,
min_chars_per_page=args.min_chars_per_page,
min_markdown_chars=args.min_markdown_chars,
)
report = {
"status": status,
"metrics": m,
"issues": issues,
"recommended_actions": actions,
"next_steps_for_agent": [
"Re-run docling with flags from recommended_actions.",
"Re-export JSON and run this script again until status is pass.",
"Append a row to improvement-log.md (see SKILL.md).",
],
}
print(json.dumps(report, indent=2, ensure_ascii=False))
if not args.quiet:
print(f"\nstatus={status}", file=sys.stderr)
if issues:
print("issues:", file=sys.stderr)
for i in issues:
print(f" - {i}", file=sys.stderr)
if actions:
print("recommended_actions:", file=sys.stderr)
for a in actions:
print(f" - {a}", file=sys.stderr)
if status == "fail":
sys.exit(1)
if status == "warn" and args.fail_on_warn:
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()