mirror of
https://github.com/docling-project/docling.git
synced 2026-05-17 13:10:38 +00:00
5b1df788ef
* ci: tighten pre-commit guardrails Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: validate pre-commit guardrail changes Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: switch hook validation to prek Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: exempt active slim plan from max-lines Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: move max-lines config under github Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: fail on uncovered tach modules Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: ignore generated docs in max-lines check Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: clarify local validation tasks Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * docs: refine agent instructions Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: replace mypy with ty (cherry picked from commit 382afbde8f00abfaeba95ea9c8e9cc603f27a2d9) Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * ci: replace justfile with makefile Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> --------- Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
200 lines
5.3 KiB
Python
200 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Pre-commit hook to enforce a maximum number of lines per file."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import fnmatch
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
CHECKED_EXTENSIONS = {
|
|
".py",
|
|
".ts",
|
|
".tsx",
|
|
".js",
|
|
".jsx",
|
|
".rs",
|
|
".json",
|
|
".sql",
|
|
".md",
|
|
".txt",
|
|
".yaml",
|
|
".yml",
|
|
}
|
|
|
|
SKIP_DIRS = {
|
|
".cache",
|
|
".git",
|
|
".hypothesis",
|
|
".nox",
|
|
".pytest_cache",
|
|
".ruff_cache",
|
|
".tox",
|
|
".venv",
|
|
"build",
|
|
"dist",
|
|
"node_modules",
|
|
"site",
|
|
"__pycache__",
|
|
}
|
|
|
|
DEFAULT_MAX_LINES = 1000
|
|
DEFAULT_IGNORE_FILE = ".github/max-lines-ignore"
|
|
|
|
|
|
def load_ignore_patterns(ignore_file: Path) -> tuple[list[str], list[str]]:
|
|
"""Load silent and warning-only glob patterns from the ignore file."""
|
|
silent_patterns: list[str] = []
|
|
warn_patterns: list[str] = []
|
|
|
|
if not ignore_file.exists():
|
|
return silent_patterns, warn_patterns
|
|
|
|
current_section = "silent"
|
|
for line in ignore_file.read_text(encoding="utf-8").splitlines():
|
|
stripped = line.strip()
|
|
if not stripped or stripped.startswith("#"):
|
|
continue
|
|
|
|
section = stripped.lower()
|
|
if section == "[silent]":
|
|
current_section = "silent"
|
|
continue
|
|
if section == "[warn]":
|
|
current_section = "warn"
|
|
continue
|
|
|
|
if current_section == "warn":
|
|
warn_patterns.append(stripped)
|
|
else:
|
|
silent_patterns.append(stripped)
|
|
|
|
return silent_patterns, warn_patterns
|
|
|
|
|
|
def is_ignored(file_path: str, patterns: list[str]) -> bool:
|
|
"""Return whether a relative file path matches any ignore pattern."""
|
|
return any(
|
|
fnmatch.fnmatch(file_path, pattern)
|
|
or fnmatch.fnmatch(Path(file_path).name, pattern)
|
|
for pattern in patterns
|
|
)
|
|
|
|
|
|
def count_lines(file_path: Path) -> int:
|
|
"""Count text lines while tolerating encoding errors in fixtures."""
|
|
try:
|
|
return len(file_path.read_text(encoding="utf-8", errors="replace").splitlines())
|
|
except OSError:
|
|
return 0
|
|
|
|
|
|
def find_files(repo_root: Path) -> list[Path]:
|
|
"""Walk the repository and collect checked file types."""
|
|
files: list[Path] = []
|
|
for dirpath, dirnames, filenames in os.walk(repo_root):
|
|
dirnames[:] = [name for name in dirnames if name not in SKIP_DIRS]
|
|
for filename in filenames:
|
|
path = Path(dirpath) / filename
|
|
if path.suffix in CHECKED_EXTENSIONS:
|
|
files.append(path)
|
|
return files
|
|
|
|
|
|
def filter_candidate_files(repo_root: Path, file_args: list[str]) -> list[Path]:
|
|
"""Resolve pre-commit file arguments to checked files under the repo root."""
|
|
files: list[Path] = []
|
|
for raw_path in file_args:
|
|
path = Path(raw_path)
|
|
if not path.is_absolute():
|
|
path = (repo_root / path).resolve()
|
|
|
|
if not path.is_file():
|
|
continue
|
|
|
|
try:
|
|
rel_parts = path.relative_to(repo_root).parts
|
|
except ValueError:
|
|
continue
|
|
|
|
if any(part in SKIP_DIRS for part in rel_parts):
|
|
continue
|
|
if path.suffix not in CHECKED_EXTENSIONS:
|
|
continue
|
|
|
|
files.append(path)
|
|
|
|
return list(dict.fromkeys(files))
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
"--max-lines",
|
|
type=int,
|
|
default=DEFAULT_MAX_LINES,
|
|
help=f"Maximum allowed lines per file (default: {DEFAULT_MAX_LINES}).",
|
|
)
|
|
parser.add_argument(
|
|
"--ignore-file",
|
|
default=DEFAULT_IGNORE_FILE,
|
|
help=f"Path to ignore file (default: {DEFAULT_IGNORE_FILE}).",
|
|
)
|
|
parser.add_argument(
|
|
"files",
|
|
nargs="*",
|
|
help="Optional file paths to check. Defaults to scanning the repo.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
repo_root = Path.cwd().resolve()
|
|
ignore_file = repo_root / args.ignore_file
|
|
silent_patterns, warn_patterns = load_ignore_patterns(ignore_file)
|
|
files = (
|
|
filter_candidate_files(repo_root, args.files)
|
|
if args.files
|
|
else find_files(repo_root)
|
|
)
|
|
|
|
violations: list[tuple[str, int]] = []
|
|
warnings: list[tuple[str, int]] = []
|
|
|
|
for file_path in sorted(files):
|
|
rel_path = file_path.relative_to(repo_root).as_posix()
|
|
line_count = count_lines(file_path)
|
|
if line_count <= args.max_lines:
|
|
continue
|
|
|
|
if is_ignored(rel_path, warn_patterns):
|
|
warnings.append((rel_path, line_count))
|
|
elif not is_ignored(rel_path, silent_patterns):
|
|
violations.append((rel_path, line_count))
|
|
|
|
if warnings:
|
|
print(
|
|
f"Ignored files exceeding {args.max_lines} line limit "
|
|
"(TODO split or refactor):"
|
|
)
|
|
for path, line_count in warnings:
|
|
print(f" WARN: {path} has {line_count} lines")
|
|
print()
|
|
|
|
if violations:
|
|
print(f"Files exceeding {args.max_lines} line limit:")
|
|
for path, line_count in violations:
|
|
print(f" FAIL: {path} has {line_count} lines")
|
|
print(f"\n{len(violations)} file(s) over the limit.")
|
|
print(f"To exempt existing debt, add patterns to {args.ignore_file}.")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|