Files
docling/scripts/check_max_lines.py
geoHeil 5b1df788ef ci: tighten pre-commit guardrails (#3346)
* ci: tighten pre-commit guardrails

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: validate pre-commit guardrail changes

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: switch hook validation to prek

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: exempt active slim plan from max-lines

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: move max-lines config under github

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: fail on uncovered tach modules

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: ignore generated docs in max-lines check

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: clarify local validation tasks

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* docs: refine agent instructions

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: replace mypy with ty

(cherry picked from commit 382afbde8f00abfaeba95ea9c8e9cc603f27a2d9)
Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

* ci: replace justfile with makefile

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>

---------

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
2026-05-08 15:07:11 +02:00

200 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""Pre-commit hook to enforce a maximum number of lines per file."""
from __future__ import annotations
import argparse
import fnmatch
import os
import sys
from pathlib import Path
CHECKED_EXTENSIONS = {
".py",
".ts",
".tsx",
".js",
".jsx",
".rs",
".json",
".sql",
".md",
".txt",
".yaml",
".yml",
}
SKIP_DIRS = {
".cache",
".git",
".hypothesis",
".nox",
".pytest_cache",
".ruff_cache",
".tox",
".venv",
"build",
"dist",
"node_modules",
"site",
"__pycache__",
}
DEFAULT_MAX_LINES = 1000
DEFAULT_IGNORE_FILE = ".github/max-lines-ignore"
def load_ignore_patterns(ignore_file: Path) -> tuple[list[str], list[str]]:
"""Load silent and warning-only glob patterns from the ignore file."""
silent_patterns: list[str] = []
warn_patterns: list[str] = []
if not ignore_file.exists():
return silent_patterns, warn_patterns
current_section = "silent"
for line in ignore_file.read_text(encoding="utf-8").splitlines():
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
section = stripped.lower()
if section == "[silent]":
current_section = "silent"
continue
if section == "[warn]":
current_section = "warn"
continue
if current_section == "warn":
warn_patterns.append(stripped)
else:
silent_patterns.append(stripped)
return silent_patterns, warn_patterns
def is_ignored(file_path: str, patterns: list[str]) -> bool:
"""Return whether a relative file path matches any ignore pattern."""
return any(
fnmatch.fnmatch(file_path, pattern)
or fnmatch.fnmatch(Path(file_path).name, pattern)
for pattern in patterns
)
def count_lines(file_path: Path) -> int:
"""Count text lines while tolerating encoding errors in fixtures."""
try:
return len(file_path.read_text(encoding="utf-8", errors="replace").splitlines())
except OSError:
return 0
def find_files(repo_root: Path) -> list[Path]:
"""Walk the repository and collect checked file types."""
files: list[Path] = []
for dirpath, dirnames, filenames in os.walk(repo_root):
dirnames[:] = [name for name in dirnames if name not in SKIP_DIRS]
for filename in filenames:
path = Path(dirpath) / filename
if path.suffix in CHECKED_EXTENSIONS:
files.append(path)
return files
def filter_candidate_files(repo_root: Path, file_args: list[str]) -> list[Path]:
"""Resolve pre-commit file arguments to checked files under the repo root."""
files: list[Path] = []
for raw_path in file_args:
path = Path(raw_path)
if not path.is_absolute():
path = (repo_root / path).resolve()
if not path.is_file():
continue
try:
rel_parts = path.relative_to(repo_root).parts
except ValueError:
continue
if any(part in SKIP_DIRS for part in rel_parts):
continue
if path.suffix not in CHECKED_EXTENSIONS:
continue
files.append(path)
return list(dict.fromkeys(files))
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--max-lines",
type=int,
default=DEFAULT_MAX_LINES,
help=f"Maximum allowed lines per file (default: {DEFAULT_MAX_LINES}).",
)
parser.add_argument(
"--ignore-file",
default=DEFAULT_IGNORE_FILE,
help=f"Path to ignore file (default: {DEFAULT_IGNORE_FILE}).",
)
parser.add_argument(
"files",
nargs="*",
help="Optional file paths to check. Defaults to scanning the repo.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
repo_root = Path.cwd().resolve()
ignore_file = repo_root / args.ignore_file
silent_patterns, warn_patterns = load_ignore_patterns(ignore_file)
files = (
filter_candidate_files(repo_root, args.files)
if args.files
else find_files(repo_root)
)
violations: list[tuple[str, int]] = []
warnings: list[tuple[str, int]] = []
for file_path in sorted(files):
rel_path = file_path.relative_to(repo_root).as_posix()
line_count = count_lines(file_path)
if line_count <= args.max_lines:
continue
if is_ignored(rel_path, warn_patterns):
warnings.append((rel_path, line_count))
elif not is_ignored(rel_path, silent_patterns):
violations.append((rel_path, line_count))
if warnings:
print(
f"Ignored files exceeding {args.max_lines} line limit "
"(TODO split or refactor):"
)
for path, line_count in warnings:
print(f" WARN: {path} has {line_count} lines")
print()
if violations:
print(f"Files exceeding {args.max_lines} line limit:")
for path, line_count in violations:
print(f" FAIL: {path} has {line_count} lines")
print(f"\n{len(violations)} file(s) over the limit.")
print(f"To exempt existing debt, add patterns to {args.ignore_file}.")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())