diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml new file mode 100644 index 000000000..265eb3787 --- /dev/null +++ b/.github/workflows/detector-corpora-test.yml @@ -0,0 +1,266 @@ +name: Corpora Test + +on: + workflow_dispatch: + pull_request: + types: [opened] + paths: + - 'pkg/detectors/**' + - 'pkg/engine/defaults/defaults.go' + - '.github/workflows/detector-corpora-test.yml' + - 'scripts/test/detector_corpora_test.sh' + - 'scripts/test/diff_corpora_results.py' + - 'scripts/test/detect_changed_detectors.sh' + +env: + DATASETS: | + s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd + +jobs: + corpora-test: + if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }} + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Install Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version: "1.25" + + - name: Install dependencies + run: sudo apt-get install -y zstd jq + + - name: Resolve merge-base + id: merge_base + shell: bash + run: | + set -o pipefail + git fetch --no-tags --prune origin main + MERGE_BASE=$(git merge-base origin/main HEAD) + echo "Merge base: $MERGE_BASE" + echo "sha=$MERGE_BASE" >> "$GITHUB_OUTPUT" + + # Determine which detectors changed in this PR. The PR build scopes its + # scan to the full set; the main build excludes detectors that don't + # exist there yet (new detectors). If the set is empty, the workflow + # short-circuits with a skip comment — scoping is the entire point of + # Phase 2, falling back to scan-all defeats it. + - name: Detect changed detectors + id: detect + shell: bash + env: + BASE_REF: ${{ steps.merge_base.outputs.sha }} + run: | + set -o pipefail + chmod +x scripts/test/detect_changed_detectors.sh + PR_CSV=$(./scripts/test/detect_changed_detectors.sh --pr-csv || true) + MAIN_CSV=$(./scripts/test/detect_changed_detectors.sh --main-csv || true) + NEW_LIST=$(./scripts/test/detect_changed_detectors.sh --new-only || true) + NEW_CSV=$(echo "$NEW_LIST" | paste -sd, -) + echo "PR detectors: $PR_CSV" + echo "Main detectors: $MAIN_CSV" + echo "New detectors: $NEW_CSV" + echo "pr_csv=$PR_CSV" >> "$GITHUB_OUTPUT" + echo "main_csv=$MAIN_CSV" >> "$GITHUB_OUTPUT" + echo "new_csv=$NEW_CSV" >> "$GITHUB_OUTPUT" + if [[ -n "$PR_CSV" ]]; then + echo "any_changed=true" >> "$GITHUB_OUTPUT" + else + echo "any_changed=false" >> "$GITHUB_OUTPUT" + fi + + # Sticky comment: find any prior detector-bench comment on the PR by + # the marker substring and update it in place. The marker — kept in + # sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py — + # has to appear in BOTH the skip body and the diff body so the same + # comment flips between them as iterative pushes change which path + # fires. Skip body is only posted on pull_request events; workflow_dispatch + # runs with no changed detectors silently finish without posting. + - name: Find existing skip comment + if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' + id: find_skip_comment + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4 + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-includes: '' + + - name: Post or update skip comment + if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 + with: + comment-id: ${{ steps.find_skip_comment.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + edit-mode: replace + body: | + + ## Corpora Test Results + + No detector regex or keyword changes in this PR. Bench skipped. + + - name: Configure AWS credentials + if: steps.detect.outputs.any_changed == 'true' + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 # v6 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + # Cache the main scan results by merge-base + scoped detector set. + # On subsequent pushes to the same PR without a rebase, both are + # identical, so the main scan (35 GB of S3 streaming + trufflehog) is + # skipped entirely. + - name: Restore main scan cache + id: main_scan_cache + if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' + uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 + with: + path: /tmp/results-main.jsonl + key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }} + + # Two independent builds run in parallel: + # A) prepare main worktree → build main binary (git I/O then CPU) + # Skipped on main scan cache hit or when main_csv is empty + # (all changed detectors are new — no baseline needed). + # B) build PR binary (CPU, no dependencies) + - name: Build binaries + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + MERGE_BASE: ${{ steps.merge_base.outputs.sha }} + MAIN_CSV: ${{ steps.detect.outputs.main_csv }} + MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} + run: | + set -o pipefail + + # Chain A: prepare worktree, then build main binary. + # Skipped when main scan results are already cached, or when all + # changed detectors are new (main_csv empty — no baseline needed). + if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + ( + git worktree add /tmp/trufflehog-main-src "$MERGE_BASE" + cd /tmp/trufflehog-main-src + CGO_ENABLED=0 go build -o /tmp/trufflehog-main . + ) & + PID_MAIN_BUILD=$! + fi + + # Chain B: build PR binary (no dependencies). + CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . & + PID_PR_BUILD=$! + + [[ -n "${PID_MAIN_BUILD:-}" ]] && { wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; }; } + wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; } + + # PR and main scans share a single S3 stream per dataset file, teed to + # both binaries simultaneously. The main side is skipped on a cache hit + # (results already in /tmp/results-main.jsonl) or when main_csv is empty + # (PR adds only new detectors — no overlap with main). + - name: Run corpora tests + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + PR_CSV: ${{ steps.detect.outputs.pr_csv }} + MAIN_CSV: ${{ steps.detect.outputs.main_csv }} + MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }} + run: | + set -o pipefail + files=() + while IFS= read -r dataset; do + [[ -z "$dataset" ]] && continue + files+=("$dataset") + done <<< "$DATASETS" + + export TRUFFLEHOG_BIN=/tmp/trufflehog-pr + export OUTPUT_JSONL=/tmp/results-pr.jsonl + export INCLUDE_DETECTORS="$PR_CSV" + + if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then + # Dual-binary: single S3 download teed to both PR and main binaries. + export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main + export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl + export INCLUDE_DETECTORS_MAIN="$MAIN_CSV" + elif [[ -z "$MAIN_CSV" ]]; then + echo "No overlapping detectors in main; skipping main scan." + : > /tmp/results-main.jsonl + else + echo "Main scan cache hit; skipping main scan." + fi + + ./scripts/test/detector_corpora_test.sh "${files[@]}" \ + || { echo "Corpora scan failed" >&2; exit 1; } + + - name: Save main scan cache + if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' && steps.main_scan_cache.outputs.cache-hit != 'true' + uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 + with: + path: /tmp/results-main.jsonl + key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }} + + - name: Diff results + if: steps.detect.outputs.any_changed == 'true' + shell: bash + env: + CHANGED: ${{ steps.detect.outputs.pr_csv }} + NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }} + run: | + set -o pipefail + python3 scripts/test/diff_corpora_results.py \ + /tmp/results-main.jsonl /tmp/results-pr.jsonl \ + --changed-detectors="$CHANGED" \ + --new-detectors="$NEW_DETECTORS" \ + > /tmp/diff-report.md + cat /tmp/diff-report.md + + # workflow_dispatch runs don't carry an issue context, so resolve the + # PR number by branch lookup. pull_request events fall through to the + # event's issue number. Output feeds the find/update pair below. + - name: Resolve PR number + if: steps.detect.outputs.any_changed == 'true' + id: resolve_pr + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9 + with: + script: | + let issue_number; + if (context.eventName === 'workflow_dispatch') { + const pulls = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`, + state: 'open', + }); + if (pulls.data.length === 0) { + core.setFailed(`No open PR found for branch ${context.ref}`); + return; + } + issue_number = pulls.data[0].number; + } else { + issue_number = context.issue.number; + } + core.setOutput('issue_number', issue_number); + + - name: Find existing diff comment + if: steps.detect.outputs.any_changed == 'true' + id: find_diff_comment + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4 + with: + issue-number: ${{ steps.resolve_pr.outputs.issue_number }} + comment-author: 'github-actions[bot]' + body-includes: '' + + - name: Post or update diff comment + if: steps.detect.outputs.any_changed == 'true' + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 + with: + comment-id: ${{ steps.find_diff_comment.outputs.comment-id }} + issue-number: ${{ steps.resolve_pr.outputs.issue_number }} + edit-mode: replace + body-path: /tmp/diff-report.md diff --git a/.gitignore b/.gitignore index 6abf4e766..48d76df3f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,7 @@ tmp/go-test.json .captain/detectors/quarantines.yaml .captain/detectors/flakes.yaml .vscode + +# Python +__pycache__/ +*.pyc diff --git a/scripts/test/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh new file mode 100755 index 000000000..1e1bfbc7f --- /dev/null +++ b/scripts/test/detect_changed_detectors.sh @@ -0,0 +1,206 @@ +#!/usr/bin/env bash +# +# detect_changed_detectors.sh — Phase 2 +# +# Emits the list of detectors changed between two git refs, formatted for +# trufflehog's --include-detectors flag (comma-separated, lowercase protobuf +# enum names, optional ".v" version suffix). +# +# Source of truth for each detector's identifier: +# - Proto enum name comes from the detector's Type() implementation in its +# source files (e.g. `return detectorspb.DetectorType_AzureBatch` → +# `azurebatch`). Necessary because the package directory often differs +# from the enum name (azure_batch vs AzureBatch, npmtokenv2 vs NpmToken, +# close vs closecrm, etc.). +# - Version comes from the directory suffix only (`/v`). Detectors that +# encode the version in the dir name (e.g. `npmtokenv2`) are emitted +# without a version suffix; trufflehog then matches all versions of that +# proto type — wider scope but correct. +# +# "New detector" detection compares pkg/engine/defaults/defaults.go imports +# between the two refs. A detector imported at HEAD but not at BASE is new. +# +# Modes: +# (none) List all changed detectors at HEAD, one per line, in +# [.v] form. +# --pr-csv Same set as default mode, comma-joined. +# --main-csv Changed detectors that also exist at BASE (excludes new), +# comma-joined. Use as --include-detectors for the main build. +# --new-only Just the new detectors (in HEAD but not BASE), one per line. +# +# Env: +# BASE_REF default origin/main +# HEAD_REF default HEAD + +set -euo pipefail + +MODE="${1:-list}" +BASE_REF="${BASE_REF:-origin/main}" +HEAD_REF="${HEAD_REF:-HEAD}" + +REPO_ROOT="$(git rev-parse --show-toplevel)" +cd "$REPO_ROOT" + +# Resolve BASE to a concrete commit. Workflow already runs `git fetch origin +# main`; locally that may not be true, so we fall back to `main` if the +# remote-tracking ref is missing. +if ! git rev-parse --verify "$BASE_REF" >/dev/null 2>&1; then + if git rev-parse --verify main >/dev/null 2>&1; then + BASE_REF=main + else + echo "error: cannot resolve BASE_REF=$BASE_REF and no local 'main'" >&2 + exit 1 + fi +fi + +MERGE_BASE=$(git merge-base "$BASE_REF" "$HEAD_REF") + +# Step 1 — changed detector dirs (relative to repo root). +# Pattern: pkg/detectors/(/v)?/.go, excludes _test.go and +# files inside common/, custom_detectors/. +mapfile -t CHANGED_DIRS < <( + git diff --name-only "$MERGE_BASE...$HEAD_REF" -- 'pkg/detectors/**/*.go' \ + | grep -Ev '_test\.go$' \ + | grep -Ev '^pkg/detectors/(common|custom_detectors)/' \ + | sed -E 's|^(pkg/detectors/[^/]+(/v[0-9]+)?)/[^/]+\.go$|\1|' \ + | sort -u +) + +# Step 2 — defaults.go imports at each ref. Each line has form +# "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/(/v)?" +# We extract just the (/v)? portion to use as the dir identifier. +parse_defaults_imports() { + local ref="$1" + git show "$ref:pkg/engine/defaults/defaults.go" 2>/dev/null \ + | grep -oE '"github\.com/trufflesecurity/trufflehog/v3/pkg/detectors/[^"]+"' \ + | sed -E 's|.*/pkg/detectors/||; s|"$||' \ + | sort -u +} + +mapfile -t HEAD_IMPORTS < <(parse_defaults_imports "$HEAD_REF") +mapfile -t BASE_IMPORTS < <(parse_defaults_imports "$MERGE_BASE") + +# Set difference: detectors imported at HEAD but not at BASE. The dir +# identifier (e.g. "github/v2", "stripe") matches the form we extracted in +# step 1, so we can intersect directly without re-mapping. +NEW_DIRS_FILE=$(mktemp) +trap 'rm -f "$NEW_DIRS_FILE"' EXIT +comm -23 \ + <(printf '%s\n' "${HEAD_IMPORTS[@]+"${HEAD_IMPORTS[@]}"}") \ + <(printf '%s\n' "${BASE_IMPORTS[@]+"${BASE_IMPORTS[@]}"}") \ + > "$NEW_DIRS_FILE" + +is_new_detector() { + grep -qxF "$1" "$NEW_DIRS_FILE" +} + +# Step 2b — skip detectors whose diff doesn't touch regex patterns or Keywords. +# Corpora results only change when the matching logic changes; verification, +# redaction, or structural changes don't affect match counts. +has_pattern_change() { + local dir="$1" + + # Fast path: regex or Keywords() signature on a changed line. + git diff "$MERGE_BASE...$HEAD_REF" -- "$dir"/*.go 2>/dev/null \ + | grep -qE '^[+-][^+-].*(regexp\.|MustCompile|Keywords)' && return 0 + + # Slow path: compare the Keywords() function body between refs to catch + # changes to the return value (e.g. []string{"old"} → []string{"new"}) + # where the changed lines don't mention "Keywords" themselves. + local file + while IFS= read -r file; do + [[ "$file" == *_test.go ]] && continue + local head_body base_body + head_body=$(git show "$HEAD_REF:$file" 2>/dev/null \ + | awk '/func[[:space:]].*Keywords\(\)[[:space:]]*\[\]string/,/^[[:space:]]*\}/' \ + | tail -n +2) + base_body=$(git show "$MERGE_BASE:$file" 2>/dev/null \ + | awk '/func[[:space:]].*Keywords\(\)[[:space:]]*\[\]string/,/^[[:space:]]*\}/' \ + | tail -n +2) + [[ "$head_body" != "$base_body" ]] && return 0 + done < <(git diff --name-only "$MERGE_BASE...$HEAD_REF" -- "$dir"/*.go 2>/dev/null) + + return 1 +} + +# Step 3 — for a dir, derive `[.v]`. +detector_id_for_dir() { + local dir="$1" + local version="" + if [[ "$dir" =~ ^pkg/detectors/[^/]+/v([0-9]+)$ ]]; then + version=".v${BASH_REMATCH[1]}" + fi + + # Extract proto enum name. Multiple matches are possible (a detector may + # also reference related types in helpers); the Type() return is by far + # the most common, so the modal value wins. + local proto + proto=$( + grep -E 'return[[:space:]]+\S*DetectorType_[A-Za-z0-9]+' "$dir"/*.go 2>/dev/null \ + | grep -v '_test\.go' \ + | grep -oE 'DetectorType_[A-Za-z0-9]+' \ + | sort | uniq -c | sort -rn \ + | head -1 \ + | awk '{print $2}' \ + | sed 's/^DetectorType_//' \ + | tr '[:upper:]' '[:lower:]' + ) + if [[ -z "$proto" ]]; then + return 1 + fi + echo "${proto}${version}" +} + +# Step 4 — emit per mode. +emit_list() { + local dir id + for dir in "${CHANGED_DIRS[@]:-}"; do + [[ -z "$dir" ]] && continue + has_pattern_change "$dir" || continue + if id=$(detector_id_for_dir "$dir"); then + echo "$id" + else + echo "warning: could not resolve detector id for $dir" >&2 + fi + done | sort -u +} + +emit_main_list() { + local dir id + for dir in "${CHANGED_DIRS[@]:-}"; do + [[ -z "$dir" ]] && continue + has_pattern_change "$dir" || continue + # Strip `pkg/detectors/` prefix to get the import-path form, then + # check against the new-detector set. + local import_form="${dir#pkg/detectors/}" + if is_new_detector "$import_form"; then + continue + fi + if id=$(detector_id_for_dir "$dir"); then + echo "$id" + fi + done | sort -u +} + +emit_new_list() { + local dir id + for dir in "${CHANGED_DIRS[@]:-}"; do + [[ -z "$dir" ]] && continue + has_pattern_change "$dir" || continue + local import_form="${dir#pkg/detectors/}" + if ! is_new_detector "$import_form"; then + continue + fi + if id=$(detector_id_for_dir "$dir"); then + echo "$id" + fi + done | sort -u +} + +case "$MODE" in + list) emit_list ;; + --pr-csv) emit_list | paste -sd, - ;; + --main-csv) emit_main_list | paste -sd, - ;; + --new-only) emit_new_list ;; + *) echo "Usage: $0 [--pr-csv|--main-csv|--new-only]" >&2; exit 2 ;; +esac diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh new file mode 100755 index 000000000..9b17eacfc --- /dev/null +++ b/scripts/test/detector_corpora_test.sh @@ -0,0 +1,127 @@ +#!/bin/bash +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [ ...]" + exit 1 +fi + +# CI sets OUTPUT_JSONL to per-run paths and skips the human-readable DuckDB +# summary. Local invocations leave it unset and get the summary table for +# debugging. +if [[ -z "${OUTPUT_JSONL+x}" ]]; then + OUTPUT_JSONL="/tmp/corpora_results.jsonl" + RUN_DUCKDB_SUMMARY=1 +else + RUN_DUCKDB_SUMMARY=0 +fi +> "$OUTPUT_JSONL" + +REPO_ROOT="$(git rev-parse --show-toplevel)" +TRUFFLEHOG_BIN="${TRUFFLEHOG_BIN:-${REPO_ROOT}/trufflehog}" + +if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then + CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT" +fi + +# When set, scope the scan to specific detectors. Comma-separated, lowercase +# proto enum names with optional ".v" suffix (matches the format produced +# by scripts/test/detect_changed_detectors.sh). +INCLUDE_DETECTORS="${INCLUDE_DETECTORS:-}" +INCLUDE_FLAG=() +if [[ -n "$INCLUDE_DETECTORS" ]]; then + INCLUDE_FLAG=(--include-detectors="$INCLUDE_DETECTORS") +fi + +if [[ -n "${OUTPUT_JSONL_MAIN:-}" ]]; then + > "$OUTPUT_JSONL_MAIN" +fi + +# --no-verification avoids network calls against a large corpus where thousands +# of matches could trigger API calls, dominating runtime. Verifier behavior is +# covered by detector unit and integration tests. +# +# Dual-binary mode: when TRUFFLEHOG_BIN_MAIN / OUTPUT_JSONL_MAIN / +# INCLUDE_DETECTORS_MAIN are set, the corpus stream is teed to both the PR +# binary (stdout side) and the main binary (process substitution) so S3 is +# only downloaded once. +scan() { + local input="$1" + set +e + + local main_include_flag=() + if [[ -n "${INCLUDE_DETECTORS_MAIN:-}" ]]; then + main_include_flag=(--include-detectors="$INCLUDE_DETECTORS_MAIN") + fi + + local rc=0 + if [[ -n "${TRUFFLEHOG_BIN_MAIN:-}" ]]; then + # Single S3 download teed to both binaries simultaneously. + unzstd -c "$input" \ + | jq -r .content \ + | tee >( + "${TRUFFLEHOG_BIN_MAIN}" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=8 \ + --json \ + --archive-timeout=2h \ + "${main_include_flag[@]}" \ + stdin >> "${OUTPUT_JSONL_MAIN}" + ) \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=8 \ + --json \ + --print-avg-detector-time \ + --archive-timeout=2h \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" + rc=$? + wait + else + unzstd -c "$input" \ + | jq -r .content \ + | "$TRUFFLEHOG_BIN" \ + --no-update \ + --no-verification \ + --allow-verification-overlap \ + --log-level=3 \ + --concurrency=8 \ + --json \ + --print-avg-detector-time \ + --archive-timeout=2h \ + "${INCLUDE_FLAG[@]}" \ + stdin >> "$OUTPUT_JSONL" + rc=$? + fi + set -e + return $rc +} + +for CORPORA_FILE in "$@"; do + if [[ "$CORPORA_FILE" == s3://* ]]; then + aws s3 cp "$CORPORA_FILE" - | scan /dev/stdin + else + scan "$CORPORA_FILE" + fi +done + +if [[ "$RUN_DUCKDB_SUMMARY" == "1" ]]; then + duckdb -c " +CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true); + +SELECT + t.DetectorName detector, + COUNT(*) total +FROM t +GROUP BY all +ORDER BY total DESC, detector +LIMIT 50; +" +fi diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py new file mode 100755 index 000000000..63b5f771a --- /dev/null +++ b/scripts/test/diff_corpora_results.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +Diffs two trufflehog JSONL outputs (main vs PR build) and emits a Markdown +report to stdout. + +Identity per finding: (DetectorName, Raw or RawV2 fallback). Set semantics — +duplicates within a single scan collapse into one identity, so a regex change +either adds a new (detector, secret) identity or removes one. + +Verification is disabled at scan time (--no-verification) to avoid network +calls against a large corpus where thousands of matches could dominate runtime. +The diff measures regex match changes only. + +When --changed-detectors is provided, the report focuses on the detectors +changed by the PR. Detectors flagged via --new-detectors are rendered with 🆕 +status and absolute density (no main baseline). When --corpus-bytes is +provided, a blast-radius column projects matches per 10 GB of scanned content. + +Usage: + diff_corpora_results.py + [--changed-detectors=] + [--new-detectors=] + [--corpus-bytes=] +""" +import argparse +import json +import sys +from collections import defaultdict + + +PREAMBLE = ( + "Scans a corpus of real-world public code against only the detectors " + "changed in this PR, then compares unique match counts between the PR " + "build and the main baseline to catch regex regressions. Verification " + "is disabled — each detector's regex is measured independently." +) + +STATUS_KEY = ( + "- 🔴 regression: >5 new, >20% increase over main, or any removed\n" + "- ⚠️ warning: 1–5 new and ≤20% increase over main\n" + "- ✅ clean\n" + "- 🆕 new detector (no baseline)" +) + +# Marker on the very first line of the body so peter-evans/find-comment can +# locate the sticky comment via substring match. Workflow file references the +# same literal — keep the two in sync. +STICKY_COMMENT_MARKER = "" + + +def parse_csv(s): + """Parse a comma-separated detector list into normalized name set. + + Strips ``.v`` version suffixes and lowercases. JSONL DetectorName is the + proto enum name (e.g., ``JDBC``); we match case-insensitively by name only, + since version doesn't appear in the output. Versioned scoping happens at + the trufflehog --include-detectors level. + """ + if not s: + return set() + out = set() + for item in s.split(","): + item = item.strip() + if not item: + continue + if "." in item: + item = item.split(".", 1)[0] + out.add(item.lower()) + return out + + +def load_findings(path): + """Returns dict: detector_name -> {"identities": set[str], "total": int}.""" + by_detector = defaultdict(lambda: {"identities": set(), "total": 0}) + with open(path, "r", encoding="utf-8", errors="replace") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + detector = obj.get("DetectorName") or "" + if not detector: + continue + raw = obj.get("Raw") or obj.get("RawV2") or "" + by_detector[detector]["identities"].add(raw) + by_detector[detector]["total"] += 1 + return by_detector + + +def status_emoji(new_count, removed_count, unique_main): + """Hybrid threshold: 🔴 on absolute (>5) OR relative (>20% of main) NEW, OR any REMOVED.""" + if removed_count > 0: + return "🔴" + if new_count > 5 or new_count > 0.20 * max(unique_main, 1): + return "🔴" + if new_count > 0: + return "⚠️" + return "✅" + + +def build_top_line_summary(rows, changed): + regressed = sum(1 for r in rows if not r["is_new"] and r["emoji"] == "🔴") + warned = sum(1 for r in rows if not r["is_new"] and r["emoji"] == "⚠️") + new_count = sum(1 for r in rows if r["is_new"]) + clean = sum(1 for r in rows if r["emoji"] == "✅") + scoped = ", ".join(f"`{d}`" for d in sorted(changed)) if changed else "" + parts = [] + if regressed: + parts.append(f"{regressed} regressed") + if warned: + parts.append(f"{warned} warned") + parts += [f"{new_count} new", f"{clean} clean"] + summary = f"**{' · '.join(parts)}**" + if scoped: + summary += f" \u00a0|\u00a0 Scoped to: {scoped}" + return summary + + +def render(main, pr, changed=None, new_detectors=None): + new_detectors = new_detectors or set() + + if changed: + all_names = {d for d in (set(main) | set(pr)) + if d.lower() in changed} + # Detectors that the PR claims to have changed (or added) but that + # produced zero matches on either side. These don't appear in JSONL, + # so we surface them as a warning row. + seen_lower = {d.lower() for d in (set(main) | set(pr))} + missing = sorted(d for d in changed if d not in seen_lower) + else: + all_names = set(main) | set(pr) + missing = [] + + _empty = {"identities": set(), "total": 0} + rows = [] + has_diff = False + for d in sorted(all_names): + # A detector is only treated as fully new if the new_detectors set + # says so AND main produced no findings for it. When a PR modifies an + # existing version and adds a new version of the same detector (e.g. + # jdbc.v1 + jdbc.v2), both collapse to "jdbc" in new_detectors but + # main still ran against the existing version — its results must not + # be discarded. + is_new = d.lower() in new_detectors and d not in main + m = main.get(d, _empty) + p = pr.get(d, _empty) + new_ids = p["identities"] - m["identities"] + removed_ids = m["identities"] - p["identities"] + + if is_new: + emoji = "🆕" + else: + emoji = status_emoji(len(new_ids), len(removed_ids), len(m["identities"])) + + if new_ids or removed_ids or m["total"] != p["total"]: + has_diff = True + + rows.append({ + "detector": d, + "is_new": is_new, + "emoji": emoji, + "total_main": m["total"], + "total_pr": p["total"], + "unique_main": len(m["identities"]), + "unique_pr": len(p["identities"]), + "new_count": len(new_ids), + "removed_count": len(removed_ids), + }) + + parts = [ + STICKY_COMMENT_MARKER, + "## Corpora Test Results", + "", + PREAMBLE, + "", + ] + if rows: + parts += [build_top_line_summary(rows, changed), ""] + + if not rows and not missing: + parts += ["_(No findings on either side for the changed detectors.)_", ""] + return "\n".join(parts) + + if rows: + if has_diff or any(r["is_new"] for r in rows): + rows.sort( + key=lambda r: ( + 0 if r["is_new"] else 1, + -(r["new_count"] + r["removed_count"]), + r["detector"], + ) + ) + else: + rows.sort(key=lambda r: r["detector"]) + + cols = ["Status", "Detector", "Unique matches (main)", "Unique matches (PR)", + "New", "Removed"] + aligns = ["", "", "---:", "---:", "---:", "---:"] + parts += [ + "| " + " | ".join(cols) + " |", + "|" + "|".join(a if a else "---" for a in aligns) + "|", + ] + + for r in rows: + if r["is_new"]: + cells = [ + r["emoji"], + r["detector"], + "—", + str(r["unique_pr"]), + "—", + "—", + ] + else: + cells = [ + r["emoji"], + r["detector"], + str(r["unique_main"]), + str(r["unique_pr"]), + str(r["new_count"]), + str(r["removed_count"]), + ] + parts.append("| " + " | ".join(cells) + " |") + parts.append("") + parts.append(STATUS_KEY) + parts.append("") + + if missing: + parts += [ + "### ⚠️ Changed detectors with zero matches in both builds", + "", + "These detectors were modified by the PR but produced no matches " + "against the corpus on either side. Could be a deliberate scope " + "narrowing, or — more concerning — a regex so loose the engine " + "silently filtered the flood (issue #3578). Worth a manual look.", + "", + ] + for d in missing: + parts.append(f"- `{d}`") + parts.append("") + + return "\n".join(parts) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("main_jsonl") + parser.add_argument("pr_jsonl") + parser.add_argument("--changed-detectors", default="", + help="CSV of detectors changed in PR; filters report.") + parser.add_argument("--new-detectors", default="", + help="CSV of detectors present in PR but not main; rendered with 🆕.") + args = parser.parse_args() + + main_findings = load_findings(args.main_jsonl) + pr_findings = load_findings(args.pr_jsonl) + changed = parse_csv(args.changed_detectors) + new_detectors = parse_csv(args.new_detectors) + + sys.stdout.write(render( + main_findings, + pr_findings, + changed=changed if changed else None, + new_detectors=new_detectors, + )) + + +if __name__ == "__main__": + main()