diff --git a/.github/workflows/detector-corpora-test.yml b/.github/workflows/detector-corpora-test.yml
new file mode 100644
index 000000000..265eb3787
--- /dev/null
+++ b/.github/workflows/detector-corpora-test.yml
@@ -0,0 +1,266 @@
+name: Corpora Test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened]
+    paths:
+      - 'pkg/detectors/**'
+      - 'pkg/engine/defaults/defaults.go'
+      - '.github/workflows/detector-corpora-test.yml'
+      - 'scripts/test/detector_corpora_test.sh'
+      - 'scripts/test/diff_corpora_results.py'
+      - 'scripts/test/detect_changed_detectors.sh'
+
+env:
+  DATASETS: |
+    s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd
+
+jobs:
+  corpora-test:
+    if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Install Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6
+        with:
+          go-version: "1.25"
+
+      - name: Install dependencies
+        run: sudo apt-get install -y zstd jq
+
+      - name: Resolve merge-base
+        id: merge_base
+        shell: bash
+        run: |
+          set -o pipefail
+          git fetch --no-tags --prune origin main
+          MERGE_BASE=$(git merge-base origin/main HEAD)
+          echo "Merge base: $MERGE_BASE"
+          echo "sha=$MERGE_BASE" >> "$GITHUB_OUTPUT"
+
+      # Determine which detectors changed in this PR. The PR build scopes its
+      # scan to the full set; the main build excludes detectors that don't
+      # exist there yet (new detectors). If the set is empty, the workflow
+      # short-circuits with a skip comment — scoping is the entire point of
+      # Phase 2, falling back to scan-all defeats it.
+      - name: Detect changed detectors
+        id: detect
+        shell: bash
+        env:
+          BASE_REF: ${{ steps.merge_base.outputs.sha }}
+        run: |
+          set -o pipefail
+          chmod +x scripts/test/detect_changed_detectors.sh
+          PR_CSV=$(./scripts/test/detect_changed_detectors.sh --pr-csv || true)
+          MAIN_CSV=$(./scripts/test/detect_changed_detectors.sh --main-csv || true)
+          NEW_LIST=$(./scripts/test/detect_changed_detectors.sh --new-only || true)
+          NEW_CSV=$(echo "$NEW_LIST" | paste -sd, -)
+          echo "PR detectors:   $PR_CSV"
+          echo "Main detectors: $MAIN_CSV"
+          echo "New detectors:  $NEW_CSV"
+          echo "pr_csv=$PR_CSV" >> "$GITHUB_OUTPUT"
+          echo "main_csv=$MAIN_CSV" >> "$GITHUB_OUTPUT"
+          echo "new_csv=$NEW_CSV" >> "$GITHUB_OUTPUT"
+          if [[ -n "$PR_CSV" ]]; then
+            echo "any_changed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "any_changed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      # Sticky comment: find any prior detector-bench comment on the PR by
+      # the marker substring and update it in place. The marker — kept in
+      # sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py —
+      # has to appear in BOTH the skip body and the diff body so the same
+      # comment flips between them as iterative pushes change which path
+      # fires. Skip body is only posted on pull_request events; workflow_dispatch
+      # runs with no changed detectors silently finish without posting.
+      - name: Find existing skip comment
+        if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
+        id: find_skip_comment
+        uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4
+        with:
+          issue-number: ${{ github.event.pull_request.number }}
+          comment-author: 'github-actions[bot]'
+          body-includes: '<!-- detector-bench -->'
+
+      - name: Post or update skip comment
+        if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
+        uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5
+        with:
+          comment-id: ${{ steps.find_skip_comment.outputs.comment-id }}
+          issue-number: ${{ github.event.pull_request.number }}
+          edit-mode: replace
+          body: |
+            <!-- detector-bench -->
+            ## Corpora Test Results
+
+            No detector regex or keyword changes in this PR. Bench skipped.
+
+      - name: Configure AWS credentials
+        if: steps.detect.outputs.any_changed == 'true'
+        uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 # v6
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      # Cache the main scan results by merge-base + scoped detector set.
+      # On subsequent pushes to the same PR without a rebase, both are
+      # identical, so the main scan (35 GB of S3 streaming + trufflehog) is
+      # skipped entirely.
+      - name: Restore main scan cache
+        id: main_scan_cache
+        if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != ''
+        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+        with:
+          path: /tmp/results-main.jsonl
+          key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }}
+
+      # Two independent builds run in parallel:
+      #   A) prepare main worktree → build main binary (git I/O then CPU)
+      #      Skipped on main scan cache hit or when main_csv is empty
+      #      (all changed detectors are new — no baseline needed).
+      #   B) build PR binary (CPU, no dependencies)
+      - name: Build binaries
+        if: steps.detect.outputs.any_changed == 'true'
+        shell: bash
+        env:
+          MERGE_BASE: ${{ steps.merge_base.outputs.sha }}
+          MAIN_CSV: ${{ steps.detect.outputs.main_csv }}
+          MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }}
+        run: |
+          set -o pipefail
+
+          # Chain A: prepare worktree, then build main binary.
+          # Skipped when main scan results are already cached, or when all
+          # changed detectors are new (main_csv empty — no baseline needed).
+          if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then
+            (
+              git worktree add /tmp/trufflehog-main-src "$MERGE_BASE"
+              cd /tmp/trufflehog-main-src
+              CGO_ENABLED=0 go build -o /tmp/trufflehog-main .
+            ) &
+            PID_MAIN_BUILD=$!
+          fi
+
+          # Chain B: build PR binary (no dependencies).
+          CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . &
+          PID_PR_BUILD=$!
+
+          [[ -n "${PID_MAIN_BUILD:-}" ]] && { wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; }; }
+          wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; }
+
+      # PR and main scans share a single S3 stream per dataset file, teed to
+      # both binaries simultaneously. The main side is skipped on a cache hit
+      # (results already in /tmp/results-main.jsonl) or when main_csv is empty
+      # (PR adds only new detectors — no overlap with main).
+      - name: Run corpora tests
+        if: steps.detect.outputs.any_changed == 'true'
+        shell: bash
+        env:
+          PR_CSV: ${{ steps.detect.outputs.pr_csv }}
+          MAIN_CSV: ${{ steps.detect.outputs.main_csv }}
+          MAIN_SCAN_CACHE_HIT: ${{ steps.main_scan_cache.outputs.cache-hit }}
+        run: |
+          set -o pipefail
+          files=()
+          while IFS= read -r dataset; do
+            [[ -z "$dataset" ]] && continue
+            files+=("$dataset")
+          done <<< "$DATASETS"
+
+          export TRUFFLEHOG_BIN=/tmp/trufflehog-pr
+          export OUTPUT_JSONL=/tmp/results-pr.jsonl
+          export INCLUDE_DETECTORS="$PR_CSV"
+
+          if [[ -n "$MAIN_CSV" && "$MAIN_SCAN_CACHE_HIT" != 'true' ]]; then
+            # Dual-binary: single S3 download teed to both PR and main binaries.
+            export TRUFFLEHOG_BIN_MAIN=/tmp/trufflehog-main
+            export OUTPUT_JSONL_MAIN=/tmp/results-main.jsonl
+            export INCLUDE_DETECTORS_MAIN="$MAIN_CSV"
+          elif [[ -z "$MAIN_CSV" ]]; then
+            echo "No overlapping detectors in main; skipping main scan."
+            : > /tmp/results-main.jsonl
+          else
+            echo "Main scan cache hit; skipping main scan."
+          fi
+
+          ./scripts/test/detector_corpora_test.sh "${files[@]}" \
+            || { echo "Corpora scan failed" >&2; exit 1; }
+
+      - name: Save main scan cache
+        if: steps.detect.outputs.any_changed == 'true' && steps.detect.outputs.main_csv != '' && steps.main_scan_cache.outputs.cache-hit != 'true'
+        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
+        with:
+          path: /tmp/results-main.jsonl
+          key: main-scan-v1-${{ steps.merge_base.outputs.sha }}-${{ steps.detect.outputs.main_csv }}
+
+      - name: Diff results
+        if: steps.detect.outputs.any_changed == 'true'
+        shell: bash
+        env:
+          CHANGED: ${{ steps.detect.outputs.pr_csv }}
+          NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }}
+        run: |
+          set -o pipefail
+          python3 scripts/test/diff_corpora_results.py \
+            /tmp/results-main.jsonl /tmp/results-pr.jsonl \
+            --changed-detectors="$CHANGED" \
+            --new-detectors="$NEW_DETECTORS" \
+            > /tmp/diff-report.md
+          cat /tmp/diff-report.md
+
+      # workflow_dispatch runs don't carry an issue context, so resolve the
+      # PR number by branch lookup. pull_request events fall through to the
+      # event's issue number. Output feeds the find/update pair below.
+      - name: Resolve PR number
+        if: steps.detect.outputs.any_changed == 'true'
+        id: resolve_pr
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9
+        with:
+          script: |
+            let issue_number;
+            if (context.eventName === 'workflow_dispatch') {
+              const pulls = await github.rest.pulls.list({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`,
+                state: 'open',
+              });
+              if (pulls.data.length === 0) {
+                core.setFailed(`No open PR found for branch ${context.ref}`);
+                return;
+              }
+              issue_number = pulls.data[0].number;
+            } else {
+              issue_number = context.issue.number;
+            }
+            core.setOutput('issue_number', issue_number);
+
+      - name: Find existing diff comment
+        if: steps.detect.outputs.any_changed == 'true'
+        id: find_diff_comment
+        uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4
+        with:
+          issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
+          comment-author: 'github-actions[bot]'
+          body-includes: '<!-- detector-bench -->'
+
+      - name: Post or update diff comment
+        if: steps.detect.outputs.any_changed == 'true'
+        uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5
+        with:
+          comment-id: ${{ steps.find_diff_comment.outputs.comment-id }}
+          issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
+          edit-mode: replace
+          body-path: /tmp/diff-report.md
diff --git a/.gitignore b/.gitignore
index 6abf4e766..48d76df3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,7 @@ tmp/go-test.json
 .captain/detectors/quarantines.yaml
 .captain/detectors/flakes.yaml
 .vscode
+
+# Python
+__pycache__/
+*.pyc
diff --git a/scripts/test/detect_changed_detectors.sh b/scripts/test/detect_changed_detectors.sh
new file mode 100755
index 000000000..1e1bfbc7f
--- /dev/null
+++ b/scripts/test/detect_changed_detectors.sh
@@ -0,0 +1,206 @@
+#!/usr/bin/env bash
+#
+# detect_changed_detectors.sh — Phase 2
+#
+# Emits the list of detectors changed between two git refs, formatted for
+# trufflehog's --include-detectors flag (comma-separated, lowercase protobuf
+# enum names, optional ".v<n>" version suffix).
+#
+# Source of truth for each detector's identifier:
+#   - Proto enum name comes from the detector's Type() implementation in its
+#     source files (e.g. `return detectorspb.DetectorType_AzureBatch` →
+#     `azurebatch`). Necessary because the package directory often differs
+#     from the enum name (azure_batch vs AzureBatch, npmtokenv2 vs NpmToken,
+#     close vs closecrm, etc.).
+#   - Version comes from the directory suffix only (`/v<n>`). Detectors that
+#     encode the version in the dir name (e.g. `npmtokenv2`) are emitted
+#     without a version suffix; trufflehog then matches all versions of that
+#     proto type — wider scope but correct.
+#
+# "New detector" detection compares pkg/engine/defaults/defaults.go imports
+# between the two refs. A detector imported at HEAD but not at BASE is new.
+#
+# Modes:
+#   (none)       List all changed detectors at HEAD, one per line, in
+#                <name>[.v<n>] form.
+#   --pr-csv     Same set as default mode, comma-joined.
+#   --main-csv   Changed detectors that also exist at BASE (excludes new),
+#                comma-joined. Use as --include-detectors for the main build.
+#   --new-only   Just the new detectors (in HEAD but not BASE), one per line.
+#
+# Env:
+#   BASE_REF   default origin/main
+#   HEAD_REF   default HEAD
+
+set -euo pipefail
+
+MODE="${1:-list}"
+BASE_REF="${BASE_REF:-origin/main}"
+HEAD_REF="${HEAD_REF:-HEAD}"
+
+REPO_ROOT="$(git rev-parse --show-toplevel)"
+cd "$REPO_ROOT"
+
+# Resolve BASE to a concrete commit. Workflow already runs `git fetch origin
+# main`; locally that may not be true, so we fall back to `main` if the
+# remote-tracking ref is missing.
+if ! git rev-parse --verify "$BASE_REF" >/dev/null 2>&1; then
+    if git rev-parse --verify main >/dev/null 2>&1; then
+        BASE_REF=main
+    else
+        echo "error: cannot resolve BASE_REF=$BASE_REF and no local 'main'" >&2
+        exit 1
+    fi
+fi
+
+MERGE_BASE=$(git merge-base "$BASE_REF" "$HEAD_REF")
+
+# Step 1 — changed detector dirs (relative to repo root).
+# Pattern: pkg/detectors/<name>(/v<n>)?/<file>.go, excludes _test.go and
+# files inside common/, custom_detectors/.
+mapfile -t CHANGED_DIRS < <(
+    git diff --name-only "$MERGE_BASE...$HEAD_REF" -- 'pkg/detectors/**/*.go' \
+        | grep -Ev '_test\.go$' \
+        | grep -Ev '^pkg/detectors/(common|custom_detectors)/' \
+        | sed -E 's|^(pkg/detectors/[^/]+(/v[0-9]+)?)/[^/]+\.go$|\1|' \
+        | sort -u
+)
+
+# Step 2 — defaults.go imports at each ref. Each line has form
+#   "github.com/trufflesecurity/trufflehog/v3/pkg/detectors/<name>(/v<n>)?"
+# We extract just the <name>(/v<n>)? portion to use as the dir identifier.
+parse_defaults_imports() {
+    local ref="$1"
+    git show "$ref:pkg/engine/defaults/defaults.go" 2>/dev/null \
+        | grep -oE '"github\.com/trufflesecurity/trufflehog/v3/pkg/detectors/[^"]+"' \
+        | sed -E 's|.*/pkg/detectors/||; s|"$||' \
+        | sort -u
+}
+
+mapfile -t HEAD_IMPORTS < <(parse_defaults_imports "$HEAD_REF")
+mapfile -t BASE_IMPORTS < <(parse_defaults_imports "$MERGE_BASE")
+
+# Set difference: detectors imported at HEAD but not at BASE. The dir
+# identifier (e.g. "github/v2", "stripe") matches the form we extracted in
+# step 1, so we can intersect directly without re-mapping.
+NEW_DIRS_FILE=$(mktemp)
+trap 'rm -f "$NEW_DIRS_FILE"' EXIT
+comm -23 \
+    <(printf '%s\n' "${HEAD_IMPORTS[@]+"${HEAD_IMPORTS[@]}"}") \
+    <(printf '%s\n' "${BASE_IMPORTS[@]+"${BASE_IMPORTS[@]}"}") \
+    > "$NEW_DIRS_FILE"
+
+is_new_detector() {
+    grep -qxF "$1" "$NEW_DIRS_FILE"
+}
+
+# Step 2b — skip detectors whose diff doesn't touch regex patterns or Keywords.
+# Corpora results only change when the matching logic changes; verification,
+# redaction, or structural changes don't affect match counts.
+has_pattern_change() {
+    local dir="$1"
+
+    # Fast path: regex or Keywords() signature on a changed line.
+    git diff "$MERGE_BASE...$HEAD_REF" -- "$dir"/*.go 2>/dev/null \
+        | grep -qE '^[+-][^+-].*(regexp\.|MustCompile|Keywords)' && return 0
+
+    # Slow path: compare the Keywords() function body between refs to catch
+    # changes to the return value (e.g. []string{"old"} → []string{"new"})
+    # where the changed lines don't mention "Keywords" themselves.
+    local file
+    while IFS= read -r file; do
+        [[ "$file" == *_test.go ]] && continue
+        local head_body base_body
+        head_body=$(git show "$HEAD_REF:$file" 2>/dev/null \
+            | awk '/func[[:space:]].*Keywords\(\)[[:space:]]*\[\]string/,/^[[:space:]]*\}/' \
+            | tail -n +2)
+        base_body=$(git show "$MERGE_BASE:$file" 2>/dev/null \
+            | awk '/func[[:space:]].*Keywords\(\)[[:space:]]*\[\]string/,/^[[:space:]]*\}/' \
+            | tail -n +2)
+        [[ "$head_body" != "$base_body" ]] && return 0
+    done < <(git diff --name-only "$MERGE_BASE...$HEAD_REF" -- "$dir"/*.go 2>/dev/null)
+
+    return 1
+}
+
+# Step 3 — for a dir, derive `<protoname>[.v<n>]`.
+detector_id_for_dir() {
+    local dir="$1"
+    local version=""
+    if [[ "$dir" =~ ^pkg/detectors/[^/]+/v([0-9]+)$ ]]; then
+        version=".v${BASH_REMATCH[1]}"
+    fi
+
+    # Extract proto enum name. Multiple matches are possible (a detector may
+    # also reference related types in helpers); the Type() return is by far
+    # the most common, so the modal value wins.
+    local proto
+    proto=$(
+        grep -E 'return[[:space:]]+\S*DetectorType_[A-Za-z0-9]+' "$dir"/*.go 2>/dev/null \
+            | grep -v '_test\.go' \
+            | grep -oE 'DetectorType_[A-Za-z0-9]+' \
+            | sort | uniq -c | sort -rn \
+            | head -1 \
+            | awk '{print $2}' \
+            | sed 's/^DetectorType_//' \
+            | tr '[:upper:]' '[:lower:]'
+    )
+    if [[ -z "$proto" ]]; then
+        return 1
+    fi
+    echo "${proto}${version}"
+}
+
+# Step 4 — emit per mode.
+emit_list() {
+    local dir id
+    for dir in "${CHANGED_DIRS[@]:-}"; do
+        [[ -z "$dir" ]] && continue
+        has_pattern_change "$dir" || continue
+        if id=$(detector_id_for_dir "$dir"); then
+            echo "$id"
+        else
+            echo "warning: could not resolve detector id for $dir" >&2
+        fi
+    done | sort -u
+}
+
+emit_main_list() {
+    local dir id
+    for dir in "${CHANGED_DIRS[@]:-}"; do
+        [[ -z "$dir" ]] && continue
+        has_pattern_change "$dir" || continue
+        # Strip `pkg/detectors/` prefix to get the import-path form, then
+        # check against the new-detector set.
+        local import_form="${dir#pkg/detectors/}"
+        if is_new_detector "$import_form"; then
+            continue
+        fi
+        if id=$(detector_id_for_dir "$dir"); then
+            echo "$id"
+        fi
+    done | sort -u
+}
+
+emit_new_list() {
+    local dir id
+    for dir in "${CHANGED_DIRS[@]:-}"; do
+        [[ -z "$dir" ]] && continue
+        has_pattern_change "$dir" || continue
+        local import_form="${dir#pkg/detectors/}"
+        if ! is_new_detector "$import_form"; then
+            continue
+        fi
+        if id=$(detector_id_for_dir "$dir"); then
+            echo "$id"
+        fi
+    done | sort -u
+}
+
+case "$MODE" in
+    list)       emit_list ;;
+    --pr-csv)   emit_list | paste -sd, - ;;
+    --main-csv) emit_main_list | paste -sd, - ;;
+    --new-only) emit_new_list ;;
+    *)          echo "Usage: $0 [--pr-csv|--main-csv|--new-only]" >&2; exit 2 ;;
+esac
diff --git a/scripts/test/detector_corpora_test.sh b/scripts/test/detector_corpora_test.sh
new file mode 100755
index 000000000..9b17eacfc
--- /dev/null
+++ b/scripts/test/detector_corpora_test.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+set -euo pipefail
+
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <corpora_file.jsonl.zstd> [<corpora_file2.jsonl.zstd> ...]"
+    exit 1
+fi
+
+# CI sets OUTPUT_JSONL to per-run paths and skips the human-readable DuckDB
+# summary. Local invocations leave it unset and get the summary table for
+# debugging.
+if [[ -z "${OUTPUT_JSONL+x}" ]]; then
+    OUTPUT_JSONL="/tmp/corpora_results.jsonl"
+    RUN_DUCKDB_SUMMARY=1
+else
+    RUN_DUCKDB_SUMMARY=0
+fi
+> "$OUTPUT_JSONL"
+
+REPO_ROOT="$(git rev-parse --show-toplevel)"
+TRUFFLEHOG_BIN="${TRUFFLEHOG_BIN:-${REPO_ROOT}/trufflehog}"
+
+if [[ ! -x "$TRUFFLEHOG_BIN" ]]; then
+    CGO_ENABLED=0 go build -o "$TRUFFLEHOG_BIN" "$REPO_ROOT"
+fi
+
+# When set, scope the scan to specific detectors. Comma-separated, lowercase
+# proto enum names with optional ".v<n>" suffix (matches the format produced
+# by scripts/test/detect_changed_detectors.sh).
+INCLUDE_DETECTORS="${INCLUDE_DETECTORS:-}"
+INCLUDE_FLAG=()
+if [[ -n "$INCLUDE_DETECTORS" ]]; then
+    INCLUDE_FLAG=(--include-detectors="$INCLUDE_DETECTORS")
+fi
+
+if [[ -n "${OUTPUT_JSONL_MAIN:-}" ]]; then
+    > "$OUTPUT_JSONL_MAIN"
+fi
+
+# --no-verification avoids network calls against a large corpus where thousands
+# of matches could trigger API calls, dominating runtime. Verifier behavior is
+# covered by detector unit and integration tests.
+#
+# Dual-binary mode: when TRUFFLEHOG_BIN_MAIN / OUTPUT_JSONL_MAIN /
+# INCLUDE_DETECTORS_MAIN are set, the corpus stream is teed to both the PR
+# binary (stdout side) and the main binary (process substitution) so S3 is
+# only downloaded once.
+scan() {
+    local input="$1"
+    set +e
+
+    local main_include_flag=()
+    if [[ -n "${INCLUDE_DETECTORS_MAIN:-}" ]]; then
+        main_include_flag=(--include-detectors="$INCLUDE_DETECTORS_MAIN")
+    fi
+
+    local rc=0
+    if [[ -n "${TRUFFLEHOG_BIN_MAIN:-}" ]]; then
+        # Single S3 download teed to both binaries simultaneously.
+        unzstd -c "$input" \
+            | jq -r .content \
+            | tee >(
+                "${TRUFFLEHOG_BIN_MAIN}" \
+                    --no-update \
+                    --no-verification \
+                    --allow-verification-overlap \
+                    --log-level=3 \
+                    --concurrency=8 \
+                    --json \
+                    --archive-timeout=2h \
+                    "${main_include_flag[@]}" \
+                    stdin >> "${OUTPUT_JSONL_MAIN}"
+              ) \
+            | "$TRUFFLEHOG_BIN" \
+                --no-update \
+                --no-verification \
+                --allow-verification-overlap \
+                --log-level=3 \
+                --concurrency=8 \
+                --json \
+                --print-avg-detector-time \
+                --archive-timeout=2h \
+                "${INCLUDE_FLAG[@]}" \
+                stdin >> "$OUTPUT_JSONL"
+        rc=$?
+        wait
+    else
+        unzstd -c "$input" \
+            | jq -r .content \
+            | "$TRUFFLEHOG_BIN" \
+                --no-update \
+                --no-verification \
+                --allow-verification-overlap \
+                --log-level=3 \
+                --concurrency=8 \
+                --json \
+                --print-avg-detector-time \
+                --archive-timeout=2h \
+                "${INCLUDE_FLAG[@]}" \
+                stdin >> "$OUTPUT_JSONL"
+        rc=$?
+    fi
+    set -e
+    return $rc
+}
+
+for CORPORA_FILE in "$@"; do
+    if [[ "$CORPORA_FILE" == s3://* ]]; then
+        aws s3 cp "$CORPORA_FILE" - | scan /dev/stdin
+    else
+        scan "$CORPORA_FILE"
+    fi
+done
+
+if [[ "$RUN_DUCKDB_SUMMARY" == "1" ]]; then
+    duckdb -c "
+CREATE TABLE t AS FROM read_json_auto('$OUTPUT_JSONL', ignore_errors=true);
+
+SELECT
+    t.DetectorName detector,
+    COUNT(*) total
+FROM t
+GROUP BY all
+ORDER BY total DESC, detector
+LIMIT 50;
+"
+fi
diff --git a/scripts/test/diff_corpora_results.py b/scripts/test/diff_corpora_results.py
new file mode 100755
index 000000000..63b5f771a
--- /dev/null
+++ b/scripts/test/diff_corpora_results.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Diffs two trufflehog JSONL outputs (main vs PR build) and emits a Markdown
+report to stdout.
+
+Identity per finding: (DetectorName, Raw or RawV2 fallback). Set semantics —
+duplicates within a single scan collapse into one identity, so a regex change
+either adds a new (detector, secret) identity or removes one.
+
+Verification is disabled at scan time (--no-verification) to avoid network
+calls against a large corpus where thousands of matches could dominate runtime.
+The diff measures regex match changes only.
+
+When --changed-detectors is provided, the report focuses on the detectors
+changed by the PR. Detectors flagged via --new-detectors are rendered with 🆕
+status and absolute density (no main baseline). When --corpus-bytes is
+provided, a blast-radius column projects matches per 10 GB of scanned content.
+
+Usage:
+    diff_corpora_results.py <main.jsonl> <pr.jsonl>
+        [--changed-detectors=<csv>]
+        [--new-detectors=<csv>]
+        [--corpus-bytes=<n>]
+"""
+import argparse
+import json
+import sys
+from collections import defaultdict
+
+
+PREAMBLE = (
+    "Scans a corpus of real-world public code against only the detectors "
+    "changed in this PR, then compares unique match counts between the PR "
+    "build and the main baseline to catch regex regressions. Verification "
+    "is disabled — each detector's regex is measured independently."
+)
+
+STATUS_KEY = (
+    "- 🔴 regression: >5 new, >20% increase over main, or any removed\n"
+    "- ⚠️ warning: 1–5 new and ≤20% increase over main\n"
+    "- ✅ clean\n"
+    "- 🆕 new detector (no baseline)"
+)
+
+# Marker on the very first line of the body so peter-evans/find-comment can
+# locate the sticky comment via substring match. Workflow file references the
+# same literal — keep the two in sync.
+STICKY_COMMENT_MARKER = "<!-- detector-bench -->"
+
+
+def parse_csv(s):
+    """Parse a comma-separated detector list into normalized name set.
+
+    Strips ``.v<n>`` version suffixes and lowercases. JSONL DetectorName is the
+    proto enum name (e.g., ``JDBC``); we match case-insensitively by name only,
+    since version doesn't appear in the output. Versioned scoping happens at
+    the trufflehog --include-detectors level.
+    """
+    if not s:
+        return set()
+    out = set()
+    for item in s.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        if "." in item:
+            item = item.split(".", 1)[0]
+        out.add(item.lower())
+    return out
+
+
+def load_findings(path):
+    """Returns dict: detector_name -> {"identities": set[str], "total": int}."""
+    by_detector = defaultdict(lambda: {"identities": set(), "total": 0})
+    with open(path, "r", encoding="utf-8", errors="replace") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            detector = obj.get("DetectorName") or ""
+            if not detector:
+                continue
+            raw = obj.get("Raw") or obj.get("RawV2") or ""
+            by_detector[detector]["identities"].add(raw)
+            by_detector[detector]["total"] += 1
+    return by_detector
+
+
+def status_emoji(new_count, removed_count, unique_main):
+    """Hybrid threshold: 🔴 on absolute (>5) OR relative (>20% of main) NEW, OR any REMOVED."""
+    if removed_count > 0:
+        return "🔴"
+    if new_count > 5 or new_count > 0.20 * max(unique_main, 1):
+        return "🔴"
+    if new_count > 0:
+        return "⚠️"
+    return "✅"
+
+
+def build_top_line_summary(rows, changed):
+    regressed = sum(1 for r in rows if not r["is_new"] and r["emoji"] == "🔴")
+    warned = sum(1 for r in rows if not r["is_new"] and r["emoji"] == "⚠️")
+    new_count = sum(1 for r in rows if r["is_new"])
+    clean = sum(1 for r in rows if r["emoji"] == "✅")
+    scoped = ", ".join(f"`{d}`" for d in sorted(changed)) if changed else ""
+    parts = []
+    if regressed:
+        parts.append(f"{regressed} regressed")
+    if warned:
+        parts.append(f"{warned} warned")
+    parts += [f"{new_count} new", f"{clean} clean"]
+    summary = f"**{' · '.join(parts)}**"
+    if scoped:
+        summary += f" \u00a0|\u00a0 Scoped to: {scoped}"
+    return summary
+
+
+def render(main, pr, changed=None, new_detectors=None):
+    new_detectors = new_detectors or set()
+
+    if changed:
+        all_names = {d for d in (set(main) | set(pr))
+                     if d.lower() in changed}
+        # Detectors that the PR claims to have changed (or added) but that
+        # produced zero matches on either side. These don't appear in JSONL,
+        # so we surface them as a warning row.
+        seen_lower = {d.lower() for d in (set(main) | set(pr))}
+        missing = sorted(d for d in changed if d not in seen_lower)
+    else:
+        all_names = set(main) | set(pr)
+        missing = []
+
+    _empty = {"identities": set(), "total": 0}
+    rows = []
+    has_diff = False
+    for d in sorted(all_names):
+        # A detector is only treated as fully new if the new_detectors set
+        # says so AND main produced no findings for it. When a PR modifies an
+        # existing version and adds a new version of the same detector (e.g.
+        # jdbc.v1 + jdbc.v2), both collapse to "jdbc" in new_detectors but
+        # main still ran against the existing version — its results must not
+        # be discarded.
+        is_new = d.lower() in new_detectors and d not in main
+        m = main.get(d, _empty)
+        p = pr.get(d, _empty)
+        new_ids = p["identities"] - m["identities"]
+        removed_ids = m["identities"] - p["identities"]
+
+        if is_new:
+            emoji = "🆕"
+        else:
+            emoji = status_emoji(len(new_ids), len(removed_ids), len(m["identities"]))
+
+        if new_ids or removed_ids or m["total"] != p["total"]:
+            has_diff = True
+
+        rows.append({
+            "detector": d,
+            "is_new": is_new,
+            "emoji": emoji,
+            "total_main": m["total"],
+            "total_pr": p["total"],
+            "unique_main": len(m["identities"]),
+            "unique_pr": len(p["identities"]),
+            "new_count": len(new_ids),
+            "removed_count": len(removed_ids),
+        })
+
+    parts = [
+        STICKY_COMMENT_MARKER,
+        "## Corpora Test Results",
+        "",
+        PREAMBLE,
+        "",
+    ]
+    if rows:
+        parts += [build_top_line_summary(rows, changed), ""]
+
+    if not rows and not missing:
+        parts += ["_(No findings on either side for the changed detectors.)_", ""]
+        return "\n".join(parts)
+
+    if rows:
+        if has_diff or any(r["is_new"] for r in rows):
+            rows.sort(
+                key=lambda r: (
+                    0 if r["is_new"] else 1,
+                    -(r["new_count"] + r["removed_count"]),
+                    r["detector"],
+                )
+            )
+        else:
+            rows.sort(key=lambda r: r["detector"])
+
+        cols = ["Status", "Detector", "Unique matches (main)", "Unique matches (PR)",
+                "New", "Removed"]
+        aligns = ["", "", "---:", "---:", "---:", "---:"]
+        parts += [
+            "| " + " | ".join(cols) + " |",
+            "|" + "|".join(a if a else "---" for a in aligns) + "|",
+        ]
+
+        for r in rows:
+            if r["is_new"]:
+                cells = [
+                    r["emoji"],
+                    r["detector"],
+                    "—",
+                    str(r["unique_pr"]),
+                    "—",
+                    "—",
+                ]
+            else:
+                cells = [
+                    r["emoji"],
+                    r["detector"],
+                    str(r["unique_main"]),
+                    str(r["unique_pr"]),
+                    str(r["new_count"]),
+                    str(r["removed_count"]),
+                ]
+            parts.append("| " + " | ".join(cells) + " |")
+        parts.append("")
+        parts.append(STATUS_KEY)
+        parts.append("")
+
+    if missing:
+        parts += [
+            "### ⚠️ Changed detectors with zero matches in both builds",
+            "",
+            "These detectors were modified by the PR but produced no matches "
+            "against the corpus on either side. Could be a deliberate scope "
+            "narrowing, or — more concerning — a regex so loose the engine "
+            "silently filtered the flood (issue #3578). Worth a manual look.",
+            "",
+        ]
+        for d in missing:
+            parts.append(f"- `{d}`")
+        parts.append("")
+
+    return "\n".join(parts)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("main_jsonl")
+    parser.add_argument("pr_jsonl")
+    parser.add_argument("--changed-detectors", default="",
+                        help="CSV of detectors changed in PR; filters report.")
+    parser.add_argument("--new-detectors", default="",
+                        help="CSV of detectors present in PR but not main; rendered with 🆕.")
+    args = parser.parse_args()
+
+    main_findings = load_findings(args.main_jsonl)
+    pr_findings = load_findings(args.pr_jsonl)
+    changed = parse_csv(args.changed_detectors)
+    new_detectors = parse_csv(args.new_detectors)
+
+    sys.stdout.write(render(
+        main_findings,
+        pr_findings,
+        changed=changed if changed else None,
+        new_detectors=new_detectors,
+    ))
+
+
+if __name__ == "__main__":
+    main()