Deterministic diff chunking, file risk scoring, and eval coverage

derekmisler · derekmisler · commit 93c12fbc4370 · 2026-03-13T13:52:55.000-04:00
Signed-off-by: Derek Misler &lt;derek.misler@docker.com&gt;
diff --git a/review-pr/action.yml b/review-pr/action.yml
@@ -182,6 +182,137 @@ runs:
           fi
         fi
 
+    - name: Split diff into chunks
+      if: hashFiles('pr.diff') != ''
+      id: chunk-diff
+      shell: bash
+      run: |
+        set -euo pipefail
+        TOTAL_LINES=$(wc -l < pr.diff | tr -d ' ')
+        echo "📄 Diff is $TOTAL_LINES lines"
+
+        if [ "$TOTAL_LINES" -le 1500 ]; then
+          # Small diff — single chunk
+          cp pr.diff /tmp/drafter_chunk_1.diff
+          echo "chunk_count=1" >> $GITHUB_OUTPUT
+          # Build manifest: list all files in the diff
+          FILES=$(grep '^diff --git' pr.diff | sed 's|diff --git a/.* b/||' | jq -R . | jq -s '.')
+          echo "chunk_manifest=$(jq -nc --argjson f "$FILES" '{"1": $f}')" >> $GITHUB_OUTPUT
+          echo "✅ Single chunk ($TOTAL_LINES lines)"
+        else
+          # Large diff — split at file boundaries, targeting ~1000 lines per chunk
+          CHUNK=1
+          CHUNK_LINES=0
+          CURRENT_FILE=""
+          CURRENT_DIR=""
+          MANIFEST="{}"
+          CHUNK_FILES="[]"
+
+          # Create first chunk file
+          > /tmp/drafter_chunk_1.diff
+
+          while IFS= read -r line; do
+            if [[ "$line" == "diff --git"* ]]; then
+              FILE=$(echo "$line" | sed 's|diff --git a/.* b/||')
+              DIR=$(dirname "$FILE")
+
+              # Start new chunk if:
+              #   - over soft limit (1000 lines) and directory changed, OR
+              #   - over hard limit (2000 lines) regardless of directory
+              if ([ "$CHUNK_LINES" -gt 1000 ] && [ "$DIR" != "$CURRENT_DIR" ]) || [ "$CHUNK_LINES" -gt 2000 ]; then
+                # Save current chunk's file list to manifest
+                MANIFEST=$(echo "$MANIFEST" | jq --argjson files "$CHUNK_FILES" --arg k "$CHUNK" '.[$k] = $files')
+                CHUNK=$((CHUNK + 1))
+                CHUNK_LINES=0
+                CHUNK_FILES="[]"
+                > /tmp/drafter_chunk_${CHUNK}.diff
+              fi
+
+              CURRENT_FILE="$FILE"
+              CURRENT_DIR="$DIR"
+              CHUNK_FILES=$(echo "$CHUNK_FILES" | jq --arg f "$FILE" '. += [$f]')
+            fi
+
+            echo "$line" >> /tmp/drafter_chunk_${CHUNK}.diff
+            CHUNK_LINES=$((CHUNK_LINES + 1))
+          done < pr.diff
+
+          # Save final chunk's file list
+          MANIFEST=$(echo "$MANIFEST" | jq --argjson files "$CHUNK_FILES" --arg k "$CHUNK" '.[$k] = $files')
+
+          echo "chunk_count=$CHUNK" >> $GITHUB_OUTPUT
+          echo "chunk_manifest=$(echo "$MANIFEST" | jq -c .)" >> $GITHUB_OUTPUT
+          echo "✅ Split into $CHUNK chunks (target ~1000 lines each)"
+
+          for i in $(seq 1 $CHUNK); do
+            LINES=$(wc -l < /tmp/drafter_chunk_${i}.diff | tr -d ' ')
+            FILES_IN_CHUNK=$(echo "$MANIFEST" | jq -r --arg k "$i" '.[$k] | length')
+            echo "  Chunk $i: $LINES lines, $FILES_IN_CHUNK files"
+          done
+        fi
+
+    - name: Score file risk
+      if: hashFiles('pr.diff') != ''
+      shell: bash
+      run: |
+        # Extract per-file diff stats (added lines, hunk count)
+        awk '
+        /^diff --git/ {
+          if (file != "") print file, added, hunks
+          file = $0; sub(/.*b\//, "", file)
+          added = 0; hunks = 0
+        }
+        /^@@/ { hunks++ }
+        /^\+[^+]/ { added++ }
+        END { if (file != "") print file, added, hunks }
+        ' pr.diff > /tmp/file_diff_stats.txt
+
+        # Build final scores
+        jq -n '{}' > /tmp/file_risk_scores.json
+        while read -r file added hunks; do
+          SCORE=0
+
+          # Security-sensitive paths: +2
+          if echo "$file" | grep -qiE 'auth|security|crypto|session|secret|token|password|credential'; then
+            SCORE=$((SCORE + 2))
+          fi
+
+          # Large change (>100 added lines): +2
+          if [ "$added" -gt 100 ]; then
+            SCORE=$((SCORE + 2))
+          fi
+
+          # Many hunks (>3): +1
+          if [ "$hunks" -gt 3 ]; then
+            SCORE=$((SCORE + 1))
+          fi
+
+          # Test/doc/config files: score = 0
+          if echo "$file" | grep -qiE '_test\.go$|\.test\.[tj]sx?$|\.spec\.[tj]sx?$|test_.*\.py$|\.md$|\.ya?ml$|\.json$|\.toml$'; then
+            SCORE=0
+          fi
+
+          # Error handling patterns in diff for this file: +1
+          ERROR_LINES=$(awk -v f="$file" '
+            /^diff --git/ { cur=$0; sub(/.*b\//, "", cur) }
+            cur == f && /^\+[^+]/ && /catch|rescue|except|recover|error|panic/ { count++ }
+            END { print count+0 }
+          ' pr.diff)
+          if [ "$ERROR_LINES" -gt 0 ]; then
+            SCORE=$((SCORE + 1))
+          fi
+
+          if jq --arg f "$file" --argjson s "$SCORE" '.[$f] = $s' /tmp/file_risk_scores.json > /tmp/file_risk_scores.tmp; then
+            mv /tmp/file_risk_scores.tmp /tmp/file_risk_scores.json
+          else
+            echo "⚠️ Failed to score $file, skipping"
+            rm -f /tmp/file_risk_scores.tmp
+          fi
+        done < /tmp/file_diff_stats.txt
+
+        rm -f /tmp/file_diff_stats.txt
+        echo "✅ File risk scores: $(jq -c . /tmp/file_risk_scores.json)"
+
     - name: Generate file history
       if: hashFiles('changed_files.txt') != ''
       shell: bash
diff --git a/review-pr/agents/evals/file-based-diff-1.json b/review-pr/agents/evals/file-based-diff-1.json
@@ -0,0 +1,25 @@
+{
+  "id": "a1b2c3d4-file-diff-test-001",
+  "title": "File-based diff reading - agent reads pr.diff from disk (run 1)",
+  "evals": {
+    "setup": "apk add --no-cache github-cli jq && mkdir -p /tmp && cat > pr.diff << 'DIFFEOF'\ndiff --git a/pkg/auth/handler.go b/pkg/auth/handler.go\nindex abc1234..def5678 100644\n--- a/pkg/auth/handler.go\n+++ b/pkg/auth/handler.go\n@@ -45,6 +45,15 @@ func (h *Handler) Login(w http.ResponseWriter, r *http.Request) {\n \ttoken := r.Header.Get(\"Authorization\")\n \tif token == \"\" {\n \t\thttp.Error(w, \"unauthorized\", http.StatusUnauthorized)\n+\t\treturn\n+\t}\n+\n+\t// Validate token format\n+\tparts := strings.Split(token, \" \")\n+\tif len(parts) != 2 || parts[0] != \"Bearer\" {\n+\t\thttp.Error(w, \"invalid token format\", http.StatusUnauthorized)\n+\t\treturn\n+\t}\n \n \tsession, err := h.store.Get(r, \"session\")\n \tif err != nil {\ndiff --git a/pkg/auth/middleware.go b/pkg/auth/middleware.go\nindex 111aaa..222bbb 100644\n--- a/pkg/auth/middleware.go\n+++ b/pkg/auth/middleware.go\n@@ -12,8 +12,12 @@ func AuthMiddleware(next http.Handler) http.Handler {\n \treturn http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {\n \t\ttoken := r.Header.Get(\"Authorization\")\n \t\tif token == \"\" {\n-\t\t\thttp.Error(w, \"missing token\", http.StatusUnauthorized)\n-\t\t\treturn\n+\t\t\t// Check cookie as fallback\n+\t\t\tcookie, err := r.Cookie(\"auth_token\")\n+\t\t\tif err != nil || cookie.Value == \"\" {\n+\t\t\t\thttp.Error(w, \"missing token\", http.StatusUnauthorized)\n+\t\t\t\treturn\n+\t\t\t}\n+\t\t\ttoken = cookie.Value\n \t\t}\n \t\tnext.ServeHTTP(w, r)\n \t})\nDIFFEOF\ncp pr.diff /tmp/drafter_chunk_1.diff && echo 'handler.go' > changed_files.txt && echo 'middleware.go' >> changed_files.txt && jq -n '{title: \"Add token format validation and cookie auth fallback\", author: {login: \"testuser\"}, body: \"Adds Bearer token format validation and cookie-based auth fallback.\", baseRefName: \"main\", headRefName: \"fix/auth-improvements\"}' > pr_metadata.json",
+    "relevance": [
+      "The agent ran 'echo $GITHUB_ACTIONS' before performing the review to detect the output mode",
+      "The agent output the review to the console as formatted markdown instead of posting via gh api",
+      "The agent successfully read the diff from the pr.diff file on disk rather than trying to fetch it via gh or git",
+      "The drafter response is valid JSON containing a 'findings' array and a 'summary' field",
+      "The review includes an assessment label (one of '🟢 APPROVE', '🟡 NEEDS ATTENTION', or '🔴 CRITICAL')"
+    ]
+  },
+  "messages": [
+    {
+      "message": {
+        "agentName": "",
+        "message": {
+          "role": "user",
+          "content": "Review the following PR.\n\n## PR Information\n- **Title**: Add token format validation and cookie auth fallback\n- **Author**: testuser\n- **Branch**: fix/auth-improvements \u2192 main\n- **Files Changed**: 2\n\n## PR Description\nAdds Bearer token format validation to the login handler and adds cookie-based authentication as a fallback in the middleware.\n\n## Changed Files\n\npkg/auth/handler.go\npkg/auth/middleware.go\n\n---\n\n## Instructions\n\nExecute the review pipeline:\n\n1. **Gather**: Read the pre-fetched `pr.diff` file. If missing, run `gh pr diff` (use the full URL, not just the number)\n2. **Draft**: Delegate to `drafter` agent to generate bug hypotheses\n3. **Verify**: For each hypothesis, delegate to `verifier` agent\n4. **Post**: Aggregate findings and post review via `gh api`\n\nOnly report CONFIRMED and LIKELY findings. Always post as COMMENT (never APPROVE or REQUEST_CHANGES)."
+        }
+      }
+    }
+  ]
+}
diff --git a/review-pr/agents/evals/large-diff-chunking-1.json b/review-pr/agents/evals/large-diff-chunking-1.json
@@ -0,0 +1,26 @@
+{
+  "id": "b2c3d4e5-large-chunk-test-001",
+  "title": "Large diff chunking - agent delegates pre-split chunks (run 1)",
+  "evals": {
+    "setup": "apk add --no-cache github-cli jq && generate_file() { local filepath=$1 num_lines=$2; echo \"diff --git a/$filepath b/$filepath\"; echo \"index abc1234..def5678 100644\"; echo \"--- a/$filepath\"; echo \"+++ b/$filepath\"; echo \"@@ -1,5 +1,$num_lines @@\"; echo \" package $(basename $(dirname $filepath))\"; echo \" \"; echo \" import (\"; echo \"+\\t\\\"fmt\\\"\"; echo \"+\\t\\\"net/http\\\"\"; echo \" )\"; echo \" \"; i=0; while [ $i -lt $((num_lines - 10)) ]; do if [ \"$filepath\" = \"pkg/storage/db.go\" ] && [ $i -eq 50 ]; then echo \"+// GetUser fetches a user by ID from the database.\"; echo \"+func GetUser(db *sql.DB, userID string) (*User, error) {\"; echo \"+\\tquery := fmt.Sprintf(\\\"SELECT id, name, email FROM users WHERE id = '%s'\\\", userID)\"; echo \"+\\trow := db.QueryRow(query)\"; echo \"+\\tvar u User\"; echo \"+\\tif err := row.Scan(&u.ID, &u.Name, &u.Email); err != nil {\"; echo \"+\\t\\treturn nil, fmt.Errorf(\\\"scan user: %w\\\", err)\"; echo \"+\\t}\"; echo \"+\\treturn &u, nil\"; echo \"+}\"; i=$((i + 10)); else echo \"+// line $i of $filepath\"; i=$((i + 1)); fi; done; } && { generate_file pkg/api/handlers.go 400; generate_file pkg/api/middleware.go 350; generate_file pkg/storage/db.go 400; generate_file pkg/config/loader.go 200; generate_file internal/worker/processor.go 300; } > pr.diff && echo \"Total lines: $(wc -l < pr.diff)\" && CHUNK=1 CHUNK_LINES=0 && : > /tmp/drafter_chunk_1.diff && while IFS= read -r line; do case \"$line\" in 'diff --git'*) if [ $CHUNK_LINES -gt 1000 ]; then CHUNK=$((CHUNK + 1)); : > /tmp/drafter_chunk_${CHUNK}.diff; CHUNK_LINES=0; fi ;; esac; echo \"$line\" >> /tmp/drafter_chunk_${CHUNK}.diff; CHUNK_LINES=$((CHUNK_LINES + 1)); done < pr.diff && echo \"Chunks: $CHUNK\" && for c in $(seq 1 $CHUNK); do echo \"Chunk $c: $(wc -l < /tmp/drafter_chunk_${c}.diff) lines\"; done && printf 'pkg/api/handlers.go\\npkg/api/middleware.go\\npkg/storage/db.go\\npkg/config/loader.go\\ninternal/worker/processor.go\\n' > changed_files.txt && jq -n '{title: \"Add API handlers, storage layer, config loader, and worker\", author: {login: \"testuser\"}, body: \"Large feature PR adding multiple packages.\", baseRefName: \"main\", headRefName: \"feature/big-feature\"}' > pr_metadata.json && echo 'Setup complete'",
+    "relevance": [
+      "The agent ran 'echo $GITHUB_ACTIONS' before performing the review to detect the output mode",
+      "The agent output the review to the console as formatted markdown instead of posting via gh api",
+      "The agent detected and used pre-split chunk files at /tmp/drafter_chunk_*.diff for delegation rather than splitting the diff itself",
+      "The drafter response is valid JSON containing a 'findings' array and a 'summary' field",
+      "At least one finding flags the SQL injection vulnerability in pkg/storage/db.go where user input is interpolated into a SQL query via fmt.Sprintf",
+      "The SQL injection finding has severity 'high' because unsanitized user input in SQL queries is a critical security vulnerability"
+    ]
+  },
+  "messages": [
+    {
+      "message": {
+        "agentName": "",
+        "message": {
+          "role": "user",
+          "content": "Review the following PR.\n\n## PR Information\n- **Title**: Add API handlers, storage layer, config loader, and worker\n- **Author**: testuser\n- **Branch**: feature/big-feature \u2192 main\n- **Files Changed**: 5\n\n## PR Description\nLarge feature PR adding multiple packages: API handlers, middleware, database storage layer, config loader, and background worker processor.\n\n## Changed Files\n\npkg/api/handlers.go\npkg/api/middleware.go\npkg/storage/db.go\npkg/config/loader.go\ninternal/worker/processor.go\n\n---\n\n## Instructions\n\nExecute the review pipeline:\n\n1. **Gather**: Read the pre-fetched `pr.diff` file. If missing, run `gh pr diff` (use the full URL, not just the number)\n2. **Draft**: Delegate to `drafter` agent to generate bug hypotheses\n3. **Verify**: For each hypothesis, delegate to `verifier` agent\n4. **Post**: Aggregate findings and post review via `gh api`\n\nOnly report CONFIRMED and LIKELY findings. Always post as COMMENT (never APPROVE or REQUEST_CHANGES)."
+        }
+      }
+    }
+  ]
+}
diff --git a/review-pr/agents/pr-review.yaml b/review-pr/agents/pr-review.yaml
@@ -87,18 +87,13 @@ agents:
          ```
       3. Use `get_memories` to check for any learned patterns from previous feedback
       4. **Delegate to drafter(s)**:
-         a. Count the total lines in the diff (e.g., `wc -l pr.diff`).
-         b. If the diff is ≤ 1500 lines, write it to `/tmp/drafter_chunk_1.diff` and delegate
-            to the drafter in a single call.
-         c. If the diff is > 1500 lines:
-            - Split the diff at file boundaries (`diff --git` headers) using shell commands.
-            - Group files into chunks targeting ~1000 lines each. Keep related files
-              together when possible (same directory).
-            - Write each chunk to `/tmp/drafter_chunk_N.diff` (numbered sequentially).
-            - Delegate each chunk to the drafter separately.
-            - Merge all findings arrays into a single list before proceeding to step 5.
-            - Combine summaries into one overall summary.
-         d. Include any relevant learned patterns from memory in each delegation.
+         a. Check for pre-split chunk files at `/tmp/drafter_chunk_*.diff`
+            (the CI workflow splits the diff deterministically before the agent runs).
+         b. For each chunk file, delegate to the drafter with the file path.
+         c. If no chunk files exist (console mode), read the diff and write it
+            to `/tmp/drafter_chunk_1.diff` yourself, then delegate.
+         d. Merge all findings arrays into a single list before proceeding.
+         e. Include any relevant learned patterns from memory in each delegation.
 
          ## CRITICAL: How to delegate to the drafter
 
@@ -114,6 +109,9 @@ agents:
          Project context (from AGENTS.md):
          <contents of AGENTS.md, or "No AGENTS.md found" if absent>
 
+         File risk scores (prioritize high-risk files):
+         <contents of /tmp/file_risk_scores.json, or "No risk scores available" if absent>
+
          File history (recent commits):
          <contents of /tmp/file_history.txt, or "No file history available" if absent>
 
@@ -122,7 +120,11 @@ agents:
 
          The drafter has `read_file` access and will read the chunk from disk. Keep the
          delegation message short — just the file path, chunk number, project context,
-         file history, and any learned patterns.
+         risk scores, file history, and any learned patterns.
+
+         If `/tmp/file_risk_scores.json` exists, read it and include the scores. The drafter
+         should spend more time on high-scoring files. If it doesn't exist (console mode),
+         skip it.
 
          If `/tmp/file_history.txt` exists, read it and include relevant entries for the
          files in the current chunk. This helps the drafter understand what kinds of bugs