feat: apply MOSS-grounded self-evolution improvements to ADAM

Implements 7 improvements grounded in MOSS paper (arXiv 2605.22794): 1. Transcript capture (§3.4): context_ring buffer in adam-observe.mjs captures last 8 events around struggle signals as context_window. 2. Evidence batching (§3.1): new adam-batch.mjs pre-clusters windowed journal entries into coherent failure batches by (signal_type, cluster_key). 3. Multi-stage analysis (§3.3): SKILL.md dispatches adam agent in two stages (diagnose+plan → implement) with inter-stage validation gate. 4. Pre-apply verification (§3.4): 4-check deterministic gate before auto-apply (source entries exist, diagnosis grounded, type-evidence match, no conflicting recent proposals). 5. Auto-rollback (§3.5): new adam-rollback.mjs reverts regressed proposals detected by A/B measurement, creates regression nudges. 6. Harness self-modification (§1 Table 1): new harness_edit proposal type targeting adam's own scripts with stricter gates (confidence≥5, never auto-apply, test-suite-gated). 7. Keypoint matrix evaluation (§4.2): 5 capability dimensions (tool_selection, scope_discipline, error_recovery, first_attempt, build_reliability) scored per batch for structured evaluation. Test suite: 94 → 114 tests (20 new), all passing.
2026-06-08 23:09:16 +00:00 · 2026-05-24 11:15:32 +01:00
parent a48c705c0a
commit 440fb52eb1
7 changed files with 1038 additions and 20 deletions
@@ -0,0 +1,184 @@
+#!/usr/bin/env node
+// adam-batch.mjs — pre-clusters windowed journal entries into coherent failure
+// batches before analyst dispatch. Implements MOSS §3.1: "anchored to an
+// automatically curated batch of production-failure evidence."
+//
+// Each batch groups entries by (signal_type, cluster_key) where cluster_key
+// follows the same clustering rules as agents/adam.md §4:
+//   correction     → tokenized phrase (cross-cwd)
+//   retry_loop     → tool
+//   weak_agent     → subagent_type
+//   tool_error_loop→ fp
+//   dead_end       → session
+//   edit_churn     → file basename
+//   build_loop     → session
+//   subagent_dispatch_pattern → subagent_type
+//   silent_drift   → active_skills[0]
+//   error_after_recovery → (recovered_from, original_fp)
+//   correction_free_streak → active_skills[0]
+//   clean_recovery → (recovered_from, active_skills[0])
+//   task_completed → sorted tool_kinds tuple
+//
+// CLI:
+//   adam-batch.mjs [--input <jsonl-path>] [--min-entries N] [--min-sessions N]
+//
+// Output: JSON object with `batches` array and `unbatched` count.
+
+import { readFileSync } from "node:fs";
+import { readJsonlSafe } from "./adam-utils.mjs";
+
+const DEFAULT_MIN_ENTRIES = 1;
+const DEFAULT_MIN_SESSIONS = 1;
+
+const CORRECTION_STOPWORDS = new Set([
+  "the", "a", "an", "and", "or", "but", "of", "to", "for", "in", "on",
+  "with", "use", "when", "where", "what", "why", "how", "this", "that",
+  "these", "those", "is", "are", "was", "were", "be", "been", "being",
+  "do", "does", "did", "doing", "has", "have", "had", "your", "you",
+  "i", "it", "as", "at", "by", "from", "not", "no",
+]);
+
+function tokenizePhrase(phrase) {
+  if (!phrase || typeof phrase !== "string") return "";
+  return phrase.toLowerCase()
+    .split(/\s+/)
+    .map(t => t.replace(/^[^\w']+|[^\w']+$/g, ""))
+    .filter(t => t && !CORRECTION_STOPWORDS.has(t))
+    .sort()
+    .join("|");
+}
+
+function clusterKey(entry) {
+  if (!entry || typeof entry !== "object") return null;
+  const t = entry.type;
+  switch (t) {
+    case "correction":
+      return tokenizePhrase(entry.phrase) || "unknown";
+    case "retry_loop":
+      return entry.tool || "unknown";
+    case "weak_agent":
+    case "subagent_dispatch_pattern":
+      return entry.subagent_type || "unknown";
+    case "tool_error_loop":
+      return entry.fp || "unknown";
+    case "dead_end":
+    case "build_loop":
+      return entry.session || "unknown";
+    case "edit_churn":
+      return entry.file ? entry.file.split("/").pop() : "unknown";
+    case "silent_drift":
+    case "correction_free_streak":
+      return Array.isArray(entry.active_skills) ? (entry.active_skills[0] || "") : "";
+    case "error_after_recovery":
+      return `${entry.recovered_from || "?"}:${entry.original_fp || "?"}`;
+    case "clean_recovery":
+      return `${entry.recovered_from || "?"}:${Array.isArray(entry.active_skills) ? (entry.active_skills[0] || "") : ""}`;
+    case "task_completed":
+      return Array.isArray(entry.tool_kinds) ? entry.tool_kinds.slice().sort().join(",") : "unknown";
+    default:
+      return entry.session || "unknown";
+  }
+}
+
+function parseArgs(argv) {
+  const args = { input: null, minEntries: DEFAULT_MIN_ENTRIES, minSessions: DEFAULT_MIN_SESSIONS, help: false };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === "--input" && i + 1 < argv.length) args.input = argv[++i];
+    else if (a === "--min-entries" && i + 1 < argv.length) {
+      const n = Number(argv[++i]);
+      if (!Number.isNaN(n) && n > 0) args.minEntries = n;
+    }
+    else if (a === "--min-sessions" && i + 1 < argv.length) {
+      const n = Number(argv[++i]);
+      if (!Number.isNaN(n) && n > 0) args.minSessions = n;
+    }
+    else if (a === "--help" || a === "-h") args.help = true;
+  }
+  return args;
+}
+
+export function buildBatches(entries, opts = {}) {
+  const minEntries = opts.minEntries || DEFAULT_MIN_ENTRIES;
+  const minSessions = opts.minSessions || DEFAULT_MIN_SESSIONS;
+  const map = new Map();
+
+  for (const e of entries || []) {
+    if (!e || typeof e !== "object" || !e.type) continue;
+    const key = `${e.type}::${clusterKey(e)}`;
+    if (!map.has(key)) {
+      map.set(key, {
+        batch_id: null,
+        signal_type: e.type,
+        cluster_key: clusterKey(e),
+        entries: [],
+        sessions: new Set(),
+        cwds: new Set(),
+      });
+    }
+    const batch = map.get(key);
+    batch.entries.push(e);
+    if (e.session) batch.sessions.add(e.session);
+    if (e.cwd) batch.cwds.add(e.cwd);
+  }
+
+  const batches = [];
+  let unbatched = 0;
+  let id = 1;
+  for (const [, batch] of map) {
+    if (batch.entries.length < minEntries || batch.sessions.size < minSessions) {
+      unbatched += batch.entries.length;
+      continue;
+    }
+    batch.batch_id = `b${id++}`;
+    batches.push({
+      batch_id: batch.batch_id,
+      signal_type: batch.signal_type,
+      cluster_key: batch.cluster_key,
+      entry_count: batch.entries.length,
+      session_count: batch.sessions.size,
+      cwd_count: batch.cwds.size,
+      has_context_window: batch.entries.some(e => Array.isArray(e.context_window) && e.context_window.length > 0),
+      entries: batch.entries,
+    });
+  }
+
+  batches.sort((a, b) => b.entry_count - a.entry_count);
+  return { batches, unbatched, total: (entries || []).length };
+}
+
+function main() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help) {
+    process.stdout.write("usage: adam-batch.mjs [--input <jsonl-path>] [--min-entries N] [--min-sessions N]\n");
+    process.exit(0);
+  }
+  try {
+    let entries;
+    if (args.input) {
+      entries = readJsonlSafe(args.input);
+    } else if (!process.stdin.isTTY) {
+      const buf = readFileSync(0, "utf8");
+      entries = [];
+      for (const line of buf.split("\n")) {
+        if (!line) continue;
+        try { entries.push(JSON.parse(line)); } catch { /* skip */ }
+      }
+    } else {
+      process.stderr.write("adam-batch: no input (use --input or pipe)\n");
+      process.exit(1);
+    }
+    const result = buildBatches(entries, { minEntries: args.minEntries, minSessions: args.minSessions });
+    process.stdout.write(JSON.stringify(result) + "\n");
+    process.exit(0);
+  } catch (e) {
+    process.stderr.write(`adam-batch error: ${e.message}\n`);
+    process.exit(1);
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}
+
+export { clusterKey, tokenizePhrase };
@@ -0,0 +1,225 @@
+#!/usr/bin/env node
+// adam-rollback.mjs — auto-reverts proposals that regressed after apply.
+//
+// Implements MOSS §3.5: "rollback is mandatory because... a candidate that
+// passes trial can still regress live."
+//
+// For each regressed proposal (detected by adam-ab-measure.mjs):
+//   1. Reads the applied proposal from applied/
+//   2. Parses the `# Rollback` section for undo commands
+//   3. Moves proposal from applied/ to proposals/ with `rolled_back: true`
+//   4. Creates a regression nudge for next SessionStart
+//   5. Removes the ab-tracking entry (so it doesn't re-trigger)
+//
+// CLI:
+//   adam-rollback.mjs --proposal-id <id> [--home <path>] [--dry-run]
+//   adam-rollback.mjs --auto [--home <path>] [--dry-run]
+//
+//   --auto mode: reads ab-measure output, rolls back all regressed proposals.
+//
+// Output: JSON object with rollback results per proposal.
+// Does NOT execute the undo commands itself — outputs them for the skill to
+// execute in-context (safety: undo commands may reference files the script
+// can't safely modify).
+
+import { readFileSync, writeFileSync, renameSync, readdirSync, existsSync, mkdirSync } from "node:fs";
+import { join, basename } from "node:path";
+import { homedir } from "node:os";
+import { parseFrontmatter, readJsonlSafe, listJsonlFiles } from "./adam-utils.mjs";
+
+function parseArgs(argv) {
+  const args = { home: null, proposalId: null, auto: false, dryRun: false, help: false };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === "--home" && i + 1 < argv.length) args.home = argv[++i];
+    else if (a === "--proposal-id" && i + 1 < argv.length) args.proposalId = argv[++i];
+    else if (a === "--auto") args.auto = true;
+    else if (a === "--dry-run") args.dryRun = true;
+    else if (a === "--help" || a === "-h") args.help = true;
+  }
+  return args;
+}
+
+function findAppliedProposal(appliedDir, proposalId) {
+  if (!existsSync(appliedDir)) return null;
+  try {
+    const files = readdirSync(appliedDir).filter(n => n.endsWith(".md"));
+    for (const f of files) {
+      if (f.includes(proposalId)) return join(appliedDir, f);
+    }
+  } catch { /* skip */ }
+  return null;
+}
+
+function extractRollbackSection(content) {
+  const idx = content.indexOf("\n# Rollback\n");
+  if (idx === -1) return null;
+  let body = content.slice(idx + "\n# Rollback\n".length);
+  const nextSection = body.search(/\n# |\n---/);
+  if (nextSection !== -1) body = body.slice(0, nextSection);
+  return body.trim() || null;
+}
+
+function extractUndoCommands(rollbackSection) {
+  if (!rollbackSection) return [];
+  const commands = [];
+  const lines = rollbackSection.split("\n");
+  let inCodeBlock = false;
+  let blockLines = [];
+  for (const line of lines) {
+    if (line.startsWith("```")) {
+      if (inCodeBlock) {
+        if (blockLines.length) commands.push(blockLines.join("\n"));
+        blockLines = [];
+      }
+      inCodeBlock = !inCodeBlock;
+      continue;
+    }
+    if (inCodeBlock) {
+      blockLines.push(line);
+    }
+  }
+  return commands;
+}
+
+export function planRollback(appliedDir, proposalId) {
+  const path = findAppliedProposal(appliedDir, proposalId);
+  if (!path) return { status: "not_found", proposal_id: proposalId };
+
+  const content = readFileSync(path, "utf8");
+  const fm = parseFrontmatter(content);
+  const rollbackSection = extractRollbackSection(content);
+  const undoCommands = extractUndoCommands(rollbackSection);
+
+  return {
+    status: "planned",
+    proposal_id: proposalId,
+    applied_path: path,
+    type: fm.type || "unknown",
+    target: fm.target || null,
+    target_skill: fm.target_skill || null,
+    undo_commands: undoCommands,
+    has_rollback_section: !!rollbackSection,
+  };
+}
+
+export function executeRollback(plan, adamRoot, opts = {}) {
+  const dryRun = opts.dryRun || false;
+  const proposalsDir = join(adamRoot, "proposals");
+  const nudgesPath = join(adamRoot, "active-nudges.json");
+  const now = Date.now();
+
+  if (plan.status !== "planned") return { ...plan, action: "skipped" };
+
+  const result = {
+    proposal_id: plan.proposal_id,
+    type: plan.type,
+    target: plan.target,
+    undo_commands: plan.undo_commands,
+    actions: [],
+  };
+
+  if (dryRun) {
+    result.actions.push("dry_run: would move applied → proposals");
+    if (plan.undo_commands.length) {
+      result.actions.push(`dry_run: would output ${plan.undo_commands.length} undo command(s)`);
+    }
+    result.actions.push("dry_run: would create regression nudge");
+    result.status = "dry_run";
+    return result;
+  }
+
+  mkdirSync(proposalsDir, { recursive: true });
+  const destName = `${basename(plan.applied_path).replace(/\.md$/, "")}-rollback.md`;
+  const destPath = join(proposalsDir, destName);
+
+  let content = readFileSync(plan.applied_path, "utf8");
+  const rollbackMeta = `\nrolled_back: true\nrolled_back_at: "${new Date(now).toISOString()}"`;
+  content = content.replace(/^(---\n[\s\S]*?)(---)/m, `$1${rollbackMeta}\n$2`);
+
+  try {
+    writeFileSync(destPath, content);
+    renameSync(plan.applied_path, plan.applied_path + ".rolled-back");
+    result.actions.push(`moved ${plan.applied_path} → ${destPath}`);
+  } catch (e) {
+    result.status = "move_failed";
+    result.error = e.message;
+    return result;
+  }
+
+  try {
+    let nudges = [];
+    if (existsSync(nudgesPath)) {
+      try { nudges = JSON.parse(readFileSync(nudgesPath, "utf8")); } catch { nudges = []; }
+    }
+    nudges.push({
+      kind: "regression_rollback",
+      message: `adam: rolled back "${plan.proposal_id}" (type: ${plan.type}) — regression detected in A/B measurement. Review with /reflect.`,
+      created_at: now,
+      expires_at_ts: now + 7 * 86400000,
+      max_displays: 3,
+      displays_used: 0,
+      source_proposal: plan.proposal_id,
+    });
+    writeFileSync(nudgesPath, JSON.stringify(nudges, null, 2));
+    result.actions.push("regression nudge created");
+  } catch (e) {
+    result.actions.push(`nudge failed: ${e.message}`);
+  }
+
+  result.status = "rolled_back";
+  return result;
+}
+
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help) {
+    process.stdout.write(
+      "usage: adam-rollback.mjs --proposal-id <id> [--home <path>] [--dry-run]\n" +
+      "       adam-rollback.mjs --auto [--home <path>] [--dry-run]\n"
+    );
+    process.exit(0);
+  }
+
+  const claudeHome = args.home || join(homedir(), ".claude");
+  const adamRoot = join(claudeHome, "adam");
+  const appliedDir = join(adamRoot, "applied");
+
+  try {
+    const results = [];
+
+    if (args.auto) {
+      const abPath = join(adamRoot, "ab-tracking.jsonl");
+      const entries = readJsonlSafe(abPath);
+      const { computeDeltas } = await import("./adam-ab-measure.mjs");
+      const sources = [join(adamRoot, "journal.jsonl"), ...listJsonlFiles(join(adamRoot, "journal"))];
+      const journalAll = [];
+      for (const p of sources) for (const e of readJsonlSafe(p)) journalAll.push(e);
+      const deltas = computeDeltas(entries, journalAll);
+      const regressed = deltas.filter(d => d.status === "regressed");
+
+      for (const d of regressed) {
+        const plan = planRollback(appliedDir, d.proposal_id);
+        const result = executeRollback(plan, adamRoot, { dryRun: args.dryRun });
+        results.push(result);
+      }
+    } else if (args.proposalId) {
+      const plan = planRollback(appliedDir, args.proposalId);
+      const result = executeRollback(plan, adamRoot, { dryRun: args.dryRun });
+      results.push(result);
+    } else {
+      process.stderr.write("adam-rollback: specify --proposal-id or --auto\n");
+      process.exit(1);
+    }
+
+    process.stdout.write(JSON.stringify({ rollbacks: results }) + "\n");
+    process.exit(0);
+  } catch (e) {
+    process.stderr.write(`adam-rollback error: ${e.message}\n`);
+    process.exit(1);
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}
@@ -16,6 +16,8 @@ SCORE="$REAL_HOME/.claude/adam/scripts/adam-score.mjs"
 ABMEASURE="$REAL_HOME/.claude/adam/scripts/adam-ab-measure.mjs"
 APPLYREIN="$REAL_HOME/.claude/adam/scripts/adam-apply-reinforcement.mjs"
 UPGRADE="$REAL_HOME/.claude/adam/scripts/adam-upgrade.mjs"
+BATCH="$REAL_HOME/.claude/adam/scripts/adam-batch.mjs"
+ROLLBACK="$REAL_HOME/.claude/adam/scripts/adam-rollback.mjs"

 TMP_HOME="$(mktemp -d -t adam-test.XXXXXX)"
 trap 'rm -rf "$TMP_HOME"' EXIT INT TERM
@@ -33,6 +35,8 @@ SCORE_RUN()   { HOME="$TMP_HOME" node "$SCORE" --home "$TMP_HOME/.claude" "$@";
 ABMEASURE_RUN(){ HOME="$TMP_HOME" node "$ABMEASURE" --home "$TMP_HOME/.claude" "$@"; }
 APPLYREIN_RUN(){ HOME="$TMP_HOME" node "$APPLYREIN" "$@" --home "$TMP_HOME/.claude"; }
 UPGRADE_RUN() { HOME="$TMP_HOME" node "$UPGRADE" "$@"; }
+BATCH_RUN()   { HOME="$TMP_HOME" node "$BATCH" "$@"; }
+ROLLBACK_RUN(){ HOME="$TMP_HOME" node "$ROLLBACK" "$@"; }

 PASS=0
 FAIL=0
@@ -1487,6 +1491,354 @@ else
  echo "  FAIL: severity_by_type.dead_end missing/wrong (got: $out)"; FAIL=$((FAIL+1))
 fi

+# ============================================================
+# MOSS-grounded tests: context_window, adam-batch, adam-rollback
+# ============================================================
+
+# --- Test 83: context_window attached to tool_error_loop struggle signal ---
+echo "Test 83: context_window attached to tool_error_loop"
+reset_state
+# Fire a user prompt first so the context ring has something.
+echo '{"hook_event_name":"UserPromptSubmit","prompt":"run the tests","session_id":"sCW1","cwd":"/tmp/x"}' \
+  | HOOK_RUN >/dev/null 2>&1 || true
+for i in 1 2 3; do
+  echo '{"hook_event_name":"PostToolUse","tool_name":"Bash","tool_input":{"command":"failing-cmd"},"tool_response":{"is_error":true,"content":"Error: command not found: failing-cmd"},"session_id":"sCW1","cwd":"/tmp/x"}' \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+assert_grep "$ROOT/journal.jsonl" '"context_window":\[' "tool_error_loop carries context_window"
+
+# --- Test 84: context_window captures preceding user prompt ---
+echo "Test 84: context_window captures user prompt text"
+# Re-use the journal from test 83
+assert_grep "$ROOT/journal.jsonl" '"prompt":"run the tests"' "context_window includes user prompt excerpt"
+
+# --- Test 85: context_window includes tool response excerpts ---
+echo "Test 85: context_window includes tool response excerpts"
+assert_grep "$ROOT/journal.jsonl" '"response_excerpt"' "context_window entries have response_excerpt"
+
+# --- Test 86: context_window on dead_end signal ---
+echo "Test 86: context_window on dead_end signal"
+reset_state
+echo '{"hook_event_name":"UserPromptSubmit","prompt":"start working","session_id":"sCW2","cwd":"/tmp/x"}' \
+  | HOOK_RUN >/dev/null 2>&1 || true
+for i in 1 2 3 4 5 6 7 8; do
+  echo "{\"hook_event_name\":\"PostToolUse\",\"tool_name\":\"Bash\",\"tool_input\":{\"command\":\"step$i\"},\"session_id\":\"sCW2\",\"cwd\":\"/tmp/x\"}" \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+if grep -qE '"type":"dead_end"' "$ROOT/journal.jsonl" && grep -qE '"context_window":\[' "$ROOT/journal.jsonl"; then
+  echo "  PASS: dead_end carries context_window"; PASS=$((PASS+1))
+else
+  echo "  FAIL: dead_end missing context_window"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 87: context_window NOT on non-struggle signals ---
+echo "Test 87: context_window absent from correction_free_streak"
+reset_state
+for i in 1 2 3 4 5; do
+  echo "{\"hook_event_name\":\"UserPromptSubmit\",\"prompt\":\"step $i please\",\"session_id\":\"sCW3\",\"cwd\":\"/tmp/x\"}" \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+# correction_free_streak should have fired
+streak_line=$(grep '"type":"correction_free_streak"' "$ROOT/journal.jsonl" | head -1)
+if [ -n "$streak_line" ] && ! echo "$streak_line" | grep -q '"context_window"'; then
+  echo "  PASS: correction_free_streak has no context_window"; PASS=$((PASS+1))
+else
+  echo "  FAIL: unexpected context_window on non-struggle signal"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 88: adam-batch clusters same signal_type + fp into one batch ---
+echo "Test 88: adam-batch clusters same (type, fp) into one batch"
+batch_input=$(cat <<'EOF'
+{"ts":"2026-05-20T10:00:00Z","type":"tool_error_loop","session":"s1","cwd":"/a","tool":"Bash","fp":"ENOENT:abc","count":3}
+{"ts":"2026-05-21T10:00:00Z","type":"tool_error_loop","session":"s2","cwd":"/a","tool":"Bash","fp":"ENOENT:abc","count":4}
+{"ts":"2026-05-22T10:00:00Z","type":"tool_error_loop","session":"s3","cwd":"/b","tool":"Bash","fp":"ENOENT:abc","count":3}
+EOF
+)
+out=$(echo "$batch_input" | BATCH_RUN 2>/dev/null)
+batch_count=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.batches.length)})')
+entry_count=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.batches[0]?j.batches[0].entry_count:0)})')
+if [ "$batch_count" = "1" ] && [ "$entry_count" = "3" ]; then
+  echo "  PASS: 3 same-fp entries → 1 batch with entry_count=3"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected 1 batch / 3 entries (got batches=$batch_count entries=$entry_count)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 89: adam-batch creates separate batches for different signal types ---
+echo "Test 89: adam-batch separates different signal types"
+batch_input=$(cat <<'EOF'
+{"ts":"2026-05-20T10:00:00Z","type":"correction","session":"s1","cwd":"/a","phrase":"no wrong"}
+{"ts":"2026-05-21T10:00:00Z","type":"tool_error_loop","session":"s1","cwd":"/a","fp":"ENOENT:abc","count":3}
+EOF
+)
+out=$(echo "$batch_input" | BATCH_RUN 2>/dev/null)
+batch_count=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.batches.length)})')
+if [ "$batch_count" = "2" ]; then
+  echo "  PASS: 2 different types → 2 batches"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected 2 batches (got $batch_count)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 90: adam-batch reports session_count correctly ---
+echo "Test 90: adam-batch tracks session_count per batch"
+batch_input=$(cat <<'EOF'
+{"ts":"2026-05-20T10:00:00Z","type":"correction","session":"s1","cwd":"/a","phrase":"no wrong"}
+{"ts":"2026-05-21T10:00:00Z","type":"correction","session":"s2","cwd":"/a","phrase":"no wrong"}
+{"ts":"2026-05-22T10:00:00Z","type":"correction","session":"s1","cwd":"/b","phrase":"no wrong"}
+EOF
+)
+out=$(echo "$batch_input" | BATCH_RUN 2>/dev/null)
+sessions=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.batches[0]?j.batches[0].session_count:0)})')
+if [ "$sessions" = "2" ]; then
+  echo "  PASS: session_count=2 for entries from s1+s2"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected session_count=2 (got $sessions)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 91: adam-batch reports has_context_window ---
+echo "Test 91: adam-batch reports has_context_window flag"
+batch_input=$(cat <<'EOF'
+{"ts":"2026-05-20T10:00:00Z","type":"dead_end","session":"s1","cwd":"/a","count":8,"context_window":[{"event":"user","prompt":"hi","ts":"2026-05-20T09:59:00Z"}]}
+EOF
+)
+out=$(echo "$batch_input" | BATCH_RUN 2>/dev/null)
+has_cw=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.batches[0]?j.batches[0].has_context_window:"false")})')
+if [ "$has_cw" = "true" ]; then
+  echo "  PASS: has_context_window=true when entries have context_window"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected has_context_window=true (got $has_cw)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 92: adam-batch empty input → no batches ---
+echo "Test 92: adam-batch produces empty output on empty input"
+out=$(echo '' | BATCH_RUN 2>/dev/null)
+batch_count=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{try{const j=JSON.parse(b);console.log(j.batches.length)}catch{console.log("parse-error")}})')
+total=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{try{const j=JSON.parse(b);console.log(j.total)}catch{console.log("parse-error")}})')
+if [ "$batch_count" = "0" ] && [ "$total" = "0" ]; then
+  echo "  PASS: empty input → 0 batches, total=0"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected 0 batches (got batches=$batch_count total=$total)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 93: adam-rollback --proposal-id moves applied proposal to proposals ---
+echo "Test 93: adam-rollback moves applied proposal to proposals/"
+reset_state
+rm -f "$ROOT/proposals/"*rollback* "$ROOT/active-nudges.json"
+cat > "$ROOT/applied/2026-05-20T00-00-00Z-rb-test-001.md" <<'EOF'
+---
+id: rb-test-001
+type: skill_new
+target: ~/.claude/skills/test-skill/SKILL.md
+confidence: 5
+blast_radius: low
+auto_apply_eligible: true
+status: applied
+source_entries:
+  - "2026-05-18T10:00:00Z"
+---
+# Why
+test rollback
+
+# Rollback
+```bash
+rm -rf ~/.claude/skills/test-skill/
+```
+EOF
+out=$(ROLLBACK_RUN --proposal-id rb-test-001 --home "$TMP_HOME/.claude" 2>/dev/null)
+if echo "$out" | grep -q '"status":"rolled_back"'; then
+  rb_ok=1
+else
+  rb_ok=0
+fi
+# Verify proposal moved to proposals/
+if ls "$ROOT/proposals/"*rb-test-001* >/dev/null 2>&1; then
+  moved_ok=1
+else
+  moved_ok=0
+fi
+# Verify original file renamed
+if [ -f "$ROOT/applied/2026-05-20T00-00-00Z-rb-test-001.md.rolled-back" ]; then
+  renamed_ok=1
+else
+  renamed_ok=0
+fi
+if [ "$rb_ok" = "1" ] && [ "$moved_ok" = "1" ] && [ "$renamed_ok" = "1" ]; then
+  echo "  PASS: rollback moved proposal and renamed applied file"; PASS=$((PASS+1))
+else
+  echo "  FAIL: rollback incomplete (status=$rb_ok moved=$moved_ok renamed=$renamed_ok out=$out)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 94: adam-rollback creates regression nudge ---
+echo "Test 94: adam-rollback creates regression nudge in active-nudges.json"
+if [ -f "$ROOT/active-nudges.json" ]; then
+  nudge_kind=$(node -e "const j=JSON.parse(require('fs').readFileSync('$ROOT/active-nudges.json','utf8'));console.log((j[0]||{}).kind||'')")
+  if [ "$nudge_kind" = "regression_rollback" ]; then
+    echo "  PASS: regression nudge created with kind=regression_rollback"; PASS=$((PASS+1))
+  else
+    echo "  FAIL: nudge kind wrong (got $nudge_kind)"; FAIL=$((FAIL+1))
+  fi
+else
+  echo "  FAIL: active-nudges.json not created"; FAIL=$((FAIL+1))
+fi
+rm -f "$ROOT/proposals/"*rb-test* "$ROOT/applied/"*rb-test* "$ROOT/active-nudges.json"
+
+# --- Test 95: adam-rollback rolled_back field in proposal frontmatter ---
+echo "Test 95: rolled-back proposal has rolled_back: true in frontmatter"
+reset_state
+rm -f "$ROOT/proposals/"*rollback* "$ROOT/active-nudges.json"
+cat > "$ROOT/applied/2026-05-20T00-00-00Z-rb-test-002.md" <<'EOF'
+---
+id: rb-test-002
+type: memory
+target: ~/.claude/projects/-Users-nvm/memory/test.md
+confidence: 4
+blast_radius: low
+---
+# Why
+test
+# Rollback
+delete the memory file
+EOF
+ROLLBACK_RUN --proposal-id rb-test-002 --home "$TMP_HOME/.claude" >/dev/null 2>&1 || true
+rb_file=$(ls "$ROOT/proposals/"*rb-test-002* 2>/dev/null | head -1)
+if [ -n "$rb_file" ] && grep -q 'rolled_back: true' "$rb_file"; then
+  echo "  PASS: rolled-back proposal has rolled_back: true"; PASS=$((PASS+1))
+else
+  echo "  FAIL: rolled_back marker missing (file=$rb_file)"; FAIL=$((FAIL+1))
+fi
+rm -f "$ROOT/proposals/"*rb-test* "$ROOT/applied/"*rb-test* "$ROOT/active-nudges.json"
+
+# --- Test 96: adam-rollback not_found on missing proposal ---
+echo "Test 96: adam-rollback returns not_found for missing proposal"
+reset_state
+out=$(ROLLBACK_RUN --proposal-id nonexistent-999 --home "$TMP_HOME/.claude" 2>/dev/null)
+if echo "$out" | grep -q '"status":"not_found"'; then
+  echo "  PASS: not_found status for missing proposal"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected not_found (got: $out)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 97: adam-rollback --dry-run does not move files ---
+echo "Test 97: adam-rollback --dry-run leaves files in place"
+reset_state
+rm -f "$ROOT/proposals/"*rollback* "$ROOT/active-nudges.json"
+cat > "$ROOT/applied/2026-05-20T00-00-00Z-rb-dry-001.md" <<'EOF'
+---
+id: rb-dry-001
+type: skill_edit
+target: ~/.claude/skills/foo/SKILL.md
+confidence: 4
+---
+# Why
+test dry run
+# Rollback
+revert edit
+EOF
+out=$(ROLLBACK_RUN --proposal-id rb-dry-001 --dry-run --home "$TMP_HOME/.claude" 2>/dev/null)
+if echo "$out" | grep -q '"status":"dry_run"' && [ -f "$ROOT/applied/2026-05-20T00-00-00Z-rb-dry-001.md" ]; then
+  echo "  PASS: dry-run did not move files"; PASS=$((PASS+1))
+else
+  echo "  FAIL: dry-run moved files or wrong status (out=$out)"; FAIL=$((FAIL+1))
+fi
+rm -f "$ROOT/applied/2026-05-20T00-00-00Z-rb-dry-001.md"
+
+# --- Test 98: context_window on edit_churn signal ---
+echo "Test 98: context_window on edit_churn signal"
+reset_state
+echo '{"hook_event_name":"UserPromptSubmit","prompt":"fix the tests","session_id":"sCW4","cwd":"/tmp/x"}' \
+  | HOOK_RUN >/dev/null 2>&1 || true
+for i in 1 2 3 4; do
+  echo '{"hook_event_name":"PostToolUse","tool_name":"Edit","tool_input":{"file_path":"/tmp/churn.py"},"session_id":"sCW4","cwd":"/tmp/x"}' \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+churn_line=$(grep '"type":"edit_churn"' "$ROOT/journal.jsonl" | head -1)
+if [ -n "$churn_line" ] && echo "$churn_line" | grep -q '"context_window"'; then
+  echo "  PASS: edit_churn carries context_window"; PASS=$((PASS+1))
+else
+  echo "  FAIL: edit_churn missing context_window"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 99: context_window on build_loop signal ---
+echo "Test 99: context_window on build_loop signal"
+reset_state
+echo '{"hook_event_name":"UserPromptSubmit","prompt":"run the build","session_id":"sCW5","cwd":"/tmp/x"}' \
+  | HOOK_RUN >/dev/null 2>&1 || true
+for i in 1 2; do
+  echo '{"hook_event_name":"PostToolUse","tool_name":"Bash","tool_input":{"command":"npm run build"},"tool_response":{"is_error":true,"content":"Build failed: TypeError"},"session_id":"sCW5","cwd":"/tmp/x"}' \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+build_line=$(grep '"type":"build_loop"' "$ROOT/journal.jsonl" | head -1)
+if [ -n "$build_line" ] && echo "$build_line" | grep -q '"context_window"'; then
+  echo "  PASS: build_loop carries context_window"; PASS=$((PASS+1))
+else
+  echo "  FAIL: build_loop missing context_window"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 100: adam-batch --min-entries filter ---
+echo "Test 100: adam-batch --min-entries filters small batches"
+batch_input=$(cat <<'EOF'
+{"ts":"2026-05-20T10:00:00Z","type":"correction","session":"s1","cwd":"/a","phrase":"no wrong"}
+{"ts":"2026-05-21T10:00:00Z","type":"tool_error_loop","session":"s1","cwd":"/a","fp":"ENOENT:abc","count":3}
+{"ts":"2026-05-22T10:00:00Z","type":"tool_error_loop","session":"s2","cwd":"/a","fp":"ENOENT:abc","count":4}
+{"ts":"2026-05-23T10:00:00Z","type":"tool_error_loop","session":"s3","cwd":"/a","fp":"ENOENT:abc","count":5}
+EOF
+)
+out=$(echo "$batch_input" | BATCH_RUN --min-entries 3 2>/dev/null)
+batch_count=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.batches.length)})')
+unbatched=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{const j=JSON.parse(b);console.log(j.unbatched)})')
+if [ "$batch_count" = "1" ] && [ "$unbatched" = "1" ]; then
+  echo "  PASS: --min-entries=3 keeps 1 batch (3 entries), drops 1 singleton"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected 1 batch + 1 unbatched (got batches=$batch_count unbatched=$unbatched)"; FAIL=$((FAIL+1))
+fi
+
+# --- Test 101: adam-rollback extracts undo commands from Rollback section ---
+echo "Test 101: adam-rollback extracts undo commands from code blocks"
+reset_state
+rm -f "$ROOT/proposals/"*rollback* "$ROOT/active-nudges.json"
+cat > "$ROOT/applied/2026-05-20T00-00-00Z-rb-undo-001.md" <<'HEREDOC'
+---
+id: rb-undo-001
+type: skill_new
+target: ~/.claude/skills/test-undo/SKILL.md
+confidence: 5
+blast_radius: low
+---
+# Why
+test
+# Rollback
+```bash
+rm -rf ~/.claude/skills/test-undo/
+```
+HEREDOC
+out=$(ROLLBACK_RUN --proposal-id rb-undo-001 --home "$TMP_HOME/.claude" 2>/dev/null)
+undo_count=$(echo "$out" | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{try{const j=JSON.parse(b);console.log((j.rollbacks[0]||{}).undo_commands?j.rollbacks[0].undo_commands.length:0)}catch{console.log("err")}})')
+if [ "$undo_count" = "1" ]; then
+  echo "  PASS: extracted 1 undo command from Rollback section"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected 1 undo command (got $undo_count, out=$out)"; FAIL=$((FAIL+1))
+fi
+rm -f "$ROOT/proposals/"*rb-undo* "$ROOT/applied/"*rb-undo* "$ROOT/active-nudges.json"
+
+# --- Test 102: context_ring size bounded at 8 ---
+echo "Test 102: context_ring bounded at CONTEXT_RING_SIZE=8"
+reset_state
+# Fire 12 PostToolUse events, then a struggle signal
+for i in $(seq 1 12); do
+  echo "{\"hook_event_name\":\"PostToolUse\",\"tool_name\":\"Read\",\"tool_input\":{\"file_path\":\"/tmp/f-$i\"},\"tool_response\":{\"content\":\"ok\"},\"session_id\":\"sCR\",\"cwd\":\"/tmp/x\"}" \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+# Next 3 errors to trigger tool_error_loop with context_window
+for i in 1 2 3; do
+  echo '{"hook_event_name":"PostToolUse","tool_name":"Bash","tool_input":{"command":"fail"},"tool_response":{"is_error":true,"content":"Error: fail"},"session_id":"sCR","cwd":"/tmp/x"}' \
+    | HOOK_RUN >/dev/null 2>&1 || true
+done
+cw_len=$(grep '"type":"tool_error_loop"' "$ROOT/journal.jsonl" | head -1 | node -e 'let b="";process.stdin.on("data",d=>b+=d).on("end",()=>{try{const j=JSON.parse(b);console.log(j.context_window?j.context_window.length:0)}catch{console.log("err")}})')
+if [ "$cw_len" = "8" ]; then
+  echo "  PASS: context_window capped at 8 entries"; PASS=$((PASS+1))
+else
+  echo "  FAIL: expected 8 context_window entries (got $cw_len)"; FAIL=$((FAIL+1))
+fi
+
 echo
 echo "Results: $PASS passed, $FAIL failed"
 [ "$FAIL" = "0" ]
@@ -8,6 +8,71 @@ tools: Read, Write, Edit, Grep, Glob, Bash

 You analyse Claude Code's own behaviour to propose targeted, surgical improvements. You operate offline (no LLM round-trips outside this run) and produce **files**, not actions. Main-thread Claude reviews and applies changes with the user.

+## Stage mode
+
+The skill dispatches you in one of two stages (MOSS-inspired multi-stage pipeline — §3.3: "a single prompt asked to diagnose, plan, implement, verify, and decide overloads context and produces lower-quality output than a sequenced flow"):
+
+- **`stage=diagnose`**: Read batched journal entries, cluster, diagnose root causes, plan fix types. Output diagnoses JSON to `/tmp/adam-diagnoses.json`. Do NOT draft proposals.
+- **`stage=implement`**: Read approved diagnoses from `/tmp/adam-diagnoses.json`. Draft full proposal files to `proposals_dir/`. Emit the clustering trace and punch list.
+
+If no `stage` is specified in the dispatch prompt, run **both stages sequentially** within a single pass (backward-compatible with pre-MOSS flow).
+
+### Diagnose-stage output format
+
+When `stage=diagnose`, write `/tmp/adam-diagnoses.json` containing:
+
+```json
+{
+  "diagnoses": [
+    {
+      "cluster_id": "c1",
+      "signal_type": "correction",
+      "cluster_key": "wrong|approach",
+      "count": 5,
+      "sessions": 3,
+      "diagnosis": {
+        "trigger": "...",
+        "action": "...",
+        "mismatch": "...",
+        "outcome": "... `verbatim quote` ..."
+      },
+      "plan": {
+        "type": "memory",
+        "target": "~/.claude/projects/-Users-nvm/memory/go-test-cache.md",
+        "scope": "add feedback memory about go test -count=1"
+      },
+      "keypoints": {
+        "tool_selection": 1,
+        "scope_discipline": 2,
+        "error_recovery": 0,
+        "first_attempt": 0,
+        "build_reliability": 1
+      },
+      "gates": {
+        "threshold": "pass",
+        "cross_session": "pass",
+        "window": "in:5/out:0",
+        "contradiction": "none"
+      },
+      "source_entries": ["2026-05-20T10:00:00Z", "2026-05-21T11:00:00Z"],
+      "context_evidence": ["... excerpts from context_window ..."]
+    }
+  ],
+  "skipped": [
+    {"cluster_id": "c3", "signal_type": "retry_loop", "reason": "threshold", "count": 2}
+  ],
+  "summary": "considered=4 diagnosed=2 skipped=2"
+}
+```
+
+The skill validates diagnoses between stages (see SKILL.md §2 "Inter-stage validation").
+
+## Context window evidence
+
+Journal entries for struggle signals now carry a `context_window` field — an array of the last 8 events (user prompts, tool calls, responses) surrounding the friction point. This is the ADAM equivalent of MOSS's "original transcript captured by auto-scan at evidence time" (§3.4).
+
+When drafting diagnoses, **prefer `context_window` evidence over transcript file lookups** when it is present. The `context_window` is already scoped to the friction point and more reliable than file-based transcript pulls. Fall back to `transcripts_root` only when `context_window` is absent (pre-upgrade entries).
+
 ## Karpathy constraints (mandatory)

 You MUST obey these on every proposal:
@@ -325,10 +390,29 @@ After ≥7 days, `~/.claude/adam/scripts/adam-ab-measure.mjs` reads each entry a

 The `/reflect` skill runs `adam-ab-measure.mjs --format json` before dispatching this agent, filters to `status == "regressed"`, and passes the list as `ab_regressions` (each object has `proposal_id`, `target_skill`, `proposal_type`, `delta_pct`, `pre_count`, `post_count`).

-**When `ab_regressions` is non-empty, you MUST emit a `## Regressions` section at the TOP of your output (above the proposals listing).** One bullet per regressed proposal listing `proposal_id`, `target_skill`, `delta_pct`, plus the short suggestion `consider revert via /reflect --revert <proposal_id>` (the revert mechanism itself is out of scope for this release — the message stands as a hint).
+**When `ab_regressions` is non-empty, you MUST emit a `## Regressions` section at the TOP of your output (above the proposals listing).** One bullet per regressed proposal listing `proposal_id`, `target_skill`, `delta_pct`. The skill auto-rolls back regressed proposals via `adam-rollback.mjs` before dispatching you — this section is your record of what was rolled back and why.

 The clustering trace summary (see §"Clustering trace") adds an extra `regressions=<N>` key alongside `considered/emitted/skipped`. When no `ab_regressions` arrive (or list is empty), emit `regressions=0`.

+## Keypoint matrix (MOSS §3.3/§4.2)
+
+When running in `stage=diagnose`, you MUST produce a **keypoint matrix** alongside each batch diagnosis. This structured evaluation replaces ad-hoc confidence with per-capability scoring.
+
+Capability dimensions (score each 0–2 per batch: 0=no signal, 1=partial, 2=strong evidence):
+
+| dimension | description | positive signals | negative signals |
+|---|---|---|---|
+| `tool_selection` | correct tool chosen first try | low `retry_loop` | high `retry_loop`, `weak_agent` |
+| `scope_discipline` | stays within requested scope | low `edit_churn`, low `dead_end` | high `edit_churn`, `dead_end`, `silent_drift` |
+| `error_recovery` | recovers from errors without user help | `clean_recovery` | `error_after_recovery`, `tool_error_loop` |
+| `first_attempt` | succeeds without corrections | `correction_free_streak` | `correction` |
+| `build_reliability` | builds/tests pass on first try | `task_completed` with build tools | `build_loop` |
+
+The matrix goes into the diagnosis output as `keypoints: {tool_selection: N, scope_discipline: N, ...}`. The implement stage uses it to:
+1. Prioritize proposals targeting the weakest dimensions.
+2. Include `keypoint_target: "<dimension>"` in proposal frontmatter.
+3. Track dimension trends across `/reflect` runs (persisted in `~/.claude/adam/keypoint-history.jsonl`).
+
 ## Confidence rubric (deterministic — do NOT vibe)

 Sum:
@@ -364,6 +448,7 @@ Sum:
 | `agent_edit` | existing agent file | medium | no |
 | `claude_md_edit` | `~/.claude/CLAUDE.md` | high | no |
 | `hook_new` / `hook_edit` | `settings.json` hooks | high | no |
+| `harness_edit` | adam's own scripts/agent/hooks (see "Harness self-modification") | high | **never** |
 | `deletion` | any skill/agent (soft delete) | high | no |

 ### `nudge` proposals
@@ -392,6 +477,42 @@ A `reinforcement` proposal is logged when `adam-score.mjs` reports `count >= 3`

 Note that `task_completed` alone — without an adjacent negative signal cluster — is NOT a proposal source. It is a urgency *modifier* (see "Scoring: task_completed dampener") and a reinforcement input only.

+### `harness_edit` proposals (MOSS §1 Table 1)
+
+MOSS's core thesis: "routing, hook ordering, state invariants, and dispatch live in code rather than in any text artifact, an entire class of structural failure is physically unreachable from the text layer." This proposal type extends ADAM's evolution scope to its own harness.
+
+**Allowed targets** (harness files that ADAM may propose edits to):
+
+| target | what it controls |
+|---|---|
+| `~/.claude/adam/scripts/adam-observe.mjs` | signal detection regexes, thresholds, counters |
+| `~/.claude/adam/scripts/adam-score.mjs` | severity divisors, dampener thresholds |
+| `~/.claude/adam/scripts/adam-window.mjs` | per-signal sliding window durations |
+| `~/.claude/adam/scripts/adam-batch.mjs` | evidence batching logic |
+| `~/.claude/agents/adam.md` | this agent's own rubric, clustering, proposal rules |
+| `~/.claude/hooks/adam-observe.mjs` | hook integration, event routing |
+
+**Gates (all must hold — stricter than any other type):**
+
+1. `confidence ≥ 5`
+2. `cross_session_evidence == true` (≥5 occurrences across ≥3 sessions)
+3. `auto_apply_eligible: false` — **always**. Harness edits are never auto-applied.
+4. `blast_radius: high`
+5. Proposal includes a `# Test verification` section with the command `bash ~/.claude/adam/tests/run-tests.sh` and the expected result "94 passed, 0 failed" (or current pass count). The skill runs this test before applying.
+6. Change is surgical: ≤30 LOC diff, single file.
+7. `# Diagnosis` reconstructs the causal chain from harness-level behavior (not from text-artifact behavior). The mismatch must name a specific code path (function, regex, threshold) in the target file.
+
+**When to propose `harness_edit`:**
+- Signal detection misses a recurring friction pattern (false negative in adam-observe.mjs)
+- A/B measurement shows systematic bias (e.g., windows too short/long in adam-window.mjs)
+- Scoring thresholds produce consistently over/under-weighted proposals (adam-score.mjs)
+- Batch clustering produces too-coarse or too-fine groupings (adam-batch.mjs)
+
+**When NOT to propose `harness_edit`:**
+- The fix is achievable via a text-mutable type (skill, memory, nudge)
+- Evidence is from a single session only
+- The change would affect test outcomes without clear improvement evidence
+
 ## Special handling

 ### CLAUDE.md edits
@@ -418,7 +539,7 @@ Filename: `proposals_dir/YYYY-MM-DD-NNN-<type>-<slug>.md` (NNN is daily counter
 ```markdown
 ---
 id: YYYY-MM-DD-NNN
-type: skill_new | memory | skill_edit | nudge | reinforcement | agent_new | agent_edit | claude_md_edit | hook_new | hook_edit | deletion
+type: skill_new | memory | skill_edit | nudge | reinforcement | agent_new | agent_edit | claude_md_edit | hook_new | hook_edit | harness_edit | deletion
 target: <absolute path — for skill_new, the will-be path: ~/.claude/skills/<slug>/SKILL.md>
 confidence: <int>
 blast_radius: low | medium | high
@@ -444,6 +565,11 @@ bytes_after: <int>
 contradiction_flag: "<one-line summary or null>"
 # optional — auto-populated from Diagnosis Mismatch line
 diagnosis_summary: "<≤120 chars, single sentence>"
+# keypoint matrix — which capability dimension this proposal targets (MOSS §4.2)
+keypoint_target: "<tool_selection | scope_discipline | error_recovery | first_attempt | build_reliability>"
+# harness_edit only — test command and expected output
+test_command: "bash ~/.claude/adam/tests/run-tests.sh"
+test_expected: "<N> passed, 0 failed"
 ---

 # Why
@@ -482,7 +608,7 @@ Print a single JSON line to stdout:
 ## What you must NOT do

 - Do not call other agents.
- Do not write to `~/.claude/skills/`, `~/.claude/agents/`, `settings.json`, `CLAUDE.md`, or any existing skill/agent file directly. All changes go through proposal files for main-thread review and apply.
+- Do not write to `~/.claude/skills/`, `~/.claude/agents/`, `settings.json`, `CLAUDE.md`, adam scripts, or any existing skill/agent/harness file directly. All changes go through proposal files for main-thread review and apply. This includes `harness_edit` proposals — you draft the diff, the skill applies it after test verification.
 - Do not delete files. Deletion proposals describe a soft-move; the main thread executes it.
 - Do not write outside `proposals_dir/` and `state_path`.
 - Do not invent trigger phrases for `skill_new` — every trigger must come from observed user input.
@@ -107,10 +107,15 @@ const CLEAN_RECOVERY_WINDOW = 3;
 const SILENT_DRIFT_THRESHOLD = 5;
 const ERROR_AFTER_RECOVERY_WINDOW = 5;
 const RECENT_RECOVERIES_MAX = 3;
-const STRUGGLE_TYPES = new Set(["tool_error_loop", "dead_end", "retry_loop"]);
+const STRUGGLE_TYPES = new Set([
+  "tool_error_loop", "dead_end", "retry_loop", "weak_agent",
+  "edit_churn", "build_loop", "silent_drift", "error_after_recovery",
+]);
 const ACTIVE_SKILLS_LOOKBACK = 10;
 const TASK_TOOL_MIN = 5;
 const TASK_DIVERSITY_MIN = 3;
+const CONTEXT_RING_SIZE = 8;
+const CONTEXT_EXCERPT_LEN = 200;
 const STATE_MAX_BYTES = 1_000_000;

 function safeRead(path, fallback) {
@@ -226,6 +231,20 @@ function activeNames(state, kind) {
  return [...seen];
 }

+function excerpt(text, len) {
+  if (!text || typeof text !== "string") return null;
+  return text.length > len ? text.slice(0, len) + "…" : text;
+}
+
+function pushContext(state, entry) {
+  state.context_ring.push(entry);
+  if (state.context_ring.length > CONTEXT_RING_SIZE) state.context_ring.shift();
+}
+
+function snapshotContext(state) {
+  return state.context_ring.length ? state.context_ring.slice() : undefined;
+}
+
 function errorFingerprint(toolResponse) {
  if (!toolResponse) return null;
  let text = "";
@@ -290,6 +309,7 @@ function resetSessionLocal(state) {
  state.recentRecoveries = [];
  state.session_post_count = 0;
  state.tool_window = [];
+  state.context_ring = [];
  state.task_tool_kinds = {};
  state.task_tool_count = 0;
  state.task_corrections = 0;
@@ -316,6 +336,7 @@ function ensureStateDefaults(state) {
  if (typeof state.silentDriftEmitted !== "boolean") state.silentDriftEmitted = false;
  if (!Array.isArray(state.recentRecoveries)) state.recentRecoveries = [];
  if (typeof state.session_post_count !== "number") state.session_post_count = 0;
+  if (!Array.isArray(state.context_ring)) state.context_ring = [];
 }

 function main() {
@@ -340,6 +361,7 @@ function main() {

  if (event === "UserPromptSubmit") {
    const prompt = (input.prompt || "").slice(0, 200);
+    pushContext(state, { event: "user", prompt: excerpt(prompt, CONTEXT_EXCERPT_LEN), ts });
    if (isCorrection(prompt)) {
      const last = state.tool_window[state.tool_window.length - 1] || {};
      appendJournal({
@@ -406,9 +428,27 @@ function main() {
    const argsHash = djb2(JSON.stringify(input.tool_input || {}));
    const file = (input.tool_input && (input.tool_input.file_path || input.tool_input.path)) || null;

+    const toolResponse = input.tool_response;
+    const respExcerpt = (() => {
+      if (!toolResponse) return null;
+      const text = typeof toolResponse === "string" ? toolResponse
+        : typeof toolResponse.content === "string" ? toolResponse.content
+        : null;
+      return excerpt(text, CONTEXT_EXCERPT_LEN);
+    })();
+    pushContext(state, {
+      event: "tool", tool, ts,
+      input_excerpt: excerpt(JSON.stringify(input.tool_input || {}), CONTEXT_EXCERPT_LEN),
+      response_excerpt: respExcerpt,
+      is_error: !!(toolResponse && toolResponse.is_error),
+    });
+
    let struggleEmittedThisTurn = null;
    const emit = (entry) => {
-      if (STRUGGLE_TYPES.has(entry.type)) struggleEmittedThisTurn = entry.type;
+      if (STRUGGLE_TYPES.has(entry.type)) {
+        entry.context_window = snapshotContext(state);
+        struggleEmittedThisTurn = entry.type;
+      }
      appendJournal(entry);
    };

@@ -126,7 +126,7 @@ copy_file "$SRC/adam/scripts/adam-archive.mjs"                       "$DEST/adam
 copy_file "$SRC/adam/scripts/adam-upgrade.mjs"                       "$DEST/adam/scripts/adam-upgrade.mjs"
 # v0.3.3 helper scripts — invoked from SKILL.md / hooks / analyst flow
 for _adam_script in adam-utils adam-window adam-explain adam-nudge-eligibility adam-cooldown \
-                    adam-score adam-ab-measure adam-apply-reinforcement; do
+                    adam-score adam-ab-measure adam-apply-reinforcement adam-batch adam-rollback; do
  copy_file "$SRC/adam/scripts/${_adam_script}.mjs" \
            "$DEST/adam/scripts/${_adam_script}.mjs"
  run "chmod +x \"$DEST/adam/scripts/${_adam_script}.mjs\""
@@ -65,32 +65,86 @@ Filter to `status == "regressed"` before passing to the analyst as
 effectiveness") to surface a `## Regressions` section at the top of its output
 when this list is non-empty. If the script fails: log stderr, pass `[]`.

-### 2. Dispatch the analyst
+**Auto-rollback** (MOSS §3.5): if any entries have `status == "regressed"`, run the rollback script to auto-revert them before analyst dispatch:

-Use the Agent tool with `subagent_type: "adam"` and prompt:
+```bash
+node ~/.claude/adam/scripts/adam-rollback.mjs --auto --home ~/.claude > /tmp/adam-rollback-results.json 2> /tmp/adam-rollback.log
+```
+
+For each rolled-back proposal, print to user: `adam: rolled back "<proposal_id>" — regression detected (delta: <delta_pct>%)`. The rollback script moves the proposal from `applied/` back to `proposals/` with `rolled_back: true` and creates a regression nudge. If the script fails: log stderr, continue (rollback is best-effort).
+
+**Evidence batching** (MOSS §3.1): pre-cluster the windowed journal into coherent failure batches:
+
+```bash
+node ~/.claude/adam/scripts/adam-batch.mjs --input /tmp/adam-windowed-journal.jsonl > /tmp/adam-batches.json 2> /tmp/adam-batch.log
+```
+
+This groups entries by (signal_type, cluster_key) and reports per-batch metadata including `has_context_window` (whether transcript evidence is attached). If the script fails: log stderr, pass `null` to the analyst (graceful degradation — analyst falls back to raw journal clustering).
+
+### 2. Dispatch the analyst (two-stage pipeline)
+
+MOSS §3.3: "A single prompt asked to diagnose, plan, implement, verify, and decide overloads context and produces lower-quality output than a sequenced flow." The analyst is dispatched in two stages with a validation gate between them.
+
+**Stage 1 — Diagnose + Plan**: Use the Agent tool with `subagent_type: "adam"` and prompt:

 ```
-Run a single analysis pass.
+stage=diagnose
+
+Read the batched journal entries, cluster by signal type, diagnose root causes,
+plan fix types, and score the keypoint matrix. Write diagnoses to /tmp/adam-diagnoses.json.
+Do NOT draft proposal files.

 Inputs:
- windowed_journal_path: /tmp/adam-windowed-journal.jsonl  # pre-filtered by adam-window.mjs
- scores_path: /tmp/adam-scores.json                       # per-session dampeners + reinforcement candidates
- ab_regressions_path: /tmp/adam-ab-regressions.json       # A/B deltas for prior auto-applied proposals
+- windowed_journal_path: /tmp/adam-windowed-journal.jsonl
+- batches_path: /tmp/adam-batches.json                     # pre-clustered evidence batches
+- scores_path: /tmp/adam-scores.json
+- ab_regressions_path: /tmp/adam-ab-regressions.json
 - journal_path: ~/.claude/adam/journal.jsonl               # raw — fallback only
 - state_path: ~/.claude/adam/state.json
 - usage_path: ~/.claude/adam/usage.json
+- applied_dir: ~/.claude/adam/applied/
+- rejected_dir: ~/.claude/adam/rejected/
+- transcripts_root: ~/.claude/projects/
+- skills_root: ~/.claude/skills/
+
+Use batches_path for pre-clustered evidence when available. Prefer context_window
+fields in journal entries over transcript file lookups. Write /tmp/adam-diagnoses.json
+per the "Diagnose-stage output format" in your system prompt.
+```
+
+Wait for return.
+
+**Inter-stage validation** (§2a): after stage 1 returns, read `/tmp/adam-diagnoses.json` and validate each diagnosis:
+
+1. Every `source_entries` timestamp exists in the windowed journal (read `/tmp/adam-windowed-journal.jsonl`, check timestamps match).
+2. Every diagnosis has all four fields (`trigger`, `action`, `mismatch`, `outcome`).
+3. The planned `type` is a valid proposal type.
+4. Remove diagnoses that fail validation — log a one-line warning per removal.
+
+If all diagnoses are removed or the file is missing/empty, print "adam: no valid diagnoses — nothing to implement" and skip to §6.
+
+**Stage 2 — Implement**: Use the Agent tool with `subagent_type: "adam"` and prompt:
+
+```
+stage=implement
+
+Read the validated diagnoses and draft full proposal files.
+
+Inputs:
+- diagnoses_path: /tmp/adam-diagnoses.json                 # validated stage-1 output
+- windowed_journal_path: /tmp/adam-windowed-journal.jsonl
+- scores_path: /tmp/adam-scores.json
+- ab_regressions_path: /tmp/adam-ab-regressions.json
+- state_path: ~/.claude/adam/state.json
+- usage_path: ~/.claude/adam/usage.json
 - proposals_dir: ~/.claude/adam/proposals/
 - applied_dir: ~/.claude/adam/applied/
 - rejected_dir: ~/.claude/adam/rejected/
 - transcripts_root: ~/.claude/projects/
 - skills_root: ~/.claude/skills/

-The windowed_journal is already filtered by per-signal age (see
-SIGNAL_WINDOWS_DAYS in adam-window.mjs) AND by actioned-exclusion. Read it as
-your primary input — do not re-apply window math. Fall back to journal_path
-only if windowed_journal_path is missing or empty.
-
-Follow your system prompt exactly. Emit a single JSON punch list as your final message.
+Draft proposal files to proposals_dir/ for each diagnosis. Score against the
+confidence rubric. Emit the clustering trace and punch list as your final message.
 ```

 Wait for return.
@@ -112,11 +166,29 @@ node ~/.claude/adam/scripts/adam-explain.mjs --mode full       # verbatim trace
 node ~/.claude/adam/scripts/adam-explain.mjs --mode json       # machine-readable
 ```

-### 3. Auto-apply high-confidence items
+### 3. Pre-apply verification gate (MOSS §3.4)
+
+MOSS §3.4: "Verification must therefore be runtime, on a production-equivalent environment, and against the same prompts that produced the failure evidence." Before auto-applying, verify each proposal deterministically:

 For each id in `high_confidence`:
 - Read the proposal file from `~/.claude/adam/proposals/<id>-*.md`.
- Verify in front of the user: print `id`, `target`, `confidence`, `blast_radius`, `cross_session_evidence`, `auto_apply_eligible`.
+- **Verification checks** (all must pass for auto-apply to proceed):
+  1. **Source entries exist**: every timestamp in `source_entries` frontmatter must appear in `/tmp/adam-windowed-journal.jsonl`. If any are missing, the evidence is stale or was already actioned — demote to `queued`.
+  2. **Diagnosis grounded**: the `# Diagnosis` section must have all four fields (Trigger, Action, Mismatch, Outcome) with ≥1 backtick-wrapped quote. If malformed, demote to `queued`.
+  3. **Type-evidence match**: the proposal `type` must match what the evidence supports:
+     - `correction` signals → `memory`, `skill_new`, `skill_edit` (not `nudge`)
+     - `dead_end` signals → `nudge`, `skill_new`, `skill_edit` (not `memory`)
+     - `tool_error_loop` signals → `memory`, `skill_new`, `skill_edit`
+     - `harness_edit` → must cite harness-level evidence (false negative, scoring bias, window miscalibration)
+     If mismatch, demote to `queued`.
+  4. **No conflicting applied proposal**: grep `~/.claude/adam/applied/` for any proposal with the same `target` applied in the last 7 days. If found, demote to `queued` (prevents stacking rapid edits).
+- Print verification result: `verified: <id> (4/4 checks passed)` or `demoted: <id> (failed: <check_name>)`.
+- Demoted proposals are moved from `high_confidence` to `queued` for manual review.
+
+### 3a. Apply verified high-confidence items
+
+For each id that passed verification:
+- Print `id`, `target`, `confidence`, `blast_radius`, `cross_session_evidence`, `auto_apply_eligible`.
 - Apply the change:
  - **For `skill_new`**: `mkdir -p ~/.claude/skills/<slug>/`, then `Write` the proposal's `# Proposed change` body to `~/.claude/skills/<slug>/SKILL.md`. After write, print: "skill `<slug>` written to `~/.claude/skills/<slug>/SKILL.md` — activates immediately — Claude Code v2.1.0+ auto-hot-reloads user-level skills, no restart needed."
  - **For `memory`**: `Write` the proposal's `# Proposed change` body (which MUST include the auto-memory frontmatter — see "Memory drafting protocol" in `agents/adam.md`) to the path in `target`. Then update `MEMORY.md` index with a one-line pointer.
@@ -174,6 +246,12 @@ c. On **approve**:
   - For `skill_new`: `mkdir -p ~/.claude/skills/<slug>/`, then write `# Proposed change` body to `<slug>/SKILL.md`. Tell user: "skill `<slug>` written — activates immediately (CC v2.1.0+ auto-hot-reload)."
   - For `skill_edit`: apply the unified diff in `# Proposed change` to the existing SKILL.md at `target` (append-only — never replace existing content).
   - For `memory`: write `# Proposed change` body (must include auto-memory frontmatter) to `target` and update `MEMORY.md` index with a one-line pointer.
+   - For `harness_edit` (MOSS §1): apply the unified diff to the target harness file. **Before applying**:
+     1. Run `bash ~/.claude/adam/tests/run-tests.sh` — capture pass count.
+     2. Apply the diff via `Edit`.
+     3. Run `bash ~/.claude/adam/tests/run-tests.sh` again — verify pass count is equal or higher and 0 failures.
+     4. If test regression: revert the edit, print "harness_edit reverted — test regression detected", leave proposal in `proposals/`.
+     5. If tests pass: tell user "harness edit applied to `<target>` — tests pass (<N> passed)."
   - For all others: apply via Write/Edit per the proposal's `# Proposed change`.
   - Move proposal to `~/.claude/adam/applied/<ts>-<id>.md`.
   - Archive: `node ~/.claude/adam/scripts/adam-archive.mjs ~/.claude/adam/applied/<ts>-<id>.md`.
@@ -191,6 +269,10 @@ End with one block:
 ```
 adam reflect summary:
  observations processed: <new>
+  batches formed: <N>
+  diagnoses validated: <N>/<total>
+  rolled back (regression): <N>
+  verification passed: <N>/<total high_confidence>
  auto-applied: <N>
  approved: <N>
  rejected: <N>
@@ -198,6 +280,14 @@ adam reflect summary:
  failed: <N>
 ```

+**Keypoint history**: after all proposals are processed, append one JSON line to `~/.claude/adam/keypoint-history.jsonl` with the aggregate keypoint scores from the diagnose stage:
+
+```json
+{"ts":"<iso>","session":"<session_id>","keypoints":{"tool_selection":N,"scope_discipline":N,"error_recovery":N,"first_attempt":N,"build_reliability":N},"proposals_emitted":N,"proposals_applied":N}
+```
+
+This builds a longitudinal record of which capabilities are improving across `/reflect` runs.
+
 ## Karpathy constraints (you must enforce on each apply)

 Before writing any proposal:
@@ -211,6 +301,7 @@ Before writing any proposal:
 - For `skill_edit`: confirm the diff is append-only (no `-` lines that remove existing content) and that target SKILL.md exists. When auto-applying, ALSO re-verify the eligibility gate steps in §3 (cooldown, blacklist, byte cap) before any `Edit` call — never trust frontmatter alone.
 - For `skill_edit` with `auto_apply_eligible: true`: confirm `contradiction_flag` is absent or null in frontmatter. Refuse auto-apply if `contradiction_flag` is set with any non-empty value (treat the agent's flag as a hard veto on auto-apply; user can still manually approve in walk-the-queue if they disagree with the heuristic).
 - For `memory`: confirm `# Proposed change` body starts with `---` frontmatter containing required fields `name`, `description`, `type`, `originSessionId`. Refuse if frontmatter missing — agent must redraft per the Memory drafting protocol.
+- For `harness_edit`: confirm `auto_apply_eligible: false` (never auto-apply). Confirm `confidence ≥ 5`. Confirm `# Test verification` section names the test command. Confirm diff is ≤30 LOC and targets a single allowed harness file (see `agents/adam.md` §"Harness self-modification"). Run test suite before AND after applying — revert on any regression.
 - Confirm `source_entries` is present in proposal frontmatter as a non-empty list (used for archive). Warn (do not refuse) if missing — legacy proposals from before v0.2.0 won't have it.

 If any check fails, refuse to apply and ask the user how to proceed.