feat: apply MOSS-grounded self-evolution improvements to ADAM

Implements 7 improvements grounded in MOSS paper (arXiv 2605.22794): 1. Transcript capture (§3.4): context_ring buffer in adam-observe.mjs captures last 8 events around struggle signals as context_window. 2. Evidence batching (§3.1): new adam-batch.mjs pre-clusters windowed journal entries into coherent failure batches by (signal_type, cluster_key). 3. Multi-stage analysis (§3.3): SKILL.md dispatches adam agent in two stages (diagnose+plan → implement) with inter-stage validation gate. 4. Pre-apply verification (§3.4): 4-check deterministic gate before auto-apply (source entries exist, diagnosis grounded, type-evidence match, no conflicting recent proposals). 5. Auto-rollback (§3.5): new adam-rollback.mjs reverts regressed proposals detected by A/B measurement, creates regression nudges. 6. Harness self-modification (§1 Table 1): new harness_edit proposal type targeting adam's own scripts with stricter gates (confidence≥5, never auto-apply, test-suite-gated). 7. Keypoint matrix evaluation (§4.2): 5 capability dimensions (tool_selection, scope_discipline, error_recovery, first_attempt, build_reliability) scored per batch for structured evaluation. Test suite: 94 → 114 tests (20 new), all passing.
2026-06-10 23:29:03 +00:00 · 2026-05-24 11:15:32 +01:00
parent a48c705c0a
commit 440fb52eb1
7 changed files with 1038 additions and 20 deletions
@@ -0,0 +1,184 @@
+#!/usr/bin/env node
+// adam-batch.mjs — pre-clusters windowed journal entries into coherent failure
+// batches before analyst dispatch. Implements MOSS §3.1: "anchored to an
+// automatically curated batch of production-failure evidence."
+//
+// Each batch groups entries by (signal_type, cluster_key) where cluster_key
+// follows the same clustering rules as agents/adam.md §4:
+//   correction     → tokenized phrase (cross-cwd)
+//   retry_loop     → tool
+//   weak_agent     → subagent_type
+//   tool_error_loop→ fp
+//   dead_end       → session
+//   edit_churn     → file basename
+//   build_loop     → session
+//   subagent_dispatch_pattern → subagent_type
+//   silent_drift   → active_skills[0]
+//   error_after_recovery → (recovered_from, original_fp)
+//   correction_free_streak → active_skills[0]
+//   clean_recovery → (recovered_from, active_skills[0])
+//   task_completed → sorted tool_kinds tuple
+//
+// CLI:
+//   adam-batch.mjs [--input <jsonl-path>] [--min-entries N] [--min-sessions N]
+//
+// Output: JSON object with `batches` array and `unbatched` count.
+
+import { readFileSync } from "node:fs";
+import { readJsonlSafe } from "./adam-utils.mjs";
+
+const DEFAULT_MIN_ENTRIES = 1;
+const DEFAULT_MIN_SESSIONS = 1;
+
+const CORRECTION_STOPWORDS = new Set([
+  "the", "a", "an", "and", "or", "but", "of", "to", "for", "in", "on",
+  "with", "use", "when", "where", "what", "why", "how", "this", "that",
+  "these", "those", "is", "are", "was", "were", "be", "been", "being",
+  "do", "does", "did", "doing", "has", "have", "had", "your", "you",
+  "i", "it", "as", "at", "by", "from", "not", "no",
+]);
+
+function tokenizePhrase(phrase) {
+  if (!phrase || typeof phrase !== "string") return "";
+  return phrase.toLowerCase()
+    .split(/\s+/)
+    .map(t => t.replace(/^[^\w']+|[^\w']+$/g, ""))
+    .filter(t => t && !CORRECTION_STOPWORDS.has(t))
+    .sort()
+    .join("|");
+}
+
+function clusterKey(entry) {
+  if (!entry || typeof entry !== "object") return null;
+  const t = entry.type;
+  switch (t) {
+    case "correction":
+      return tokenizePhrase(entry.phrase) || "unknown";
+    case "retry_loop":
+      return entry.tool || "unknown";
+    case "weak_agent":
+    case "subagent_dispatch_pattern":
+      return entry.subagent_type || "unknown";
+    case "tool_error_loop":
+      return entry.fp || "unknown";
+    case "dead_end":
+    case "build_loop":
+      return entry.session || "unknown";
+    case "edit_churn":
+      return entry.file ? entry.file.split("/").pop() : "unknown";
+    case "silent_drift":
+    case "correction_free_streak":
+      return Array.isArray(entry.active_skills) ? (entry.active_skills[0] || "") : "";
+    case "error_after_recovery":
+      return `${entry.recovered_from || "?"}:${entry.original_fp || "?"}`;
+    case "clean_recovery":
+      return `${entry.recovered_from || "?"}:${Array.isArray(entry.active_skills) ? (entry.active_skills[0] || "") : ""}`;
+    case "task_completed":
+      return Array.isArray(entry.tool_kinds) ? entry.tool_kinds.slice().sort().join(",") : "unknown";
+    default:
+      return entry.session || "unknown";
+  }
+}
+
+function parseArgs(argv) {
+  const args = { input: null, minEntries: DEFAULT_MIN_ENTRIES, minSessions: DEFAULT_MIN_SESSIONS, help: false };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === "--input" && i + 1 < argv.length) args.input = argv[++i];
+    else if (a === "--min-entries" && i + 1 < argv.length) {
+      const n = Number(argv[++i]);
+      if (!Number.isNaN(n) && n > 0) args.minEntries = n;
+    }
+    else if (a === "--min-sessions" && i + 1 < argv.length) {
+      const n = Number(argv[++i]);
+      if (!Number.isNaN(n) && n > 0) args.minSessions = n;
+    }
+    else if (a === "--help" || a === "-h") args.help = true;
+  }
+  return args;
+}
+
+export function buildBatches(entries, opts = {}) {
+  const minEntries = opts.minEntries || DEFAULT_MIN_ENTRIES;
+  const minSessions = opts.minSessions || DEFAULT_MIN_SESSIONS;
+  const map = new Map();
+
+  for (const e of entries || []) {
+    if (!e || typeof e !== "object" || !e.type) continue;
+    const key = `${e.type}::${clusterKey(e)}`;
+    if (!map.has(key)) {
+      map.set(key, {
+        batch_id: null,
+        signal_type: e.type,
+        cluster_key: clusterKey(e),
+        entries: [],
+        sessions: new Set(),
+        cwds: new Set(),
+      });
+    }
+    const batch = map.get(key);
+    batch.entries.push(e);
+    if (e.session) batch.sessions.add(e.session);
+    if (e.cwd) batch.cwds.add(e.cwd);
+  }
+
+  const batches = [];
+  let unbatched = 0;
+  let id = 1;
+  for (const [, batch] of map) {
+    if (batch.entries.length < minEntries || batch.sessions.size < minSessions) {
+      unbatched += batch.entries.length;
+      continue;
+    }
+    batch.batch_id = `b${id++}`;
+    batches.push({
+      batch_id: batch.batch_id,
+      signal_type: batch.signal_type,
+      cluster_key: batch.cluster_key,
+      entry_count: batch.entries.length,
+      session_count: batch.sessions.size,
+      cwd_count: batch.cwds.size,
+      has_context_window: batch.entries.some(e => Array.isArray(e.context_window) && e.context_window.length > 0),
+      entries: batch.entries,
+    });
+  }
+
+  batches.sort((a, b) => b.entry_count - a.entry_count);
+  return { batches, unbatched, total: (entries || []).length };
+}
+
+function main() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help) {
+    process.stdout.write("usage: adam-batch.mjs [--input <jsonl-path>] [--min-entries N] [--min-sessions N]\n");
+    process.exit(0);
+  }
+  try {
+    let entries;
+    if (args.input) {
+      entries = readJsonlSafe(args.input);
+    } else if (!process.stdin.isTTY) {
+      const buf = readFileSync(0, "utf8");
+      entries = [];
+      for (const line of buf.split("\n")) {
+        if (!line) continue;
+        try { entries.push(JSON.parse(line)); } catch { /* skip */ }
+      }
+    } else {
+      process.stderr.write("adam-batch: no input (use --input or pipe)\n");
+      process.exit(1);
+    }
+    const result = buildBatches(entries, { minEntries: args.minEntries, minSessions: args.minSessions });
+    process.stdout.write(JSON.stringify(result) + "\n");
+    process.exit(0);
+  } catch (e) {
+    process.stderr.write(`adam-batch error: ${e.message}\n`);
+    process.exit(1);
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}
+
+export { clusterKey, tokenizePhrase };
@@ -0,0 +1,225 @@
+#!/usr/bin/env node
+// adam-rollback.mjs — auto-reverts proposals that regressed after apply.
+//
+// Implements MOSS §3.5: "rollback is mandatory because... a candidate that
+// passes trial can still regress live."
+//
+// For each regressed proposal (detected by adam-ab-measure.mjs):
+//   1. Reads the applied proposal from applied/
+//   2. Parses the `# Rollback` section for undo commands
+//   3. Moves proposal from applied/ to proposals/ with `rolled_back: true`
+//   4. Creates a regression nudge for next SessionStart
+//   5. Removes the ab-tracking entry (so it doesn't re-trigger)
+//
+// CLI:
+//   adam-rollback.mjs --proposal-id <id> [--home <path>] [--dry-run]
+//   adam-rollback.mjs --auto [--home <path>] [--dry-run]
+//
+//   --auto mode: reads ab-measure output, rolls back all regressed proposals.
+//
+// Output: JSON object with rollback results per proposal.
+// Does NOT execute the undo commands itself — outputs them for the skill to
+// execute in-context (safety: undo commands may reference files the script
+// can't safely modify).
+
+import { readFileSync, writeFileSync, renameSync, readdirSync, existsSync, mkdirSync } from "node:fs";
+import { join, basename } from "node:path";
+import { homedir } from "node:os";
+import { parseFrontmatter, readJsonlSafe, listJsonlFiles } from "./adam-utils.mjs";
+
+function parseArgs(argv) {
+  const args = { home: null, proposalId: null, auto: false, dryRun: false, help: false };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === "--home" && i + 1 < argv.length) args.home = argv[++i];
+    else if (a === "--proposal-id" && i + 1 < argv.length) args.proposalId = argv[++i];
+    else if (a === "--auto") args.auto = true;
+    else if (a === "--dry-run") args.dryRun = true;
+    else if (a === "--help" || a === "-h") args.help = true;
+  }
+  return args;
+}
+
+function findAppliedProposal(appliedDir, proposalId) {
+  if (!existsSync(appliedDir)) return null;
+  try {
+    const files = readdirSync(appliedDir).filter(n => n.endsWith(".md"));
+    for (const f of files) {
+      if (f.includes(proposalId)) return join(appliedDir, f);
+    }
+  } catch { /* skip */ }
+  return null;
+}
+
+function extractRollbackSection(content) {
+  const idx = content.indexOf("\n# Rollback\n");
+  if (idx === -1) return null;
+  let body = content.slice(idx + "\n# Rollback\n".length);
+  const nextSection = body.search(/\n# |\n---/);
+  if (nextSection !== -1) body = body.slice(0, nextSection);
+  return body.trim() || null;
+}
+
+function extractUndoCommands(rollbackSection) {
+  if (!rollbackSection) return [];
+  const commands = [];
+  const lines = rollbackSection.split("\n");
+  let inCodeBlock = false;
+  let blockLines = [];
+  for (const line of lines) {
+    if (line.startsWith("```")) {
+      if (inCodeBlock) {
+        if (blockLines.length) commands.push(blockLines.join("\n"));
+        blockLines = [];
+      }
+      inCodeBlock = !inCodeBlock;
+      continue;
+    }
+    if (inCodeBlock) {
+      blockLines.push(line);
+    }
+  }
+  return commands;
+}
+
+export function planRollback(appliedDir, proposalId) {
+  const path = findAppliedProposal(appliedDir, proposalId);
+  if (!path) return { status: "not_found", proposal_id: proposalId };
+
+  const content = readFileSync(path, "utf8");
+  const fm = parseFrontmatter(content);
+  const rollbackSection = extractRollbackSection(content);
+  const undoCommands = extractUndoCommands(rollbackSection);
+
+  return {
+    status: "planned",
+    proposal_id: proposalId,
+    applied_path: path,
+    type: fm.type || "unknown",
+    target: fm.target || null,
+    target_skill: fm.target_skill || null,
+    undo_commands: undoCommands,
+    has_rollback_section: !!rollbackSection,
+  };
+}
+
+export function executeRollback(plan, adamRoot, opts = {}) {
+  const dryRun = opts.dryRun || false;
+  const proposalsDir = join(adamRoot, "proposals");
+  const nudgesPath = join(adamRoot, "active-nudges.json");
+  const now = Date.now();
+
+  if (plan.status !== "planned") return { ...plan, action: "skipped" };
+
+  const result = {
+    proposal_id: plan.proposal_id,
+    type: plan.type,
+    target: plan.target,
+    undo_commands: plan.undo_commands,
+    actions: [],
+  };
+
+  if (dryRun) {
+    result.actions.push("dry_run: would move applied → proposals");
+    if (plan.undo_commands.length) {
+      result.actions.push(`dry_run: would output ${plan.undo_commands.length} undo command(s)`);
+    }
+    result.actions.push("dry_run: would create regression nudge");
+    result.status = "dry_run";
+    return result;
+  }
+
+  mkdirSync(proposalsDir, { recursive: true });
+  const destName = `${basename(plan.applied_path).replace(/\.md$/, "")}-rollback.md`;
+  const destPath = join(proposalsDir, destName);
+
+  let content = readFileSync(plan.applied_path, "utf8");
+  const rollbackMeta = `\nrolled_back: true\nrolled_back_at: "${new Date(now).toISOString()}"`;
+  content = content.replace(/^(---\n[\s\S]*?)(---)/m, `$1${rollbackMeta}\n$2`);
+
+  try {
+    writeFileSync(destPath, content);
+    renameSync(plan.applied_path, plan.applied_path + ".rolled-back");
+    result.actions.push(`moved ${plan.applied_path} → ${destPath}`);
+  } catch (e) {
+    result.status = "move_failed";
+    result.error = e.message;
+    return result;
+  }
+
+  try {
+    let nudges = [];
+    if (existsSync(nudgesPath)) {
+      try { nudges = JSON.parse(readFileSync(nudgesPath, "utf8")); } catch { nudges = []; }
+    }
+    nudges.push({
+      kind: "regression_rollback",
+      message: `adam: rolled back "${plan.proposal_id}" (type: ${plan.type}) — regression detected in A/B measurement. Review with /reflect.`,
+      created_at: now,
+      expires_at_ts: now + 7 * 86400000,
+      max_displays: 3,
+      displays_used: 0,
+      source_proposal: plan.proposal_id,
+    });
+    writeFileSync(nudgesPath, JSON.stringify(nudges, null, 2));
+    result.actions.push("regression nudge created");
+  } catch (e) {
+    result.actions.push(`nudge failed: ${e.message}`);
+  }
+
+  result.status = "rolled_back";
+  return result;
+}
+
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help) {
+    process.stdout.write(
+      "usage: adam-rollback.mjs --proposal-id <id> [--home <path>] [--dry-run]\n" +
+      "       adam-rollback.mjs --auto [--home <path>] [--dry-run]\n"
+    );
+    process.exit(0);
+  }
+
+  const claudeHome = args.home || join(homedir(), ".claude");
+  const adamRoot = join(claudeHome, "adam");
+  const appliedDir = join(adamRoot, "applied");
+
+  try {
+    const results = [];
+
+    if (args.auto) {
+      const abPath = join(adamRoot, "ab-tracking.jsonl");
+      const entries = readJsonlSafe(abPath);
+      const { computeDeltas } = await import("./adam-ab-measure.mjs");
+      const sources = [join(adamRoot, "journal.jsonl"), ...listJsonlFiles(join(adamRoot, "journal"))];
+      const journalAll = [];
+      for (const p of sources) for (const e of readJsonlSafe(p)) journalAll.push(e);
+      const deltas = computeDeltas(entries, journalAll);
+      const regressed = deltas.filter(d => d.status === "regressed");
+
+      for (const d of regressed) {
+        const plan = planRollback(appliedDir, d.proposal_id);
+        const result = executeRollback(plan, adamRoot, { dryRun: args.dryRun });
+        results.push(result);
+      }
+    } else if (args.proposalId) {
+      const plan = planRollback(appliedDir, args.proposalId);
+      const result = executeRollback(plan, adamRoot, { dryRun: args.dryRun });
+      results.push(result);
+    } else {
+      process.stderr.write("adam-rollback: specify --proposal-id or --auto\n");
+      process.exit(1);
+    }
+
+    process.stdout.write(JSON.stringify({ rollbacks: results }) + "\n");
+    process.exit(0);
+  } catch (e) {
+    process.stderr.write(`adam-rollback error: ${e.message}\n`);
+    process.exit(1);
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}