mirror of
https://github.com/lukaszraczylo/claude-adam.git
synced 2026-06-10 23:29:03 +00:00
feat: apply MOSS-grounded self-evolution improvements to ADAM
Implements 7 improvements grounded in MOSS paper (arXiv 2605.22794): 1. Transcript capture (§3.4): context_ring buffer in adam-observe.mjs captures last 8 events around struggle signals as context_window. 2. Evidence batching (§3.1): new adam-batch.mjs pre-clusters windowed journal entries into coherent failure batches by (signal_type, cluster_key). 3. Multi-stage analysis (§3.3): SKILL.md dispatches adam agent in two stages (diagnose+plan → implement) with inter-stage validation gate. 4. Pre-apply verification (§3.4): 4-check deterministic gate before auto-apply (source entries exist, diagnosis grounded, type-evidence match, no conflicting recent proposals). 5. Auto-rollback (§3.5): new adam-rollback.mjs reverts regressed proposals detected by A/B measurement, creates regression nudges. 6. Harness self-modification (§1 Table 1): new harness_edit proposal type targeting adam's own scripts with stricter gates (confidence≥5, never auto-apply, test-suite-gated). 7. Keypoint matrix evaluation (§4.2): 5 capability dimensions (tool_selection, scope_discipline, error_recovery, first_attempt, build_reliability) scored per batch for structured evaluation. Test suite: 94 → 114 tests (20 new), all passing.
This commit is contained in:
Executable
+184
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env node
|
||||
// adam-batch.mjs — pre-clusters windowed journal entries into coherent failure
|
||||
// batches before analyst dispatch. Implements MOSS §3.1: "anchored to an
|
||||
// automatically curated batch of production-failure evidence."
|
||||
//
|
||||
// Each batch groups entries by (signal_type, cluster_key) where cluster_key
|
||||
// follows the same clustering rules as agents/adam.md §4:
|
||||
// correction → tokenized phrase (cross-cwd)
|
||||
// retry_loop → tool
|
||||
// weak_agent → subagent_type
|
||||
// tool_error_loop→ fp
|
||||
// dead_end → session
|
||||
// edit_churn → file basename
|
||||
// build_loop → session
|
||||
// subagent_dispatch_pattern → subagent_type
|
||||
// silent_drift → active_skills[0]
|
||||
// error_after_recovery → (recovered_from, original_fp)
|
||||
// correction_free_streak → active_skills[0]
|
||||
// clean_recovery → (recovered_from, active_skills[0])
|
||||
// task_completed → sorted tool_kinds tuple
|
||||
//
|
||||
// CLI:
|
||||
// adam-batch.mjs [--input <jsonl-path>] [--min-entries N] [--min-sessions N]
|
||||
//
|
||||
// Output: JSON object with `batches` array and `unbatched` count.
|
||||
|
||||
import { readFileSync } from "node:fs";
|
||||
import { readJsonlSafe } from "./adam-utils.mjs";
|
||||
|
||||
const DEFAULT_MIN_ENTRIES = 1;
|
||||
const DEFAULT_MIN_SESSIONS = 1;
|
||||
|
||||
const CORRECTION_STOPWORDS = new Set([
|
||||
"the", "a", "an", "and", "or", "but", "of", "to", "for", "in", "on",
|
||||
"with", "use", "when", "where", "what", "why", "how", "this", "that",
|
||||
"these", "those", "is", "are", "was", "were", "be", "been", "being",
|
||||
"do", "does", "did", "doing", "has", "have", "had", "your", "you",
|
||||
"i", "it", "as", "at", "by", "from", "not", "no",
|
||||
]);
|
||||
|
||||
function tokenizePhrase(phrase) {
|
||||
if (!phrase || typeof phrase !== "string") return "";
|
||||
return phrase.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.map(t => t.replace(/^[^\w']+|[^\w']+$/g, ""))
|
||||
.filter(t => t && !CORRECTION_STOPWORDS.has(t))
|
||||
.sort()
|
||||
.join("|");
|
||||
}
|
||||
|
||||
function clusterKey(entry) {
|
||||
if (!entry || typeof entry !== "object") return null;
|
||||
const t = entry.type;
|
||||
switch (t) {
|
||||
case "correction":
|
||||
return tokenizePhrase(entry.phrase) || "unknown";
|
||||
case "retry_loop":
|
||||
return entry.tool || "unknown";
|
||||
case "weak_agent":
|
||||
case "subagent_dispatch_pattern":
|
||||
return entry.subagent_type || "unknown";
|
||||
case "tool_error_loop":
|
||||
return entry.fp || "unknown";
|
||||
case "dead_end":
|
||||
case "build_loop":
|
||||
return entry.session || "unknown";
|
||||
case "edit_churn":
|
||||
return entry.file ? entry.file.split("/").pop() : "unknown";
|
||||
case "silent_drift":
|
||||
case "correction_free_streak":
|
||||
return Array.isArray(entry.active_skills) ? (entry.active_skills[0] || "") : "";
|
||||
case "error_after_recovery":
|
||||
return `${entry.recovered_from || "?"}:${entry.original_fp || "?"}`;
|
||||
case "clean_recovery":
|
||||
return `${entry.recovered_from || "?"}:${Array.isArray(entry.active_skills) ? (entry.active_skills[0] || "") : ""}`;
|
||||
case "task_completed":
|
||||
return Array.isArray(entry.tool_kinds) ? entry.tool_kinds.slice().sort().join(",") : "unknown";
|
||||
default:
|
||||
return entry.session || "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
function parseArgs(argv) {
|
||||
const args = { input: null, minEntries: DEFAULT_MIN_ENTRIES, minSessions: DEFAULT_MIN_SESSIONS, help: false };
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
const a = argv[i];
|
||||
if (a === "--input" && i + 1 < argv.length) args.input = argv[++i];
|
||||
else if (a === "--min-entries" && i + 1 < argv.length) {
|
||||
const n = Number(argv[++i]);
|
||||
if (!Number.isNaN(n) && n > 0) args.minEntries = n;
|
||||
}
|
||||
else if (a === "--min-sessions" && i + 1 < argv.length) {
|
||||
const n = Number(argv[++i]);
|
||||
if (!Number.isNaN(n) && n > 0) args.minSessions = n;
|
||||
}
|
||||
else if (a === "--help" || a === "-h") args.help = true;
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
export function buildBatches(entries, opts = {}) {
|
||||
const minEntries = opts.minEntries || DEFAULT_MIN_ENTRIES;
|
||||
const minSessions = opts.minSessions || DEFAULT_MIN_SESSIONS;
|
||||
const map = new Map();
|
||||
|
||||
for (const e of entries || []) {
|
||||
if (!e || typeof e !== "object" || !e.type) continue;
|
||||
const key = `${e.type}::${clusterKey(e)}`;
|
||||
if (!map.has(key)) {
|
||||
map.set(key, {
|
||||
batch_id: null,
|
||||
signal_type: e.type,
|
||||
cluster_key: clusterKey(e),
|
||||
entries: [],
|
||||
sessions: new Set(),
|
||||
cwds: new Set(),
|
||||
});
|
||||
}
|
||||
const batch = map.get(key);
|
||||
batch.entries.push(e);
|
||||
if (e.session) batch.sessions.add(e.session);
|
||||
if (e.cwd) batch.cwds.add(e.cwd);
|
||||
}
|
||||
|
||||
const batches = [];
|
||||
let unbatched = 0;
|
||||
let id = 1;
|
||||
for (const [, batch] of map) {
|
||||
if (batch.entries.length < minEntries || batch.sessions.size < minSessions) {
|
||||
unbatched += batch.entries.length;
|
||||
continue;
|
||||
}
|
||||
batch.batch_id = `b${id++}`;
|
||||
batches.push({
|
||||
batch_id: batch.batch_id,
|
||||
signal_type: batch.signal_type,
|
||||
cluster_key: batch.cluster_key,
|
||||
entry_count: batch.entries.length,
|
||||
session_count: batch.sessions.size,
|
||||
cwd_count: batch.cwds.size,
|
||||
has_context_window: batch.entries.some(e => Array.isArray(e.context_window) && e.context_window.length > 0),
|
||||
entries: batch.entries,
|
||||
});
|
||||
}
|
||||
|
||||
batches.sort((a, b) => b.entry_count - a.entry_count);
|
||||
return { batches, unbatched, total: (entries || []).length };
|
||||
}
|
||||
|
||||
function main() {
|
||||
const args = parseArgs(process.argv.slice(2));
|
||||
if (args.help) {
|
||||
process.stdout.write("usage: adam-batch.mjs [--input <jsonl-path>] [--min-entries N] [--min-sessions N]\n");
|
||||
process.exit(0);
|
||||
}
|
||||
try {
|
||||
let entries;
|
||||
if (args.input) {
|
||||
entries = readJsonlSafe(args.input);
|
||||
} else if (!process.stdin.isTTY) {
|
||||
const buf = readFileSync(0, "utf8");
|
||||
entries = [];
|
||||
for (const line of buf.split("\n")) {
|
||||
if (!line) continue;
|
||||
try { entries.push(JSON.parse(line)); } catch { /* skip */ }
|
||||
}
|
||||
} else {
|
||||
process.stderr.write("adam-batch: no input (use --input or pipe)\n");
|
||||
process.exit(1);
|
||||
}
|
||||
const result = buildBatches(entries, { minEntries: args.minEntries, minSessions: args.minSessions });
|
||||
process.stdout.write(JSON.stringify(result) + "\n");
|
||||
process.exit(0);
|
||||
} catch (e) {
|
||||
process.stderr.write(`adam-batch error: ${e.message}\n`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||
main();
|
||||
}
|
||||
|
||||
export { clusterKey, tokenizePhrase };
|
||||
Executable
+225
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env node
|
||||
// adam-rollback.mjs — auto-reverts proposals that regressed after apply.
|
||||
//
|
||||
// Implements MOSS §3.5: "rollback is mandatory because... a candidate that
|
||||
// passes trial can still regress live."
|
||||
//
|
||||
// For each regressed proposal (detected by adam-ab-measure.mjs):
|
||||
// 1. Reads the applied proposal from applied/
|
||||
// 2. Parses the `# Rollback` section for undo commands
|
||||
// 3. Moves proposal from applied/ to proposals/ with `rolled_back: true`
|
||||
// 4. Creates a regression nudge for next SessionStart
|
||||
// 5. Removes the ab-tracking entry (so it doesn't re-trigger)
|
||||
//
|
||||
// CLI:
|
||||
// adam-rollback.mjs --proposal-id <id> [--home <path>] [--dry-run]
|
||||
// adam-rollback.mjs --auto [--home <path>] [--dry-run]
|
||||
//
|
||||
// --auto mode: reads ab-measure output, rolls back all regressed proposals.
|
||||
//
|
||||
// Output: JSON object with rollback results per proposal.
|
||||
// Does NOT execute the undo commands itself — outputs them for the skill to
|
||||
// execute in-context (safety: undo commands may reference files the script
|
||||
// can't safely modify).
|
||||
|
||||
import { readFileSync, writeFileSync, renameSync, readdirSync, existsSync, mkdirSync } from "node:fs";
|
||||
import { join, basename } from "node:path";
|
||||
import { homedir } from "node:os";
|
||||
import { parseFrontmatter, readJsonlSafe, listJsonlFiles } from "./adam-utils.mjs";
|
||||
|
||||
function parseArgs(argv) {
|
||||
const args = { home: null, proposalId: null, auto: false, dryRun: false, help: false };
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
const a = argv[i];
|
||||
if (a === "--home" && i + 1 < argv.length) args.home = argv[++i];
|
||||
else if (a === "--proposal-id" && i + 1 < argv.length) args.proposalId = argv[++i];
|
||||
else if (a === "--auto") args.auto = true;
|
||||
else if (a === "--dry-run") args.dryRun = true;
|
||||
else if (a === "--help" || a === "-h") args.help = true;
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
function findAppliedProposal(appliedDir, proposalId) {
|
||||
if (!existsSync(appliedDir)) return null;
|
||||
try {
|
||||
const files = readdirSync(appliedDir).filter(n => n.endsWith(".md"));
|
||||
for (const f of files) {
|
||||
if (f.includes(proposalId)) return join(appliedDir, f);
|
||||
}
|
||||
} catch { /* skip */ }
|
||||
return null;
|
||||
}
|
||||
|
||||
function extractRollbackSection(content) {
|
||||
const idx = content.indexOf("\n# Rollback\n");
|
||||
if (idx === -1) return null;
|
||||
let body = content.slice(idx + "\n# Rollback\n".length);
|
||||
const nextSection = body.search(/\n# |\n---/);
|
||||
if (nextSection !== -1) body = body.slice(0, nextSection);
|
||||
return body.trim() || null;
|
||||
}
|
||||
|
||||
function extractUndoCommands(rollbackSection) {
|
||||
if (!rollbackSection) return [];
|
||||
const commands = [];
|
||||
const lines = rollbackSection.split("\n");
|
||||
let inCodeBlock = false;
|
||||
let blockLines = [];
|
||||
for (const line of lines) {
|
||||
if (line.startsWith("```")) {
|
||||
if (inCodeBlock) {
|
||||
if (blockLines.length) commands.push(blockLines.join("\n"));
|
||||
blockLines = [];
|
||||
}
|
||||
inCodeBlock = !inCodeBlock;
|
||||
continue;
|
||||
}
|
||||
if (inCodeBlock) {
|
||||
blockLines.push(line);
|
||||
}
|
||||
}
|
||||
return commands;
|
||||
}
|
||||
|
||||
export function planRollback(appliedDir, proposalId) {
|
||||
const path = findAppliedProposal(appliedDir, proposalId);
|
||||
if (!path) return { status: "not_found", proposal_id: proposalId };
|
||||
|
||||
const content = readFileSync(path, "utf8");
|
||||
const fm = parseFrontmatter(content);
|
||||
const rollbackSection = extractRollbackSection(content);
|
||||
const undoCommands = extractUndoCommands(rollbackSection);
|
||||
|
||||
return {
|
||||
status: "planned",
|
||||
proposal_id: proposalId,
|
||||
applied_path: path,
|
||||
type: fm.type || "unknown",
|
||||
target: fm.target || null,
|
||||
target_skill: fm.target_skill || null,
|
||||
undo_commands: undoCommands,
|
||||
has_rollback_section: !!rollbackSection,
|
||||
};
|
||||
}
|
||||
|
||||
export function executeRollback(plan, adamRoot, opts = {}) {
|
||||
const dryRun = opts.dryRun || false;
|
||||
const proposalsDir = join(adamRoot, "proposals");
|
||||
const nudgesPath = join(adamRoot, "active-nudges.json");
|
||||
const now = Date.now();
|
||||
|
||||
if (plan.status !== "planned") return { ...plan, action: "skipped" };
|
||||
|
||||
const result = {
|
||||
proposal_id: plan.proposal_id,
|
||||
type: plan.type,
|
||||
target: plan.target,
|
||||
undo_commands: plan.undo_commands,
|
||||
actions: [],
|
||||
};
|
||||
|
||||
if (dryRun) {
|
||||
result.actions.push("dry_run: would move applied → proposals");
|
||||
if (plan.undo_commands.length) {
|
||||
result.actions.push(`dry_run: would output ${plan.undo_commands.length} undo command(s)`);
|
||||
}
|
||||
result.actions.push("dry_run: would create regression nudge");
|
||||
result.status = "dry_run";
|
||||
return result;
|
||||
}
|
||||
|
||||
mkdirSync(proposalsDir, { recursive: true });
|
||||
const destName = `${basename(plan.applied_path).replace(/\.md$/, "")}-rollback.md`;
|
||||
const destPath = join(proposalsDir, destName);
|
||||
|
||||
let content = readFileSync(plan.applied_path, "utf8");
|
||||
const rollbackMeta = `\nrolled_back: true\nrolled_back_at: "${new Date(now).toISOString()}"`;
|
||||
content = content.replace(/^(---\n[\s\S]*?)(---)/m, `$1${rollbackMeta}\n$2`);
|
||||
|
||||
try {
|
||||
writeFileSync(destPath, content);
|
||||
renameSync(plan.applied_path, plan.applied_path + ".rolled-back");
|
||||
result.actions.push(`moved ${plan.applied_path} → ${destPath}`);
|
||||
} catch (e) {
|
||||
result.status = "move_failed";
|
||||
result.error = e.message;
|
||||
return result;
|
||||
}
|
||||
|
||||
try {
|
||||
let nudges = [];
|
||||
if (existsSync(nudgesPath)) {
|
||||
try { nudges = JSON.parse(readFileSync(nudgesPath, "utf8")); } catch { nudges = []; }
|
||||
}
|
||||
nudges.push({
|
||||
kind: "regression_rollback",
|
||||
message: `adam: rolled back "${plan.proposal_id}" (type: ${plan.type}) — regression detected in A/B measurement. Review with /reflect.`,
|
||||
created_at: now,
|
||||
expires_at_ts: now + 7 * 86400000,
|
||||
max_displays: 3,
|
||||
displays_used: 0,
|
||||
source_proposal: plan.proposal_id,
|
||||
});
|
||||
writeFileSync(nudgesPath, JSON.stringify(nudges, null, 2));
|
||||
result.actions.push("regression nudge created");
|
||||
} catch (e) {
|
||||
result.actions.push(`nudge failed: ${e.message}`);
|
||||
}
|
||||
|
||||
result.status = "rolled_back";
|
||||
return result;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs(process.argv.slice(2));
|
||||
if (args.help) {
|
||||
process.stdout.write(
|
||||
"usage: adam-rollback.mjs --proposal-id <id> [--home <path>] [--dry-run]\n" +
|
||||
" adam-rollback.mjs --auto [--home <path>] [--dry-run]\n"
|
||||
);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const claudeHome = args.home || join(homedir(), ".claude");
|
||||
const adamRoot = join(claudeHome, "adam");
|
||||
const appliedDir = join(adamRoot, "applied");
|
||||
|
||||
try {
|
||||
const results = [];
|
||||
|
||||
if (args.auto) {
|
||||
const abPath = join(adamRoot, "ab-tracking.jsonl");
|
||||
const entries = readJsonlSafe(abPath);
|
||||
const { computeDeltas } = await import("./adam-ab-measure.mjs");
|
||||
const sources = [join(adamRoot, "journal.jsonl"), ...listJsonlFiles(join(adamRoot, "journal"))];
|
||||
const journalAll = [];
|
||||
for (const p of sources) for (const e of readJsonlSafe(p)) journalAll.push(e);
|
||||
const deltas = computeDeltas(entries, journalAll);
|
||||
const regressed = deltas.filter(d => d.status === "regressed");
|
||||
|
||||
for (const d of regressed) {
|
||||
const plan = planRollback(appliedDir, d.proposal_id);
|
||||
const result = executeRollback(plan, adamRoot, { dryRun: args.dryRun });
|
||||
results.push(result);
|
||||
}
|
||||
} else if (args.proposalId) {
|
||||
const plan = planRollback(appliedDir, args.proposalId);
|
||||
const result = executeRollback(plan, adamRoot, { dryRun: args.dryRun });
|
||||
results.push(result);
|
||||
} else {
|
||||
process.stderr.write("adam-rollback: specify --proposal-id or --auto\n");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
process.stdout.write(JSON.stringify({ rollbacks: results }) + "\n");
|
||||
process.exit(0);
|
||||
} catch (e) {
|
||||
process.stderr.write(`adam-rollback error: ${e.message}\n`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||
main();
|
||||
}
|
||||
Reference in New Issue
Block a user