feat: apply MOSS-grounded self-evolution improvements to ADAM

Implements 7 improvements grounded in MOSS paper (arXiv 2605.22794):

1. Transcript capture (§3.4): context_ring buffer in adam-observe.mjs
   captures last 8 events around struggle signals as context_window.

2. Evidence batching (§3.1): new adam-batch.mjs pre-clusters windowed
   journal entries into coherent failure batches by (signal_type, cluster_key).

3. Multi-stage analysis (§3.3): SKILL.md dispatches adam agent in two
   stages (diagnose+plan → implement) with inter-stage validation gate.

4. Pre-apply verification (§3.4): 4-check deterministic gate before
   auto-apply (source entries exist, diagnosis grounded, type-evidence
   match, no conflicting recent proposals).

5. Auto-rollback (§3.5): new adam-rollback.mjs reverts regressed proposals
   detected by A/B measurement, creates regression nudges.

6. Harness self-modification (§1 Table 1): new harness_edit proposal type
   targeting adam's own scripts with stricter gates (confidence≥5, never
   auto-apply, test-suite-gated).

7. Keypoint matrix evaluation (§4.2): 5 capability dimensions
   (tool_selection, scope_discipline, error_recovery, first_attempt,
   build_reliability) scored per batch for structured evaluation.

Test suite: 94 → 114 tests (20 new), all passing.
This commit is contained in:
2026-05-24 11:15:32 +01:00
parent a48c705c0a
commit 440fb52eb1
7 changed files with 1038 additions and 20 deletions
+42 -2
View File
@@ -107,10 +107,15 @@ const CLEAN_RECOVERY_WINDOW = 3;
const SILENT_DRIFT_THRESHOLD = 5;
const ERROR_AFTER_RECOVERY_WINDOW = 5;
const RECENT_RECOVERIES_MAX = 3;
const STRUGGLE_TYPES = new Set(["tool_error_loop", "dead_end", "retry_loop"]);
const STRUGGLE_TYPES = new Set([
"tool_error_loop", "dead_end", "retry_loop", "weak_agent",
"edit_churn", "build_loop", "silent_drift", "error_after_recovery",
]);
const ACTIVE_SKILLS_LOOKBACK = 10;
const TASK_TOOL_MIN = 5;
const TASK_DIVERSITY_MIN = 3;
const CONTEXT_RING_SIZE = 8;
const CONTEXT_EXCERPT_LEN = 200;
const STATE_MAX_BYTES = 1_000_000;
function safeRead(path, fallback) {
@@ -226,6 +231,20 @@ function activeNames(state, kind) {
return [...seen];
}
function excerpt(text, len) {
if (!text || typeof text !== "string") return null;
return text.length > len ? text.slice(0, len) + "…" : text;
}
function pushContext(state, entry) {
state.context_ring.push(entry);
if (state.context_ring.length > CONTEXT_RING_SIZE) state.context_ring.shift();
}
function snapshotContext(state) {
return state.context_ring.length ? state.context_ring.slice() : undefined;
}
function errorFingerprint(toolResponse) {
if (!toolResponse) return null;
let text = "";
@@ -290,6 +309,7 @@ function resetSessionLocal(state) {
state.recentRecoveries = [];
state.session_post_count = 0;
state.tool_window = [];
state.context_ring = [];
state.task_tool_kinds = {};
state.task_tool_count = 0;
state.task_corrections = 0;
@@ -316,6 +336,7 @@ function ensureStateDefaults(state) {
if (typeof state.silentDriftEmitted !== "boolean") state.silentDriftEmitted = false;
if (!Array.isArray(state.recentRecoveries)) state.recentRecoveries = [];
if (typeof state.session_post_count !== "number") state.session_post_count = 0;
if (!Array.isArray(state.context_ring)) state.context_ring = [];
}
function main() {
@@ -340,6 +361,7 @@ function main() {
if (event === "UserPromptSubmit") {
const prompt = (input.prompt || "").slice(0, 200);
pushContext(state, { event: "user", prompt: excerpt(prompt, CONTEXT_EXCERPT_LEN), ts });
if (isCorrection(prompt)) {
const last = state.tool_window[state.tool_window.length - 1] || {};
appendJournal({
@@ -406,9 +428,27 @@ function main() {
const argsHash = djb2(JSON.stringify(input.tool_input || {}));
const file = (input.tool_input && (input.tool_input.file_path || input.tool_input.path)) || null;
const toolResponse = input.tool_response;
const respExcerpt = (() => {
if (!toolResponse) return null;
const text = typeof toolResponse === "string" ? toolResponse
: typeof toolResponse.content === "string" ? toolResponse.content
: null;
return excerpt(text, CONTEXT_EXCERPT_LEN);
})();
pushContext(state, {
event: "tool", tool, ts,
input_excerpt: excerpt(JSON.stringify(input.tool_input || {}), CONTEXT_EXCERPT_LEN),
response_excerpt: respExcerpt,
is_error: !!(toolResponse && toolResponse.is_error),
});
let struggleEmittedThisTurn = null;
const emit = (entry) => {
if (STRUGGLE_TYPES.has(entry.type)) struggleEmittedThisTurn = entry.type;
if (STRUGGLE_TYPES.has(entry.type)) {
entry.context_window = snapshotContext(state);
struggleEmittedThisTurn = entry.type;
}
appendJournal(entry);
};