Files
claude-adam/adam/scripts/adam-ab-measure.mjs
T
lukaszraczylo d929101af4 fix(v0.6.2): A/B volume normalization + memory frontmatter schema
Two issues surfaced by running ADAM's /reflect loop on a large real journal
(4015 entries, 119 sessions) — both caused false/broken auto-apply behavior.

1. A/B over-reported regressions (adam-ab-measure.mjs).
   Regressions were measured on RAW originating-signal counts pre vs post. On a
   busy, growing journal almost every signal count rises post-apply regardless
   of whether the proposal helped — so the loop flagged 9 false "regressions"
   (and would auto-roll-back good proposals). Now the delta is computed on the
   signal's SHARE of total activity (rate = count / window-total). Falls back to
   the raw-count delta when the signal is the only activity in the window
   (preserves prior behavior + all existing A/B tests). Output adds
   raw_delta_pct, pre_total, post_total, normalized for transparency.

2. Memory frontmatter drift (agents/adam.md, SKILL.md).
   The drafting protocol emitted flat `type:`/`originSessionId:` with a prose
   `name`, but the live auto-memory store uses `name` = slug plus a
   `metadata: {node_type, type, originSessionId}` block. Auto-applied memories
   could fail to load/categorize. Protocol + apply-time validation now require
   the live metadata.* schema and cross-checking against an existing file.

Tests: 132 -> 134. New: volume growth (raw +200%) with flat activity-share
classifies neutral, not regressed; a genuine share increase still classifies
regressed.
2026-05-29 12:37:10 +01:00

228 lines
8.8 KiB
JavaScript
Executable File

#!/usr/bin/env node
// adam-ab-measure.mjs — A/B effectiveness measurement on auto-applied edits.
//
// Reads ~/.claude/adam/ab-tracking.jsonl (one line per auto-apply event,
// written by adam-self-improvement/SKILL.md), then for each entry old enough
// (>= --min-age-days; default 7) compares the originating signal in the 7-day
// window BEFORE applied_at against the 7-day window AFTER applied_at across the
// full journal corpus (active + rotated). Surfaces regressions so /reflect
// can flag proposals that made things worse.
//
// Volume normalization: when the windows contain other (non-originating)
// activity, the delta is computed on the signal's SHARE of total activity
// (rate = count / total), not its raw count — so a generally busier journal
// after apply does not masquerade as a regression. When the signal is the only
// activity in the windows, it falls back to the raw-count delta. Output carries
// both `delta_pct` (drives status) and `raw_delta_pct` + `normalized` for
// transparency.
//
// CLI:
// adam-ab-measure.mjs [--home <path>] [--format json|table] [--min-age-days N]
//
// Output (default `table`): aligned columns sorted regressed-first.
// Output (`json`): array of deltas.
// Empty / missing tracking file → empty output, exit 0.
// Exit 1 only on I/O failure.
import { join } from "node:path";
import { homedir } from "node:os";
import { readJsonlSafe, listJsonlFiles } from "./adam-utils.mjs";
const DAY_MS = 86400000;
export const DEFAULT_PRE_WINDOW_DAYS = 7;
export const DEFAULT_MIN_AGE_DAYS = 7;
const REGRESSED_PCT = 25;
const IMPROVED_PCT = -25;
function parseArgs(argv) {
const args = { home: null, format: "table", minAgeDays: DEFAULT_MIN_AGE_DAYS, help: false };
for (let i = 0; i < argv.length; i++) {
const a = argv[i];
if (a === "--home" && i + 1 < argv.length) args.home = argv[++i];
else if (a === "--format" && i + 1 < argv.length) args.format = argv[++i];
else if (a === "--min-age-days" && i + 1 < argv.length) {
const n = Number(argv[++i]);
if (!Number.isNaN(n) && n >= 0) args.minAgeDays = n;
}
else if (a === "--help" || a === "-h") args.help = true;
}
return args;
}
function loadJournalAll(claudeHome) {
const adamRoot = join(claudeHome, "adam");
const sources = [join(adamRoot, "journal.jsonl"), ...listJsonlFiles(join(adamRoot, "journal"))];
const all = [];
for (const p of sources) for (const e of readJsonlSafe(p)) all.push(e);
return all;
}
function tsMs(e) {
if (!e || typeof e.ts !== "string") return NaN;
return Date.parse(e.ts);
}
// computeDeltas: pure function — entries = ab-tracking objects, journal = list
// of journal entries (any source). opts.now is unix ms; opts.minAgeDays is the
// floor for non-pending.
export function computeDeltas(entries, journal, opts = {}) {
const now = typeof opts.now === "number" ? opts.now : Date.now();
const minAgeDays = typeof opts.minAgeDays === "number" ? opts.minAgeDays : DEFAULT_MIN_AGE_DAYS;
const out = [];
for (const e of entries || []) {
if (!e || typeof e !== "object") continue;
const appliedAt = Number(e.applied_at);
if (!appliedAt || Number.isNaN(appliedAt)) continue;
const ageDays = (now - appliedAt) / DAY_MS;
// Symmetric window: same span applied to pre AND post sides. JSONL schema
// field stays `pre_window_days` for backward compat with existing
// ab-tracking.jsonl entries — local name reflects symmetry.
const windowDays = typeof e.pre_window_days === "number" ? e.pre_window_days : DEFAULT_PRE_WINDOW_DAYS;
const signals = Array.isArray(e.originating_signals)
? e.originating_signals.map((s) => (s && typeof s === "object" ? s.type : null)).filter(Boolean)
: [];
const sigSet = new Set(signals);
const base = {
proposal_id: e.proposal_id || "",
proposal_type: e.proposal_type || "",
target_skill: e.target_skill || "",
applied_at: appliedAt,
applied_at_iso: new Date(appliedAt).toISOString(),
signal_types: [...sigSet],
};
if (ageDays < minAgeDays) {
out.push({ ...base, pre_count: null, post_count: null, delta_pct: null, status: "pending" });
continue;
}
const preStart = appliedAt - windowDays * DAY_MS;
const postEnd = appliedAt + windowDays * DAY_MS;
// preCount/postCount = originating-signal occurrences; preTotal/postTotal =
// ALL journal entries in the window (the activity denominator).
let preCount = 0;
let postCount = 0;
let preTotal = 0;
let postTotal = 0;
for (const je of journal || []) {
if (!je || typeof je !== "object") continue;
const t = tsMs(je);
if (Number.isNaN(t)) continue;
const inPre = t >= preStart && t < appliedAt;
const inPost = t >= appliedAt && t < postEnd;
if (!inPre && !inPost) continue;
if (inPre) preTotal++; else postTotal++;
if (!sigSet.has(je.type)) continue;
if (inPre) preCount++; else postCount++;
}
let status;
let deltaPct;
let rawDeltaPct = null;
let normalized = false;
if (preCount === 0) {
status = "no_baseline";
deltaPct = null;
} else {
rawDeltaPct = Math.round(((postCount - preCount) / preCount) * 10000) / 100;
// Volume normalization: when the windows contain non-originating activity,
// compare the signal's SHARE of activity (rate), not its absolute count —
// otherwise a generally busier post-window masquerades as a regression.
// No background (signal IS the only activity) → fall back to raw delta,
// preserving prior behavior.
const hasBackground = (preTotal - preCount) + (postTotal - postCount) > 0;
if (hasBackground && postTotal > 0) {
const preRate = preCount / preTotal; // preTotal >= preCount > 0
const postRate = postCount / postTotal;
deltaPct = ((postRate - preRate) / preRate) * 100;
normalized = true;
} else {
deltaPct = ((postCount - preCount) / preCount) * 100;
}
// Round to 2 dp for stable comparison + presentation.
deltaPct = Math.round(deltaPct * 100) / 100;
if (deltaPct <= IMPROVED_PCT) status = "improved";
else if (deltaPct >= REGRESSED_PCT) status = "regressed";
else status = "neutral";
}
out.push({
...base,
pre_count: preCount, post_count: postCount,
pre_total: preTotal, post_total: postTotal,
raw_delta_pct: rawDeltaPct, normalized,
delta_pct: deltaPct, status,
});
}
return out;
}
const STATUS_ORDER = { regressed: 0, neutral: 1, no_baseline: 2, improved: 3, pending: 4 };
function sortForTable(deltas) {
return [...deltas].sort((a, b) => {
const sa = STATUS_ORDER[a.status] ?? 99;
const sb = STATUS_ORDER[b.status] ?? 99;
if (sa !== sb) return sa - sb;
return a.applied_at - b.applied_at;
});
}
function padRight(s, n) { s = String(s); return s.length >= n ? s : s + " ".repeat(n - s.length); }
export function formatTable(deltas) {
if (!deltas || !deltas.length) return "";
const rows = sortForTable(deltas);
const headers = ["proposal_id", "target", "type", "applied_at(iso)", "pre/post", "delta%", "status"];
const data = rows.map((d) => [
d.proposal_id || "-",
d.target_skill || "-",
d.proposal_type || "-",
d.applied_at_iso || "-",
d.pre_count == null ? "-" : `${d.pre_count}/${d.post_count}`,
d.delta_pct == null ? "-" : `${d.delta_pct.toFixed(2)}`,
d.status || "-",
]);
const widths = headers.map((h, i) => Math.max(h.length, ...data.map((r) => String(r[i]).length)));
const lines = [];
lines.push(headers.map((h, i) => padRight(h, widths[i])).join(" | "));
lines.push(widths.map((w) => "-".repeat(w)).join("-+-"));
for (const r of data) lines.push(r.map((c, i) => padRight(c, widths[i])).join(" | "));
return lines.join("\n");
}
export function formatJson(deltas) {
return JSON.stringify(deltas || []);
}
function main() {
const args = parseArgs(process.argv.slice(2));
if (args.help) {
process.stdout.write("usage: adam-ab-measure.mjs [--home <path>] [--format json|table] [--min-age-days N]\n");
process.exit(0);
}
const claudeHome = args.home || join(homedir(), ".claude");
const trackingPath = join(claudeHome, "adam", "ab-tracking.jsonl");
try {
const entries = readJsonlSafe(trackingPath);
if (!entries.length) {
if (args.format === "json") process.stdout.write("[]\n");
// table mode prints nothing on empty input — exit 0.
process.exit(0);
}
const journal = loadJournalAll(claudeHome);
const deltas = computeDeltas(entries, journal, { minAgeDays: args.minAgeDays });
const out = args.format === "json" ? formatJson(deltas) : formatTable(deltas);
if (out) process.stdout.write(out + "\n");
process.exit(0);
} catch (e) {
process.stderr.write(`adam-ab-measure error: ${e.message}\n`);
process.exit(1);
}
}
if (import.meta.url === `file://${process.argv[1]}`) {
main();
}