Files
claude-adam/adam/scripts/adam-ab-measure.mjs
T
lukaszraczylo 012c40b9ab chore(v0.3.3): analyst observability, A/B measurement, journal hygiene
Storage/window/exclusion split (#7): ISO-week journal rotation with safety
fuse replaces size-based rotation (fixes silent under-counting when clusters
straddle boundaries). Per-signal sliding windows via adam-window.mjs guard
against stale signal accumulation. Legacy YYYY-MM-DD-<ts>.jsonl files remain
readable.

Error fingerprint normalization (#3): adam-observe.mjs extracts canonical
error codes (ENOENT, ECONNREFUSED, etc.) and normalizes paths/timestamps/hex
before hashing. 'Connection refused' and 'ECONNREFUSED' now cluster identically.

Correction corpus expansion (#1): strong tokens (stop, wrong, undo, try again,
different approach, etc.) fire on any occurrence. Weak tokens (no, actually,
wait) require negation/contrast co-occurrence within 8 tokens. Kills the
'actually, I think...' false positive.

Analyst observability (#6): mandatory clustering trace block; adam-explain.mjs
parses to summary/full/json. Cluster decisions now surface rejection reasons
(threshold, contradiction, window). Persisted to ~/.claude/adam/last-trace.txt.

Dead_end nudge proposal type (#2): single-session auto-apply gate (>=3
dead_end events). Action appends to active-nudges.json, surfaced via
adam-nudge.mjs at next SessionStart. Lower blast than skill_edit.

Per-(skill, fingerprint) cooldown (#4): adam-cooldown.mjs replaces coarse
per-skill check. proposal_fingerprint = djb2(skill_slug + cluster_id +
normalized_diff_body). Legacy applied/rejected records gate via 'legacy'
fingerprint fallback through resolveSkill helper (handles target_skill,
skill, or target: <path>).

task_completed scoring integration (#8): adam-score.mjs computes per-session
urgency dampener (3 task_completed -> 0.5) and reinforcement candidates
(skills cited in >=3 clean completions). New 'reinforcement' proposal type
appends to reinforcements.jsonl on apply (no code/memory mutation).

A/B effectiveness measurement (#5): every auto-applied edit appends to
ab-tracking.jsonl. adam-ab-measure.mjs computes 7d pre/post signal-count
delta per entry (improved / neutral / regressed / no_baseline / pending).
Analyst surfaces regressions at top of /reflect output.

Upgrade UX overhaul (#9): adam-upgrade.mjs implements --list/--diff/--accept
/--accept-all. SessionStart nudge prints pending-merge warning when
.adam-new files exist (latency ~20ms via fixed shortlist). install.sh
emits unmissable final-message hint after creating any .adam-new file.

Simplify pass: adam-utils.mjs deduplicates readJsonlSafe / listJsonlFiles /
parseFrontmatter across 8 scripts. Net -46 LOC.

Test coverage: 30 -> 87 tests. Every new feature has feature-validating
assertions (false-case coverage included). T77 statically verifies install.sh
references every adam-*.mjs source script (would have caught the missing
adam-utils inclusion that review #2 surfaced).
2026-05-13 01:02:33 +01:00

191 lines
7.1 KiB
JavaScript
Executable File

#!/usr/bin/env node
// adam-ab-measure.mjs — A/B effectiveness measurement on auto-applied edits.
//
// Reads ~/.claude/adam/ab-tracking.jsonl (one line per auto-apply event,
// written by adam-self-improvement/SKILL.md), then for each entry old enough
// (>= --min-age-days; default 7) compares signal counts in the 7-day window
// BEFORE applied_at against the 7-day window AFTER applied_at across the
// full journal corpus (active + rotated). Surfaces regressions so /reflect
// can flag proposals that made things worse.
//
// CLI:
// adam-ab-measure.mjs [--home <path>] [--format json|table] [--min-age-days N]
//
// Output (default `table`): aligned columns sorted regressed-first.
// Output (`json`): array of deltas.
// Empty / missing tracking file → empty output, exit 0.
// Exit 1 only on I/O failure.
import { join } from "node:path";
import { homedir } from "node:os";
import { readJsonlSafe, listJsonlFiles } from "./adam-utils.mjs";
const DAY_MS = 86400000;
export const DEFAULT_PRE_WINDOW_DAYS = 7;
export const DEFAULT_MIN_AGE_DAYS = 7;
const REGRESSED_PCT = 25;
const IMPROVED_PCT = -25;
function parseArgs(argv) {
const args = { home: null, format: "table", minAgeDays: DEFAULT_MIN_AGE_DAYS, help: false };
for (let i = 0; i < argv.length; i++) {
const a = argv[i];
if (a === "--home" && i + 1 < argv.length) args.home = argv[++i];
else if (a === "--format" && i + 1 < argv.length) args.format = argv[++i];
else if (a === "--min-age-days" && i + 1 < argv.length) {
const n = Number(argv[++i]);
if (!Number.isNaN(n) && n >= 0) args.minAgeDays = n;
}
else if (a === "--help" || a === "-h") args.help = true;
}
return args;
}
function loadJournalAll(claudeHome) {
const adamRoot = join(claudeHome, "adam");
const sources = [join(adamRoot, "journal.jsonl"), ...listJsonlFiles(join(adamRoot, "journal"))];
const all = [];
for (const p of sources) for (const e of readJsonlSafe(p)) all.push(e);
return all;
}
function tsMs(e) {
if (!e || typeof e.ts !== "string") return NaN;
return Date.parse(e.ts);
}
// computeDeltas: pure function — entries = ab-tracking objects, journal = list
// of journal entries (any source). opts.now is unix ms; opts.minAgeDays is the
// floor for non-pending.
export function computeDeltas(entries, journal, opts = {}) {
const now = typeof opts.now === "number" ? opts.now : Date.now();
const minAgeDays = typeof opts.minAgeDays === "number" ? opts.minAgeDays : DEFAULT_MIN_AGE_DAYS;
const out = [];
for (const e of entries || []) {
if (!e || typeof e !== "object") continue;
const appliedAt = Number(e.applied_at);
if (!appliedAt || Number.isNaN(appliedAt)) continue;
const ageDays = (now - appliedAt) / DAY_MS;
// Symmetric window: same span applied to pre AND post sides. JSONL schema
// field stays `pre_window_days` for backward compat with existing
// ab-tracking.jsonl entries — local name reflects symmetry.
const windowDays = typeof e.pre_window_days === "number" ? e.pre_window_days : DEFAULT_PRE_WINDOW_DAYS;
const signals = Array.isArray(e.originating_signals)
? e.originating_signals.map((s) => (s && typeof s === "object" ? s.type : null)).filter(Boolean)
: [];
const sigSet = new Set(signals);
const base = {
proposal_id: e.proposal_id || "",
proposal_type: e.proposal_type || "",
target_skill: e.target_skill || "",
applied_at: appliedAt,
applied_at_iso: new Date(appliedAt).toISOString(),
signal_types: [...sigSet],
};
if (ageDays < minAgeDays) {
out.push({ ...base, pre_count: null, post_count: null, delta_pct: null, status: "pending" });
continue;
}
const preStart = appliedAt - windowDays * DAY_MS;
const postEnd = appliedAt + windowDays * DAY_MS;
let preCount = 0;
let postCount = 0;
for (const je of journal || []) {
if (!je || typeof je !== "object") continue;
if (!sigSet.has(je.type)) continue;
const t = tsMs(je);
if (Number.isNaN(t)) continue;
if (t >= preStart && t < appliedAt) preCount++;
else if (t >= appliedAt && t < postEnd) postCount++;
}
let status;
let deltaPct;
if (preCount === 0) {
status = "no_baseline";
deltaPct = null;
} else {
deltaPct = ((postCount - preCount) / preCount) * 100;
// Round to 2 dp for stable comparison + presentation.
deltaPct = Math.round(deltaPct * 100) / 100;
if (deltaPct <= IMPROVED_PCT) status = "improved";
else if (deltaPct >= REGRESSED_PCT) status = "regressed";
else status = "neutral";
}
out.push({ ...base, pre_count: preCount, post_count: postCount, delta_pct: deltaPct, status });
}
return out;
}
const STATUS_ORDER = { regressed: 0, neutral: 1, no_baseline: 2, improved: 3, pending: 4 };
function sortForTable(deltas) {
return [...deltas].sort((a, b) => {
const sa = STATUS_ORDER[a.status] ?? 99;
const sb = STATUS_ORDER[b.status] ?? 99;
if (sa !== sb) return sa - sb;
return a.applied_at - b.applied_at;
});
}
function padRight(s, n) { s = String(s); return s.length >= n ? s : s + " ".repeat(n - s.length); }
export function formatTable(deltas) {
if (!deltas || !deltas.length) return "";
const rows = sortForTable(deltas);
const headers = ["proposal_id", "target", "type", "applied_at(iso)", "pre/post", "delta%", "status"];
const data = rows.map((d) => [
d.proposal_id || "-",
d.target_skill || "-",
d.proposal_type || "-",
d.applied_at_iso || "-",
d.pre_count == null ? "-" : `${d.pre_count}/${d.post_count}`,
d.delta_pct == null ? "-" : `${d.delta_pct.toFixed(2)}`,
d.status || "-",
]);
const widths = headers.map((h, i) => Math.max(h.length, ...data.map((r) => String(r[i]).length)));
const lines = [];
lines.push(headers.map((h, i) => padRight(h, widths[i])).join(" | "));
lines.push(widths.map((w) => "-".repeat(w)).join("-+-"));
for (const r of data) lines.push(r.map((c, i) => padRight(c, widths[i])).join(" | "));
return lines.join("\n");
}
export function formatJson(deltas) {
return JSON.stringify(deltas || []);
}
function main() {
const args = parseArgs(process.argv.slice(2));
if (args.help) {
process.stdout.write("usage: adam-ab-measure.mjs [--home <path>] [--format json|table] [--min-age-days N]\n");
process.exit(0);
}
const claudeHome = args.home || join(homedir(), ".claude");
const trackingPath = join(claudeHome, "adam", "ab-tracking.jsonl");
try {
const entries = readJsonlSafe(trackingPath);
if (!entries.length) {
if (args.format === "json") process.stdout.write("[]\n");
// table mode prints nothing on empty input — exit 0.
process.exit(0);
}
const journal = loadJournalAll(claudeHome);
const deltas = computeDeltas(entries, journal, { minAgeDays: args.minAgeDays });
const out = args.format === "json" ? formatJson(deltas) : formatTable(deltas);
if (out) process.stdout.write(out + "\n");
process.exit(0);
} catch (e) {
process.stderr.write(`adam-ab-measure error: ${e.message}\n`);
process.exit(1);
}
}
if (import.meta.url === `file://${process.argv[1]}`) {
main();
}