mirror of
https://github.com/lukaszraczylo/claude-adam.git
synced 2026-06-30 02:54:34 +00:00
fix(v0.6.2): A/B volume normalization + memory frontmatter schema
Two issues surfaced by running ADAM's /reflect loop on a large real journal
(4015 entries, 119 sessions) — both caused false/broken auto-apply behavior.
1. A/B over-reported regressions (adam-ab-measure.mjs).
Regressions were measured on RAW originating-signal counts pre vs post. On a
busy, growing journal almost every signal count rises post-apply regardless
of whether the proposal helped — so the loop flagged 9 false "regressions"
(and would auto-roll-back good proposals). Now the delta is computed on the
signal's SHARE of total activity (rate = count / window-total). Falls back to
the raw-count delta when the signal is the only activity in the window
(preserves prior behavior + all existing A/B tests). Output adds
raw_delta_pct, pre_total, post_total, normalized for transparency.
2. Memory frontmatter drift (agents/adam.md, SKILL.md).
The drafting protocol emitted flat `type:`/`originSessionId:` with a prose
`name`, but the live auto-memory store uses `name` = slug plus a
`metadata: {node_type, type, originSessionId}` block. Auto-applied memories
could fail to load/categorize. Protocol + apply-time validation now require
the live metadata.* schema and cross-checking against an existing file.
Tests: 132 -> 134. New: volume growth (raw +200%) with flat activity-share
classifies neutral, not regressed; a genuine share increase still classifies
regressed.
This commit is contained in:
@@ -3,11 +3,19 @@
|
||||
//
|
||||
// Reads ~/.claude/adam/ab-tracking.jsonl (one line per auto-apply event,
|
||||
// written by adam-self-improvement/SKILL.md), then for each entry old enough
|
||||
// (>= --min-age-days; default 7) compares signal counts in the 7-day window
|
||||
// BEFORE applied_at against the 7-day window AFTER applied_at across the
|
||||
// (>= --min-age-days; default 7) compares the originating signal in the 7-day
|
||||
// window BEFORE applied_at against the 7-day window AFTER applied_at across the
|
||||
// full journal corpus (active + rotated). Surfaces regressions so /reflect
|
||||
// can flag proposals that made things worse.
|
||||
//
|
||||
// Volume normalization: when the windows contain other (non-originating)
|
||||
// activity, the delta is computed on the signal's SHARE of total activity
|
||||
// (rate = count / total), not its raw count — so a generally busier journal
|
||||
// after apply does not masquerade as a regression. When the signal is the only
|
||||
// activity in the windows, it falls back to the raw-count delta. Output carries
|
||||
// both `delta_pct` (drives status) and `raw_delta_pct` + `normalized` for
|
||||
// transparency.
|
||||
//
|
||||
// CLI:
|
||||
// adam-ab-measure.mjs [--home <path>] [--format json|table] [--min-age-days N]
|
||||
//
|
||||
@@ -92,31 +100,60 @@ export function computeDeltas(entries, journal, opts = {}) {
|
||||
|
||||
const preStart = appliedAt - windowDays * DAY_MS;
|
||||
const postEnd = appliedAt + windowDays * DAY_MS;
|
||||
// preCount/postCount = originating-signal occurrences; preTotal/postTotal =
|
||||
// ALL journal entries in the window (the activity denominator).
|
||||
let preCount = 0;
|
||||
let postCount = 0;
|
||||
let preTotal = 0;
|
||||
let postTotal = 0;
|
||||
for (const je of journal || []) {
|
||||
if (!je || typeof je !== "object") continue;
|
||||
if (!sigSet.has(je.type)) continue;
|
||||
const t = tsMs(je);
|
||||
if (Number.isNaN(t)) continue;
|
||||
if (t >= preStart && t < appliedAt) preCount++;
|
||||
else if (t >= appliedAt && t < postEnd) postCount++;
|
||||
const inPre = t >= preStart && t < appliedAt;
|
||||
const inPost = t >= appliedAt && t < postEnd;
|
||||
if (!inPre && !inPost) continue;
|
||||
if (inPre) preTotal++; else postTotal++;
|
||||
if (!sigSet.has(je.type)) continue;
|
||||
if (inPre) preCount++; else postCount++;
|
||||
}
|
||||
|
||||
let status;
|
||||
let deltaPct;
|
||||
let rawDeltaPct = null;
|
||||
let normalized = false;
|
||||
if (preCount === 0) {
|
||||
status = "no_baseline";
|
||||
deltaPct = null;
|
||||
} else {
|
||||
deltaPct = ((postCount - preCount) / preCount) * 100;
|
||||
rawDeltaPct = Math.round(((postCount - preCount) / preCount) * 10000) / 100;
|
||||
// Volume normalization: when the windows contain non-originating activity,
|
||||
// compare the signal's SHARE of activity (rate), not its absolute count —
|
||||
// otherwise a generally busier post-window masquerades as a regression.
|
||||
// No background (signal IS the only activity) → fall back to raw delta,
|
||||
// preserving prior behavior.
|
||||
const hasBackground = (preTotal - preCount) + (postTotal - postCount) > 0;
|
||||
if (hasBackground && postTotal > 0) {
|
||||
const preRate = preCount / preTotal; // preTotal >= preCount > 0
|
||||
const postRate = postCount / postTotal;
|
||||
deltaPct = ((postRate - preRate) / preRate) * 100;
|
||||
normalized = true;
|
||||
} else {
|
||||
deltaPct = ((postCount - preCount) / preCount) * 100;
|
||||
}
|
||||
// Round to 2 dp for stable comparison + presentation.
|
||||
deltaPct = Math.round(deltaPct * 100) / 100;
|
||||
if (deltaPct <= IMPROVED_PCT) status = "improved";
|
||||
else if (deltaPct >= REGRESSED_PCT) status = "regressed";
|
||||
else status = "neutral";
|
||||
}
|
||||
out.push({ ...base, pre_count: preCount, post_count: postCount, delta_pct: deltaPct, status });
|
||||
out.push({
|
||||
...base,
|
||||
pre_count: preCount, post_count: postCount,
|
||||
pre_total: preTotal, post_total: postTotal,
|
||||
raw_delta_pct: rawDeltaPct, normalized,
|
||||
delta_pct: deltaPct, status,
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user