From 4d1276a73f613ec70c0c4cb95892fe202f2f9903 Mon Sep 17 00:00:00 2001 From: Lukasz Raczylo Date: Tue, 2 Jun 2026 01:47:40 +0100 Subject: [PATCH] feat(v0.6.5): execution-grounded skill-utility report (adam-skill-utility) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ranks skills by good:bad outcome co-occurrence (Wilson LB + lift vs baseline) over the journal's active_skills payloads — the SkillsInjector (arXiv 2605.29794) execution-grounded utility signal Δ(s), computed from data already collected, no training. - reuses adam-score NEGATIVE_SIGNAL_TYPES + entrySeverity (single source of truth) - registered in install.sh helper-script copy loop - /reflect pre-step surfaces worst below-baseline skills to the USER as advisory (co-occurrence != causation; not fed to the analyst's proposal machinery) - Test 119 added; full suite 141/141 green --- adam/scripts/adam-skill-utility.mjs | 272 ++++++++++++++++++++++++++ adam/tests/run-tests.sh | 28 +++ install.sh | 3 +- skills/adam-self-improvement/SKILL.md | 8 + 4 files changed, 310 insertions(+), 1 deletion(-) create mode 100755 adam/scripts/adam-skill-utility.mjs diff --git a/adam/scripts/adam-skill-utility.mjs b/adam/scripts/adam-skill-utility.mjs new file mode 100755 index 0000000..b3d6d85 --- /dev/null +++ b/adam/scripts/adam-skill-utility.mjs @@ -0,0 +1,272 @@ +#!/usr/bin/env node +// adam-skill-utility.mjs — execution-grounded per-skill utility report. +// +// Inspired by SkillsInjector (arXiv 2605.29794v1), which shows skill injection +// should be driven by execution-grounded *utility* Δ(t,s), not surface keyword +// match — and that some topically-relevant skills actively *lower* success. +// The paper learns Δ(t,s) from rollout outcomes. We don't train anything: the +// adam journal already attaches `active_skills` to both positive outcome events +// (task_completed, clean_recovery, correction_free_streak) and negative ones +// (dead_end, tool_error_loop, …). So we approximate Δ(s) as a co-occurrence +// ratio over the data we already collect. +// +// CAVEAT (honest): this is CO-OCCURRENCE, not causation. A skill active during +// a dead_end did not necessarily cause it. Read the report as "which skills +// correlate with friction", a prompt for review — never as proof. +// +// Metric, per skill active during scored events: +// pos / neg — count of positive / negative outcome events it co-occurred with +// share — pos / (pos+neg) +// lift — share − global_baseline (>0 above baseline, <0 below) +// wLB — Wilson 95% lower bound of the positive proportion; ranks +// *reliably* below-baseline skills to the top (sample-aware) +// sevNeg — severity-weighted negative sum (adam SEVERITY_DIVISORS) +// topNeg — dominant negative event type +// Rows sorted worst-first (lowest wLB) so harmful/over-eager skills surface. +// +// CLI: +// adam-skill-utility.mjs [--home ] [--input ] +// [--min ] [--days ] [--json] +// --min min event count (n) to treat a skill's signal as confident (default 8) +// --days only consider events within the last days (default: all) +// --json emit machine-readable JSON instead of the text table +// +// Reuses adam-utils (jsonl IO) and adam-score (canonical NEGATIVE set + +// severity), so the positive/negative taxonomy stays single-sourced. + +import { readFileSync } from "node:fs"; +import { join } from "node:path"; +import { homedir } from "node:os"; +import { readJsonlSafe, listJsonlFiles } from "./adam-utils.mjs"; +import { NEGATIVE_SIGNAL_TYPES, entrySeverity } from "./adam-score.mjs"; + +// Positive outcome signals (mirror adam's vocabulary; task_completed is adam's +// canonical "clean task", the same one adam-score uses for reinforcement). +export const POSITIVE_SIGNAL_TYPES = new Set([ + "task_completed", + "clean_recovery", + "correction_free_streak", +]); + +export const DEFAULT_MIN_SAMPLE = 8; + +function round(x) { + return Math.round(x * 1000) / 1000; +} + +// Wilson score interval lower bound for a binomial proportion. Sample-aware: +// a skill with 1 pos / 0 neg does NOT outrank one with 40 pos / 2 neg. +export function wilsonLower(pos, n, z = 1.96) { + if (n <= 0) return 0; + const p = pos / n; + const z2 = z * z; + const denom = 1 + z2 / n; + const center = p + z2 / (2 * n); + const margin = z * Math.sqrt((p * (1 - p) + z2 / (4 * n)) / n); + return (center - margin) / denom; +} + +// computeSkillUtility: pure. entries → { baseline, totalPos, totalNeg, min, skills[] }. +export function computeSkillUtility(entries, opts = {}) { + const min = Number.isFinite(opts.min) ? opts.min : DEFAULT_MIN_SAMPLE; + const per = new Map(); + let totalPos = 0; + let totalNeg = 0; + + for (const e of entries || []) { + if (!e || typeof e !== "object") continue; + const isPos = POSITIVE_SIGNAL_TYPES.has(e.type); + const isNeg = NEGATIVE_SIGNAL_TYPES.has(e.type); + if (!isPos && !isNeg) continue; + + if (isPos) totalPos++; + else totalNeg++; + const sev = isNeg ? entrySeverity(e) : 0; + + const skills = Array.isArray(e.active_skills) ? e.active_skills : []; + for (const slug of skills) { + if (!slug || typeof slug !== "string") continue; + if (!per.has(slug)) { + per.set(slug, { pos: 0, neg: 0, sevNeg: 0, negTypes: {}, recent_ts: null }); + } + const s = per.get(slug); + if (isPos) { + s.pos++; + } else { + s.neg++; + s.sevNeg += sev; + s.negTypes[e.type] = (s.negTypes[e.type] || 0) + 1; + } + const ts = typeof e.ts === "string" ? e.ts : null; + if (ts && (!s.recent_ts || ts > s.recent_ts)) s.recent_ts = ts; + } + } + + const scored = totalPos + totalNeg; + const baseline = scored ? totalPos / scored : 0; + + const skills = []; + for (const [slug, s] of per.entries()) { + const n = s.pos + s.neg; + const share = n ? s.pos / n : 0; + const topNeg = Object.entries(s.negTypes).sort((a, b) => b[1] - a[1])[0]; + skills.push({ + skill: slug, + n, + pos: s.pos, + neg: s.neg, + share: round(share), + lift: round(share - baseline), + wLB: round(wilsonLower(s.pos, n)), + sevNeg: s.sevNeg, + topNeg: topNeg ? topNeg[0] : null, + lowSample: n < min, + recent_ts: s.recent_ts, + }); + } + // Worst-first: lowest Wilson lower bound, then most negatives. + skills.sort( + (a, b) => + a.wLB - b.wLB || + b.neg - a.neg || + (a.skill < b.skill ? -1 : a.skill > b.skill ? 1 : 0), + ); + + return { baseline: round(baseline), totalPos, totalNeg, min, skills }; +} + +function parseArgs(argv) { + const args = { home: null, input: null, min: DEFAULT_MIN_SAMPLE, days: null, json: false, help: false }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--home" && i + 1 < argv.length) args.home = argv[++i]; + else if (a === "--input" && i + 1 < argv.length) args.input = argv[++i]; + else if (a === "--min" && i + 1 < argv.length) args.min = Number(argv[++i]); + else if (a === "--days" && i + 1 < argv.length) args.days = Number(argv[++i]); + else if (a === "--json") args.json = true; + else if (a === "--help" || a === "-h") args.help = true; + } + return args; +} + +function readAllStdin() { + try { return readFileSync(0, "utf8"); } catch { return ""; } +} + +function entriesFromText(text) { + const out = []; + for (const line of (text || "").split("\n")) { + if (!line) continue; + try { out.push(JSON.parse(line)); } catch { /* skip */ } + } + return out; +} + +// Same gathering strategy as adam-score.mjs: explicit --input, else piped +// stdin (e.g. from adam-window.mjs), else the active journal + rotated files. +function gatherInputEntries(args) { + if (args.input) return readJsonlSafe(args.input); + if (!process.stdin.isTTY) { + const piped = readAllStdin(); + if (piped && piped.trim()) return entriesFromText(piped); + } + const home = args.home || join(homedir(), ".claude"); + const adamRoot = join(home, "adam"); + const sources = [join(adamRoot, "journal.jsonl"), ...listJsonlFiles(join(adamRoot, "journal"))]; + const all = []; + for (const p of sources) { + for (const e of readJsonlSafe(p)) all.push(e); + } + return all; +} + +function filterByDays(entries, days) { + if (!Number.isFinite(days) || days <= 0) return entries; + // Anchor the window to the newest ts in the data (avoids Date.now() + // nondeterminism and works on historical exports). + let maxMs = 0; + for (const e of entries) { + const ms = e && typeof e.ts === "string" ? Date.parse(e.ts) : NaN; + if (Number.isFinite(ms) && ms > maxMs) maxMs = ms; + } + if (!maxMs) return entries; + const cutoff = maxMs - days * 86400000; + return entries.filter((e) => { + const ms = e && typeof e.ts === "string" ? Date.parse(e.ts) : NaN; + return Number.isFinite(ms) ? ms >= cutoff : false; + }); +} + +function pad(s, w) { + s = String(s); + return s.length >= w ? s : s + " ".repeat(w - s.length); +} +function padL(s, w) { + s = String(s); + return s.length >= w ? s : " ".repeat(w - s.length) + s; +} + +function renderText(report) { + const { baseline, totalPos, totalNeg, min, skills } = report; + const lines = []; + lines.push("adam skill-utility report — execution-grounded Δ(skill) proxy"); + lines.push( + `baseline positive-rate ${(baseline * 100).toFixed(1)}% ` + + `(${totalPos} positive / ${totalNeg} negative outcome events) min-sample n≥${min}`, + ); + lines.push("CAVEAT: co-occurrence, not causation. Worst-first. ⚠ = below baseline with n≥min."); + lines.push(""); + const head = + pad("skill", 44) + padL("n", 5) + padL("pos", 6) + padL("neg", 6) + + padL("share", 8) + padL("lift", 8) + padL("wLB", 7) + padL("sevNeg", 8) + + " " + pad("topNeg", 18) + "flag"; + lines.push(head); + lines.push("-".repeat(head.length)); + for (const s of skills) { + const below = s.lift < 0 && !s.lowSample; + const flag = below ? "⚠" : s.lowSample ? "·(low n)" : ""; + lines.push( + pad(s.skill, 44) + + padL(s.n, 5) + + padL(s.pos, 6) + + padL(s.neg, 6) + + padL((s.share * 100).toFixed(0) + "%", 8) + + padL((s.lift >= 0 ? "+" : "") + (s.lift * 100).toFixed(0) + "%", 8) + + padL(s.wLB.toFixed(2), 7) + + padL(s.sevNeg, 8) + + " " + + pad(s.topNeg || "-", 18) + + flag, + ); + } + return lines.join("\n"); +} + +function main() { + const args = parseArgs(process.argv.slice(2)); + if (args.help) { + process.stdout.write( + "usage: adam-skill-utility.mjs [--home ] [--input ] " + + "[--min ] [--days ] [--json]\n", + ); + process.exit(0); + } + try { + let entries = gatherInputEntries(args); + entries = filterByDays(entries, args.days); + const report = computeSkillUtility(entries, { min: args.min }); + if (args.json) { + process.stdout.write(JSON.stringify(report) + "\n"); + } else { + process.stdout.write(renderText(report) + "\n"); + } + process.exit(0); + } catch (e) { + process.stderr.write(`adam-skill-utility error: ${e.message}\n`); + process.exit(1); + } +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +} diff --git a/adam/tests/run-tests.sh b/adam/tests/run-tests.sh index f9956c4..fcaef51 100755 --- a/adam/tests/run-tests.sh +++ b/adam/tests/run-tests.sh @@ -18,6 +18,7 @@ APPLYREIN="$REAL_HOME/.claude/adam/scripts/adam-apply-reinforcement.mjs" UPGRADE="$REAL_HOME/.claude/adam/scripts/adam-upgrade.mjs" BATCH="$REAL_HOME/.claude/adam/scripts/adam-batch.mjs" ROLLBACK="$REAL_HOME/.claude/adam/scripts/adam-rollback.mjs" +SKILLUTIL="$REAL_HOME/.claude/adam/scripts/adam-skill-utility.mjs" TMP_HOME="$(mktemp -d -t adam-test.XXXXXX)" trap 'rm -rf "$TMP_HOME"' EXIT INT TERM @@ -37,6 +38,7 @@ APPLYREIN_RUN(){ HOME="$TMP_HOME" node "$APPLYREIN" "$@" --home "$TMP_HOME/.clau UPGRADE_RUN() { HOME="$TMP_HOME" node "$UPGRADE" "$@"; } BATCH_RUN() { HOME="$TMP_HOME" node "$BATCH" "$@"; } ROLLBACK_RUN(){ HOME="$TMP_HOME" node "$ROLLBACK" "$@"; } +SKILLUTIL_RUN(){ HOME="$TMP_HOME" node "$SKILLUTIL" "$@"; } PASS=0 FAIL=0 @@ -2148,6 +2150,32 @@ else fi rm -f "$ROOT/proposals/"*rb-ab-001* "$ROOT/applied/"*rb-ab-001* "$ROOT/ab-tracking.jsonl" "$ROOT/active-nudges.json" +# --- Test 119: adam-skill-utility ranks friction-correlated skills below baseline --- +echo "Test 119: adam-skill-utility computes per-skill good:bad utility (execution-grounded Δ)" +reset_state +SU_INPUT="$TMP_HOME/su-input.jsonl" +{ + for i in 1 2 3 4 5; do echo "{\"ts\":\"2026-05-20T0$i:00:00Z\",\"session\":\"sSU\",\"type\":\"task_completed\",\"active_skills\":[\"goodskill\"]}"; done + for i in 1 2 3 4 5; do echo "{\"ts\":\"2026-05-20T1$i:00:00Z\",\"session\":\"sSU\",\"type\":\"dead_end\",\"count\":8,\"active_skills\":[\"badskill\"]}"; done +} > "$SU_INPUT" +su_out=$(SKILLUTIL_RUN --input "$SU_INPUT" --json --min 3 2>/dev/null) +su_check=$(echo "$su_out" | node -e ' +let buf=""; process.stdin.on("data",d=>buf+=d).on("end",()=>{ + try { + const p=JSON.parse(buf); + const bad=p.skills.find(s=>s.skill==="badskill"); + const good=p.skills.find(s=>s.skill==="goodskill"); + const ok = bad && good && bad.lift<0 && good.lift>0 && p.skills[0].skill==="badskill" && bad.neg===5 && good.pos===5; + console.log(ok?"ok":"bad:"+JSON.stringify({bad,good,first:p.skills[0]&&p.skills[0].skill})); + } catch(e){ console.log("parse-error:"+e.message); } +});') +if [ "$su_check" = "ok" ]; then + echo " PASS: badskill below baseline + ranked worst-first, goodskill above"; PASS=$((PASS+1)) +else + echo " FAIL: skill-utility ranking wrong ($su_check)"; FAIL=$((FAIL+1)) +fi +rm -f "$SU_INPUT" + echo echo "Results: $PASS passed, $FAIL failed" [ "$FAIL" = "0" ] diff --git a/install.sh b/install.sh index b45c01a..c16f058 100755 --- a/install.sh +++ b/install.sh @@ -126,7 +126,8 @@ copy_file "$SRC/adam/scripts/adam-archive.mjs" "$DEST/adam copy_file "$SRC/adam/scripts/adam-upgrade.mjs" "$DEST/adam/scripts/adam-upgrade.mjs" # v0.3.3 helper scripts — invoked from SKILL.md / hooks / analyst flow for _adam_script in adam-utils adam-window adam-explain adam-nudge-eligibility adam-cooldown \ - adam-score adam-ab-measure adam-apply-reinforcement adam-batch adam-rollback; do + adam-score adam-ab-measure adam-apply-reinforcement adam-batch adam-rollback \ + adam-skill-utility; do copy_file "$SRC/adam/scripts/${_adam_script}.mjs" \ "$DEST/adam/scripts/${_adam_script}.mjs" run "chmod +x \"$DEST/adam/scripts/${_adam_script}.mjs\"" diff --git a/skills/adam-self-improvement/SKILL.md b/skills/adam-self-improvement/SKILL.md index b1e20dc..8dfce85 100644 --- a/skills/adam-self-improvement/SKILL.md +++ b/skills/adam-self-improvement/SKILL.md @@ -81,6 +81,14 @@ node ~/.claude/adam/scripts/adam-batch.mjs --input /tmp/adam-windowed-journal.js This groups entries by (signal_type, cluster_key) and reports per-batch metadata including `has_context_window` (whether transcript evidence is attached). If the script fails: log stderr, pass `null` to the analyst (graceful degradation — analyst falls back to raw journal clustering). +**Skill utility** (execution-grounded selection signal, in the spirit of SkillsInjector arXiv 2605.29794 — utility Δ(s), not surface match): compute per-skill good:bad outcome ratios over the windowed journal: + +```bash +node ~/.claude/adam/scripts/adam-skill-utility.mjs --input /tmp/adam-windowed-journal.jsonl --json > /tmp/adam-skill-utility.json 2> /tmp/adam-skill-utility.log +``` + +This ranks skills by how often they co-occur with positive (`task_completed`, `clean_recovery`, `correction_free_streak`) vs negative outcome events, surfacing skills below the baseline positive rate (with sufficient sample) — advisory candidates for description disambiguation or archival. **CO-OCCURRENCE, NOT CAUSATION**: display the worst 3 below-baseline skills (`lift < 0`, not low-sample) to the *user* as a one-line advisory before listing proposals (e.g. `skill-utility: chezmoi 9% pos n=85, ghostty-config 14% pos n=50, …`). Do NOT feed this into the analyst's proposal machinery or auto-draft skill-archival from it — the human decides. If the script fails: log stderr, skip (best-effort). + ### 2. Dispatch the analyst (two-stage pipeline) MOSS §3.3: "A single prompt asked to diagnose, plan, implement, verify, and decide overloads context and produces lower-quality output than a sequenced flow." The analyst is dispatched in two stages with a validation gate between them.