diff --git a/adam/tests/run-tests.sh b/adam/tests/run-tests.sh index 9be83d5..22e20b4 100755 --- a/adam/tests/run-tests.sh +++ b/adam/tests/run-tests.sh @@ -364,6 +364,48 @@ for i in 1 2 3 4 5; do done assert_grep "$ROOT/journal.jsonl" '"active_skills":\["caveman"\]' "active_skills payload includes invoked skill" +# --- Test 24: task_completed fires on diverse multi-tool task --- +echo "Test 24: task_completed after 5 tools / 3 kinds / no corrections" +reset_state +for kind in Bash Read Edit Write Grep; do + echo "{\"hook_event_name\":\"PostToolUse\",\"tool_name\":\"$kind\",\"tool_input\":{},\"session_id\":\"sT\",\"cwd\":\"/tmp/x\"}" \ + | HOOK_RUN >/dev/null 2>&1 || true +done +echo '{"hook_event_name":"UserPromptSubmit","prompt":"go on","session_id":"sT","cwd":"/tmp/x"}' \ + | HOOK_RUN >/dev/null 2>&1 || true +assert_grep "$ROOT/journal.jsonl" '"type":"task_completed"' "5 tools + 5 kinds + 0 corrections emits task_completed" + +# --- Test 25: task_completed suppressed when tool diversity < 3 --- +echo "Test 25: task_completed suppressed on single-tool run" +reset_state +for i in 1 2 3 4 5; do + echo "{\"hook_event_name\":\"PostToolUse\",\"tool_name\":\"Edit\",\"tool_input\":{\"file_path\":\"/tmp/$i\"},\"session_id\":\"sT2\",\"cwd\":\"/tmp/x\"}" \ + | HOOK_RUN >/dev/null 2>&1 || true +done +echo '{"hook_event_name":"UserPromptSubmit","prompt":"go on","session_id":"sT2","cwd":"/tmp/x"}' \ + | HOOK_RUN >/dev/null 2>&1 || true +if grep -qE '"type":"task_completed"' "$ROOT/journal.jsonl"; then + echo " FAIL: task_completed fired on single-tool task"; FAIL=$((FAIL+1)) +else + echo " PASS: task_completed suppressed (low tool diversity)"; PASS=$((PASS+1)) +fi + +# --- Test 26: task_completed suppressed when correction fires mid-task --- +echo "Test 26: task_completed suppressed after correction" +reset_state +for kind in Bash Read Edit Write Grep; do + echo "{\"hook_event_name\":\"PostToolUse\",\"tool_name\":\"$kind\",\"tool_input\":{},\"session_id\":\"sT3\",\"cwd\":\"/tmp/x\"}" \ + | HOOK_RUN >/dev/null 2>&1 || true +done +# Correction phrase resets task_corrections inside the same UserPromptSubmit cycle, so the prior run is disqualified. +echo '{"hook_event_name":"UserPromptSubmit","prompt":"no, undo that","session_id":"sT3","cwd":"/tmp/x"}' \ + | HOOK_RUN >/dev/null 2>&1 || true +if grep -qE '"type":"task_completed"' "$ROOT/journal.jsonl"; then + echo " FAIL: task_completed fired despite correction on the closing prompt"; FAIL=$((FAIL+1)) +else + echo " PASS: task_completed suppressed by correction"; PASS=$((PASS+1)) +fi + echo echo "Results: $PASS passed, $FAIL failed" [ "$FAIL" = "0" ] diff --git a/agents/adam.md b/agents/adam.md index aaec2c8..22644de 100644 --- a/agents/adam.md +++ b/agents/adam.md @@ -38,6 +38,7 @@ The hook emits these `type` values into the journal: | `subagent_dispatch_pattern` | same subagent dispatched ≥3× cumulatively | subagent_type | | `correction_free_streak` | 5 clean UserPromptSubmits in a row (no correction phrase) | `active_skills[0]` | | `clean_recovery` | 3 clean PostToolUse events after a `tool_error_loop`/`dead_end`/`retry_loop` | (`recovered_from`, `active_skills[0]`) | +| `task_completed` | UserPromptSubmit closes a run of ≥5 tool calls with ≥3 distinct tool kinds and 0 corrections | sorted `tool_kinds` tuple | ## Process @@ -62,6 +63,7 @@ The hook emits these `type` values into the journal: - `subagent_dispatch_pattern`: cluster by `subagent_type`. - `correction_free_streak`: cluster by `active_skills[0]`. Treat ≥3 streaks across ≥2 sessions naming the same skill as cross-session evidence. - `clean_recovery`: cluster by (`recovered_from`, `active_skills[0]`). A win cluster qualifies for `skill_edit` only when the named skill exists in `skills_root`. + - `task_completed`: cluster by sorted `tool_kinds` tuple (the multi-tool recipe). Single entry qualifies for `skill_new` proposal (drafting protocol applies). Cross-session evidence requires ≥2 entries from distinct sessions with same tuple — without it, proposal queues, never auto-applies. Run the existing skill-overlap rule before drafting: if the recipe matches an existing skill's name/description tokens, route to `skill_edit` instead. 5. **Multi-axis correlation**: for each session that produced ≥2 distinct struggle types (`tool_error_loop`, `dead_end`, `weak_agent`, `retry_loop`, `edit_churn`, `build_loop`), tag clusters from that session as `multi_axis: true`. This grants +1 confidence at scoring. 6. For each cluster qualifying under the rubric — ≥3 occurrences across ≥2 sessions, OR (for struggle types) ≥1 entry within a single session, OR (for `correction`) ≥3 occurrences across ≥2 cwds: a. If cluster topic matches a rejected idea via the rejected-ideas fuzzy set (≥2 token overlap with rejection's `# Why`), skip with reason `"rejected-similar"`. diff --git a/hooks/adam-observe.mjs b/hooks/adam-observe.mjs index eab494e..261ea63 100755 --- a/hooks/adam-observe.mjs +++ b/hooks/adam-observe.mjs @@ -32,6 +32,8 @@ const CORRECTION_FREE_THRESHOLD = 5; const CLEAN_RECOVERY_WINDOW = 3; const STRUGGLE_TYPES = new Set(["tool_error_loop", "dead_end", "retry_loop"]); const ACTIVE_SKILLS_LOOKBACK = 10; +const TASK_TOOL_MIN = 5; +const TASK_DIVERSITY_MIN = 3; const STATE_MAX_BYTES = 1_000_000; function safeRead(path, fallback) { @@ -133,6 +135,9 @@ function resetSessionLocal(state) { state.correctionFreeCounter = 0; state.recoveryWatch = null; state.tool_window = []; + state.task_tool_kinds = {}; + state.task_tool_count = 0; + state.task_corrections = 0; } function ensureStateDefaults(state) { @@ -149,6 +154,9 @@ function ensureStateDefaults(state) { if (typeof state.correctionFreeCounter !== "number") state.correctionFreeCounter = 0; if (state.recoveryWatch === undefined) state.recoveryWatch = null; if (!Array.isArray(state.activity_ring)) state.activity_ring = []; + if (!state.task_tool_kinds || typeof state.task_tool_kinds !== "object") state.task_tool_kinds = {}; + if (typeof state.task_tool_count !== "number") state.task_tool_count = 0; + if (typeof state.task_corrections !== "number") state.task_corrections = 0; } function main() { @@ -178,6 +186,7 @@ function main() { prev_file: last.file || null, }); state.correctionFreeCounter = 0; + state.task_corrections += 1; } else { state.correctionFreeCounter += 1; if (state.correctionFreeCounter >= CORRECTION_FREE_THRESHOLD) { @@ -190,6 +199,22 @@ function main() { state.correctionFreeCounter = 0; } } + // Evaluate prior task (work between previous UserPromptSubmit and this one). + const taskKinds = Object.keys(state.task_tool_kinds); + if (state.task_tool_count >= TASK_TOOL_MIN && + taskKinds.length >= TASK_DIVERSITY_MIN && + state.task_corrections === 0) { + appendJournal({ + ts, session, cwd, type: "task_completed", + tool_count: state.task_tool_count, + tool_kinds: taskKinds, + active_skills: activeNames(state, "skill"), + active_agents: activeNames(state, "agent"), + }); + } + state.task_tool_kinds = {}; + state.task_tool_count = 0; + state.task_corrections = 0; resetFrictionCounters(state); } else if (event === "PreToolUse") { const tool = input.tool_name; @@ -293,6 +318,9 @@ function main() { state.dead_end_emitted = true; } + state.task_tool_count += 1; + state.task_tool_kinds[tool] = (state.task_tool_kinds[tool] || 0) + 1; + if (struggleEmittedThisTurn) { state.recoveryWatch = { recovered_from: struggleEmittedThisTurn, since_ts: ts, clean_count: 0, window_tools: [] }; } else if (state.recoveryWatch) {