mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-20 13:32:30 +00:00
test(kanban): cover sticky blocks for worker-initiated kanban_block (#28712)
Six regression tests pinning the dispatcher contract that was broken in #28712: * test_worker_block_is_not_auto_promoted_by_recompute_ready — kanban_block survives five back-to-back ticks (compressed dispatcher loop). * test_worker_block_on_child_with_done_parents_is_still_sticky — the parent-completion code path was the worst false-positive; even when every parent is done, an explicit worker block stays blocked. * test_circuit_breaker_block_still_auto_promotes — preserves the pre-#28712 recovery semantics for circuit-breaker blocks (direct UPDATE + no "blocked" event). * test_gave_up_event_alone_does_not_make_block_sticky — explicit guard so the gave_up event is never accidentally treated as sticky; covers the second leg of the protocol_violation loop. * test_unblock_clears_sticky_state_and_lets_block_recover — only unblock_task resolves the sticky state; subsequent circuit-breaker blocks recover normally. * test_protocol_violation_loop_is_broken — full bug-shaped reproduction: block → tick → (would-be) crash + gave_up → next tick still blocked. Without the fix this would loop indefinitely. The seventh test from the original PR (legacy-DB init recovery) was dropped during salvage — the schema-init half of #28712 is already fixed on main by #28754 and #28781, and the contract is covered by test_kanban_db.py::test_connect_migrates_legacy_db_before_optional_column_indexes.
This commit is contained in:
@@ -0,0 +1,268 @@
|
||||
"""Regression tests for #28712 — kanban dispatcher must not auto-promote
|
||||
worker-initiated ``kanban_block`` (sticky blocks), but must keep
|
||||
auto-recovering circuit-breaker blocks.
|
||||
|
||||
The bug: when a worker called ``kanban_block(reason="review-required:
|
||||
...")`` to hand off to a human, the dispatcher's ``recompute_ready``
|
||||
would promote the task back to ``ready`` on the next tick. The fresh
|
||||
worker found nothing to do (work already applied), exited cleanly, and
|
||||
got recorded as a ``protocol_violation`` → ``gave_up`` → promote → loop
|
||||
until manual intervention.
|
||||
|
||||
These tests pin down:
|
||||
|
||||
* Worker / operator-initiated blocks are sticky and survive
|
||||
``recompute_ready``.
|
||||
* Circuit-breaker blocks (``gave_up`` event, status flipped via
|
||||
``_record_task_failure``) still auto-recover — the original intent
|
||||
of #40c1decb3 is preserved.
|
||||
* An explicit ``kanban_unblock`` clears the sticky state.
|
||||
* The full block → promote → crash → ``gave_up`` loop is broken after
|
||||
this fix: subsequent ticks leave the task blocked.
|
||||
|
||||
The tangentially related schema-init ordering bug originally reported
|
||||
in #28712 (``init_db`` crashing on legacy DBs that pre-dated the
|
||||
``session_id`` migration) is covered separately by
|
||||
``test_kanban_db.py::test_connect_migrates_legacy_db_before_optional_column_indexes``,
|
||||
landed via #28754 / #28781 ahead of this fix.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from hermes_cli import kanban_db as kb
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def kanban_home(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
|
||||
"""Isolated HERMES_HOME with an empty kanban DB."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
||||
kb.init_db()
|
||||
return home
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Worker-initiated kanban_block must be sticky
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_worker_block_is_not_auto_promoted_by_recompute_ready(kanban_home: Path) -> None:
|
||||
"""A standalone task that a worker explicitly blocks for review
|
||||
must stay blocked across an arbitrary number of dispatcher ticks.
|
||||
Before #28712's fix, ``recompute_ready`` would silently flip it
|
||||
back to ``ready`` on the very next tick."""
|
||||
with kb.connect() as conn:
|
||||
tid = kb.create_task(conn, title="needs human review")
|
||||
kb.claim_task(conn, tid)
|
||||
assert kb.block_task(
|
||||
conn, tid,
|
||||
reason="review-required: please verify ACL change",
|
||||
expected_run_id=kb.get_task(conn, tid).current_run_id,
|
||||
)
|
||||
assert kb.get_task(conn, tid).status == "blocked"
|
||||
|
||||
# Hammer the promotion code — exactly the dispatcher loop's
|
||||
# behaviour, just compressed in time.
|
||||
for _ in range(5):
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 0, "worker-blocked task must not auto-promote"
|
||||
assert kb.get_task(conn, tid).status == "blocked"
|
||||
|
||||
|
||||
def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Path) -> None:
|
||||
"""The parent-completion path is the one ``recompute_ready`` was
|
||||
designed for, so it's the most dangerous false-positive: even when
|
||||
every parent is done, a worker-initiated block on the child must
|
||||
stay blocked."""
|
||||
with kb.connect() as conn:
|
||||
parent = kb.create_task(conn, title="parent")
|
||||
child = kb.create_task(conn, title="child", parents=[parent])
|
||||
kb.complete_task(conn, parent, result="parent ok")
|
||||
|
||||
kb.claim_task(conn, child)
|
||||
kb.block_task(
|
||||
conn, child,
|
||||
reason="review-required: child needs sign-off",
|
||||
expected_run_id=kb.get_task(conn, child).current_run_id,
|
||||
)
|
||||
assert kb.get_task(conn, child).status == "blocked"
|
||||
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 0
|
||||
assert kb.get_task(conn, child).status == "blocked"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Circuit-breaker blocks still auto-recover (preserve #40c1decb3 intent)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
|
||||
"""A child that was put into ``blocked`` *without* a worker-issued
|
||||
``kanban_block`` (e.g. circuit-breaker after repeated spawn
|
||||
failures, manual DB triage) must still get auto-promoted when its
|
||||
parents complete — preserves the pre-#28712 recovery semantics."""
|
||||
with kb.connect() as conn:
|
||||
parent = kb.create_task(conn, title="parent")
|
||||
child = kb.create_task(conn, title="child", parents=[parent])
|
||||
kb.complete_task(conn, parent, result="ok")
|
||||
|
||||
# Simulate a circuit-breaker / direct triage that flips status
|
||||
# without emitting a ``blocked`` event — exactly what
|
||||
# ``_record_task_failure`` does after a ``gave_up``.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=5, "
|
||||
"last_failure_error='persistent error' WHERE id=?",
|
||||
(child,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 1
|
||||
task = kb.get_task(conn, child)
|
||||
assert task.status == "ready"
|
||||
assert task.consecutive_failures == 0
|
||||
assert task.last_failure_error is None
|
||||
|
||||
|
||||
def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None:
|
||||
"""The circuit-breaker emits ``gave_up`` (not ``blocked``). Make
|
||||
sure ``_has_sticky_block`` doesn't accidentally treat ``gave_up``
|
||||
as sticky — otherwise we'd regress the safety net for genuinely
|
||||
transient crashes."""
|
||||
with kb.connect() as conn:
|
||||
parent = kb.create_task(conn, title="parent")
|
||||
child = kb.create_task(conn, title="child", parents=[parent])
|
||||
kb.complete_task(conn, parent, result="ok")
|
||||
|
||||
# Status + event match what _record_task_failure writes when
|
||||
# the breaker trips.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked' WHERE id=?", (child,),
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO task_events (task_id, kind, payload, created_at) "
|
||||
"VALUES (?, 'gave_up', NULL, ?)",
|
||||
(child, int(time.time())),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 1
|
||||
assert kb.get_task(conn, child).status == "ready"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# unblock_task clears the sticky state
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_unblock_clears_sticky_state_and_lets_block_recover(kanban_home: Path) -> None:
|
||||
"""``hermes kanban unblock`` (or the ``kanban_unblock`` tool) is
|
||||
the only legitimate way out of a worker-initiated block. After
|
||||
unblock, a *subsequent* circuit-breaker block on the same task
|
||||
must again be eligible for auto-recovery."""
|
||||
with kb.connect() as conn:
|
||||
tid = kb.create_task(conn, title="t")
|
||||
kb.claim_task(conn, tid)
|
||||
kb.block_task(
|
||||
conn, tid,
|
||||
reason="review-required: ...",
|
||||
expected_run_id=kb.get_task(conn, tid).current_run_id,
|
||||
)
|
||||
assert kb.unblock_task(conn, tid)
|
||||
# After unblock the task is no longer blocked at all.
|
||||
assert kb.get_task(conn, tid).status == "ready"
|
||||
|
||||
# Now simulate a *later* circuit-breaker block (no new
|
||||
# ``blocked`` event, just status flip). The most recent
|
||||
# block/unblock event is ``unblocked`` → guard does not fire
|
||||
# → recompute can recover.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked' WHERE id=?", (tid,),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 1
|
||||
assert kb.get_task(conn, tid).status == "ready"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Full bug-shaped loop: block → promote → crash → gave_up → next tick
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_protocol_violation_loop_is_broken(kanban_home: Path) -> None:
|
||||
"""Reproduces the exact #28712 loop and asserts the dispatcher
|
||||
leaves the task blocked instead of cycling.
|
||||
|
||||
Loop shape from the issue:
|
||||
|
||||
1. Worker calls ``kanban_block`` → status='blocked',
|
||||
``task_runs.outcome='blocked'``, ``blocked`` event.
|
||||
2. (Bug) Dispatcher promotes back to ``ready``.
|
||||
3. Fresh worker exits cleanly without terminal tool call →
|
||||
``protocol_violation`` event.
|
||||
4. ``_record_task_failure(failure_limit=1)`` → ``gave_up`` event,
|
||||
status='blocked' again.
|
||||
5. (Bug) Dispatcher promotes again → infinite loop.
|
||||
|
||||
With the fix in place, step 2 never happens — the test simulates
|
||||
one would-be loop cycle by faking the crash-then-gave_up entries
|
||||
that *would* have been written and asserts the *next* tick still
|
||||
leaves the task blocked.
|
||||
"""
|
||||
with kb.connect() as conn:
|
||||
tid = kb.create_task(conn, title="loop reproducer")
|
||||
kb.claim_task(conn, tid)
|
||||
kb.block_task(
|
||||
conn, tid,
|
||||
reason="review-required: human eyes please",
|
||||
expected_run_id=kb.get_task(conn, tid).current_run_id,
|
||||
)
|
||||
assert kb.get_task(conn, tid).status == "blocked"
|
||||
|
||||
# First dispatcher tick — must NOT promote.
|
||||
assert kb.recompute_ready(conn) == 0
|
||||
assert kb.get_task(conn, tid).status == "blocked"
|
||||
|
||||
# Simulate the (hypothetical) protocol_violation + gave_up
|
||||
# entries that the dispatcher would have written if the bug
|
||||
# were still present. Even with those event rows in place,
|
||||
# the worker-initiated ``blocked`` event is the most recent
|
||||
# of the ``{blocked, unblocked}`` pair, so the sticky guard
|
||||
# still fires.
|
||||
now = int(time.time())
|
||||
conn.execute(
|
||||
"INSERT INTO task_events (task_id, kind, payload, created_at) "
|
||||
"VALUES (?, 'protocol_violation', NULL, ?)",
|
||||
(tid, now),
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO task_events (task_id, kind, payload, created_at) "
|
||||
"VALUES (?, 'gave_up', NULL, ?)",
|
||||
(tid, now + 1),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Subsequent ticks must still leave it blocked.
|
||||
for _ in range(3):
|
||||
promoted = kb.recompute_ready(conn)
|
||||
assert promoted == 0
|
||||
assert kb.get_task(conn, tid).status == "blocked"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema-init recovery on legacy DBs is covered by
|
||||
# tests/hermes_cli/test_kanban_db.py::test_connect_migrates_legacy_db_before_optional_column_indexes
|
||||
# (landed via #28754 / #28781). The original PR shipped a duplicate test
|
||||
# here; dropped during salvage to avoid two assertions of the same contract.
|
||||
# ---------------------------------------------------------------------------
|
||||
Reference in New Issue
Block a user