fix: code review fixes - remove dead verdict variable, fix safety valve, fix fragile test, use tool factories

Co-authored-by: Kenearos <86194771+Kenearos@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2026-03-12 22:27:14 +00:00
parent d4cfb34423
commit 071f994e20
4 changed files with 49 additions and 27 deletions

View file

@ -34,26 +34,29 @@ Scoring criteria:
Be strict. Only award 8+ if the document genuinely meets high quality standards."""
def _parse_critic_response(content: str) -> tuple[float, str, str]:
def _parse_critic_response(content: str) -> tuple[float, str]:
"""
Parse the structured critic response.
The routing decision is derived exclusively from the numeric score so that
the APPROVAL_THRESHOLD constant is the single source of truth the
LLM-reported VERDICT string is intentionally not returned.
Returns:
(score, verdict, feedback) tuple.
Falls back to ("rework", full content) on parse failure.
(score, feedback) tuple.
score: float clamped to 010, defaults to 0.0 on parse failure.
feedback: FEEDBACK block text, or full content on parse failure.
"""
score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", content)
verdict_match = re.search(r"VERDICT:\s*(approve|rework)", content, re.IGNORECASE)
feedback_match = re.search(r"FEEDBACK:\s*(.*)", content, re.DOTALL)
score = float(score_match.group(1)) if score_match else 0.0
verdict = verdict_match.group(1).lower() if verdict_match else "rework"
feedback = feedback_match.group(1).strip() if feedback_match else content.strip()
# Clamp score to 010
score = max(0.0, min(10.0, score))
return score, verdict, feedback
return score, feedback
def critic_agent_node(state: CouncilState) -> dict:
@ -80,9 +83,6 @@ def critic_agent_node(state: CouncilState) -> dict:
return {
"route_decision": "approve",
"critic_score": APPROVAL_THRESHOLD,
"feedback_history": [
f"[Auto-approved after {MAX_ITERATIONS} iterations]"
],
"messages": [],
"active_node": "critic_agent",
}
@ -103,9 +103,10 @@ def critic_agent_node(state: CouncilState) -> dict:
)
response = llm.invoke([system_msg, user_msg])
score, verdict, feedback = _parse_critic_response(response.content)
score, feedback = _parse_critic_response(response.content)
# Override verdict based on threshold to ensure consistency
# Route decision is derived solely from the numeric score against
# APPROVAL_THRESHOLD — the LLM's own VERDICT string is not trusted.
if score >= APPROVAL_THRESHOLD:
route_decision = "approve"
else: