fix: code review fixes - remove dead verdict variable, fix safety valve, fix fragile test, use tool factories
Co-authored-by: Kenearos <86194771+Kenearos@users.noreply.github.com>
This commit is contained in:
parent
d4cfb34423
commit
071f994e20
4 changed files with 49 additions and 27 deletions
|
|
@ -34,26 +34,29 @@ Scoring criteria:
|
||||||
Be strict. Only award 8+ if the document genuinely meets high quality standards."""
|
Be strict. Only award 8+ if the document genuinely meets high quality standards."""
|
||||||
|
|
||||||
|
|
||||||
def _parse_critic_response(content: str) -> tuple[float, str, str]:
|
def _parse_critic_response(content: str) -> tuple[float, str]:
|
||||||
"""
|
"""
|
||||||
Parse the structured critic response.
|
Parse the structured critic response.
|
||||||
|
|
||||||
|
The routing decision is derived exclusively from the numeric score so that
|
||||||
|
the APPROVAL_THRESHOLD constant is the single source of truth — the
|
||||||
|
LLM-reported VERDICT string is intentionally not returned.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
(score, verdict, feedback) tuple.
|
(score, feedback) tuple.
|
||||||
Falls back to ("rework", full content) on parse failure.
|
score: float clamped to 0–10, defaults to 0.0 on parse failure.
|
||||||
|
feedback: FEEDBACK block text, or full content on parse failure.
|
||||||
"""
|
"""
|
||||||
score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", content)
|
score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", content)
|
||||||
verdict_match = re.search(r"VERDICT:\s*(approve|rework)", content, re.IGNORECASE)
|
|
||||||
feedback_match = re.search(r"FEEDBACK:\s*(.*)", content, re.DOTALL)
|
feedback_match = re.search(r"FEEDBACK:\s*(.*)", content, re.DOTALL)
|
||||||
|
|
||||||
score = float(score_match.group(1)) if score_match else 0.0
|
score = float(score_match.group(1)) if score_match else 0.0
|
||||||
verdict = verdict_match.group(1).lower() if verdict_match else "rework"
|
|
||||||
feedback = feedback_match.group(1).strip() if feedback_match else content.strip()
|
feedback = feedback_match.group(1).strip() if feedback_match else content.strip()
|
||||||
|
|
||||||
# Clamp score to 0–10
|
# Clamp score to 0–10
|
||||||
score = max(0.0, min(10.0, score))
|
score = max(0.0, min(10.0, score))
|
||||||
|
|
||||||
return score, verdict, feedback
|
return score, feedback
|
||||||
|
|
||||||
|
|
||||||
def critic_agent_node(state: CouncilState) -> dict:
|
def critic_agent_node(state: CouncilState) -> dict:
|
||||||
|
|
@ -80,9 +83,6 @@ def critic_agent_node(state: CouncilState) -> dict:
|
||||||
return {
|
return {
|
||||||
"route_decision": "approve",
|
"route_decision": "approve",
|
||||||
"critic_score": APPROVAL_THRESHOLD,
|
"critic_score": APPROVAL_THRESHOLD,
|
||||||
"feedback_history": [
|
|
||||||
f"[Auto-approved after {MAX_ITERATIONS} iterations]"
|
|
||||||
],
|
|
||||||
"messages": [],
|
"messages": [],
|
||||||
"active_node": "critic_agent",
|
"active_node": "critic_agent",
|
||||||
}
|
}
|
||||||
|
|
@ -103,9 +103,10 @@ def critic_agent_node(state: CouncilState) -> dict:
|
||||||
)
|
)
|
||||||
|
|
||||||
response = llm.invoke([system_msg, user_msg])
|
response = llm.invoke([system_msg, user_msg])
|
||||||
score, verdict, feedback = _parse_critic_response(response.content)
|
score, feedback = _parse_critic_response(response.content)
|
||||||
|
|
||||||
# Override verdict based on threshold to ensure consistency
|
# Route decision is derived solely from the numeric score against
|
||||||
|
# APPROVAL_THRESHOLD — the LLM's own VERDICT string is not trusted.
|
||||||
if score >= APPROVAL_THRESHOLD:
|
if score >= APPROVAL_THRESHOLD:
|
||||||
route_decision = "approve"
|
route_decision = "approve"
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -22,8 +22,8 @@ from langchain_openai import ChatOpenAI
|
||||||
from langgraph.graph import END, StateGraph
|
from langgraph.graph import END, StateGraph
|
||||||
|
|
||||||
from state import CouncilState, APPROVAL_THRESHOLD, MAX_ITERATIONS
|
from state import CouncilState, APPROVAL_THRESHOLD, MAX_ITERATIONS
|
||||||
from tools.web_search import web_search
|
from tools.web_search import create_web_search_tool
|
||||||
from tools.pdf_reader import pdf_search
|
from tools.pdf_reader import create_pdf_search_tool
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -76,9 +76,13 @@ def _resolve_tools(tools_config: Optional[dict]) -> list:
|
||||||
|
|
||||||
resolved = []
|
resolved = []
|
||||||
if tools_config.get("webSearch"):
|
if tools_config.get("webSearch"):
|
||||||
resolved.append(web_search)
|
tool = create_web_search_tool()
|
||||||
|
if tool is not None:
|
||||||
|
resolved.append(tool)
|
||||||
if tools_config.get("pdfReader"):
|
if tools_config.get("pdfReader"):
|
||||||
resolved.append(pdf_search)
|
tool = create_pdf_search_tool()
|
||||||
|
if tool is not None:
|
||||||
|
resolved.append(tool)
|
||||||
return resolved
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -110,8 +110,12 @@ class TestToolResolution:
|
||||||
|
|
||||||
def test_resolve_tools_web_search_only(self):
|
def test_resolve_tools_web_search_only(self):
|
||||||
from services.dynamic_graph_builder import _resolve_tools
|
from services.dynamic_graph_builder import _resolve_tools
|
||||||
|
import os
|
||||||
|
|
||||||
tools = _resolve_tools({"webSearch": True, "pdfReader": False})
|
with __import__("unittest.mock", fromlist=["patch"]).patch.dict(
|
||||||
|
os.environ, {"TAVILY_API_KEY": "test-key"}
|
||||||
|
):
|
||||||
|
tools = _resolve_tools({"webSearch": True, "pdfReader": False})
|
||||||
assert len(tools) == 1
|
assert len(tools) == 1
|
||||||
assert tools[0].name == "web_search"
|
assert tools[0].name == "web_search"
|
||||||
|
|
||||||
|
|
@ -124,12 +128,27 @@ class TestToolResolution:
|
||||||
|
|
||||||
def test_resolve_tools_both(self):
|
def test_resolve_tools_both(self):
|
||||||
from services.dynamic_graph_builder import _resolve_tools
|
from services.dynamic_graph_builder import _resolve_tools
|
||||||
|
import os
|
||||||
|
|
||||||
tools = _resolve_tools({"webSearch": True, "pdfReader": True})
|
with __import__("unittest.mock", fromlist=["patch"]).patch.dict(
|
||||||
|
os.environ, {"TAVILY_API_KEY": "test-key"}
|
||||||
|
):
|
||||||
|
tools = _resolve_tools({"webSearch": True, "pdfReader": True})
|
||||||
assert len(tools) == 2
|
assert len(tools) == 2
|
||||||
names = {t.name for t in tools}
|
names = {t.name for t in tools}
|
||||||
assert names == {"web_search", "pdf_search"}
|
assert names == {"web_search", "pdf_search"}
|
||||||
|
|
||||||
|
def test_resolve_tools_web_search_skipped_when_no_api_key(self):
|
||||||
|
from services.dynamic_graph_builder import _resolve_tools
|
||||||
|
import os
|
||||||
|
|
||||||
|
env = {k: v for k, v in os.environ.items() if k != "TAVILY_API_KEY"}
|
||||||
|
with __import__("unittest.mock", fromlist=["patch"]).patch.dict(
|
||||||
|
os.environ, env, clear=True
|
||||||
|
):
|
||||||
|
tools = _resolve_tools({"webSearch": True, "pdfReader": False})
|
||||||
|
assert tools == []
|
||||||
|
|
||||||
|
|
||||||
class TestInvokeWithTools:
|
class TestInvokeWithTools:
|
||||||
"""Tests for the _invoke_with_tools helper."""
|
"""Tests for the _invoke_with_tools helper."""
|
||||||
|
|
|
||||||
|
|
@ -48,42 +48,40 @@ class TestCriticAgentParsing:
|
||||||
from agents.critic_agent import _parse_critic_response
|
from agents.critic_agent import _parse_critic_response
|
||||||
|
|
||||||
content = "SCORE: 9\nVERDICT: approve\nFEEDBACK:\nExcellent work."
|
content = "SCORE: 9\nVERDICT: approve\nFEEDBACK:\nExcellent work."
|
||||||
score, verdict, feedback = _parse_critic_response(content)
|
score, feedback = _parse_critic_response(content)
|
||||||
assert score == 9.0
|
assert score == 9.0
|
||||||
assert verdict == "approve"
|
|
||||||
assert "Excellent" in feedback
|
assert "Excellent" in feedback
|
||||||
|
|
||||||
def test_parse_valid_rework_response(self):
|
def test_parse_valid_rework_response(self):
|
||||||
from agents.critic_agent import _parse_critic_response
|
from agents.critic_agent import _parse_critic_response
|
||||||
|
|
||||||
content = "SCORE: 5\nVERDICT: rework\nFEEDBACK:\nNeeds more detail."
|
content = "SCORE: 5\nVERDICT: rework\nFEEDBACK:\nNeeds more detail."
|
||||||
score, verdict, feedback = _parse_critic_response(content)
|
score, feedback = _parse_critic_response(content)
|
||||||
assert score == 5.0
|
assert score == 5.0
|
||||||
assert verdict == "rework"
|
|
||||||
assert "detail" in feedback
|
assert "detail" in feedback
|
||||||
|
|
||||||
def test_parse_score_clamped_to_0_10(self):
|
def test_parse_score_clamped_to_0_10(self):
|
||||||
from agents.critic_agent import _parse_critic_response
|
from agents.critic_agent import _parse_critic_response
|
||||||
|
|
||||||
content = "SCORE: 15\nVERDICT: approve\nFEEDBACK:\nToo high score."
|
content = "SCORE: 15\nVERDICT: approve\nFEEDBACK:\nToo high score."
|
||||||
score, verdict, feedback = _parse_critic_response(content)
|
score, feedback = _parse_critic_response(content)
|
||||||
assert score == 10.0
|
assert score == 10.0
|
||||||
|
|
||||||
def test_parse_missing_score_defaults_to_0(self):
|
def test_parse_missing_score_defaults_to_0(self):
|
||||||
from agents.critic_agent import _parse_critic_response
|
from agents.critic_agent import _parse_critic_response
|
||||||
|
|
||||||
content = "No structured response at all."
|
content = "No structured response at all."
|
||||||
score, verdict, feedback = _parse_critic_response(content)
|
score, feedback = _parse_critic_response(content)
|
||||||
assert score == 0.0
|
assert score == 0.0
|
||||||
assert verdict == "rework"
|
# No structured response → full content returned as feedback
|
||||||
|
assert content.strip() in feedback
|
||||||
|
|
||||||
def test_threshold_boundary_exactly_8_approves(self):
|
def test_threshold_boundary_exactly_8_approves(self):
|
||||||
from agents.critic_agent import _parse_critic_response
|
from agents.critic_agent import _parse_critic_response
|
||||||
|
|
||||||
content = f"SCORE: {APPROVAL_THRESHOLD}\nVERDICT: approve\nFEEDBACK:\nGood."
|
content = f"SCORE: {APPROVAL_THRESHOLD}\nVERDICT: approve\nFEEDBACK:\nGood."
|
||||||
score, verdict, _ = _parse_critic_response(content)
|
score, _ = _parse_critic_response(content)
|
||||||
assert score == APPROVAL_THRESHOLD
|
assert score == APPROVAL_THRESHOLD
|
||||||
assert verdict == "approve"
|
|
||||||
|
|
||||||
|
|
||||||
class TestMasterAgentPromptBuilding:
|
class TestMasterAgentPromptBuilding:
|
||||||
|
|
@ -95,7 +93,7 @@ class TestMasterAgentPromptBuilding:
|
||||||
state = create_initial_state("Test topic", "run-1")
|
state = create_initial_state("Test topic", "run-1")
|
||||||
prompt = _build_master_prompt(state)
|
prompt = _build_master_prompt(state)
|
||||||
assert "Test topic" in prompt
|
assert "Test topic" in prompt
|
||||||
assert "feedback" not in prompt.lower() or "Feedback" not in prompt
|
assert "feedback" not in prompt.lower()
|
||||||
|
|
||||||
def test_rework_prompt_includes_feedback(self):
|
def test_rework_prompt_includes_feedback(self):
|
||||||
from agents.master_agent import _build_master_prompt
|
from agents.master_agent import _build_master_prompt
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue