From 071f994e20192004b583fdfe31e7dec7697f91b6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 12 Mar 2026 22:27:14 +0000 Subject: [PATCH] fix: code review fixes - remove dead verdict variable, fix safety valve, fix fragile test, use tool factories Co-authored-by: Kenearos <86194771+Kenearos@users.noreply.github.com> --- backend/agents/critic_agent.py | 23 ++++++++++++----------- backend/services/dynamic_graph_builder.py | 12 ++++++++---- backend/tests/test_god_mode.py | 23 +++++++++++++++++++++-- backend/tests/test_routing.py | 18 ++++++++---------- 4 files changed, 49 insertions(+), 27 deletions(-) diff --git a/backend/agents/critic_agent.py b/backend/agents/critic_agent.py index 7681c81..1899360 100644 --- a/backend/agents/critic_agent.py +++ b/backend/agents/critic_agent.py @@ -34,26 +34,29 @@ Scoring criteria: Be strict. Only award 8+ if the document genuinely meets high quality standards.""" -def _parse_critic_response(content: str) -> tuple[float, str, str]: +def _parse_critic_response(content: str) -> tuple[float, str]: """ Parse the structured critic response. + The routing decision is derived exclusively from the numeric score so that + the APPROVAL_THRESHOLD constant is the single source of truth — the + LLM-reported VERDICT string is intentionally not returned. + Returns: - (score, verdict, feedback) tuple. - Falls back to ("rework", full content) on parse failure. + (score, feedback) tuple. + score: float clamped to 0–10, defaults to 0.0 on parse failure. + feedback: FEEDBACK block text, or full content on parse failure. """ score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", content) - verdict_match = re.search(r"VERDICT:\s*(approve|rework)", content, re.IGNORECASE) feedback_match = re.search(r"FEEDBACK:\s*(.*)", content, re.DOTALL) score = float(score_match.group(1)) if score_match else 0.0 - verdict = verdict_match.group(1).lower() if verdict_match else "rework" feedback = feedback_match.group(1).strip() if feedback_match else content.strip() # Clamp score to 0–10 score = max(0.0, min(10.0, score)) - return score, verdict, feedback + return score, feedback def critic_agent_node(state: CouncilState) -> dict: @@ -80,9 +83,6 @@ def critic_agent_node(state: CouncilState) -> dict: return { "route_decision": "approve", "critic_score": APPROVAL_THRESHOLD, - "feedback_history": [ - f"[Auto-approved after {MAX_ITERATIONS} iterations]" - ], "messages": [], "active_node": "critic_agent", } @@ -103,9 +103,10 @@ def critic_agent_node(state: CouncilState) -> dict: ) response = llm.invoke([system_msg, user_msg]) - score, verdict, feedback = _parse_critic_response(response.content) + score, feedback = _parse_critic_response(response.content) - # Override verdict based on threshold to ensure consistency + # Route decision is derived solely from the numeric score against + # APPROVAL_THRESHOLD — the LLM's own VERDICT string is not trusted. if score >= APPROVAL_THRESHOLD: route_decision = "approve" else: diff --git a/backend/services/dynamic_graph_builder.py b/backend/services/dynamic_graph_builder.py index 78d2bdb..460ed6a 100644 --- a/backend/services/dynamic_graph_builder.py +++ b/backend/services/dynamic_graph_builder.py @@ -22,8 +22,8 @@ from langchain_openai import ChatOpenAI from langgraph.graph import END, StateGraph from state import CouncilState, APPROVAL_THRESHOLD, MAX_ITERATIONS -from tools.web_search import web_search -from tools.pdf_reader import pdf_search +from tools.web_search import create_web_search_tool +from tools.pdf_reader import create_pdf_search_tool # --------------------------------------------------------------------------- @@ -76,9 +76,13 @@ def _resolve_tools(tools_config: Optional[dict]) -> list: resolved = [] if tools_config.get("webSearch"): - resolved.append(web_search) + tool = create_web_search_tool() + if tool is not None: + resolved.append(tool) if tools_config.get("pdfReader"): - resolved.append(pdf_search) + tool = create_pdf_search_tool() + if tool is not None: + resolved.append(tool) return resolved diff --git a/backend/tests/test_god_mode.py b/backend/tests/test_god_mode.py index 72bf813..604e647 100644 --- a/backend/tests/test_god_mode.py +++ b/backend/tests/test_god_mode.py @@ -110,8 +110,12 @@ class TestToolResolution: def test_resolve_tools_web_search_only(self): from services.dynamic_graph_builder import _resolve_tools + import os - tools = _resolve_tools({"webSearch": True, "pdfReader": False}) + with __import__("unittest.mock", fromlist=["patch"]).patch.dict( + os.environ, {"TAVILY_API_KEY": "test-key"} + ): + tools = _resolve_tools({"webSearch": True, "pdfReader": False}) assert len(tools) == 1 assert tools[0].name == "web_search" @@ -124,12 +128,27 @@ class TestToolResolution: def test_resolve_tools_both(self): from services.dynamic_graph_builder import _resolve_tools + import os - tools = _resolve_tools({"webSearch": True, "pdfReader": True}) + with __import__("unittest.mock", fromlist=["patch"]).patch.dict( + os.environ, {"TAVILY_API_KEY": "test-key"} + ): + tools = _resolve_tools({"webSearch": True, "pdfReader": True}) assert len(tools) == 2 names = {t.name for t in tools} assert names == {"web_search", "pdf_search"} + def test_resolve_tools_web_search_skipped_when_no_api_key(self): + from services.dynamic_graph_builder import _resolve_tools + import os + + env = {k: v for k, v in os.environ.items() if k != "TAVILY_API_KEY"} + with __import__("unittest.mock", fromlist=["patch"]).patch.dict( + os.environ, env, clear=True + ): + tools = _resolve_tools({"webSearch": True, "pdfReader": False}) + assert tools == [] + class TestInvokeWithTools: """Tests for the _invoke_with_tools helper.""" diff --git a/backend/tests/test_routing.py b/backend/tests/test_routing.py index 31bc4cb..d305015 100644 --- a/backend/tests/test_routing.py +++ b/backend/tests/test_routing.py @@ -48,42 +48,40 @@ class TestCriticAgentParsing: from agents.critic_agent import _parse_critic_response content = "SCORE: 9\nVERDICT: approve\nFEEDBACK:\nExcellent work." - score, verdict, feedback = _parse_critic_response(content) + score, feedback = _parse_critic_response(content) assert score == 9.0 - assert verdict == "approve" assert "Excellent" in feedback def test_parse_valid_rework_response(self): from agents.critic_agent import _parse_critic_response content = "SCORE: 5\nVERDICT: rework\nFEEDBACK:\nNeeds more detail." - score, verdict, feedback = _parse_critic_response(content) + score, feedback = _parse_critic_response(content) assert score == 5.0 - assert verdict == "rework" assert "detail" in feedback def test_parse_score_clamped_to_0_10(self): from agents.critic_agent import _parse_critic_response content = "SCORE: 15\nVERDICT: approve\nFEEDBACK:\nToo high score." - score, verdict, feedback = _parse_critic_response(content) + score, feedback = _parse_critic_response(content) assert score == 10.0 def test_parse_missing_score_defaults_to_0(self): from agents.critic_agent import _parse_critic_response content = "No structured response at all." - score, verdict, feedback = _parse_critic_response(content) + score, feedback = _parse_critic_response(content) assert score == 0.0 - assert verdict == "rework" + # No structured response → full content returned as feedback + assert content.strip() in feedback def test_threshold_boundary_exactly_8_approves(self): from agents.critic_agent import _parse_critic_response content = f"SCORE: {APPROVAL_THRESHOLD}\nVERDICT: approve\nFEEDBACK:\nGood." - score, verdict, _ = _parse_critic_response(content) + score, _ = _parse_critic_response(content) assert score == APPROVAL_THRESHOLD - assert verdict == "approve" class TestMasterAgentPromptBuilding: @@ -95,7 +93,7 @@ class TestMasterAgentPromptBuilding: state = create_initial_state("Test topic", "run-1") prompt = _build_master_prompt(state) assert "Test topic" in prompt - assert "feedback" not in prompt.lower() or "Feedback" not in prompt + assert "feedback" not in prompt.lower() def test_rework_prompt_includes_feedback(self): from agents.master_agent import _build_master_prompt