Implement Phase 4: tools, God Mode, and missing features
Backend: - Add Tavily web search tool wrapper (tools/web_search.py) - Add PDF reader + ChromaDB vector store tool (tools/pdf_reader.py) - Bind tools to LLM calls via .bind_tools() in dynamic_graph_builder - Implement God Mode using LangGraph interrupt_before + MemorySaver - Add approve/reject/modify API endpoints for God Mode - Add PDF upload endpoint with ingestion pipeline - Add persistent run history (CouncilRun model + run_service + API) - Add Alembic migration for council_runs table - Enhance WebSocket to emit run_paused and run_resumed events - Add tests for tools, God Mode, and run history Frontend: - Add God Mode approval UI (GodModePanel component) - Add Auto-Pilot / God Mode toggle in Konferenzzimmer - Add functional PDF upload handler - Add Conditional Edge editor (EdgeSettingsPanel component) - Add edge click selection in ArchitectCanvas - Update Zustand store with edge selection and update actions - Update types for God Mode, execution modes, and WS events - Update API client with God Mode, PDF upload, and blueprint run endpoints - Update WebSocket hook for paused/resumed events - Add Vitest config and frontend tests (store, parser, types, API) https://claude.ai/code/session_017U6idFgaqnYTXzPxA7mxMv
This commit is contained in:
parent
c6d0c4a636
commit
001649a364
31 changed files with 2502 additions and 81 deletions
140
backend/tools/pdf_reader.py
Normal file
140
backend/tools/pdf_reader.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
"""
|
||||
PDF Reader Tool — PyPDF + ChromaDB vector store wrapper for agent nodes.
|
||||
|
||||
Loads PDF files, splits them into chunks, stores embeddings in a local
|
||||
ChromaDB collection, and performs similarity search against queries.
|
||||
Requires the CHROMA_PERSIST_DIR environment variable for storage location.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain_core.tools import tool
|
||||
|
||||
# Module-level collection cache to avoid re-initializing on every call
|
||||
_collection_cache: dict = {}
|
||||
|
||||
|
||||
def _get_chroma_collection(collection_name: str = "council_pdfs"):
|
||||
"""Get or create a ChromaDB collection for PDF content."""
|
||||
if collection_name in _collection_cache:
|
||||
return _collection_cache[collection_name]
|
||||
|
||||
import chromadb
|
||||
|
||||
persist_dir = os.environ.get("CHROMA_PERSIST_DIR", "./chroma_db")
|
||||
client = chromadb.PersistentClient(path=persist_dir)
|
||||
collection = client.get_or_create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
_collection_cache[collection_name] = collection
|
||||
return collection
|
||||
|
||||
|
||||
def ingest_pdf(file_path: str, collection_name: str = "council_pdfs") -> int:
|
||||
"""
|
||||
Read a PDF file, split into chunks, and store in ChromaDB.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file.
|
||||
collection_name: ChromaDB collection name.
|
||||
|
||||
Returns:
|
||||
Number of chunks ingested.
|
||||
"""
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(file_path)
|
||||
chunks: List[str] = []
|
||||
metadata_list: List[dict] = []
|
||||
|
||||
for page_num, page in enumerate(reader.pages):
|
||||
text = page.extract_text()
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
|
||||
# Split long pages into ~500 character chunks with overlap
|
||||
words = text.split()
|
||||
chunk_size = 100 # words per chunk
|
||||
overlap = 20
|
||||
|
||||
for i in range(0, len(words), chunk_size - overlap):
|
||||
chunk_words = words[i : i + chunk_size]
|
||||
chunk_text = " ".join(chunk_words)
|
||||
if chunk_text.strip():
|
||||
chunks.append(chunk_text)
|
||||
metadata_list.append({
|
||||
"source": os.path.basename(file_path),
|
||||
"page": page_num + 1,
|
||||
})
|
||||
|
||||
if not chunks:
|
||||
return 0
|
||||
|
||||
collection = _get_chroma_collection(collection_name)
|
||||
|
||||
# Generate deterministic IDs based on file and chunk position
|
||||
ids = [
|
||||
f"{os.path.basename(file_path)}_chunk_{i}"
|
||||
for i in range(len(chunks))
|
||||
]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
metadatas=metadata_list,
|
||||
ids=ids,
|
||||
)
|
||||
|
||||
return len(chunks)
|
||||
|
||||
|
||||
@tool
|
||||
def pdf_search(query: str, n_results: int = 5) -> str:
|
||||
"""
|
||||
Search the PDF knowledge base for information relevant to a query.
|
||||
|
||||
Args:
|
||||
query: The search query to find relevant PDF content.
|
||||
n_results: Number of results to return (default 5).
|
||||
|
||||
Returns:
|
||||
A formatted string with relevant passages from ingested PDFs.
|
||||
"""
|
||||
try:
|
||||
collection = _get_chroma_collection()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"[PDF Search Error] Could not access vector store: {exc}"
|
||||
|
||||
if collection.count() == 0:
|
||||
return "[PDF Search] No documents have been ingested yet."
|
||||
|
||||
try:
|
||||
results = collection.query(
|
||||
query_texts=[query],
|
||||
n_results=min(n_results, collection.count()),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"[PDF Search Error] {exc}"
|
||||
|
||||
documents = results.get("documents", [[]])[0]
|
||||
metadatas = results.get("metadatas", [[]])[0]
|
||||
|
||||
if not documents:
|
||||
return f"No relevant passages found for: {query}"
|
||||
|
||||
formatted = []
|
||||
for i, (doc, meta) in enumerate(zip(documents, metadatas), 1):
|
||||
source = meta.get("source", "unknown")
|
||||
page = meta.get("page", "?")
|
||||
formatted.append(f"{i}. [Source: {source}, Page {page}]\n {doc}")
|
||||
|
||||
return "\n\n".join(formatted)
|
||||
|
||||
|
||||
def create_pdf_search_tool() -> Optional[tool]:
|
||||
"""Factory that returns the pdf_search tool if ChromaDB is configured."""
|
||||
persist_dir = os.environ.get("CHROMA_PERSIST_DIR", "./chroma_db")
|
||||
if persist_dir:
|
||||
return pdf_search
|
||||
return None
|
||||
Loading…
Add table
Add a link
Reference in a new issue