From 7935b017711b238ef44e60880d32afa5c352d6f2 Mon Sep 17 00:00:00 2001 From: KeshavAnandCode Date: Sun, 19 Apr 2026 15:00:46 -0500 Subject: [PATCH] almost finished --- mcp_server.py | 81 ++++++++++----------------------------------------- 1 file changed, 15 insertions(+), 66 deletions(-) diff --git a/mcp_server.py b/mcp_server.py index f06c8f3..3f112bd 100644 --- a/mcp_server.py +++ b/mcp_server.py @@ -37,48 +37,7 @@ with open(project_root / "data" / "processed" / "parent_lookup.json") as f: parent_lookup = json.load(f) # ── Config ───────────────────────────────────────────────────────────────── -TOP_K = 6 - -SYSTEM_PROMPT = """You are an elite AP US History tutor. Your only goal is to help the student master APUSH and score a 5. - -━━━ TOOL USE ━━━ -ALWAYS call search_textbook before answering any history question — no exceptions. -For complex questions (LEQ/DBQ/thematic), call it 2-3 times with different search angles to get full coverage. - -━━━ CITATIONS ━━━ -- Cite inline after every specific claim: (Ch5, p.153) -- The **bolded sentences** in each source are the most relevant — prioritize citing and building on those -- Never invent or guess a citation — if unsure, say "Outside textbook:" -- If the textbook is silent on something relevant, supplement with general knowledge, clearly labeled - -━━━ ACCURACY ━━━ -- Correct false premises immediately and directly — never reinforce a wrong assumption -- Distinguish causation from correlation, primary from secondary causes -- Note historiographical debates where relevant (e.g. revisionist vs traditional interpretations) -- Be precise with dates, names, legislation, and turning points — vagueness loses points on the exam - -━━━ FORMAT — match the question type exactly ━━━ -- Identification / one fact → one concise answer, one citation -- SAQ (Short Answer) → 3 tight paragraphs: claim → evidence → analysis. No intro/conclusion fluff -- LEQ (Long Essay) → Full essay: contextualization → thesis → 3 body paragraphs (each with specific evidence + analysis) → conclusion with complexity -- DBQ → Same as LEQ plus: sourcing, audience/purpose/context for docs, corroboration across docs -- Compare/contrast → Use parallel structure, explicit similarities AND differences -- General question → Clear prose, as long as needed, no padding - -━━━ APUSH EXAM SKILLS ━━━ -When writing essays, explicitly hit the College Board rubric: -- Contextualization: zoom out to broader historical context BEFORE the thesis -- Thesis: historically defensible, specific, addresses complexity (not just "there were many causes") -- Evidence: at least 2 specific pieces of evidence per body paragraph -- Analysis: explain HOW and WHY, not just what happened -- Complexity: demonstrate nuance — turning points, continuity vs change, multiple causation, or cross-period connections - -━━━ END EVERY RESPONSE WITH ━━━ ---- -**Sources Used:** -[list each source: Ch# › Section › p.### — score: X.XXX] -**Retrieval Confidence:** HIGH / MEDIUM / LOW -**Exam Tip:** [one sentence of targeted advice for how this topic typically appears on the APUSH exam]""" +TOP_K = 10 # ── Embed ────────────────────────────────────────────────────────────────── def embed_query(query: str) -> np.ndarray: @@ -89,11 +48,6 @@ def embed_query(query: str) -> np.ndarray: # ── Highlight ────────────────────────────────────────────────────────────── def highlight_passage(query_emb: np.ndarray, passage: str) -> str: - """ - Bold the top 3 most query-relevant sentences using the already-loaded - embedder. Reuses the query embedding computed during retrieval — zero - extra model calls. - """ sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', passage) if len(s.strip()) > 20] if not sentences: return passage @@ -105,14 +59,13 @@ def highlight_passage(query_emb: np.ndarray, passage: str) -> str: show_progress_bar=False, ) - scores = sent_embs @ query_emb # cosine sim (both normalized) + scores = sent_embs @ query_emb top_n = min(3, len(scores)) threshold = float(sorted(scores)[-top_n]) highlighted = passage for sent, score in zip(sentences, scores): if float(score) >= threshold: - # avoid double-bolding if somehow already bolded if f"**{sent}**" not in highlighted: highlighted = highlighted.replace(sent, f"**{sent}**") @@ -120,7 +73,7 @@ def highlight_passage(query_emb: np.ndarray, passage: str) -> str: # ── Retrieve ─────────────────────────────────────────────────────────────── def retrieve(query: str) -> dict: - query_emb = embed_query(query) # compute once, reuse for highlighting + query_emb = embed_query(query) hits = qdrant.query_points( collection_name=COLLECTION, @@ -134,12 +87,7 @@ def retrieve(query: str) -> dict: ).points top_score = hits[0].score if hits else 0 - if top_score >= 0.70: - confidence = "HIGH" - elif top_score >= 0.50: - confidence = "MEDIUM" - else: - confidence = "LOW" + confidence = "HIGH" if top_score >= 0.70 else "MEDIUM" if top_score >= 0.50 else "LOW" seen_parents = set() unique_hits = [] @@ -149,14 +97,14 @@ def retrieve(query: str) -> dict: seen_parents.add(pid) unique_hits.append(h) - unique_hits = unique_hits[:4] + unique_hits = unique_hits[:5] sources = [] for h in unique_hits: - pid = h.payload["parent_id"] - parts = parent_lookup.get(pid, []) - full_text = "\n\n".join(p["text"] for p in parts) - highlighted = highlight_passage(query_emb, full_text) # reuse query_emb + pid = h.payload["parent_id"] + parts = parent_lookup.get(pid, []) + full_text = "\n\n".join(p["text"] for p in parts) + highlighted = highlight_passage(query_emb, full_text) sources.append({ "score": h.score, @@ -174,7 +122,7 @@ def retrieve(query: str) -> dict: "sources": sources, } -# ── Origin bypass middleware ──────────────────────────────────────────────── +# ── Origin bypass middleware ─────────────────────────────────────────────── class AllowAllOriginsMiddleware(BaseHTTPMiddleware): async def dispatch(self, request: Request, call_next): request._headers = request.headers.mutablecopy() @@ -182,15 +130,16 @@ class AllowAllOriginsMiddleware(BaseHTTPMiddleware): return await call_next(request) # ── MCP Server ───────────────────────────────────────────────────────────── -mcp = FastMCP("APUSH Tutor", instructions=SYSTEM_PROMPT) +mcp = FastMCP("APUSH Tutor") @mcp.tool() def search_textbook(query: str) -> str: """ Search the AP US History textbook for relevant passages. - Use this for any question about US history before answering. - Always cite sources inline and list all sources at the end. - Bold or emphasize the most important phrases in your answer. + Call this before answering ANY US history question. + For broad topics call it multiple times with different search angles. + Returns passages with the most relevant sentences bolded. + Always cite inline (Ch#, p.###) and list sources at the end. """ retrieved = retrieve(query)