From 7935b017711b238ef44e60880d32afa5c352d6f2 Mon Sep 17 00:00:00 2001
From: KeshavAnandCode <keshavanand.dev@gmail.com>
Date: Sun, 19 Apr 2026 15:00:46 -0500
Subject: [PATCH] almost finished

---
 mcp_server.py | 81 ++++++++++-----------------------------------------
 1 file changed, 15 insertions(+), 66 deletions(-)

diff --git a/mcp_server.py b/mcp_server.py
index f06c8f3..3f112bd 100644
--- a/mcp_server.py
+++ b/mcp_server.py
@@ -37,48 +37,7 @@ with open(project_root / "data" / "processed" / "parent_lookup.json") as f:
     parent_lookup = json.load(f)
 
 # ── Config ─────────────────────────────────────────────────────────────────
-TOP_K = 6
-
-SYSTEM_PROMPT = """You are an elite AP US History tutor. Your only goal is to help the student master APUSH and score a 5.
-
-━━━ TOOL USE ━━━
-ALWAYS call search_textbook before answering any history question — no exceptions.
-For complex questions (LEQ/DBQ/thematic), call it 2-3 times with different search angles to get full coverage.
-
-━━━ CITATIONS ━━━
-- Cite inline after every specific claim: (Ch5, p.153)
-- The **bolded sentences** in each source are the most relevant — prioritize citing and building on those
-- Never invent or guess a citation — if unsure, say "Outside textbook:"
-- If the textbook is silent on something relevant, supplement with general knowledge, clearly labeled
-
-━━━ ACCURACY ━━━
-- Correct false premises immediately and directly — never reinforce a wrong assumption
-- Distinguish causation from correlation, primary from secondary causes
-- Note historiographical debates where relevant (e.g. revisionist vs traditional interpretations)
-- Be precise with dates, names, legislation, and turning points — vagueness loses points on the exam
-
-━━━ FORMAT — match the question type exactly ━━━
-- Identification / one fact → one concise answer, one citation
-- SAQ (Short Answer) → 3 tight paragraphs: claim → evidence → analysis. No intro/conclusion fluff
-- LEQ (Long Essay) → Full essay: contextualization → thesis → 3 body paragraphs (each with specific evidence + analysis) → conclusion with complexity
-- DBQ → Same as LEQ plus: sourcing, audience/purpose/context for docs, corroboration across docs
-- Compare/contrast → Use parallel structure, explicit similarities AND differences
-- General question → Clear prose, as long as needed, no padding
-
-━━━ APUSH EXAM SKILLS ━━━
-When writing essays, explicitly hit the College Board rubric:
-- Contextualization: zoom out to broader historical context BEFORE the thesis
-- Thesis: historically defensible, specific, addresses complexity (not just "there were many causes")
-- Evidence: at least 2 specific pieces of evidence per body paragraph
-- Analysis: explain HOW and WHY, not just what happened
-- Complexity: demonstrate nuance — turning points, continuity vs change, multiple causation, or cross-period connections
-
-━━━ END EVERY RESPONSE WITH ━━━
----
-**Sources Used:**
-[list each source: Ch# › Section › p.### — score: X.XXX]
-**Retrieval Confidence:** HIGH / MEDIUM / LOW
-**Exam Tip:** [one sentence of targeted advice for how this topic typically appears on the APUSH exam]"""
+TOP_K = 10
 
 # ── Embed ──────────────────────────────────────────────────────────────────
 def embed_query(query: str) -> np.ndarray:
@@ -89,11 +48,6 @@ def embed_query(query: str) -> np.ndarray:
 
 # ── Highlight ──────────────────────────────────────────────────────────────
 def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
-    """
-    Bold the top 3 most query-relevant sentences using the already-loaded
-    embedder. Reuses the query embedding computed during retrieval — zero
-    extra model calls.
-    """
     sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', passage) if len(s.strip()) > 20]
     if not sentences:
         return passage
@@ -105,14 +59,13 @@ def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
         show_progress_bar=False,
     )
 
-    scores    = sent_embs @ query_emb  # cosine sim (both normalized)
+    scores    = sent_embs @ query_emb
     top_n     = min(3, len(scores))
     threshold = float(sorted(scores)[-top_n])
 
     highlighted = passage
     for sent, score in zip(sentences, scores):
         if float(score) >= threshold:
-            # avoid double-bolding if somehow already bolded
             if f"**{sent}**" not in highlighted:
                 highlighted = highlighted.replace(sent, f"**{sent}**")
 
@@ -120,7 +73,7 @@ def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
 
 # ── Retrieve ───────────────────────────────────────────────────────────────
 def retrieve(query: str) -> dict:
-    query_emb = embed_query(query)  # compute once, reuse for highlighting
+    query_emb = embed_query(query)
 
     hits = qdrant.query_points(
         collection_name=COLLECTION,
@@ -134,12 +87,7 @@ def retrieve(query: str) -> dict:
     ).points
 
     top_score = hits[0].score if hits else 0
-    if top_score >= 0.70:
-        confidence = "HIGH"
-    elif top_score >= 0.50:
-        confidence = "MEDIUM"
-    else:
-        confidence = "LOW"
+    confidence = "HIGH" if top_score >= 0.70 else "MEDIUM" if top_score >= 0.50 else "LOW"
 
     seen_parents = set()
     unique_hits  = []
@@ -149,14 +97,14 @@ def retrieve(query: str) -> dict:
             seen_parents.add(pid)
             unique_hits.append(h)
 
-    unique_hits = unique_hits[:4]
+    unique_hits = unique_hits[:5]
 
     sources = []
     for h in unique_hits:
-        pid       = h.payload["parent_id"]
-        parts     = parent_lookup.get(pid, [])
-        full_text = "\n\n".join(p["text"] for p in parts)
-        highlighted = highlight_passage(query_emb, full_text)  # reuse query_emb
+        pid         = h.payload["parent_id"]
+        parts       = parent_lookup.get(pid, [])
+        full_text   = "\n\n".join(p["text"] for p in parts)
+        highlighted = highlight_passage(query_emb, full_text)
 
         sources.append({
             "score":         h.score,
@@ -174,7 +122,7 @@ def retrieve(query: str) -> dict:
         "sources":    sources,
     }
 
-# ── Origin bypass middleware ────────────────────────────────────────────────
+# ── Origin bypass middleware ───────────────────────────────────────────────
 class AllowAllOriginsMiddleware(BaseHTTPMiddleware):
     async def dispatch(self, request: Request, call_next):
         request._headers = request.headers.mutablecopy()
@@ -182,15 +130,16 @@ class AllowAllOriginsMiddleware(BaseHTTPMiddleware):
         return await call_next(request)
 
 # ── MCP Server ─────────────────────────────────────────────────────────────
-mcp = FastMCP("APUSH Tutor", instructions=SYSTEM_PROMPT)
+mcp = FastMCP("APUSH Tutor")
 
 @mcp.tool()
 def search_textbook(query: str) -> str:
     """
     Search the AP US History textbook for relevant passages.
-    Use this for any question about US history before answering.
-    Always cite sources inline and list all sources at the end.
-    Bold or emphasize the most important phrases in your answer.
+    Call this before answering ANY US history question.
+    For broad topics call it multiple times with different search angles.
+    Returns passages with the most relevant sentences bolded.
+    Always cite inline (Ch#, p.###) and list sources at the end.
     """
     retrieved = retrieve(query)