fixed and added mcp

2026-04-12 23:13:16 -05:00
parent 3378ad7328
commit 50b4c1c905
1 changed files with 155 additions and 0 deletions
--- a/mcp_server.py
+++ b/mcp_server.py
@@ -0,0 +1,155 @@
+import json
+import re
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+from qdrant_client import QdrantClient
+from qdrant_client.models import Filter, FieldCondition, MatchValue
+from mcp.server.fastmcp import FastMCP
+
+# ── Paths ──────────────────────────────────────────────────────────────────
+project_root = Path(__file__).resolve().parent
+
+# ── Models / Clients ───────────────────────────────────────────────────────
+model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
+
+qdrant    = QdrantClient(path=str(project_root / "data" / "qdrant_local"))
+COLLECTION = "apush_chunks"
+
+with open(project_root / "data" / "processed" / "parent_lookup.json") as f:
+    parent_lookup = json.load(f)
+
+# ── Config (same as notebook) ──────────────────────────────────────────────
+TOP_K = 10
+
+SYSTEM_PROMPT = """You are an expert AP US History tutor helping a student ace their APUSH exam.
+
+You have access to the search_textbook tool. Call it before answering ANY history question.
+
+ANSWERING:
+- Cite inline like (Ch5, p.153) after every specific claim
+- **Bold** key terms, dates, names, and critical facts
+- Correct false premises directly — don't reinforce wrong assumptions
+- If the textbook doesn't cover it, answer from general knowledge and prefix with "Outside textbook:"
+
+FORMAT — match the question type:
+- One word/fact → one word
+- SAQ → 1 focused paragraph, dense with evidence  
+- LEQ/DBQ → full essay: context, thesis, body paragraphs with evidence, nuance
+- General question → clear prose, as long as needed
+
+END EVERY RESPONSE WITH:
+---
+**Sources Used:**
+[list every source from the tool output with chapter, section, page, and score]
+**Retrieval Confidence:** HIGH/MEDIUM/LOW"""
+
+# ── Embed ──────────────────────────────────────────────────────────────────
+def embed_query(query: str) -> list[float]:
+    return model.encode(
+        f"search_query: {query}",
+        normalize_embeddings=True,
+    ).tolist()
+
+# ── Retrieve (same as notebook) ────────────────────────────────────────────
+def retrieve(query: str) -> dict:
+    hits = qdrant.query_points(
+        collection_name=COLLECTION,
+        query=embed_query(query),
+        limit=TOP_K,
+        query_filter=Filter(
+            must_not=[
+                FieldCondition(key="is_chapter_review", match=MatchValue(value=True))
+            ]
+        ),
+    ).points
+
+    top_score = hits[0].score if hits else 0
+    if top_score >= 0.70:
+        confidence = "HIGH"
+    elif top_score >= 0.50:
+        confidence = "MEDIUM"
+    else:
+        confidence = "LOW"
+
+    # Deduplicate by parent_id
+    seen_parents = set()
+    unique_hits  = []
+    for h in hits:
+        pid = h.payload["parent_id"]
+        if pid not in seen_parents:
+            seen_parents.add(pid)
+            unique_hits.append(h)
+
+    unique_hits = unique_hits[:5]
+
+    sources = []
+    for h in unique_hits:
+        pid      = h.payload["parent_id"]
+        parts    = parent_lookup.get(pid, [])
+        full_text = "\n\n".join(p["text"] for p in parts)
+
+        sources.append({
+            "score":         h.score,
+            "chapter_num":   h.payload["chapter_num"],
+            "chapter_title": h.payload["chapter_title"],
+            "section_title": h.payload["section_title"],
+            "textbook_page": h.payload["textbook_page"],
+            "text":          full_text,
+        })
+
+    return {
+        "query":      query,
+        "confidence": confidence,
+        "top_score":  top_score,
+        "sources":    sources,
+    }
+
+# ── MCP Server ─────────────────────────────────────────────────────────────
+mcp = FastMCP("APUSH Tutor")
+
+@mcp.tool()
+def search_textbook(query: str) -> str:
+    """
+    Search the AP US History textbook for relevant passages.
+    Use this for any question about US history before answering.
+    Always cite sources inline and list all sources at the end.
+    Bold or emphasize the most important phrases in your answer.
+    """
+    retrieved = retrieve(query)
+
+    if not retrieved["sources"]:
+        return "No relevant passages found in the textbook."
+
+    header = f"[Confidence: {retrieved['confidence']} | Top score: {retrieved['top_score']:.3f}]\n\n"
+
+    passages = "\n\n---\n\n".join(
+        f"[SOURCE {i+1} | Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} | score: {s['score']:.3f}]\n{s['text']}"
+        for i, s in enumerate(retrieved["sources"])
+    )
+
+    footer = "\n\n===SOURCES===\n" + "\n".join(
+        f"[{i+1}] Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} (score: {s['score']:.3f})"
+        for i, s in enumerate(retrieved["sources"])
+    )
+
+    return header + passages + footer
+
+@mcp.prompt()
+def system_prompt() -> str:
+    """The APUSH tutor system prompt."""
+    return SYSTEM_PROMPT
+
+# ── Run ────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import uvicorn
+    from starlette.middleware.cors import CORSMiddleware
+
+    app = mcp.streamable_http_app()
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    uvicorn.run(app, host="127.0.0.1", port=52437)