diff --git a/mcp_server.py b/mcp_server.py new file mode 100644 index 0000000..79d7982 --- /dev/null +++ b/mcp_server.py @@ -0,0 +1,155 @@ +import json +import re +from pathlib import Path +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import Filter, FieldCondition, MatchValue +from mcp.server.fastmcp import FastMCP + +# ── Paths ────────────────────────────────────────────────────────────────── +project_root = Path(__file__).resolve().parent + +# ── Models / Clients ─────────────────────────────────────────────────────── +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) + +qdrant = QdrantClient(path=str(project_root / "data" / "qdrant_local")) +COLLECTION = "apush_chunks" + +with open(project_root / "data" / "processed" / "parent_lookup.json") as f: + parent_lookup = json.load(f) + +# ── Config (same as notebook) ────────────────────────────────────────────── +TOP_K = 10 + +SYSTEM_PROMPT = """You are an expert AP US History tutor helping a student ace their APUSH exam. + +You have access to the search_textbook tool. Call it before answering ANY history question. + +ANSWERING: +- Cite inline like (Ch5, p.153) after every specific claim +- **Bold** key terms, dates, names, and critical facts +- Correct false premises directly — don't reinforce wrong assumptions +- If the textbook doesn't cover it, answer from general knowledge and prefix with "Outside textbook:" + +FORMAT — match the question type: +- One word/fact → one word +- SAQ → 1 focused paragraph, dense with evidence +- LEQ/DBQ → full essay: context, thesis, body paragraphs with evidence, nuance +- General question → clear prose, as long as needed + +END EVERY RESPONSE WITH: +--- +**Sources Used:** +[list every source from the tool output with chapter, section, page, and score] +**Retrieval Confidence:** HIGH/MEDIUM/LOW""" + +# ── Embed ────────────────────────────────────────────────────────────────── +def embed_query(query: str) -> list[float]: + return model.encode( + f"search_query: {query}", + normalize_embeddings=True, + ).tolist() + +# ── Retrieve (same as notebook) ──────────────────────────────────────────── +def retrieve(query: str) -> dict: + hits = qdrant.query_points( + collection_name=COLLECTION, + query=embed_query(query), + limit=TOP_K, + query_filter=Filter( + must_not=[ + FieldCondition(key="is_chapter_review", match=MatchValue(value=True)) + ] + ), + ).points + + top_score = hits[0].score if hits else 0 + if top_score >= 0.70: + confidence = "HIGH" + elif top_score >= 0.50: + confidence = "MEDIUM" + else: + confidence = "LOW" + + # Deduplicate by parent_id + seen_parents = set() + unique_hits = [] + for h in hits: + pid = h.payload["parent_id"] + if pid not in seen_parents: + seen_parents.add(pid) + unique_hits.append(h) + + unique_hits = unique_hits[:5] + + sources = [] + for h in unique_hits: + pid = h.payload["parent_id"] + parts = parent_lookup.get(pid, []) + full_text = "\n\n".join(p["text"] for p in parts) + + sources.append({ + "score": h.score, + "chapter_num": h.payload["chapter_num"], + "chapter_title": h.payload["chapter_title"], + "section_title": h.payload["section_title"], + "textbook_page": h.payload["textbook_page"], + "text": full_text, + }) + + return { + "query": query, + "confidence": confidence, + "top_score": top_score, + "sources": sources, + } + +# ── MCP Server ───────────────────────────────────────────────────────────── +mcp = FastMCP("APUSH Tutor") + +@mcp.tool() +def search_textbook(query: str) -> str: + """ + Search the AP US History textbook for relevant passages. + Use this for any question about US history before answering. + Always cite sources inline and list all sources at the end. + Bold or emphasize the most important phrases in your answer. + """ + retrieved = retrieve(query) + + if not retrieved["sources"]: + return "No relevant passages found in the textbook." + + header = f"[Confidence: {retrieved['confidence']} | Top score: {retrieved['top_score']:.3f}]\n\n" + + passages = "\n\n---\n\n".join( + f"[SOURCE {i+1} | Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} | score: {s['score']:.3f}]\n{s['text']}" + for i, s in enumerate(retrieved["sources"]) + ) + + footer = "\n\n===SOURCES===\n" + "\n".join( + f"[{i+1}] Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} (score: {s['score']:.3f})" + for i, s in enumerate(retrieved["sources"]) + ) + + return header + passages + footer + +@mcp.prompt() +def system_prompt() -> str: + """The APUSH tutor system prompt.""" + return SYSTEM_PROMPT + +# ── Run ──────────────────────────────────────────────────────────────────── +if __name__ == "__main__": + import uvicorn + from starlette.middleware.cors import CORSMiddleware + + app = mcp.streamable_http_app() + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + ) + + uvicorn.run(app, host="127.0.0.1", port=52437) \ No newline at end of file