This repository has been archived on 2026-04-23. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
apush-rag/mcp_server.py
2026-04-12 23:13:16 -05:00

155 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import re
from pathlib import Path
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
from mcp.server.fastmcp import FastMCP
# ── Paths ──────────────────────────────────────────────────────────────────
project_root = Path(__file__).resolve().parent
# ── Models / Clients ───────────────────────────────────────────────────────
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
qdrant = QdrantClient(path=str(project_root / "data" / "qdrant_local"))
COLLECTION = "apush_chunks"
with open(project_root / "data" / "processed" / "parent_lookup.json") as f:
parent_lookup = json.load(f)
# ── Config (same as notebook) ──────────────────────────────────────────────
TOP_K = 10
SYSTEM_PROMPT = """You are an expert AP US History tutor helping a student ace their APUSH exam.
You have access to the search_textbook tool. Call it before answering ANY history question.
ANSWERING:
- Cite inline like (Ch5, p.153) after every specific claim
- **Bold** key terms, dates, names, and critical facts
- Correct false premises directly — don't reinforce wrong assumptions
- If the textbook doesn't cover it, answer from general knowledge and prefix with "Outside textbook:"
FORMAT — match the question type:
- One word/fact → one word
- SAQ → 1 focused paragraph, dense with evidence
- LEQ/DBQ → full essay: context, thesis, body paragraphs with evidence, nuance
- General question → clear prose, as long as needed
END EVERY RESPONSE WITH:
---
**Sources Used:**
[list every source from the tool output with chapter, section, page, and score]
**Retrieval Confidence:** HIGH/MEDIUM/LOW"""
# ── Embed ──────────────────────────────────────────────────────────────────
def embed_query(query: str) -> list[float]:
return model.encode(
f"search_query: {query}",
normalize_embeddings=True,
).tolist()
# ── Retrieve (same as notebook) ────────────────────────────────────────────
def retrieve(query: str) -> dict:
hits = qdrant.query_points(
collection_name=COLLECTION,
query=embed_query(query),
limit=TOP_K,
query_filter=Filter(
must_not=[
FieldCondition(key="is_chapter_review", match=MatchValue(value=True))
]
),
).points
top_score = hits[0].score if hits else 0
if top_score >= 0.70:
confidence = "HIGH"
elif top_score >= 0.50:
confidence = "MEDIUM"
else:
confidence = "LOW"
# Deduplicate by parent_id
seen_parents = set()
unique_hits = []
for h in hits:
pid = h.payload["parent_id"]
if pid not in seen_parents:
seen_parents.add(pid)
unique_hits.append(h)
unique_hits = unique_hits[:5]
sources = []
for h in unique_hits:
pid = h.payload["parent_id"]
parts = parent_lookup.get(pid, [])
full_text = "\n\n".join(p["text"] for p in parts)
sources.append({
"score": h.score,
"chapter_num": h.payload["chapter_num"],
"chapter_title": h.payload["chapter_title"],
"section_title": h.payload["section_title"],
"textbook_page": h.payload["textbook_page"],
"text": full_text,
})
return {
"query": query,
"confidence": confidence,
"top_score": top_score,
"sources": sources,
}
# ── MCP Server ─────────────────────────────────────────────────────────────
mcp = FastMCP("APUSH Tutor")
@mcp.tool()
def search_textbook(query: str) -> str:
"""
Search the AP US History textbook for relevant passages.
Use this for any question about US history before answering.
Always cite sources inline and list all sources at the end.
Bold or emphasize the most important phrases in your answer.
"""
retrieved = retrieve(query)
if not retrieved["sources"]:
return "No relevant passages found in the textbook."
header = f"[Confidence: {retrieved['confidence']} | Top score: {retrieved['top_score']:.3f}]\n\n"
passages = "\n\n---\n\n".join(
f"[SOURCE {i+1} | Ch{s['chapter_num']} {s['section_title']} p.{s['textbook_page']} | score: {s['score']:.3f}]\n{s['text']}"
for i, s in enumerate(retrieved["sources"])
)
footer = "\n\n===SOURCES===\n" + "\n".join(
f"[{i+1}] Ch{s['chapter_num']} {s['section_title']} p.{s['textbook_page']} (score: {s['score']:.3f})"
for i, s in enumerate(retrieved["sources"])
)
return header + passages + footer
@mcp.prompt()
def system_prompt() -> str:
"""The APUSH tutor system prompt."""
return SYSTEM_PROMPT
# ── Run ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import uvicorn
from starlette.middleware.cors import CORSMiddleware
app = mcp.streamable_http_app()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
uvicorn.run(app, host="127.0.0.1", port=52437)