226 lines
10 KiB
Python
226 lines
10 KiB
Python
import os
|
||
os.environ["MCP_ALLOW_ALL_ORIGINS"] = "1"
|
||
|
||
import re
|
||
import json
|
||
import numpy as np
|
||
from pathlib import Path
|
||
from sentence_transformers import SentenceTransformer
|
||
from qdrant_client import QdrantClient
|
||
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
||
from mcp.server.fastmcp import FastMCP
|
||
from mcp.server import streamable_http
|
||
streamable_http.ALLOWED_ORIGINS = None
|
||
|
||
import mcp.server.streamable_http as _sh
|
||
_sh.is_valid_origin = lambda origin, allowed: True
|
||
import uvicorn
|
||
from starlette.middleware.cors import CORSMiddleware
|
||
from starlette.middleware.base import BaseHTTPMiddleware
|
||
from starlette.requests import Request
|
||
from mcp.server.transport_security import TransportSecuritySettings, TransportSecurityMiddleware
|
||
|
||
TransportSecurityMiddleware.__init__ = lambda self, settings=None: setattr(
|
||
self, "settings", TransportSecuritySettings(enable_dns_rebinding_protection=False)
|
||
)
|
||
|
||
# ── Paths ──────────────────────────────────────────────────────────────────
|
||
project_root = Path(__file__).resolve().parent
|
||
|
||
# ── Models / Clients ───────────────────────────────────────────────────────
|
||
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
|
||
|
||
qdrant = QdrantClient(path=str(project_root / "data" / "qdrant_local"))
|
||
COLLECTION = "apush_chunks"
|
||
|
||
with open(project_root / "data" / "processed" / "parent_lookup.json") as f:
|
||
parent_lookup = json.load(f)
|
||
|
||
# ── Config ─────────────────────────────────────────────────────────────────
|
||
TOP_K = 6
|
||
|
||
SYSTEM_PROMPT = """You are an elite AP US History tutor. Your only goal is to help the student master APUSH and score a 5.
|
||
|
||
━━━ TOOL USE ━━━
|
||
ALWAYS call search_textbook before answering any history question — no exceptions.
|
||
For complex questions (LEQ/DBQ/thematic), call it 2-3 times with different search angles to get full coverage.
|
||
|
||
━━━ CITATIONS ━━━
|
||
- Cite inline after every specific claim: (Ch5, p.153)
|
||
- The **bolded sentences** in each source are the most relevant — prioritize citing and building on those
|
||
- Never invent or guess a citation — if unsure, say "Outside textbook:"
|
||
- If the textbook is silent on something relevant, supplement with general knowledge, clearly labeled
|
||
|
||
━━━ ACCURACY ━━━
|
||
- Correct false premises immediately and directly — never reinforce a wrong assumption
|
||
- Distinguish causation from correlation, primary from secondary causes
|
||
- Note historiographical debates where relevant (e.g. revisionist vs traditional interpretations)
|
||
- Be precise with dates, names, legislation, and turning points — vagueness loses points on the exam
|
||
|
||
━━━ FORMAT — match the question type exactly ━━━
|
||
- Identification / one fact → one concise answer, one citation
|
||
- SAQ (Short Answer) → 3 tight paragraphs: claim → evidence → analysis. No intro/conclusion fluff
|
||
- LEQ (Long Essay) → Full essay: contextualization → thesis → 3 body paragraphs (each with specific evidence + analysis) → conclusion with complexity
|
||
- DBQ → Same as LEQ plus: sourcing, audience/purpose/context for docs, corroboration across docs
|
||
- Compare/contrast → Use parallel structure, explicit similarities AND differences
|
||
- General question → Clear prose, as long as needed, no padding
|
||
|
||
━━━ APUSH EXAM SKILLS ━━━
|
||
When writing essays, explicitly hit the College Board rubric:
|
||
- Contextualization: zoom out to broader historical context BEFORE the thesis
|
||
- Thesis: historically defensible, specific, addresses complexity (not just "there were many causes")
|
||
- Evidence: at least 2 specific pieces of evidence per body paragraph
|
||
- Analysis: explain HOW and WHY, not just what happened
|
||
- Complexity: demonstrate nuance — turning points, continuity vs change, multiple causation, or cross-period connections
|
||
|
||
━━━ END EVERY RESPONSE WITH ━━━
|
||
---
|
||
**Sources Used:**
|
||
[list each source: Ch# › Section › p.### — score: X.XXX]
|
||
**Retrieval Confidence:** HIGH / MEDIUM / LOW
|
||
**Exam Tip:** [one sentence of targeted advice for how this topic typically appears on the APUSH exam]"""
|
||
|
||
# ── Embed ──────────────────────────────────────────────────────────────────
|
||
def embed_query(query: str) -> np.ndarray:
|
||
return model.encode(
|
||
f"search_query: {query}",
|
||
normalize_embeddings=True,
|
||
)
|
||
|
||
# ── Highlight ──────────────────────────────────────────────────────────────
|
||
def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
|
||
"""
|
||
Bold the top 3 most query-relevant sentences using the already-loaded
|
||
embedder. Reuses the query embedding computed during retrieval — zero
|
||
extra model calls.
|
||
"""
|
||
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', passage) if len(s.strip()) > 20]
|
||
if not sentences:
|
||
return passage
|
||
|
||
sent_embs = model.encode(
|
||
[f"search_document: {s}" for s in sentences],
|
||
normalize_embeddings=True,
|
||
batch_size=32,
|
||
show_progress_bar=False,
|
||
)
|
||
|
||
scores = sent_embs @ query_emb # cosine sim (both normalized)
|
||
top_n = min(3, len(scores))
|
||
threshold = float(sorted(scores)[-top_n])
|
||
|
||
highlighted = passage
|
||
for sent, score in zip(sentences, scores):
|
||
if float(score) >= threshold:
|
||
# avoid double-bolding if somehow already bolded
|
||
if f"**{sent}**" not in highlighted:
|
||
highlighted = highlighted.replace(sent, f"**{sent}**")
|
||
|
||
return highlighted
|
||
|
||
# ── Retrieve ───────────────────────────────────────────────────────────────
|
||
def retrieve(query: str) -> dict:
|
||
query_emb = embed_query(query) # compute once, reuse for highlighting
|
||
|
||
hits = qdrant.query_points(
|
||
collection_name=COLLECTION,
|
||
query=query_emb.tolist(),
|
||
limit=TOP_K,
|
||
query_filter=Filter(
|
||
must_not=[
|
||
FieldCondition(key="is_chapter_review", match=MatchValue(value=True))
|
||
]
|
||
),
|
||
).points
|
||
|
||
top_score = hits[0].score if hits else 0
|
||
if top_score >= 0.70:
|
||
confidence = "HIGH"
|
||
elif top_score >= 0.50:
|
||
confidence = "MEDIUM"
|
||
else:
|
||
confidence = "LOW"
|
||
|
||
seen_parents = set()
|
||
unique_hits = []
|
||
for h in hits:
|
||
pid = h.payload["parent_id"]
|
||
if pid not in seen_parents:
|
||
seen_parents.add(pid)
|
||
unique_hits.append(h)
|
||
|
||
unique_hits = unique_hits[:4]
|
||
|
||
sources = []
|
||
for h in unique_hits:
|
||
pid = h.payload["parent_id"]
|
||
parts = parent_lookup.get(pid, [])
|
||
full_text = "\n\n".join(p["text"] for p in parts)
|
||
highlighted = highlight_passage(query_emb, full_text) # reuse query_emb
|
||
|
||
sources.append({
|
||
"score": h.score,
|
||
"chapter_num": h.payload["chapter_num"],
|
||
"chapter_title": h.payload["chapter_title"],
|
||
"section_title": h.payload["section_title"],
|
||
"textbook_page": h.payload["textbook_page"],
|
||
"text": highlighted,
|
||
})
|
||
|
||
return {
|
||
"query": query,
|
||
"confidence": confidence,
|
||
"top_score": top_score,
|
||
"sources": sources,
|
||
}
|
||
|
||
# ── Origin bypass middleware ────────────────────────────────────────────────
|
||
class AllowAllOriginsMiddleware(BaseHTTPMiddleware):
|
||
async def dispatch(self, request: Request, call_next):
|
||
request._headers = request.headers.mutablecopy()
|
||
request._headers["origin"] = "http://127.0.0.1:11434"
|
||
return await call_next(request)
|
||
|
||
# ── MCP Server ─────────────────────────────────────────────────────────────
|
||
mcp = FastMCP("APUSH Tutor", instructions=SYSTEM_PROMPT)
|
||
|
||
@mcp.tool()
|
||
def search_textbook(query: str) -> str:
|
||
"""
|
||
Search the AP US History textbook for relevant passages.
|
||
Use this for any question about US history before answering.
|
||
Always cite sources inline and list all sources at the end.
|
||
Bold or emphasize the most important phrases in your answer.
|
||
"""
|
||
retrieved = retrieve(query)
|
||
|
||
if not retrieved["sources"]:
|
||
return "No relevant passages found in the textbook."
|
||
|
||
header = f"[Confidence: {retrieved['confidence']} | Top score: {retrieved['top_score']:.3f}]\n\n"
|
||
|
||
passages = "\n\n---\n\n".join(
|
||
f"[SOURCE {i+1} | Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} | score: {s['score']:.3f}]\n{s['text']}"
|
||
for i, s in enumerate(retrieved["sources"])
|
||
)
|
||
|
||
footer = "\n\n===SOURCES===\n" + "\n".join(
|
||
f"[{i+1}] Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} (score: {s['score']:.3f})"
|
||
for i, s in enumerate(retrieved["sources"])
|
||
)
|
||
|
||
return header + passages + footer
|
||
|
||
# ── Run ────────────────────────────────────────────────────────────────────
|
||
if __name__ == "__main__":
|
||
app = mcp.streamable_http_app()
|
||
app.add_middleware(AllowAllOriginsMiddleware)
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"],
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
print("Starting APUSH MCP server on http://127.0.0.1:52437/mcp")
|
||
uvicorn.run(app, host="127.0.0.1", port=52437) |