almost finished

This commit is contained in:
2026-04-19 15:00:46 -05:00
parent 248317c959
commit 7935b01771

View File

@@ -37,48 +37,7 @@ with open(project_root / "data" / "processed" / "parent_lookup.json") as f:
parent_lookup = json.load(f)
# ── Config ─────────────────────────────────────────────────────────────────
TOP_K = 6
SYSTEM_PROMPT = """You are an elite AP US History tutor. Your only goal is to help the student master APUSH and score a 5.
━━━ TOOL USE ━━━
ALWAYS call search_textbook before answering any history question — no exceptions.
For complex questions (LEQ/DBQ/thematic), call it 2-3 times with different search angles to get full coverage.
━━━ CITATIONS ━━━
- Cite inline after every specific claim: (Ch5, p.153)
- The **bolded sentences** in each source are the most relevant — prioritize citing and building on those
- Never invent or guess a citation — if unsure, say "Outside textbook:"
- If the textbook is silent on something relevant, supplement with general knowledge, clearly labeled
━━━ ACCURACY ━━━
- Correct false premises immediately and directly — never reinforce a wrong assumption
- Distinguish causation from correlation, primary from secondary causes
- Note historiographical debates where relevant (e.g. revisionist vs traditional interpretations)
- Be precise with dates, names, legislation, and turning points — vagueness loses points on the exam
━━━ FORMAT — match the question type exactly ━━━
- Identification / one fact → one concise answer, one citation
- SAQ (Short Answer) → 3 tight paragraphs: claim → evidence → analysis. No intro/conclusion fluff
- LEQ (Long Essay) → Full essay: contextualization → thesis → 3 body paragraphs (each with specific evidence + analysis) → conclusion with complexity
- DBQ → Same as LEQ plus: sourcing, audience/purpose/context for docs, corroboration across docs
- Compare/contrast → Use parallel structure, explicit similarities AND differences
- General question → Clear prose, as long as needed, no padding
━━━ APUSH EXAM SKILLS ━━━
When writing essays, explicitly hit the College Board rubric:
- Contextualization: zoom out to broader historical context BEFORE the thesis
- Thesis: historically defensible, specific, addresses complexity (not just "there were many causes")
- Evidence: at least 2 specific pieces of evidence per body paragraph
- Analysis: explain HOW and WHY, not just what happened
- Complexity: demonstrate nuance — turning points, continuity vs change, multiple causation, or cross-period connections
━━━ END EVERY RESPONSE WITH ━━━
---
**Sources Used:**
[list each source: Ch# Section p.### — score: X.XXX]
**Retrieval Confidence:** HIGH / MEDIUM / LOW
**Exam Tip:** [one sentence of targeted advice for how this topic typically appears on the APUSH exam]"""
TOP_K = 10
# ── Embed ──────────────────────────────────────────────────────────────────
def embed_query(query: str) -> np.ndarray:
@@ -89,11 +48,6 @@ def embed_query(query: str) -> np.ndarray:
# ── Highlight ──────────────────────────────────────────────────────────────
def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
"""
Bold the top 3 most query-relevant sentences using the already-loaded
embedder. Reuses the query embedding computed during retrieval — zero
extra model calls.
"""
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', passage) if len(s.strip()) > 20]
if not sentences:
return passage
@@ -105,14 +59,13 @@ def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
show_progress_bar=False,
)
scores = sent_embs @ query_emb # cosine sim (both normalized)
scores = sent_embs @ query_emb
top_n = min(3, len(scores))
threshold = float(sorted(scores)[-top_n])
highlighted = passage
for sent, score in zip(sentences, scores):
if float(score) >= threshold:
# avoid double-bolding if somehow already bolded
if f"**{sent}**" not in highlighted:
highlighted = highlighted.replace(sent, f"**{sent}**")
@@ -120,7 +73,7 @@ def highlight_passage(query_emb: np.ndarray, passage: str) -> str:
# ── Retrieve ───────────────────────────────────────────────────────────────
def retrieve(query: str) -> dict:
query_emb = embed_query(query) # compute once, reuse for highlighting
query_emb = embed_query(query)
hits = qdrant.query_points(
collection_name=COLLECTION,
@@ -134,12 +87,7 @@ def retrieve(query: str) -> dict:
).points
top_score = hits[0].score if hits else 0
if top_score >= 0.70:
confidence = "HIGH"
elif top_score >= 0.50:
confidence = "MEDIUM"
else:
confidence = "LOW"
confidence = "HIGH" if top_score >= 0.70 else "MEDIUM" if top_score >= 0.50 else "LOW"
seen_parents = set()
unique_hits = []
@@ -149,14 +97,14 @@ def retrieve(query: str) -> dict:
seen_parents.add(pid)
unique_hits.append(h)
unique_hits = unique_hits[:4]
unique_hits = unique_hits[:5]
sources = []
for h in unique_hits:
pid = h.payload["parent_id"]
parts = parent_lookup.get(pid, [])
full_text = "\n\n".join(p["text"] for p in parts)
highlighted = highlight_passage(query_emb, full_text) # reuse query_emb
pid = h.payload["parent_id"]
parts = parent_lookup.get(pid, [])
full_text = "\n\n".join(p["text"] for p in parts)
highlighted = highlight_passage(query_emb, full_text)
sources.append({
"score": h.score,
@@ -174,7 +122,7 @@ def retrieve(query: str) -> dict:
"sources": sources,
}
# ── Origin bypass middleware ───────────────────────────────────────────────
# ── Origin bypass middleware ───────────────────────────────────────────────
class AllowAllOriginsMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
request._headers = request.headers.mutablecopy()
@@ -182,15 +130,16 @@ class AllowAllOriginsMiddleware(BaseHTTPMiddleware):
return await call_next(request)
# ── MCP Server ─────────────────────────────────────────────────────────────
mcp = FastMCP("APUSH Tutor", instructions=SYSTEM_PROMPT)
mcp = FastMCP("APUSH Tutor")
@mcp.tool()
def search_textbook(query: str) -> str:
"""
Search the AP US History textbook for relevant passages.
Use this for any question about US history before answering.
Always cite sources inline and list all sources at the end.
Bold or emphasize the most important phrases in your answer.
Call this before answering ANY US history question.
For broad topics call it multiple times with different search angles.
Returns passages with the most relevant sentences bolded.
Always cite inline (Ch#, p.###) and list sources at the end.
"""
retrieved = retrieve(query)