yes

fixed and added mcp
working
2026-04-12 23:56:49 -05:00 · 2026-04-12 23:13:16 -05:00 · 2026-04-12 22:50:16 -05:00
4 changed files with 737 additions and 200 deletions
--- a/mcp_server.py
+++ b/mcp_server.py
@@ -0,0 +1,179 @@
 import os
 os.environ["MCP_ALLOW_ALL_ORIGINS"] = "1"
 import json
 from pathlib import Path
 from sentence_transformers import SentenceTransformer
 from qdrant_client import QdrantClient
 from qdrant_client.models import Filter, FieldCondition, MatchValue
 from mcp.server.fastmcp import FastMCP
 # Add this right after the FastMCP import, before anything else
 from mcp.server import streamable_http
 streamable_http.ALLOWED_ORIGINS = None  # try this first
 # If that doesn't work, patch the actual check function:
 import mcp.server.streamable_http as _sh
 _sh.is_valid_origin = lambda origin, allowed: True
 import uvicorn
 from starlette.middleware.cors import CORSMiddleware
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.requests import Request
 from mcp.server.transport_security import TransportSecuritySettings, TransportSecurityMiddleware
 # Monkey-patch to disable DNS rebinding protection entirely
 TransportSecurityMiddleware.__init__ = lambda self, settings=None: setattr(
    self, "settings", TransportSecuritySettings(enable_dns_rebinding_protection=False)
 )
 # ── Paths ──────────────────────────────────────────────────────────────────
 project_root = Path(__file__).resolve().parent
 # ── Models / Clients ───────────────────────────────────────────────────────
 model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
 qdrant     = QdrantClient(path=str(project_root / "data" / "qdrant_local"))
 COLLECTION = "apush_chunks"
 with open(project_root / "data" / "processed" / "parent_lookup.json") as f:
    parent_lookup = json.load(f)
 # ── Config ─────────────────────────────────────────────────────────────────
 TOP_K = 10
 SYSTEM_PROMPT = """You are an expert AP US History tutor helping a student ace their APUSH exam.
 You have access to the search_textbook tool. Call it before answering ANY history question.
 ANSWERING:
 - Cite inline like (Ch5, p.153) after every specific claim
 - **Bold** key terms, dates, names, and critical facts
 - Correct false premises directly — don't reinforce wrong assumptions
 - If the textbook doesn't cover it, answer from general knowledge and prefix with "Outside textbook:"
 FORMAT — match the question type:
 - One word/fact → one word
 - SAQ → 1 focused paragraph, dense with evidence
 - LEQ/DBQ → full essay: context, thesis, body paragraphs with evidence, nuance
 - General question → clear prose, as long as needed
 END EVERY RESPONSE WITH:
 ---
 **Sources Used:**
 [list every source from the tool output with chapter, section, page, and score]
 **Retrieval Confidence:** HIGH/MEDIUM/LOW"""
 # ── Embed ──────────────────────────────────────────────────────────────────
 def embed_query(query: str) -> list[float]:
    return model.encode(
        f"search_query: {query}",
        normalize_embeddings=True,
    ).tolist()
 # ── Retrieve ───────────────────────────────────────────────────────────────
 def retrieve(query: str) -> dict:
    hits = qdrant.query_points(
        collection_name=COLLECTION,
        query=embed_query(query),
        limit=TOP_K,
        query_filter=Filter(
            must_not=[
                FieldCondition(key="is_chapter_review", match=MatchValue(value=True))
            ]
        ),
    ).points
    top_score = hits[0].score if hits else 0
    if top_score >= 0.70:
        confidence = "HIGH"
    elif top_score >= 0.50:
        confidence = "MEDIUM"
    else:
        confidence = "LOW"
    seen_parents = set()
    unique_hits  = []
    for h in hits:
        pid = h.payload["parent_id"]
        if pid not in seen_parents:
            seen_parents.add(pid)
            unique_hits.append(h)
    unique_hits = unique_hits[:5]
    sources = []
    for h in unique_hits:
        pid       = h.payload["parent_id"]
        parts     = parent_lookup.get(pid, [])
        full_text = "\n\n".join(p["text"] for p in parts)
        sources.append({
            "score":         h.score,
            "chapter_num":   h.payload["chapter_num"],
            "chapter_title": h.payload["chapter_title"],
            "section_title": h.payload["section_title"],
            "textbook_page": h.payload["textbook_page"],
            "text":          full_text,
        })
    return {
        "query":      query,
        "confidence": confidence,
        "top_score":  top_score,
        "sources":    sources,
    }
 # ── Origin bypass middleware ────────────────────────────────────────────────
 class AllowAllOriginsMiddleware(BaseHTTPMiddleware):
    async def dispatch(self, request: Request, call_next):
        # Spoof origin so FastMCP's internal check passes
        request._headers = request.headers.mutablecopy()
        request._headers["origin"] = "http://127.0.0.1:11434"
        return await call_next(request)
 # ── MCP Server ─────────────────────────────────────────────────────────────
 mcp = FastMCP("APUSH Tutor")
@mcp.tool()
 def search_textbook(query: str) -> str:
    """
    Search the AP US History textbook for relevant passages.
    Use this for any question about US history before answering.
    Always cite sources inline and list all sources at the end.
    Bold or emphasize the most important phrases in your answer.
    """
    retrieved = retrieve(query)
    if not retrieved["sources"]:
        return "No relevant passages found in the textbook."
    header = f"[Confidence: {retrieved['confidence']} | Top score: {retrieved['top_score']:.3f}]\n\n"
    passages = "\n\n---\n\n".join(
        f"[SOURCE {i+1} | Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} | score: {s['score']:.3f}]\n{s['text']}"
        for i, s in enumerate(retrieved["sources"])
    )
    footer = "\n\n===SOURCES===\n" + "\n".join(
        f"[{i+1}] Ch{s['chapter_num']} › {s['section_title']} › p.{s['textbook_page']} (score: {s['score']:.3f})"
        for i, s in enumerate(retrieved["sources"])
    )
    return header + passages + footer
@mcp.prompt()
 def system_prompt() -> str:
    """The APUSH tutor system prompt."""
    return SYSTEM_PROMPT
 # ── Run ────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    app = mcp.streamable_http_app()
    app.add_middleware(AllowAllOriginsMiddleware)
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_methods=["*"],
        allow_headers=["*"],
    )
    print("Starting APUSH MCP server on http://127.0.0.1:52437/mcp")
    uvicorn.run(app, host="127.0.0.1", port=52437)
--- a/notebooks/embed.ipynb
+++ b/notebooks/embed.ipynb
@@ -393,7 +393,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.14.3"
+   "version": "3.14.4"
  }
 },
 "nbformat": 4,
--- a/notebooks/pdf_parse.ipynb
+++ b/notebooks/pdf_parse.ipynb
@@ -1511,7 +1511,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.14.3"
+   "version": "3.14.4"
  }
 },
 "nbformat": 4,
--- a/notebooks/query.ipynb
+++ b/notebooks/query.ipynb
Author	SHA1	Message	Date
KeshavAnandCode	1511423057	yes	2026-04-12 23:56:49 -05:00
KeshavAnandCode	50b4c1c905	fixed and added mcp	2026-04-12 23:13:16 -05:00
KeshavAnandCode	3378ad7328	working	2026-04-12 22:50:16 -05:00