finmished beta

2026-03-30 19:38:37 -05:00
parent c6bc1216df
commit ebe592a6b1
4 changed files with 1186 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 data/raw/
 data/processed/
 data/vectorstore/
+data/qdrant_local/
 *.pdf
 *.epub
 *.mobi
--- a/notebooks/embed.ipynb
+++ b/notebooks/embed.ipynb
@@ -0,0 +1,401 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1b771833",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/keshav/code/apush-rag/.venv/lib/python3.14/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "from qdrant_client import QdrantClient\n",
+    "from qdrant_client.models import Distance, VectorParams, PointStruct"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "4ffbf396",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 5530 child chunks\n",
+      "Sample: ch01_s000_p000 — Chapter 1: Old Worlds and New\n"
+     ]
+    }
+   ],
+   "source": [
+    "project_root = Path().resolve().parent\n",
+    "children_path = project_root / \"data\" / \"processed\" / \"chunks_children.json\"\n",
+    "\n",
+    "with open(children_path) as f:\n",
+    "    children = json.load(f)\n",
+    "\n",
+    "print(f\"Loaded {len(children)} child chunks\")\n",
+    "print(f\"Sample: {children[0]['chunk_id']} — {children[0]['section_title']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "da5ff826",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.11.0+cu130\n",
+      "True\n",
+      "NVIDIA GeForce RTX 5080\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "print(torch.__version__)\n",
+    "print(torch.cuda.is_available())       # must be True\n",
+    "print(torch.cuda.get_device_name(0))   # should say RTX 5080"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6261dd22",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<All keys matched successfully>\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model loaded ✓\n",
+      "Device: cuda:0\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1.5\", trust_remote_code=True)\n",
+    "# First run downloads ~300MB then caches it\n",
+    "# Will automatically use your GPU if torch sees it\n",
+    "\n",
+    "VECTOR_DIM = 768\n",
+    "COLLECTION = \"apush_chunks\"\n",
+    "\n",
+    "print(\"Model loaded ✓\")\n",
+    "print(f\"Device: {model.device}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "7bde5821",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Query vector dim: 768 ✓\n"
+     ]
+    }
+   ],
+   "source": [
+    "def embed_batch(texts: list[str]) -> list[list[float]]:\n",
+    "    prefixed = [f\"search_document: {t}\" for t in texts]\n",
+    "    return model.encode(\n",
+    "        prefixed,\n",
+    "        normalize_embeddings=True,\n",
+    "        batch_size=64,\n",
+    "        show_progress_bar=False,\n",
+    "    ).tolist()\n",
+    "\n",
+    "def embed_query(query: str) -> list[float]:\n",
+    "    return model.encode(\n",
+    "        f\"search_query: {query}\",\n",
+    "        normalize_embeddings=True,\n",
+    "    ).tolist()\n",
+    "\n",
+    "# Quick test\n",
+    "test_vec = embed_query(\"causes of the American Revolution\")\n",
+    "print(f\"Query vector dim: {len(test_vec)} ✓\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "9812826b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collection created ✓\n"
+     ]
+    }
+   ],
+   "source": [
+    "qdrant = QdrantClient(path=str(project_root / \"data\" / \"qdrant_local\"))\n",
+    "\n",
+    "# Delete and recreate if it already exists\n",
+    "if qdrant.collection_exists(COLLECTION):\n",
+    "    qdrant.delete_collection(COLLECTION)\n",
+    "    print(\"Deleted existing collection\")\n",
+    "\n",
+    "qdrant.create_collection(\n",
+    "    collection_name=COLLECTION,\n",
+    "    vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),\n",
+    ")\n",
+    "print(\"Collection created ✓\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e40d06a0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Embedding: 100%|██████████| 87/87 [00:18<00:00,  4.78it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Indexed 5530 chunks ✓\n"
+     ]
+    }
+   ],
+   "source": [
+    "BATCH_SIZE = 64\n",
+    "points = []\n",
+    "\n",
+    "for i in tqdm(range(0, len(children), BATCH_SIZE), desc=\"Embedding\"):\n",
+    "    batch   = children[i : i + BATCH_SIZE]\n",
+    "    vectors = embed_batch([c[\"text\"] for c in batch])\n",
+    "\n",
+    "    for j, (chunk, vector) in enumerate(zip(batch, vectors)):\n",
+    "        points.append(PointStruct(\n",
+    "            id      = i + j,\n",
+    "            vector  = vector,\n",
+    "            payload = {\n",
+    "                \"chunk_id\":          chunk[\"chunk_id\"],\n",
+    "                \"parent_id\":         chunk[\"parent_id\"],\n",
+    "                \"chapter_num\":       chunk[\"chapter_num\"],\n",
+    "                \"chapter_title\":     chunk[\"chapter_title\"],\n",
+    "                \"section_title\":     chunk[\"section_title\"],\n",
+    "                \"textbook_page\":     chunk[\"textbook_page\"],\n",
+    "                \"is_chapter_review\": chunk[\"is_chapter_review\"],\n",
+    "                \"text\":              chunk[\"text\"],   # store text too for quick access\n",
+    "            }\n",
+    "        ))\n",
+    "\n",
+    "# Upload in one shot\n",
+    "qdrant.upsert(collection_name=COLLECTION, points=points)\n",
+    "print(f\"\\nIndexed {len(points)} chunks ✓\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0b21d67f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vectors in collection: 5530\n",
+      "Expected:              5530\n",
+      "Match: True ✓\n"
+     ]
+    }
+   ],
+   "source": [
+    "info = qdrant.get_collection(COLLECTION)\n",
+    "count = info.points_count\n",
+    "print(f\"Vectors in collection: {count}\")\n",
+    "print(f\"Expected:              {len(children)}\")\n",
+    "print(f\"Match: {count == len(children)} ✓\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0766bbd2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Query: 'causes of the American Revolution'\n",
+      "  0.767 | Ch5 › The Coming of Independence › p.153\n",
+      "  0.737 | Ch6 › Democratizing Freedom › p.178\n",
+      "\n",
+      "Query: 'how did slavery expand in the south'\n",
+      "  0.814 | Ch11 › The Old South › p.327\n",
+      "  0.802 | Ch9 › The Rise of the West › p.265\n",
+      "\n",
+      "Query: 'what was the New Deal'\n",
+      "  0.779 | Ch21 › The Second New Deal › p.691\n",
+      "  0.762 | Ch23 › The Truman Presidency › p.765\n",
+      "\n",
+      "Query: 'civil rights movement Rosa Parks'\n",
+      "  0.801 | Ch24 › The Freedom Movement › p.799\n",
+      "  0.756 | Ch24 › Chapter Review › p.810\n",
+      "\n",
+      "Query: 'manifest destiny westward expansion'\n",
+      "  0.795 | Ch9 › The Free Individual › p.279\n",
+      "  0.730 | Ch17 › Becoming a World Power › p.558\n"
+     ]
+    }
+   ],
+   "source": [
+    "from qdrant_client.models import QueryRequest\n",
+    "\n",
+    "for q in test_queries:\n",
+    "    results = qdrant.query_points(\n",
+    "        collection_name=COLLECTION,\n",
+    "        query=embed_query(q),\n",
+    "        limit=2,\n",
+    "    ).points\n",
+    "\n",
+    "    print(f\"\\nQuery: '{q}'\")\n",
+    "    for r in results:\n",
+    "        p = r.payload\n",
+    "        print(f\"  {r.score:.3f} | Ch{p['chapter_num']} › {p['section_title']} › p.{p['textbook_page']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "1832d0c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ready for notebook 3 — query pipeline ✓\n",
+      "Qdrant db path: /home/keshav/code/apush-rag/data/qdrant_local\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Nothing to save — just note the query prefix for notebook 3\n",
+    "# When searching always use:  embed_query(\"your question here\")\n",
+    "# The model lives at:         nomic-ai/nomic-embed-text-v1.5\n",
+    "# The collection is at:       project_root / data / qdrant_local\n",
+    "# Collection name:            apush_chunks\n",
+    "\n",
+    "print(\"Ready for notebook 3 — query pipeline ✓\")\n",
+    "print(f\"Qdrant db path: {project_root / 'data' / 'qdrant_local'}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fd5119ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'qdrant' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m qdrant.close()\n",
+      "\u001b[31mNameError\u001b[39m: name 'qdrant' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "qdrant.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16058429",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "VRAM free: 11.2 GB\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "del model  # delete the sentence transformer\n",
+    "torch.cuda.empty_cache()\n",
+    "print(f\"VRAM free: {torch.cuda.mem_get_info()[0]/1e9:.1f} GB\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/notebooks/pdf_parse.ipynb
+++ b/notebooks/pdf_parse.ipynb
@@ -1394,7 +1394,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
   "id": "0ef0c08a",
   "metadata": {},
   "outputs": [
@@ -1414,6 +1414,17 @@
      "  Parent parts found: 1\n",
      "  Citation:    Chapter 2: European Colonies and Native Nations, 1600⠍1660 › New Englanders Divided\n"
     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
    }
   ],
   "source": [
--- a/notebooks/query.ipynb
+++ b/notebooks/query.ipynb