apush-rag/notebooks/embed.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1b771833",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/keshav/code/apush-rag/.venv/lib/python3.14/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from qdrant_client import QdrantClient\n",
    "from qdrant_client.models import Distance, VectorParams, PointStruct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4ffbf396",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 5530 child chunks\n",
      "Sample: ch01_s000_p000 — Chapter 1: Old Worlds and New\n"
     ]
    }
   ],
   "source": [
    "project_root = Path().resolve().parent\n",
    "children_path = project_root / \"data\" / \"processed\" / \"chunks_children.json\"\n",
    "\n",
    "with open(children_path) as f:\n",
    "    children = json.load(f)\n",
    "\n",
    "print(f\"Loaded {len(children)} child chunks\")\n",
    "print(f\"Sample: {children[0]['chunk_id']} — {children[0]['section_title']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "da5ff826",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.11.0+cu130\n",
      "True\n",
      "NVIDIA GeForce RTX 5080\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "print(torch.__version__)\n",
    "print(torch.cuda.is_available())       # must be True\n",
    "print(torch.cuda.get_device_name(0))   # should say RTX 5080"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6261dd22",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<All keys matched successfully>\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model loaded ✓\n",
      "Device: cuda:0\n"
     ]
    }
   ],
   "source": [
    "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1.5\", trust_remote_code=True)\n",
    "# First run downloads ~300MB then caches it\n",
    "# Will automatically use your GPU if torch sees it\n",
    "\n",
    "VECTOR_DIM = 768\n",
    "COLLECTION = \"apush_chunks\"\n",
    "\n",
    "print(\"Model loaded ✓\")\n",
    "print(f\"Device: {model.device}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7bde5821",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query vector dim: 768 ✓\n"
     ]
    }
   ],
   "source": [
    "def embed_batch(texts: list[str]) -> list[list[float]]:\n",
    "    prefixed = [f\"search_document: {t}\" for t in texts]\n",
    "    return model.encode(\n",
    "        prefixed,\n",
    "        normalize_embeddings=True,\n",
    "        batch_size=64,\n",
    "        show_progress_bar=False,\n",
    "    ).tolist()\n",
    "\n",
    "def embed_query(query: str) -> list[float]:\n",
    "    return model.encode(\n",
    "        f\"search_query: {query}\",\n",
    "        normalize_embeddings=True,\n",
    "    ).tolist()\n",
    "\n",
    "# Quick test\n",
    "test_vec = embed_query(\"causes of the American Revolution\")\n",
    "print(f\"Query vector dim: {len(test_vec)} ✓\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9812826b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collection created ✓\n"
     ]
    }
   ],
   "source": [
    "qdrant = QdrantClient(path=str(project_root / \"data\" / \"qdrant_local\"))\n",
    "\n",
    "# Delete and recreate if it already exists\n",
    "if qdrant.collection_exists(COLLECTION):\n",
    "    qdrant.delete_collection(COLLECTION)\n",
    "    print(\"Deleted existing collection\")\n",
    "\n",
    "qdrant.create_collection(\n",
    "    collection_name=COLLECTION,\n",
    "    vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),\n",
    ")\n",
    "print(\"Collection created ✓\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e40d06a0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Embedding: 100%|██████████| 87/87 [00:18<00:00,  4.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Indexed 5530 chunks ✓\n"
     ]
    }
   ],
   "source": [
    "BATCH_SIZE = 64\n",
    "points = []\n",
    "\n",
    "for i in tqdm(range(0, len(children), BATCH_SIZE), desc=\"Embedding\"):\n",
    "    batch   = children[i : i + BATCH_SIZE]\n",
    "    vectors = embed_batch([c[\"text\"] for c in batch])\n",
    "\n",
    "    for j, (chunk, vector) in enumerate(zip(batch, vectors)):\n",
    "        points.append(PointStruct(\n",
    "            id      = i + j,\n",
    "            vector  = vector,\n",
    "            payload = {\n",
    "                \"chunk_id\":          chunk[\"chunk_id\"],\n",
    "                \"parent_id\":         chunk[\"parent_id\"],\n",
    "                \"chapter_num\":       chunk[\"chapter_num\"],\n",
    "                \"chapter_title\":     chunk[\"chapter_title\"],\n",
    "                \"section_title\":     chunk[\"section_title\"],\n",
    "                \"textbook_page\":     chunk[\"textbook_page\"],\n",
    "                \"is_chapter_review\": chunk[\"is_chapter_review\"],\n",
    "                \"text\":              chunk[\"text\"],   # store text too for quick access\n",
    "            }\n",
    "        ))\n",
    "\n",
    "# Upload in one shot\n",
    "qdrant.upsert(collection_name=COLLECTION, points=points)\n",
    "print(f\"\\nIndexed {len(points)} chunks ✓\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0b21d67f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vectors in collection: 5530\n",
      "Expected:              5530\n",
      "Match: True ✓\n"
     ]
    }
   ],
   "source": [
    "info = qdrant.get_collection(COLLECTION)\n",
    "count = info.points_count\n",
    "print(f\"Vectors in collection: {count}\")\n",
    "print(f\"Expected:              {len(children)}\")\n",
    "print(f\"Match: {count == len(children)} ✓\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0766bbd2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Query: 'causes of the American Revolution'\n",
      "  0.767 | Ch5 › The Coming of Independence › p.153\n",
      "  0.737 | Ch6 › Democratizing Freedom › p.178\n",
      "\n",
      "Query: 'how did slavery expand in the south'\n",
      "  0.814 | Ch11 › The Old South › p.327\n",
      "  0.802 | Ch9 › The Rise of the West › p.265\n",
      "\n",
      "Query: 'what was the New Deal'\n",
      "  0.779 | Ch21 › The Second New Deal › p.691\n",
      "  0.762 | Ch23 › The Truman Presidency › p.765\n",
      "\n",
      "Query: 'civil rights movement Rosa Parks'\n",
      "  0.801 | Ch24 › The Freedom Movement › p.799\n",
      "  0.756 | Ch24 › Chapter Review › p.810\n",
      "\n",
      "Query: 'manifest destiny westward expansion'\n",
      "  0.795 | Ch9 › The Free Individual › p.279\n",
      "  0.730 | Ch17 › Becoming a World Power › p.558\n"
     ]
    }
   ],
   "source": [
    "from qdrant_client.models import QueryRequest\n",
    "\n",
    "for q in test_queries:\n",
    "    results = qdrant.query_points(\n",
    "        collection_name=COLLECTION,\n",
    "        query=embed_query(q),\n",
    "        limit=2,\n",
    "    ).points\n",
    "\n",
    "    print(f\"\\nQuery: '{q}'\")\n",
    "    for r in results:\n",
    "        p = r.payload\n",
    "        print(f\"  {r.score:.3f} | Ch{p['chapter_num']} › {p['section_title']} › p.{p['textbook_page']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1832d0c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ready for notebook 3 — query pipeline ✓\n",
      "Qdrant db path: /home/keshav/code/apush-rag/data/qdrant_local\n"
     ]
    }
   ],
   "source": [
    "# Nothing to save — just note the query prefix for notebook 3\n",
    "# When searching always use:  embed_query(\"your question here\")\n",
    "# The model lives at:         nomic-ai/nomic-embed-text-v1.5\n",
    "# The collection is at:       project_root / data / qdrant_local\n",
    "# Collection name:            apush_chunks\n",
    "\n",
    "print(\"Ready for notebook 3 — query pipeline ✓\")\n",
    "print(f\"Qdrant db path: {project_root / 'data' / 'qdrant_local'}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fd5119ca",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'qdrant' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m qdrant.close()\n",
      "\u001b[31mNameError\u001b[39m: name 'qdrant' is not defined"
     ]
    }
   ],
   "source": [
    "qdrant.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16058429",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "VRAM free: 11.2 GB\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "del model  # delete the sentence transformer\n",
    "torch.cuda.empty_cache()\n",
    "print(f\"VRAM free: {torch.cuda.mem_get_info()[0]/1e9:.1f} GB\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}