{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "1b771833", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/keshav/code/apush-rag/.venv/lib/python3.14/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import json\n", "from pathlib import Path\n", "from tqdm import tqdm\n", "from sentence_transformers import SentenceTransformer\n", "from qdrant_client import QdrantClient\n", "from qdrant_client.models import Distance, VectorParams, PointStruct" ] }, { "cell_type": "code", "execution_count": 2, "id": "4ffbf396", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 5530 child chunks\n", "Sample: ch01_s000_p000 — Chapter 1: Old Worlds and New\n" ] } ], "source": [ "project_root = Path().resolve().parent\n", "children_path = project_root / \"data\" / \"processed\" / \"chunks_children.json\"\n", "\n", "with open(children_path) as f:\n", " children = json.load(f)\n", "\n", "print(f\"Loaded {len(children)} child chunks\")\n", "print(f\"Sample: {children[0]['chunk_id']} — {children[0]['section_title']}\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "da5ff826", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.11.0+cu130\n", "True\n", "NVIDIA GeForce RTX 5080\n" ] } ], "source": [ "import torch\n", "print(torch.__version__)\n", "print(torch.cuda.is_available()) # must be True\n", "print(torch.cuda.get_device_name(0)) # should say RTX 5080" ] }, { "cell_type": "code", "execution_count": 5, "id": "6261dd22", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Model loaded ✓\n", "Device: cuda:0\n" ] } ], "source": [ "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1.5\", trust_remote_code=True)\n", "# First run downloads ~300MB then caches it\n", "# Will automatically use your GPU if torch sees it\n", "\n", "VECTOR_DIM = 768\n", "COLLECTION = \"apush_chunks\"\n", "\n", "print(\"Model loaded ✓\")\n", "print(f\"Device: {model.device}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "7bde5821", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Query vector dim: 768 ✓\n" ] } ], "source": [ "def embed_batch(texts: list[str]) -> list[list[float]]:\n", " prefixed = [f\"search_document: {t}\" for t in texts]\n", " return model.encode(\n", " prefixed,\n", " normalize_embeddings=True,\n", " batch_size=64,\n", " show_progress_bar=False,\n", " ).tolist()\n", "\n", "def embed_query(query: str) -> list[float]:\n", " return model.encode(\n", " f\"search_query: {query}\",\n", " normalize_embeddings=True,\n", " ).tolist()\n", "\n", "# Quick test\n", "test_vec = embed_query(\"causes of the American Revolution\")\n", "print(f\"Query vector dim: {len(test_vec)} ✓\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "9812826b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collection created ✓\n" ] } ], "source": [ "qdrant = QdrantClient(path=str(project_root / \"data\" / \"qdrant_local\"))\n", "\n", "# Delete and recreate if it already exists\n", "if qdrant.collection_exists(COLLECTION):\n", " qdrant.delete_collection(COLLECTION)\n", " print(\"Deleted existing collection\")\n", "\n", "qdrant.create_collection(\n", " collection_name=COLLECTION,\n", " vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),\n", ")\n", "print(\"Collection created ✓\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "e40d06a0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Embedding: 100%|██████████| 87/87 [00:18<00:00, 4.78it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Indexed 5530 chunks ✓\n" ] } ], "source": [ "BATCH_SIZE = 64\n", "points = []\n", "\n", "for i in tqdm(range(0, len(children), BATCH_SIZE), desc=\"Embedding\"):\n", " batch = children[i : i + BATCH_SIZE]\n", " vectors = embed_batch([c[\"text\"] for c in batch])\n", "\n", " for j, (chunk, vector) in enumerate(zip(batch, vectors)):\n", " points.append(PointStruct(\n", " id = i + j,\n", " vector = vector,\n", " payload = {\n", " \"chunk_id\": chunk[\"chunk_id\"],\n", " \"parent_id\": chunk[\"parent_id\"],\n", " \"chapter_num\": chunk[\"chapter_num\"],\n", " \"chapter_title\": chunk[\"chapter_title\"],\n", " \"section_title\": chunk[\"section_title\"],\n", " \"textbook_page\": chunk[\"textbook_page\"],\n", " \"is_chapter_review\": chunk[\"is_chapter_review\"],\n", " \"text\": chunk[\"text\"], # store text too for quick access\n", " }\n", " ))\n", "\n", "# Upload in one shot\n", "qdrant.upsert(collection_name=COLLECTION, points=points)\n", "print(f\"\\nIndexed {len(points)} chunks ✓\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "0b21d67f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Vectors in collection: 5530\n", "Expected: 5530\n", "Match: True ✓\n" ] } ], "source": [ "info = qdrant.get_collection(COLLECTION)\n", "count = info.points_count\n", "print(f\"Vectors in collection: {count}\")\n", "print(f\"Expected: {len(children)}\")\n", "print(f\"Match: {count == len(children)} ✓\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "0766bbd2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Query: 'causes of the American Revolution'\n", " 0.767 | Ch5 › The Coming of Independence › p.153\n", " 0.737 | Ch6 › Democratizing Freedom › p.178\n", "\n", "Query: 'how did slavery expand in the south'\n", " 0.814 | Ch11 › The Old South › p.327\n", " 0.802 | Ch9 › The Rise of the West › p.265\n", "\n", "Query: 'what was the New Deal'\n", " 0.779 | Ch21 › The Second New Deal › p.691\n", " 0.762 | Ch23 › The Truman Presidency › p.765\n", "\n", "Query: 'civil rights movement Rosa Parks'\n", " 0.801 | Ch24 › The Freedom Movement › p.799\n", " 0.756 | Ch24 › Chapter Review › p.810\n", "\n", "Query: 'manifest destiny westward expansion'\n", " 0.795 | Ch9 › The Free Individual › p.279\n", " 0.730 | Ch17 › Becoming a World Power › p.558\n" ] } ], "source": [ "from qdrant_client.models import QueryRequest\n", "\n", "for q in test_queries:\n", " results = qdrant.query_points(\n", " collection_name=COLLECTION,\n", " query=embed_query(q),\n", " limit=2,\n", " ).points\n", "\n", " print(f\"\\nQuery: '{q}'\")\n", " for r in results:\n", " p = r.payload\n", " print(f\" {r.score:.3f} | Ch{p['chapter_num']} › {p['section_title']} › p.{p['textbook_page']}\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "1832d0c6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ready for notebook 3 — query pipeline ✓\n", "Qdrant db path: /home/keshav/code/apush-rag/data/qdrant_local\n" ] } ], "source": [ "# Nothing to save — just note the query prefix for notebook 3\n", "# When searching always use: embed_query(\"your question here\")\n", "# The model lives at: nomic-ai/nomic-embed-text-v1.5\n", "# The collection is at: project_root / data / qdrant_local\n", "# Collection name: apush_chunks\n", "\n", "print(\"Ready for notebook 3 — query pipeline ✓\")\n", "print(f\"Qdrant db path: {project_root / 'data' / 'qdrant_local'}\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "fd5119ca", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'qdrant' is not defined", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m qdrant.close()\n", "\u001b[31mNameError\u001b[39m: name 'qdrant' is not defined" ] } ], "source": [ "qdrant.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "16058429", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VRAM free: 11.2 GB\n" ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", "\u001b[1;31mClick here for more info. \n", "\u001b[1;31mView Jupyter log for further details." ] } ], "source": [ "import torch\n", "\n", "del model # delete the sentence transformer\n", "torch.cuda.empty_cache()\n", "print(f\"VRAM free: {torch.cuda.mem_get_info()[0]/1e9:.1f} GB\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.3" } }, "nbformat": 4, "nbformat_minor": 5 }