This repository has been archived on 2026-04-23. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
apush-rag/notebooks/embed.ipynb
2026-04-12 22:50:16 -05:00

402 lines
11 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1b771833",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/keshav/code/apush-rag/.venv/lib/python3.14/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import json\n",
"from pathlib import Path\n",
"from tqdm import tqdm\n",
"from sentence_transformers import SentenceTransformer\n",
"from qdrant_client import QdrantClient\n",
"from qdrant_client.models import Distance, VectorParams, PointStruct"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4ffbf396",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded 5530 child chunks\n",
"Sample: ch01_s000_p000 — Chapter 1: Old Worlds and New\n"
]
}
],
"source": [
"project_root = Path().resolve().parent\n",
"children_path = project_root / \"data\" / \"processed\" / \"chunks_children.json\"\n",
"\n",
"with open(children_path) as f:\n",
" children = json.load(f)\n",
"\n",
"print(f\"Loaded {len(children)} child chunks\")\n",
"print(f\"Sample: {children[0]['chunk_id']} — {children[0]['section_title']}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "da5ff826",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.11.0+cu130\n",
"True\n",
"NVIDIA GeForce RTX 5080\n"
]
}
],
"source": [
"import torch\n",
"print(torch.__version__)\n",
"print(torch.cuda.is_available()) # must be True\n",
"print(torch.cuda.get_device_name(0)) # should say RTX 5080"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6261dd22",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<All keys matched successfully>\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model loaded ✓\n",
"Device: cuda:0\n"
]
}
],
"source": [
"model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1.5\", trust_remote_code=True)\n",
"# First run downloads ~300MB then caches it\n",
"# Will automatically use your GPU if torch sees it\n",
"\n",
"VECTOR_DIM = 768\n",
"COLLECTION = \"apush_chunks\"\n",
"\n",
"print(\"Model loaded ✓\")\n",
"print(f\"Device: {model.device}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7bde5821",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Query vector dim: 768 ✓\n"
]
}
],
"source": [
"def embed_batch(texts: list[str]) -> list[list[float]]:\n",
" prefixed = [f\"search_document: {t}\" for t in texts]\n",
" return model.encode(\n",
" prefixed,\n",
" normalize_embeddings=True,\n",
" batch_size=64,\n",
" show_progress_bar=False,\n",
" ).tolist()\n",
"\n",
"def embed_query(query: str) -> list[float]:\n",
" return model.encode(\n",
" f\"search_query: {query}\",\n",
" normalize_embeddings=True,\n",
" ).tolist()\n",
"\n",
"# Quick test\n",
"test_vec = embed_query(\"causes of the American Revolution\")\n",
"print(f\"Query vector dim: {len(test_vec)} ✓\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9812826b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collection created ✓\n"
]
}
],
"source": [
"qdrant = QdrantClient(path=str(project_root / \"data\" / \"qdrant_local\"))\n",
"\n",
"# Delete and recreate if it already exists\n",
"if qdrant.collection_exists(COLLECTION):\n",
" qdrant.delete_collection(COLLECTION)\n",
" print(\"Deleted existing collection\")\n",
"\n",
"qdrant.create_collection(\n",
" collection_name=COLLECTION,\n",
" vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),\n",
")\n",
"print(\"Collection created ✓\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e40d06a0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Embedding: 100%|██████████| 87/87 [00:18<00:00, 4.78it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Indexed 5530 chunks ✓\n"
]
}
],
"source": [
"BATCH_SIZE = 64\n",
"points = []\n",
"\n",
"for i in tqdm(range(0, len(children), BATCH_SIZE), desc=\"Embedding\"):\n",
" batch = children[i : i + BATCH_SIZE]\n",
" vectors = embed_batch([c[\"text\"] for c in batch])\n",
"\n",
" for j, (chunk, vector) in enumerate(zip(batch, vectors)):\n",
" points.append(PointStruct(\n",
" id = i + j,\n",
" vector = vector,\n",
" payload = {\n",
" \"chunk_id\": chunk[\"chunk_id\"],\n",
" \"parent_id\": chunk[\"parent_id\"],\n",
" \"chapter_num\": chunk[\"chapter_num\"],\n",
" \"chapter_title\": chunk[\"chapter_title\"],\n",
" \"section_title\": chunk[\"section_title\"],\n",
" \"textbook_page\": chunk[\"textbook_page\"],\n",
" \"is_chapter_review\": chunk[\"is_chapter_review\"],\n",
" \"text\": chunk[\"text\"], # store text too for quick access\n",
" }\n",
" ))\n",
"\n",
"# Upload in one shot\n",
"qdrant.upsert(collection_name=COLLECTION, points=points)\n",
"print(f\"\\nIndexed {len(points)} chunks ✓\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0b21d67f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vectors in collection: 5530\n",
"Expected: 5530\n",
"Match: True ✓\n"
]
}
],
"source": [
"info = qdrant.get_collection(COLLECTION)\n",
"count = info.points_count\n",
"print(f\"Vectors in collection: {count}\")\n",
"print(f\"Expected: {len(children)}\")\n",
"print(f\"Match: {count == len(children)} ✓\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "0766bbd2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Query: 'causes of the American Revolution'\n",
" 0.767 | Ch5 The Coming of Independence p.153\n",
" 0.737 | Ch6 Democratizing Freedom p.178\n",
"\n",
"Query: 'how did slavery expand in the south'\n",
" 0.814 | Ch11 The Old South p.327\n",
" 0.802 | Ch9 The Rise of the West p.265\n",
"\n",
"Query: 'what was the New Deal'\n",
" 0.779 | Ch21 The Second New Deal p.691\n",
" 0.762 | Ch23 The Truman Presidency p.765\n",
"\n",
"Query: 'civil rights movement Rosa Parks'\n",
" 0.801 | Ch24 The Freedom Movement p.799\n",
" 0.756 | Ch24 Chapter Review p.810\n",
"\n",
"Query: 'manifest destiny westward expansion'\n",
" 0.795 | Ch9 The Free Individual p.279\n",
" 0.730 | Ch17 Becoming a World Power p.558\n"
]
}
],
"source": [
"from qdrant_client.models import QueryRequest\n",
"\n",
"for q in test_queries:\n",
" results = qdrant.query_points(\n",
" collection_name=COLLECTION,\n",
" query=embed_query(q),\n",
" limit=2,\n",
" ).points\n",
"\n",
" print(f\"\\nQuery: '{q}'\")\n",
" for r in results:\n",
" p = r.payload\n",
" print(f\" {r.score:.3f} | Ch{p['chapter_num']} {p['section_title']} p.{p['textbook_page']}\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1832d0c6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ready for notebook 3 — query pipeline ✓\n",
"Qdrant db path: /home/keshav/code/apush-rag/data/qdrant_local\n"
]
}
],
"source": [
"# Nothing to save — just note the query prefix for notebook 3\n",
"# When searching always use: embed_query(\"your question here\")\n",
"# The model lives at: nomic-ai/nomic-embed-text-v1.5\n",
"# The collection is at: project_root / data / qdrant_local\n",
"# Collection name: apush_chunks\n",
"\n",
"print(\"Ready for notebook 3 — query pipeline ✓\")\n",
"print(f\"Qdrant db path: {project_root / 'data' / 'qdrant_local'}\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "fd5119ca",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'qdrant' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m qdrant.close()\n",
"\u001b[31mNameError\u001b[39m: name 'qdrant' is not defined"
]
}
],
"source": [
"qdrant.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16058429",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"VRAM free: 11.2 GB\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"import torch\n",
"\n",
"del model # delete the sentence transformer\n",
"torch.cuda.empty_cache()\n",
"print(f\"VRAM free: {torch.cuda.mem_get_info()[0]/1e9:.1f} GB\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}