finmished beta
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,6 +2,7 @@
|
||||
data/raw/
|
||||
data/processed/
|
||||
data/vectorstore/
|
||||
data/qdrant_local/
|
||||
*.pdf
|
||||
*.epub
|
||||
*.mobi
|
||||
|
||||
401
notebooks/embed.ipynb
Normal file
401
notebooks/embed.ipynb
Normal file
@@ -0,0 +1,401 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1b771833",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/keshav/code/apush-rag/.venv/lib/python3.14/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from pathlib import Path\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"from qdrant_client import QdrantClient\n",
|
||||
"from qdrant_client.models import Distance, VectorParams, PointStruct"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4ffbf396",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded 5530 child chunks\n",
|
||||
"Sample: ch01_s000_p000 — Chapter 1: Old Worlds and New\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"project_root = Path().resolve().parent\n",
|
||||
"children_path = project_root / \"data\" / \"processed\" / \"chunks_children.json\"\n",
|
||||
"\n",
|
||||
"with open(children_path) as f:\n",
|
||||
" children = json.load(f)\n",
|
||||
"\n",
|
||||
"print(f\"Loaded {len(children)} child chunks\")\n",
|
||||
"print(f\"Sample: {children[0]['chunk_id']} — {children[0]['section_title']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "da5ff826",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.11.0+cu130\n",
|
||||
"True\n",
|
||||
"NVIDIA GeForce RTX 5080\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"print(torch.__version__)\n",
|
||||
"print(torch.cuda.is_available()) # must be True\n",
|
||||
"print(torch.cuda.get_device_name(0)) # should say RTX 5080"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "6261dd22",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<All keys matched successfully>\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Model loaded ✓\n",
|
||||
"Device: cuda:0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1.5\", trust_remote_code=True)\n",
|
||||
"# First run downloads ~300MB then caches it\n",
|
||||
"# Will automatically use your GPU if torch sees it\n",
|
||||
"\n",
|
||||
"VECTOR_DIM = 768\n",
|
||||
"COLLECTION = \"apush_chunks\"\n",
|
||||
"\n",
|
||||
"print(\"Model loaded ✓\")\n",
|
||||
"print(f\"Device: {model.device}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "7bde5821",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Query vector dim: 768 ✓\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def embed_batch(texts: list[str]) -> list[list[float]]:\n",
|
||||
" prefixed = [f\"search_document: {t}\" for t in texts]\n",
|
||||
" return model.encode(\n",
|
||||
" prefixed,\n",
|
||||
" normalize_embeddings=True,\n",
|
||||
" batch_size=64,\n",
|
||||
" show_progress_bar=False,\n",
|
||||
" ).tolist()\n",
|
||||
"\n",
|
||||
"def embed_query(query: str) -> list[float]:\n",
|
||||
" return model.encode(\n",
|
||||
" f\"search_query: {query}\",\n",
|
||||
" normalize_embeddings=True,\n",
|
||||
" ).tolist()\n",
|
||||
"\n",
|
||||
"# Quick test\n",
|
||||
"test_vec = embed_query(\"causes of the American Revolution\")\n",
|
||||
"print(f\"Query vector dim: {len(test_vec)} ✓\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "9812826b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collection created ✓\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"qdrant = QdrantClient(path=str(project_root / \"data\" / \"qdrant_local\"))\n",
|
||||
"\n",
|
||||
"# Delete and recreate if it already exists\n",
|
||||
"if qdrant.collection_exists(COLLECTION):\n",
|
||||
" qdrant.delete_collection(COLLECTION)\n",
|
||||
" print(\"Deleted existing collection\")\n",
|
||||
"\n",
|
||||
"qdrant.create_collection(\n",
|
||||
" collection_name=COLLECTION,\n",
|
||||
" vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),\n",
|
||||
")\n",
|
||||
"print(\"Collection created ✓\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "e40d06a0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Embedding: 100%|██████████| 87/87 [00:18<00:00, 4.78it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Indexed 5530 chunks ✓\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"BATCH_SIZE = 64\n",
|
||||
"points = []\n",
|
||||
"\n",
|
||||
"for i in tqdm(range(0, len(children), BATCH_SIZE), desc=\"Embedding\"):\n",
|
||||
" batch = children[i : i + BATCH_SIZE]\n",
|
||||
" vectors = embed_batch([c[\"text\"] for c in batch])\n",
|
||||
"\n",
|
||||
" for j, (chunk, vector) in enumerate(zip(batch, vectors)):\n",
|
||||
" points.append(PointStruct(\n",
|
||||
" id = i + j,\n",
|
||||
" vector = vector,\n",
|
||||
" payload = {\n",
|
||||
" \"chunk_id\": chunk[\"chunk_id\"],\n",
|
||||
" \"parent_id\": chunk[\"parent_id\"],\n",
|
||||
" \"chapter_num\": chunk[\"chapter_num\"],\n",
|
||||
" \"chapter_title\": chunk[\"chapter_title\"],\n",
|
||||
" \"section_title\": chunk[\"section_title\"],\n",
|
||||
" \"textbook_page\": chunk[\"textbook_page\"],\n",
|
||||
" \"is_chapter_review\": chunk[\"is_chapter_review\"],\n",
|
||||
" \"text\": chunk[\"text\"], # store text too for quick access\n",
|
||||
" }\n",
|
||||
" ))\n",
|
||||
"\n",
|
||||
"# Upload in one shot\n",
|
||||
"qdrant.upsert(collection_name=COLLECTION, points=points)\n",
|
||||
"print(f\"\\nIndexed {len(points)} chunks ✓\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "0b21d67f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Vectors in collection: 5530\n",
|
||||
"Expected: 5530\n",
|
||||
"Match: True ✓\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"info = qdrant.get_collection(COLLECTION)\n",
|
||||
"count = info.points_count\n",
|
||||
"print(f\"Vectors in collection: {count}\")\n",
|
||||
"print(f\"Expected: {len(children)}\")\n",
|
||||
"print(f\"Match: {count == len(children)} ✓\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "0766bbd2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Query: 'causes of the American Revolution'\n",
|
||||
" 0.767 | Ch5 › The Coming of Independence › p.153\n",
|
||||
" 0.737 | Ch6 › Democratizing Freedom › p.178\n",
|
||||
"\n",
|
||||
"Query: 'how did slavery expand in the south'\n",
|
||||
" 0.814 | Ch11 › The Old South › p.327\n",
|
||||
" 0.802 | Ch9 › The Rise of the West › p.265\n",
|
||||
"\n",
|
||||
"Query: 'what was the New Deal'\n",
|
||||
" 0.779 | Ch21 › The Second New Deal › p.691\n",
|
||||
" 0.762 | Ch23 › The Truman Presidency › p.765\n",
|
||||
"\n",
|
||||
"Query: 'civil rights movement Rosa Parks'\n",
|
||||
" 0.801 | Ch24 › The Freedom Movement › p.799\n",
|
||||
" 0.756 | Ch24 › Chapter Review › p.810\n",
|
||||
"\n",
|
||||
"Query: 'manifest destiny westward expansion'\n",
|
||||
" 0.795 | Ch9 › The Free Individual › p.279\n",
|
||||
" 0.730 | Ch17 › Becoming a World Power › p.558\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from qdrant_client.models import QueryRequest\n",
|
||||
"\n",
|
||||
"for q in test_queries:\n",
|
||||
" results = qdrant.query_points(\n",
|
||||
" collection_name=COLLECTION,\n",
|
||||
" query=embed_query(q),\n",
|
||||
" limit=2,\n",
|
||||
" ).points\n",
|
||||
"\n",
|
||||
" print(f\"\\nQuery: '{q}'\")\n",
|
||||
" for r in results:\n",
|
||||
" p = r.payload\n",
|
||||
" print(f\" {r.score:.3f} | Ch{p['chapter_num']} › {p['section_title']} › p.{p['textbook_page']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "1832d0c6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ready for notebook 3 — query pipeline ✓\n",
|
||||
"Qdrant db path: /home/keshav/code/apush-rag/data/qdrant_local\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Nothing to save — just note the query prefix for notebook 3\n",
|
||||
"# When searching always use: embed_query(\"your question here\")\n",
|
||||
"# The model lives at: nomic-ai/nomic-embed-text-v1.5\n",
|
||||
"# The collection is at: project_root / data / qdrant_local\n",
|
||||
"# Collection name: apush_chunks\n",
|
||||
"\n",
|
||||
"print(\"Ready for notebook 3 — query pipeline ✓\")\n",
|
||||
"print(f\"Qdrant db path: {project_root / 'data' / 'qdrant_local'}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "fd5119ca",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'qdrant' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m qdrant.close()\n",
|
||||
"\u001b[31mNameError\u001b[39m: name 'qdrant' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"qdrant.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "16058429",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"VRAM free: 11.2 GB\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
|
||||
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
|
||||
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
|
||||
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"del model # delete the sentence transformer\n",
|
||||
"torch.cuda.empty_cache()\n",
|
||||
"print(f\"VRAM free: {torch.cuda.mem_get_info()[0]/1e9:.1f} GB\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1394,7 +1394,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": null,
|
||||
"id": "0ef0c08a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1414,6 +1414,17 @@
|
||||
" Parent parts found: 1\n",
|
||||
" Citation: Chapter 2: European Colonies and Native Nations, 1600⠍1660 › New Englanders Divided\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
|
||||
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
|
||||
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
|
||||
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
||||
772
notebooks/query.ipynb
Normal file
772
notebooks/query.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user