working documentation parse
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 59,
|
||||
"id": "e91fd8c7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -20,7 +20,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 60,
|
||||
"id": "11896305",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -262,7 +262,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 61,
|
||||
"id": "991dbad2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -496,7 +496,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 62,
|
||||
"id": "43e20197",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -561,7 +561,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 63,
|
||||
"id": "149bc714",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -596,7 +596,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 64,
|
||||
"id": "c2563864",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -641,6 +641,750 @@
|
||||
"print(f\"Generated → {output_path}\")\n",
|
||||
"print(\"Now open that file and fill in the real page numbers. Leave as null if unknown.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"id": "f1830bd0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded 177 entries from page map\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import yaml\n",
|
||||
"\n",
|
||||
"page_map_path = project_root / \"config\" / \"page_map.yaml\"\n",
|
||||
"\n",
|
||||
"with open(page_map_path) as f:\n",
|
||||
" page_map = yaml.safe_load(f)\n",
|
||||
"\n",
|
||||
"# Build fast lookup: section title → real page (will be None for now)\n",
|
||||
"section_page_lookup = {}\n",
|
||||
"for ch_num, ch_data in page_map[\"chapters\"].items():\n",
|
||||
" # Chapter-level entry\n",
|
||||
" section_page_lookup[ch_data[\"title\"]] = ch_data.get(\"real_page\")\n",
|
||||
" # Section-level entries\n",
|
||||
" if ch_data.get(\"sections\"):\n",
|
||||
" for section_title, real_page in ch_data[\"sections\"].items():\n",
|
||||
" section_page_lookup[section_title] = real_page\n",
|
||||
"\n",
|
||||
"print(f\"Loaded {len(section_page_lookup)} entries from page map\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"id": "3065642e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"('Half-title Page', 2) → y=0\n",
|
||||
"('Physical/Political Map of The United States', 5) → y=0\n",
|
||||
"('Political Map of The World', 6) → y=0\n",
|
||||
"('Title Page', 7) → y=0\n",
|
||||
"('Copyright', 10) → y=0\n",
|
||||
"('Dedication', 13) → y=0\n",
|
||||
"('Contents', 14) → y=0\n",
|
||||
"('List of Maps, Tables, and Figures', 22) → y=0\n",
|
||||
"('About the Authors', 32) → y=0\n",
|
||||
"('Preface', 34) → y=0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Build a lookup from (title, pdf_page_1indexed) → y coordinate\n",
|
||||
"# This is what gives us pixel-precise section boundaries\n",
|
||||
"toc_full = doc.get_toc(simple=False)\n",
|
||||
"\n",
|
||||
"toc_coord_lookup = {}\n",
|
||||
"for item in toc_full:\n",
|
||||
" level = item[0]\n",
|
||||
" title = item[1]\n",
|
||||
" page_1idx = item[2]\n",
|
||||
" dest = item[3] if len(item) > 3 else {}\n",
|
||||
" y = dest.get(\"y\", 0) if isinstance(dest, dict) else 0\n",
|
||||
" toc_coord_lookup[(title, page_1idx)] = y\n",
|
||||
"\n",
|
||||
"# Sanity check — print a few to confirm y values are non-zero\n",
|
||||
"sample = list(toc_coord_lookup.items())[:10]\n",
|
||||
"for k, v in sample:\n",
|
||||
" print(f\"{k} → y={v}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"id": "ada206a9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'\"Hello world\"\\n\\nSome text'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def clean_text(text: str) -> str:\n",
|
||||
" # Fix encoding artifacts from pirated PDF\n",
|
||||
" replacements = {\n",
|
||||
" '“': '\"', 'â€\\x9d': '\"', '’': \"'\",\n",
|
||||
" 'â€\"': '—', '‘': \"'\", '…': '…',\n",
|
||||
" '⠍': '-', '⠒': \"'\",\n",
|
||||
" }\n",
|
||||
" for bad, good in replacements.items():\n",
|
||||
" text = text.replace(bad, good)\n",
|
||||
"\n",
|
||||
" # Collapse excessive whitespace\n",
|
||||
" text = re.sub(r'\\n{3,}', '\\n\\n', text)\n",
|
||||
" text = re.sub(r'[ \\t]+', ' ', text)\n",
|
||||
" text = re.sub(r'\\n ', '\\n', text)\n",
|
||||
"\n",
|
||||
" return text.strip()\n",
|
||||
"\n",
|
||||
"# Quick test\n",
|
||||
"test = clean_text(\"“Hello worldâ€\\x9d\\n\\n\\n\\nSome text\")\n",
|
||||
"print(repr(test)) # should be: '\"Hello world\"\\n\\nSome text'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"id": "b4b3ffea",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ch28 last section ends at PDF page: 1843\n",
|
||||
"Back matter starts at PDF page: 1844\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# %%\n",
|
||||
"# Fix back matter bleed BEFORE chunking\n",
|
||||
"back_matter_start = next(item[2] for item in toc if item[1] == 'Suggested Reading')\n",
|
||||
"\n",
|
||||
"for section in structured:\n",
|
||||
" if section[\"end_pdf\"] >= back_matter_start - 1:\n",
|
||||
" section[\"end_pdf\"] = back_matter_start - 2\n",
|
||||
"\n",
|
||||
"# Verify Ch28 is now sane\n",
|
||||
"ch28 = [s for s in structured if s[\"chapter_num\"] == 28]\n",
|
||||
"print(f\"Ch28 last section ends at PDF page: {ch28[-1]['end_pdf']}\")\n",
|
||||
"print(f\"Back matter starts at PDF page: {back_matter_start - 1}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"id": "f7d6a626",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Section: An Old World: West Africa\n",
|
||||
"Paragraphs extracted: 4\n",
|
||||
"============================================================\n",
|
||||
"\n",
|
||||
"[Para 0 | PDF page 72]\n",
|
||||
"Like Native Americans and Europeans, West Africans did not consider themselves all one people. West Africans spoke dozens of different languages and hundreds of dialects. They lived under a variety of different political systems. In the late medieval and early modern eras, most West Africans lived in towns centered on kinship and run by elders. As in Native America, women in many parts of West Afr\n",
|
||||
"----------------------------------------\n",
|
||||
"\n",
|
||||
"[Para 1 | PDF page 72]\n",
|
||||
"Some parts of West Africa were ruled by large empires. Gaining power in the thirteenth century, the Mali empire became the largest in West Africa, with major cities at Jenne, Gao, and Timbuktu. To the south was the smaller kingdom of Benin, in what is now Nigeria. Its capital, Edo, was an imposing city whose craftspeople produced bronze sculptures that still inspire admiration for their artistic b\n",
|
||||
"----------------------------------------\n",
|
||||
"\n",
|
||||
"[Para 2 | PDF page 72]\n",
|
||||
"The wealth of West African empires was built on trans-Saharan trade. Starting around the year 1000, Muslim traders from North Africa and the Middle East crossed the Sahara to trade with West Africa. Camel caravans carried spices, silks, and cotton south to exchange for West African products, including textiles, gold, copper, grains, nuts, and art. From North Africa, West African products reached m\n",
|
||||
"----------------------------------------\n",
|
||||
"\n",
|
||||
"[Para 3 | PDF page 72]\n",
|
||||
"Although connected to trading networks and regional politics, most West Africans farmed, herded, and fished locally for their living. The rice, millet,\n",
|
||||
"----------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def extract_section_text(section_idx: int) -> list[tuple[str, int]]:\n",
|
||||
" section = structured[section_idx]\n",
|
||||
" title = section[\"title\"]\n",
|
||||
" start_pdf = section[\"start_pdf\"]\n",
|
||||
" end_pdf = section[\"end_pdf\"]\n",
|
||||
"\n",
|
||||
" start_y = toc_coord_lookup.get((title, start_pdf + 1), 0)\n",
|
||||
" if section_idx + 1 < len(structured):\n",
|
||||
" next_s = structured[section_idx + 1]\n",
|
||||
" end_y = toc_coord_lookup.get((next_s[\"title\"], next_s[\"start_pdf\"] + 1), 9999)\n",
|
||||
" else:\n",
|
||||
" end_y = 9999\n",
|
||||
"\n",
|
||||
" paragraphs = []\n",
|
||||
"\n",
|
||||
" for page_num in range(start_pdf, end_pdf + 1):\n",
|
||||
" page = doc[page_num]\n",
|
||||
" page_width = page.rect.width\n",
|
||||
" page_height = page.rect.height\n",
|
||||
"\n",
|
||||
" if start_pdf == end_pdf:\n",
|
||||
" clip = fitz.Rect(0, start_y, page_width, end_y)\n",
|
||||
" elif page_num == start_pdf:\n",
|
||||
" clip = fitz.Rect(0, start_y, page_width, page_height)\n",
|
||||
" elif page_num == end_pdf:\n",
|
||||
" clip = fitz.Rect(0, 0, page_width, end_y)\n",
|
||||
" else:\n",
|
||||
" clip = page.rect\n",
|
||||
"\n",
|
||||
" # blocks returns: (x0, y0, x1, y1, text, block_no, block_type)\n",
|
||||
" # block_type 0 = text, 1 = image\n",
|
||||
" blocks = page.get_text(\"blocks\", clip=clip)\n",
|
||||
"\n",
|
||||
" for block in blocks:\n",
|
||||
" block_type = block[6]\n",
|
||||
" if block_type != 0:\n",
|
||||
" continue # skip image blocks\n",
|
||||
"\n",
|
||||
" raw_text = block[4]\n",
|
||||
" text = clean_text(raw_text)\n",
|
||||
" text = text.replace('\\n', ' ').strip() # join wrapped lines within block\n",
|
||||
"\n",
|
||||
" if len(text) < 80:\n",
|
||||
" continue # skip headers, page numbers, captions that are too short\n",
|
||||
"\n",
|
||||
" paragraphs.append((text, page_num))\n",
|
||||
"\n",
|
||||
" return paragraphs\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ── Test ─────────────────────────────────────────────────────────────────────\n",
|
||||
"test_idx = next(i for i, s in enumerate(structured) if 'West Africa' in s['title'])\n",
|
||||
"test_paras = extract_section_text(test_idx)\n",
|
||||
"\n",
|
||||
"print(f\"Section: {structured[test_idx]['title']}\")\n",
|
||||
"print(f\"Paragraphs extracted: {len(test_paras)}\")\n",
|
||||
"print(\"=\" * 60)\n",
|
||||
"\n",
|
||||
"for i, (text, page) in enumerate(test_paras):\n",
|
||||
" print(f\"\\n[Para {i} | PDF page {page}]\")\n",
|
||||
" print(text[:400])\n",
|
||||
" print(\"-\" * 40)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"id": "01a14318",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✓ Rebuilt 5474 children with parent_id field\n",
|
||||
"✗ Failed: 0\n",
|
||||
"\n",
|
||||
"Sample child:\n",
|
||||
" chunk_id: ch02_s014_p008\n",
|
||||
" parent_id: ch02_s014_parent\n",
|
||||
" type: child\n",
|
||||
"Child chunks: 5474\n",
|
||||
"Parent chunks: 204\n",
|
||||
"Total: 5678\n",
|
||||
"\n",
|
||||
"Expected parents: 204 (one per unique section)\n",
|
||||
"\n",
|
||||
"Sample parent:\n",
|
||||
" ID: ch04_s026_parent\n",
|
||||
" Citation: Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763 › Slavery and Empire › page number not yet added\n",
|
||||
" Word count: 2754\n",
|
||||
" Preview: Of the estimated 10 million Africans transported to the Americas between 1492 and 1820, more than half arrived between 1700 and 1800. The Atlantic slave trade would later be condemned as a crime against humanity. But in the eighteenth century, it was a regularized business in which European merchant\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# %%\n",
|
||||
"# Build parent chunks — one per section, containing all child text concatenated\n",
|
||||
"# Children point to their parent via parent_id field\n",
|
||||
"\n",
|
||||
"from dataclasses import dataclass, asdict, field\n",
|
||||
"from typing import Optional\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class Chunk:\n",
|
||||
" chunk_id: str\n",
|
||||
" text: str\n",
|
||||
" chapter_num: int\n",
|
||||
" chapter_title: str\n",
|
||||
" section_title: str\n",
|
||||
" paragraph_index: int\n",
|
||||
" pdf_page: int\n",
|
||||
" textbook_page: Optional[int]\n",
|
||||
" is_chapter_header: bool\n",
|
||||
" is_chapter_review: bool\n",
|
||||
" chunk_type: str # \"child\" | \"parent\"\n",
|
||||
" parent_id: Optional[str] # set on children, None on parents\n",
|
||||
"\n",
|
||||
" def citation(self) -> str:\n",
|
||||
" page_str = f\"approx. p. {self.textbook_page}\" if self.textbook_page else \"page number not yet added\"\n",
|
||||
" return f\"{self.chapter_title} › {self.section_title} › {page_str}\"\n",
|
||||
"\n",
|
||||
"# %%\n",
|
||||
"# Rebuild all_chunks using the FINAL Chunk definition (with chunk_type + parent_id)\n",
|
||||
"# This replaces the old all_chunks that were built without those fields\n",
|
||||
"\n",
|
||||
"all_chunks_final = []\n",
|
||||
"failed = []\n",
|
||||
"\n",
|
||||
"for section_idx, section in enumerate(structured):\n",
|
||||
" try:\n",
|
||||
" title = section[\"title\"]\n",
|
||||
" ch_num = section[\"chapter_num\"]\n",
|
||||
" ch_title = section[\"chapter_title\"]\n",
|
||||
" is_header = section[\"is_chapter_header\"]\n",
|
||||
" is_review = \"Chapter Review\" in title\n",
|
||||
" real_page = section_page_lookup.get(title)\n",
|
||||
"\n",
|
||||
" # Pre-compute parent_id for this section\n",
|
||||
" parent_id = f\"ch{ch_num:02d}_s{section_idx:03d}_parent\"\n",
|
||||
"\n",
|
||||
" paragraphs = extract_section_text(section_idx)\n",
|
||||
"\n",
|
||||
" for para_idx, (text, pdf_page) in enumerate(paragraphs):\n",
|
||||
" chunk_id = f\"ch{ch_num:02d}_s{section_idx:03d}_p{para_idx:03d}\"\n",
|
||||
" all_chunks_final.append(Chunk(\n",
|
||||
" chunk_id = chunk_id,\n",
|
||||
" text = text,\n",
|
||||
" chapter_num = ch_num,\n",
|
||||
" chapter_title = ch_title,\n",
|
||||
" section_title = title,\n",
|
||||
" paragraph_index = para_idx,\n",
|
||||
" pdf_page = pdf_page,\n",
|
||||
" textbook_page = real_page,\n",
|
||||
" is_chapter_header = is_header,\n",
|
||||
" is_chapter_review = is_review,\n",
|
||||
" chunk_type = \"child\",\n",
|
||||
" parent_id = parent_id,\n",
|
||||
" ))\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" failed.append({\"section\": section[\"title\"], \"error\": str(e)})\n",
|
||||
" print(f\"FAILED: {section['title']} → {e}\")\n",
|
||||
"\n",
|
||||
"# Replace old all_chunks\n",
|
||||
"all_chunks = all_chunks_final\n",
|
||||
"\n",
|
||||
"print(f\"✓ Rebuilt {len(all_chunks)} children with parent_id field\")\n",
|
||||
"print(f\"✗ Failed: {len(failed)}\")\n",
|
||||
"\n",
|
||||
"# Verify parent_id is present\n",
|
||||
"sample = all_chunks[300]\n",
|
||||
"print(f\"\\nSample child:\")\n",
|
||||
"print(f\" chunk_id: {sample.chunk_id}\")\n",
|
||||
"print(f\" parent_id: {sample.parent_id}\")\n",
|
||||
"print(f\" type: {sample.chunk_type}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# ── Second pass: build one parent per unique section ─────────────────────────\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"# Group children by section\n",
|
||||
"section_children = defaultdict(list)\n",
|
||||
"for chunk in all_chunks:\n",
|
||||
" key = (chunk.chapter_num, chunk.section_title)\n",
|
||||
" section_children[key].append(chunk)\n",
|
||||
"\n",
|
||||
"parent_chunks = []\n",
|
||||
"\n",
|
||||
"for (ch_num, section_title), children in section_children.items():\n",
|
||||
" # Sort children by paragraph index so text is in reading order\n",
|
||||
" children_sorted = sorted(children, key=lambda c: c.paragraph_index)\n",
|
||||
"\n",
|
||||
" # Concatenate all paragraph text for this section\n",
|
||||
" full_text = \"\\n\\n\".join(c.text for c in children_sorted)\n",
|
||||
"\n",
|
||||
" # Inherit metadata from first child\n",
|
||||
" first = children_sorted[0]\n",
|
||||
"\n",
|
||||
" parent_id = first.parent_id\n",
|
||||
"\n",
|
||||
" parent = Chunk(\n",
|
||||
" chunk_id = parent_id,\n",
|
||||
" text = full_text,\n",
|
||||
" chapter_num = ch_num,\n",
|
||||
" chapter_title = first.chapter_title,\n",
|
||||
" section_title = section_title,\n",
|
||||
" paragraph_index = -1, # not applicable for parent\n",
|
||||
" pdf_page = first.pdf_page,\n",
|
||||
" textbook_page = first.textbook_page,\n",
|
||||
" is_chapter_header = first.is_chapter_header,\n",
|
||||
" is_chapter_review = first.is_chapter_review,\n",
|
||||
" chunk_type = \"parent\",\n",
|
||||
" parent_id = None, # parents have no parent\n",
|
||||
" )\n",
|
||||
" parent_chunks.append(parent)\n",
|
||||
"\n",
|
||||
"print(f\"Child chunks: {len(all_chunks)}\")\n",
|
||||
"print(f\"Parent chunks: {len(parent_chunks)}\")\n",
|
||||
"print(f\"Total: {len(all_chunks) + len(parent_chunks)}\")\n",
|
||||
"\n",
|
||||
"# Sanity check — one parent per section\n",
|
||||
"print(f\"\\nExpected parents: {len(section_children)} (one per unique section)\")\n",
|
||||
"\n",
|
||||
"# Show one parent end to end\n",
|
||||
"sample_parent = next(p for p in parent_chunks if p.chapter_num == 4 \n",
|
||||
" and 'Slavery and Empire' in p.section_title)\n",
|
||||
"print(f\"\\nSample parent:\")\n",
|
||||
"print(f\" ID: {sample_parent.chunk_id}\")\n",
|
||||
"print(f\" Citation: {sample_parent.citation()}\")\n",
|
||||
"print(f\" Word count: {len(sample_parent.text.split())}\")\n",
|
||||
"print(f\" Preview: {sample_parent.text[:300]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "87f74abb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Chunks per chapter:\n",
|
||||
" Ch 01: 186 chunks\n",
|
||||
" Ch 02: 195 chunks\n",
|
||||
" Ch 03: 173 chunks\n",
|
||||
" Ch 04: 194 chunks\n",
|
||||
" Ch 05: 185 chunks\n",
|
||||
" Ch 06: 153 chunks\n",
|
||||
" Ch 07: 161 chunks\n",
|
||||
" Ch 08: 192 chunks\n",
|
||||
" Ch 09: 166 chunks\n",
|
||||
" Ch 10: 193 chunks\n",
|
||||
" Ch 11: 158 chunks\n",
|
||||
" Ch 12: 171 chunks\n",
|
||||
" Ch 13: 189 chunks\n",
|
||||
" Ch 14: 217 chunks\n",
|
||||
" Ch 15: 188 chunks\n",
|
||||
" Ch 16: 209 chunks\n",
|
||||
" Ch 17: 200 chunks\n",
|
||||
" Ch 18: 239 chunks\n",
|
||||
" Ch 19: 206 chunks\n",
|
||||
" Ch 20: 177 chunks\n",
|
||||
" Ch 21: 205 chunks\n",
|
||||
" Ch 22: 221 chunks\n",
|
||||
" Ch 23: 172 chunks\n",
|
||||
" Ch 24: 212 chunks\n",
|
||||
" Ch 25: 234 chunks\n",
|
||||
" Ch 26: 220 chunks\n",
|
||||
" Ch 27: 213 chunks\n",
|
||||
" Ch 28: 245 chunks\n",
|
||||
"\n",
|
||||
"Text length — min: 80, max: 2251, avg: 510\n",
|
||||
"\n",
|
||||
"======================================================================\n",
|
||||
"SAMPLE CHUNKS — FULL TEXT\n",
|
||||
"======================================================================\n",
|
||||
"\n",
|
||||
"[ch01_s001_p000]\n",
|
||||
"Citation : Chapter 1: Old Worlds and New › An Old World: North America › page number not yet added\n",
|
||||
"PDF page : 62\n",
|
||||
"Text :\n",
|
||||
"The most striking feature of Native American society at the time Europeans arrived was its sheer diversity. Each group had its own political system and set of religious beliefs, and North America was home to hundreds of mutually unintelligible languages. Indians did not define “America” as a continent or hemisphere. They did not think of themselves as a single people, and Native Americans still today identify primarily as separate nations. Identity centered on the immediate social group—a family, clan, town, nation, or confederacy. When Europeans first arrived, many Indians saw them as simply one group among many. Their first thought was how to use the newcomers to enhance their standing in relation to other Native peoples rather than to unite against them. The sharp dichotomy between “Indians” and “white” persons did not emerge until later in the colonial era.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch04_s026_p000]\n",
|
||||
"Citation : Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763 › Slavery and Empire › page number not yet added\n",
|
||||
"PDF page : 256\n",
|
||||
"Text :\n",
|
||||
"Of the estimated 10 million Africans transported to the Americas between 1492 and 1820, more than half arrived between 1700 and 1800. The Atlantic slave trade would later be condemned as a crime against humanity. But in the eighteenth century, it was a regularized business in which European merchants, African traders, and American planters engaged in complex bargaining over human lives, all with the expectation of securing a profit. The slave trade was a vital part of world commerce. Every European empire in the Americas utilized slave labor and battled for control of this lucrative trade. The asiento—an agreement whereby Spain subcontracted to a foreign power the right to provide slaves to Spanish America—was an important diplomatic prize. Britain’s acquisition of the asiento from the Dutch in the Treaty of Utrecht of 1713 was a major step in its rise to commercial supremacy.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch11_s076_p000]\n",
|
||||
"Citation : Chapter 11: The Peculiar Institution › The Old South › page number not yet added\n",
|
||||
"PDF page : 671\n",
|
||||
"Text :\n",
|
||||
"When Frederick Douglass was born, slavery was already an old institution in America. Two centuries had passed since the first twenty Africans were landed in Virginia from a Dutch ship. After abolition in the North, slavery had become the “peculiar institution” of the South—that is, an institution unique to southern society. The Mason-Dixon Line, drawn by two surveyors in the eighteenth century to settle a boundary dispute between Maryland and Pennsylvania, eventually became the dividing line between slavery and freedom.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch18_s122_p000]\n",
|
||||
"Citation : Chapter 18: The Progressive Era, 1900⠍1916 › An Urban Age and a Consumer Society › page number not yet added\n",
|
||||
"PDF page : 1110\n",
|
||||
"Text :\n",
|
||||
"The Progressive era was a period of explosive economic growth, fueled by increasing industrial production, a rapid rise in population, and the continued expansion of the consumer marketplace. In the first decade of the twentieth century, the economy’s total output rose by about 85 percent. For the last time in American history, farms and cities grew together. As farm prices recovered from their low point during the depression of the 1890s, American agriculture entered what would later be remembered as its “golden age.” The expansion of urban areas stimulated demand for farm goods. Farm families poured into the western Great Plains. More than 1 million claims for free government land were filed under the Homestead Act of 1862—more than in the previous forty years. Between 1900 and 1910, the combined population of Texas and Oklahoma rose by nearly 2 million, and Kansas, Nebraska, and the Dakotas added 800,000. Irrigation transformed the Imperial Valley of California and parts of Arizona into major areas of commercial farming.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch26_s178_p000]\n",
|
||||
"Citation : Chapter 26: The Conservative Turn, 1969⠍1988 › President Nixon › page number not yet added\n",
|
||||
"PDF page : 1630\n",
|
||||
"Text :\n",
|
||||
"Richard Nixon’s presidency bridged the eras of liberalism under Kennedy and Johnson and the conservatism of the Reagan era. Nixon was the first president from California, and his victory signaled the growing power of the conservative Sunbelt in national politics. From the vantage point of the early twenty-first century, it is difficult to recall how marginal conservatism seemed at the end of World War II. Associated in many minds with conspiracy theories, anti-Semitism, and preference for social hierarchy over democracy and equality, conservatism seemed a relic of a discredited past.\n",
|
||||
"----------------------------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"# %%\n",
|
||||
"# Sanity check\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"by_chapter = Counter(c.chapter_num for c in all_chunks)\n",
|
||||
"print(\"Chunks per chapter:\")\n",
|
||||
"for ch_num in sorted(by_chapter):\n",
|
||||
" print(f\" Ch {ch_num:02d}: {by_chapter[ch_num]} chunks\")\n",
|
||||
"\n",
|
||||
"lengths = [len(c.text) for c in all_chunks]\n",
|
||||
"print(f\"\\nText length — min: {min(lengths)}, max: {max(lengths)}, avg: {sum(lengths)//len(lengths)}\")\n",
|
||||
"\n",
|
||||
"# ── Print 5 real full chunks from different chapters ──────────────────────────\n",
|
||||
"print(\"\\n\" + \"=\"*70)\n",
|
||||
"print(\"SAMPLE CHUNKS — FULL TEXT\")\n",
|
||||
"print(\"=\"*70)\n",
|
||||
"\n",
|
||||
"test_chapters = [1, 4, 11, 18, 26]\n",
|
||||
"for ch in test_chapters:\n",
|
||||
" sample = next((c for c in all_chunks if c.chapter_num == ch and not c.is_chapter_header), None)\n",
|
||||
" if sample:\n",
|
||||
" print(f\"\\n[{sample.chunk_id}]\")\n",
|
||||
" print(f\"Citation : {sample.citation()}\")\n",
|
||||
" print(f\"PDF page : {sample.pdf_page}\")\n",
|
||||
" print(f\"Text :\\n{sample.text}\")\n",
|
||||
" print(\"-\"*70)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "f1bc3f74",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Top 10 largest parents:\n",
|
||||
" 6,944 words — The American Dilemma\n",
|
||||
" 6,690 words — The Transformation of the West\n",
|
||||
" 6,226 words — The Golden Age\n",
|
||||
" 5,697 words — The New Movements and the Rights Revolution\n",
|
||||
" 5,279 words — The Anticommunist Crusade\n",
|
||||
" 5,278 words — The Making of Radical Reconstruction\n",
|
||||
" 5,071 words — Varieties of Progressivism\n",
|
||||
" 4,972 words — Culture Wars\n",
|
||||
" 4,885 words — The Meaning of Freedom\n",
|
||||
" 4,823 words — The Segregated South\n",
|
||||
"\n",
|
||||
"Bottom 5 smallest parents:\n",
|
||||
" 337 words — Chapter 13: A House Divided, 1840⠍1861\n",
|
||||
" 295 words — The Attacks of September 11\n",
|
||||
" 268 words — Chapter Review\n",
|
||||
" 251 words — An Old World: West Africa\n",
|
||||
" 196 words — Chapter Review\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parent_word_counts = [(p.section_title, len(p.text.split())) for p in parent_chunks]\n",
|
||||
"parent_word_counts.sort(key=lambda x: x[1], reverse=True)\n",
|
||||
"\n",
|
||||
"print(\"Top 10 largest parents:\")\n",
|
||||
"for title, count in parent_word_counts[:10]:\n",
|
||||
" print(f\" {count:,} words — {title}\")\n",
|
||||
"\n",
|
||||
"print(\"\\nBottom 5 smallest parents:\")\n",
|
||||
"for title, count in parent_word_counts[-5:]:\n",
|
||||
" print(f\" {count:,} words — {title}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"id": "8355e686",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Before capping: 204 parents\n",
|
||||
"After capping: 229 parents\n",
|
||||
"New max parent: 3,973 words\n",
|
||||
"New avg parent: 1,915 words\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# %%\n",
|
||||
"MAX_PARENT_WORDS = 4000\n",
|
||||
"\n",
|
||||
"capped_parents = []\n",
|
||||
"\n",
|
||||
"for parent in parent_chunks:\n",
|
||||
" words = parent.text.split()\n",
|
||||
" \n",
|
||||
" if len(words) <= MAX_PARENT_WORDS:\n",
|
||||
" capped_parents.append(parent)\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" # Split into halves, preserve all metadata\n",
|
||||
" mid = len(words) // 2\n",
|
||||
" half_a = ' '.join(words[:mid])\n",
|
||||
" half_b = ' '.join(words[mid:])\n",
|
||||
"\n",
|
||||
" for i, half_text in enumerate([half_a, half_b]):\n",
|
||||
" capped_parents.append(Chunk(\n",
|
||||
" chunk_id = f\"{parent.chunk_id}_part{i}\",\n",
|
||||
" text = half_text,\n",
|
||||
" chapter_num = parent.chapter_num,\n",
|
||||
" chapter_title = parent.chapter_title,\n",
|
||||
" section_title = parent.section_title,\n",
|
||||
" paragraph_index = -1,\n",
|
||||
" pdf_page = parent.pdf_page,\n",
|
||||
" textbook_page = parent.textbook_page,\n",
|
||||
" is_chapter_header = parent.is_chapter_header,\n",
|
||||
" is_chapter_review = parent.is_chapter_review,\n",
|
||||
" chunk_type = \"parent\",\n",
|
||||
" parent_id = None,\n",
|
||||
" ))\n",
|
||||
"\n",
|
||||
"print(f\"Before capping: {len(parent_chunks)} parents\")\n",
|
||||
"print(f\"After capping: {len(capped_parents)} parents\")\n",
|
||||
"\n",
|
||||
"word_counts = sorted([len(p.text.split()) for p in capped_parents], reverse=True)\n",
|
||||
"print(f\"New max parent: {word_counts[0]:,} words\")\n",
|
||||
"print(f\"New avg parent: {sum(word_counts)//len(word_counts):,} words\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"id": "dad7c6d3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Duplicate parent IDs: set()\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Check for any duplicate parent IDs right now\n",
|
||||
"parent_ids = [p.chunk_id for p in parent_chunks]\n",
|
||||
"dupes = [pid for pid in parent_ids if parent_ids.count(pid) > 1]\n",
|
||||
"print(f\"Duplicate parent IDs: {set(dupes)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 76,
|
||||
"id": "0ef0c08a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Parent lookup entries: 204\n",
|
||||
"Children with missing parent: 0\n",
|
||||
"Duplicate capped parent IDs: none ✓\n",
|
||||
"Saved 5474 children → /home/keshav/code/apush-rag/data/processed/chunks_children.json\n",
|
||||
"Saved 204 parent entries → /home/keshav/code/apush-rag/data/processed/parent_lookup.json\n",
|
||||
"\n",
|
||||
"Round-trip test:\n",
|
||||
" Child ID: ch02_s014_p008\n",
|
||||
" Parent ID: ch02_s014_parent\n",
|
||||
" Parent parts found: 1\n",
|
||||
" Citation: Chapter 2: European Colonies and Native Nations, 1600⠍1660 › New Englanders Divided\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# %%\n",
|
||||
"# Build parent lookup: original_parent_id → list of capped parts\n",
|
||||
"# Handles both unsplit (1 entry) and split (2 entries) parents\n",
|
||||
"parent_lookup = {}\n",
|
||||
"\n",
|
||||
"for capped in capped_parents:\n",
|
||||
" # Strip _part0 / _part1 suffix to get original parent_id\n",
|
||||
" original_id = re.sub(r'_part\\d+$', '', capped.chunk_id)\n",
|
||||
" if original_id not in parent_lookup:\n",
|
||||
" parent_lookup[original_id] = []\n",
|
||||
" parent_lookup[original_id].append(asdict(capped))\n",
|
||||
"\n",
|
||||
"# Verify all children can find their parent\n",
|
||||
"missing = []\n",
|
||||
"for chunk in all_chunks:\n",
|
||||
" if chunk.parent_id not in parent_lookup:\n",
|
||||
" missing.append(chunk.chunk_id)\n",
|
||||
"\n",
|
||||
"print(f\"Parent lookup entries: {len(parent_lookup)}\")\n",
|
||||
"print(f\"Children with missing parent: {len(missing)}\")\n",
|
||||
"if missing:\n",
|
||||
" print(f\" Examples: {missing[:5]}\")\n",
|
||||
"\n",
|
||||
"# Duplicate check on the right list\n",
|
||||
"capped_ids = [p.chunk_id for p in capped_parents]\n",
|
||||
"dupes = {pid for pid in capped_ids if capped_ids.count(pid) > 1}\n",
|
||||
"print(f\"Duplicate capped parent IDs: {dupes if dupes else 'none ✓'}\")\n",
|
||||
"\n",
|
||||
"# %%\n",
|
||||
"# Save everything\n",
|
||||
"output_dir = project_root / \"data\" / \"processed\"\n",
|
||||
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
"# Children — these go into Qdrant for vector search\n",
|
||||
"children_path = output_dir / \"chunks_children.json\"\n",
|
||||
"with open(children_path, \"w\", encoding=\"utf-8\") as f:\n",
|
||||
" json.dump([asdict(c) for c in all_chunks], f, indent=2, ensure_ascii=False)\n",
|
||||
"\n",
|
||||
"# Parent lookup — keyed by original parent_id, value is list of parts\n",
|
||||
"# At query time: retrieve child → look up parent_id → get all parts → send to LLM\n",
|
||||
"lookup_path = output_dir / \"parent_lookup.json\"\n",
|
||||
"with open(lookup_path, \"w\", encoding=\"utf-8\") as f:\n",
|
||||
" json.dump(parent_lookup, f, indent=2, ensure_ascii=False)\n",
|
||||
"\n",
|
||||
"print(f\"Saved {len(all_chunks)} children → {children_path}\")\n",
|
||||
"print(f\"Saved {len(parent_lookup)} parent entries → {lookup_path}\")\n",
|
||||
"\n",
|
||||
"# Verify round-trip\n",
|
||||
"with open(children_path) as f:\n",
|
||||
" check_children = json.load(f)\n",
|
||||
"with open(lookup_path) as f:\n",
|
||||
" check_lookup = json.load(f)\n",
|
||||
"\n",
|
||||
"# Test one full retrieval cycle works\n",
|
||||
"sample_child = check_children[300]\n",
|
||||
"sample_parent = check_lookup.get(sample_child[\"parent_id\"])\n",
|
||||
"\n",
|
||||
"print(f\"\\nRound-trip test:\")\n",
|
||||
"print(f\" Child ID: {sample_child['chunk_id']}\")\n",
|
||||
"print(f\" Parent ID: {sample_child['parent_id']}\")\n",
|
||||
"print(f\" Parent parts found: {len(sample_parent) if sample_parent else 'MISSING ✗'}\")\n",
|
||||
"print(f\" Citation: {sample_child['chapter_title']} › {sample_child['section_title']}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user