cleaned up and run
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 2,
|
||||
"id": "11896305",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -262,7 +262,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 3,
|
||||
"id": "991dbad2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -496,7 +496,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 4,
|
||||
"id": "43e20197",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -596,7 +596,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 6,
|
||||
"id": "c2563864",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -652,7 +652,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loaded 177 entries from page map\n"
|
||||
"Loaded 353 entries from page map\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -664,15 +664,14 @@
|
||||
"with open(page_map_path) as f:\n",
|
||||
" page_map = yaml.safe_load(f)\n",
|
||||
"\n",
|
||||
"# Build fast lookup: section title → real page (will be None for now)\n",
|
||||
"section_page_lookup = {}\n",
|
||||
"for ch_num, ch_data in page_map[\"chapters\"].items():\n",
|
||||
" # Chapter-level entry\n",
|
||||
" section_page_lookup[ch_data[\"title\"]] = ch_data.get(\"real_page\")\n",
|
||||
" # Section-level entries\n",
|
||||
" if ch_data.get(\"sections\"):\n",
|
||||
" for section_title, real_page in ch_data[\"sections\"].items():\n",
|
||||
" section_page_lookup[section_title] = real_page\n",
|
||||
" section_page_lookup[(ch_num, section_title)] = real_page\n",
|
||||
" if section_title not in section_page_lookup:\n",
|
||||
" section_page_lookup[section_title] = real_page\n",
|
||||
"\n",
|
||||
"print(f\"Loaded {len(section_page_lookup)} entries from page map\")"
|
||||
]
|
||||
@@ -687,37 +686,78 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"('Half-title Page', 2) → y=0\n",
|
||||
"('Physical/Political Map of The United States', 5) → y=0\n",
|
||||
"('Political Map of The World', 6) → y=0\n",
|
||||
"('Title Page', 7) → y=0\n",
|
||||
"('Copyright', 10) → y=0\n",
|
||||
"('Dedication', 13) → y=0\n",
|
||||
"('Contents', 14) → y=0\n",
|
||||
"('List of Maps, Tables, and Figures', 22) → y=0\n",
|
||||
"('About the Authors', 32) → y=0\n",
|
||||
"('Preface', 34) → y=0\n"
|
||||
"Total entries: 221\n",
|
||||
"Still y=0: 36\n",
|
||||
"\n",
|
||||
"Sample (should now be non-zero for section headers):\n",
|
||||
" ('An Old World: Western Europe', 75) → y=93.6\n",
|
||||
" ('Contact', 80) → y=92.1\n",
|
||||
" ('The Spanish Empire', 88) → y=90.9\n",
|
||||
" ('The French and Dutch Empires', 108) → y=92.1\n",
|
||||
" ('Chapter Review', 120) → y=93.6\n",
|
||||
" ('Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124) → y=0.0\n",
|
||||
" ('England and the Americas', 129) → y=92.1\n",
|
||||
" ('Early English Exploration and Colonization', 138) → y=92.1\n",
|
||||
" ('The Chesapeake', 142) → y=92.1\n",
|
||||
" ('Origins of American Slavery', 150) → y=92.1\n",
|
||||
"\n",
|
||||
"Could not locate on page (will use y=0 fallback):\n",
|
||||
" p2: Half-title Page\n",
|
||||
" p7: Title Page\n",
|
||||
" p13: Dedication\n",
|
||||
" p59: Chapter 1: Old Worlds and New\n",
|
||||
" p124: Chapter 2: European Colonies and Native Nations, 1600⠍1660\n",
|
||||
" p193: Chapter 3: Creating Anglo-America, 1660⠍1750\n",
|
||||
" p246: North America at Mid-Century\n",
|
||||
" p253: Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763\n",
|
||||
" p325: Chapter 5: The American Revolution, 1763⠍1783\n",
|
||||
" p381: Chapter 6: The Revolution Within\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Build a lookup from (title, pdf_page_1indexed) → y coordinate\n",
|
||||
"# This is what gives us pixel-precise section boundaries\n",
|
||||
"toc_full = doc.get_toc(simple=False)\n",
|
||||
"\n",
|
||||
"toc_coord_lookup = {}\n",
|
||||
"not_found = []\n",
|
||||
"\n",
|
||||
"for item in toc_full:\n",
|
||||
" level = item[0]\n",
|
||||
" title = item[1]\n",
|
||||
" page_1idx = item[2]\n",
|
||||
" dest = item[3] if len(item) > 3 else {}\n",
|
||||
" y = dest.get(\"y\", 0) if isinstance(dest, dict) else 0\n",
|
||||
"\n",
|
||||
" # If y=0 (page-only link), search for the title text on the page\n",
|
||||
" if y == 0:\n",
|
||||
" page = doc[page_1idx - 1] # 0-indexed\n",
|
||||
" hits = page.search_for(title)\n",
|
||||
" if hits:\n",
|
||||
" y = hits[0].y0 # top of first match\n",
|
||||
" else:\n",
|
||||
" # Try searching for just the first few words (TOC titles sometimes\n",
|
||||
" # differ slightly from the printed heading)\n",
|
||||
" short = \" \".join(title.split()[:4])\n",
|
||||
" hits = page.search_for(short)\n",
|
||||
" if hits:\n",
|
||||
" y = hits[0].y0\n",
|
||||
" else:\n",
|
||||
" not_found.append((title, page_1idx))\n",
|
||||
" y = 0 # fallback — whole page\n",
|
||||
"\n",
|
||||
" toc_coord_lookup[(title, page_1idx)] = y\n",
|
||||
"\n",
|
||||
"# Sanity check — print a few to confirm y values are non-zero\n",
|
||||
"sample = list(toc_coord_lookup.items())[:10]\n",
|
||||
"for k, v in sample:\n",
|
||||
" print(f\"{k} → y={v}\")"
|
||||
"# Sanity check\n",
|
||||
"print(f\"Total entries: {len(toc_coord_lookup)}\")\n",
|
||||
"print(f\"Still y=0: {len(not_found)}\")\n",
|
||||
"print(\"\\nSample (should now be non-zero for section headers):\")\n",
|
||||
"for k, v in list(toc_coord_lookup.items())[14:24]: # skip front matter, show ch1+\n",
|
||||
" print(f\" {k} → y={v:.1f}\")\n",
|
||||
"\n",
|
||||
"if not_found:\n",
|
||||
" print(f\"\\nCould not locate on page (will use y=0 fallback):\")\n",
|
||||
" for t, p in not_found[:10]:\n",
|
||||
" print(f\" p{p}: {t}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -762,6 +802,63 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "75737655",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✓ \"We the People\" → y=92.1 (matched 'We the People')\n",
|
||||
"✗ The \"Second War of Independence\" — check page 531 manually\n",
|
||||
"✗ The Post-Cold War World — check page 1700 manually\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def clean_for_search(title: str) -> str:\n",
|
||||
" replacements = {\n",
|
||||
" '“': '\"', 'â€\\x9d': '\"', '’': \"'\",\n",
|
||||
" 'â€\"': '—', '‘': \"'\",\n",
|
||||
" '⠍': '-', '⠒': \"'\",\n",
|
||||
" }\n",
|
||||
" for bad, good in replacements.items():\n",
|
||||
" title = title.replace(bad, good)\n",
|
||||
" return title\n",
|
||||
"\n",
|
||||
"# The 3 known stragglers — fix them directly\n",
|
||||
"stragglers = [\n",
|
||||
" ('\"We the People\"', 472),\n",
|
||||
" ('The \"Second War of Independence\"', 531),\n",
|
||||
" ('The Post-Cold War World', 1700),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for clean_title, page_1idx in stragglers:\n",
|
||||
" page = doc[page_1idx - 1]\n",
|
||||
" \n",
|
||||
" # Try progressively shorter substrings of the cleaned title\n",
|
||||
" words = clean_title.split()\n",
|
||||
" for n in range(len(words), 1, -1):\n",
|
||||
" snippet = \" \".join(words[:n]).strip('\"') # strip quotes too\n",
|
||||
" hits = page.search_for(snippet)\n",
|
||||
" if hits:\n",
|
||||
" # Update the ORIGINAL (dirty) key in the lookup\n",
|
||||
" dirty_key = next(\n",
|
||||
" k for k in toc_coord_lookup \n",
|
||||
" if k[1] == page_1idx and clean_for_search(k[0]) == clean_title\n",
|
||||
" )\n",
|
||||
" toc_coord_lookup[dirty_key] = hits[0].y0\n",
|
||||
" print(f\"✓ {clean_title} → y={hits[0].y0:.1f} (matched '{snippet}')\")\n",
|
||||
" break\n",
|
||||
" else:\n",
|
||||
" print(f\"✗ {clean_title} — check page {page_1idx} manually\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "b4b3ffea",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -791,7 +888,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 12,
|
||||
"id": "f7d6a626",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -888,7 +985,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 13,
|
||||
"id": "01a14318",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -896,23 +993,23 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✓ Rebuilt 5474 children with parent_id field\n",
|
||||
"✓ Rebuilt 5530 children with parent_id field\n",
|
||||
"✗ Failed: 0\n",
|
||||
"\n",
|
||||
"Sample child:\n",
|
||||
" chunk_id: ch02_s014_p008\n",
|
||||
" chunk_id: ch02_s014_p002\n",
|
||||
" parent_id: ch02_s014_parent\n",
|
||||
" type: child\n",
|
||||
"Child chunks: 5474\n",
|
||||
"Child chunks: 5530\n",
|
||||
"Parent chunks: 204\n",
|
||||
"Total: 5678\n",
|
||||
"Total: 5734\n",
|
||||
"\n",
|
||||
"Expected parents: 204 (one per unique section)\n",
|
||||
"\n",
|
||||
"Sample parent:\n",
|
||||
" ID: ch04_s026_parent\n",
|
||||
" Citation: Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763 › Slavery and Empire › page number not yet added\n",
|
||||
" Word count: 2754\n",
|
||||
" Citation: Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763 › Slavery and Empire › approx. p. 111\n",
|
||||
" Word count: 2787\n",
|
||||
" Preview: Of the estimated 10 million Africans transported to the Americas between 1492 and 1820, more than half arrived between 1700 and 1800. The Atlantic slave trade would later be condemned as a crime against humanity. But in the eighteenth century, it was a regularized business in which European merchant\n"
|
||||
]
|
||||
}
|
||||
@@ -958,7 +1055,7 @@
|
||||
" ch_title = section[\"chapter_title\"]\n",
|
||||
" is_header = section[\"is_chapter_header\"]\n",
|
||||
" is_review = \"Chapter Review\" in title\n",
|
||||
" real_page = section_page_lookup.get(title)\n",
|
||||
" real_page = section_page_lookup.get((ch_num, title)) or section_page_lookup.get(title)\n",
|
||||
"\n",
|
||||
" # Pre-compute parent_id for this section\n",
|
||||
" parent_id = f\"ch{ch_num:02d}_s{section_idx:03d}_parent\"\n",
|
||||
@@ -1060,7 +1157,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 14,
|
||||
"id": "87f74abb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1069,71 +1166,71 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Chunks per chapter:\n",
|
||||
" Ch 01: 186 chunks\n",
|
||||
" Ch 02: 195 chunks\n",
|
||||
" Ch 03: 173 chunks\n",
|
||||
" Ch 04: 194 chunks\n",
|
||||
" Ch 01: 189 chunks\n",
|
||||
" Ch 02: 199 chunks\n",
|
||||
" Ch 03: 176 chunks\n",
|
||||
" Ch 04: 197 chunks\n",
|
||||
" Ch 05: 185 chunks\n",
|
||||
" Ch 06: 153 chunks\n",
|
||||
" Ch 07: 161 chunks\n",
|
||||
" Ch 08: 192 chunks\n",
|
||||
" Ch 09: 166 chunks\n",
|
||||
" Ch 10: 193 chunks\n",
|
||||
" Ch 11: 158 chunks\n",
|
||||
" Ch 12: 171 chunks\n",
|
||||
" Ch 13: 189 chunks\n",
|
||||
" Ch 14: 217 chunks\n",
|
||||
" Ch 15: 188 chunks\n",
|
||||
" Ch 16: 209 chunks\n",
|
||||
" Ch 17: 200 chunks\n",
|
||||
" Ch 06: 155 chunks\n",
|
||||
" Ch 07: 164 chunks\n",
|
||||
" Ch 08: 193 chunks\n",
|
||||
" Ch 09: 168 chunks\n",
|
||||
" Ch 10: 195 chunks\n",
|
||||
" Ch 11: 160 chunks\n",
|
||||
" Ch 12: 173 chunks\n",
|
||||
" Ch 13: 191 chunks\n",
|
||||
" Ch 14: 218 chunks\n",
|
||||
" Ch 15: 189 chunks\n",
|
||||
" Ch 16: 210 chunks\n",
|
||||
" Ch 17: 201 chunks\n",
|
||||
" Ch 18: 239 chunks\n",
|
||||
" Ch 19: 206 chunks\n",
|
||||
" Ch 20: 177 chunks\n",
|
||||
" Ch 21: 205 chunks\n",
|
||||
" Ch 22: 221 chunks\n",
|
||||
" Ch 23: 172 chunks\n",
|
||||
" Ch 24: 212 chunks\n",
|
||||
" Ch 25: 234 chunks\n",
|
||||
" Ch 26: 220 chunks\n",
|
||||
" Ch 27: 213 chunks\n",
|
||||
" Ch 28: 245 chunks\n",
|
||||
" Ch 19: 208 chunks\n",
|
||||
" Ch 20: 179 chunks\n",
|
||||
" Ch 21: 207 chunks\n",
|
||||
" Ch 22: 224 chunks\n",
|
||||
" Ch 23: 174 chunks\n",
|
||||
" Ch 24: 215 chunks\n",
|
||||
" Ch 25: 237 chunks\n",
|
||||
" Ch 26: 221 chunks\n",
|
||||
" Ch 27: 216 chunks\n",
|
||||
" Ch 28: 247 chunks\n",
|
||||
"\n",
|
||||
"Text length — min: 80, max: 2251, avg: 510\n",
|
||||
"Text length — min: 80, max: 2251, avg: 506\n",
|
||||
"\n",
|
||||
"======================================================================\n",
|
||||
"SAMPLE CHUNKS — FULL TEXT\n",
|
||||
"======================================================================\n",
|
||||
"\n",
|
||||
"[ch01_s001_p000]\n",
|
||||
"Citation : Chapter 1: Old Worlds and New › An Old World: North America › page number not yet added\n",
|
||||
"Citation : Chapter 1: Old Worlds and New › An Old World: North America › approx. p. 4\n",
|
||||
"PDF page : 62\n",
|
||||
"Text :\n",
|
||||
"The most striking feature of Native American society at the time Europeans arrived was its sheer diversity. Each group had its own political system and set of religious beliefs, and North America was home to hundreds of mutually unintelligible languages. Indians did not define “America” as a continent or hemisphere. They did not think of themselves as a single people, and Native Americans still today identify primarily as separate nations. Identity centered on the immediate social group—a family, clan, town, nation, or confederacy. When Europeans first arrived, many Indians saw them as simply one group among many. Their first thought was how to use the newcomers to enhance their standing in relation to other Native peoples rather than to unite against them. The sharp dichotomy between “Indians” and “white” persons did not emerge until later in the colonial era.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch04_s026_p000]\n",
|
||||
"Citation : Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763 › Slavery and Empire › page number not yet added\n",
|
||||
"Citation : Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763 › Slavery and Empire › approx. p. 111\n",
|
||||
"PDF page : 256\n",
|
||||
"Text :\n",
|
||||
"Of the estimated 10 million Africans transported to the Americas between 1492 and 1820, more than half arrived between 1700 and 1800. The Atlantic slave trade would later be condemned as a crime against humanity. But in the eighteenth century, it was a regularized business in which European merchants, African traders, and American planters engaged in complex bargaining over human lives, all with the expectation of securing a profit. The slave trade was a vital part of world commerce. Every European empire in the Americas utilized slave labor and battled for control of this lucrative trade. The asiento—an agreement whereby Spain subcontracted to a foreign power the right to provide slaves to Spanish America—was an important diplomatic prize. Britain’s acquisition of the asiento from the Dutch in the Treaty of Utrecht of 1713 was a major step in its rise to commercial supremacy.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch11_s076_p000]\n",
|
||||
"Citation : Chapter 11: The Peculiar Institution › The Old South › page number not yet added\n",
|
||||
"Citation : Chapter 11: The Peculiar Institution › The Old South › approx. p. 327\n",
|
||||
"PDF page : 671\n",
|
||||
"Text :\n",
|
||||
"When Frederick Douglass was born, slavery was already an old institution in America. Two centuries had passed since the first twenty Africans were landed in Virginia from a Dutch ship. After abolition in the North, slavery had become the “peculiar institution” of the South—that is, an institution unique to southern society. The Mason-Dixon Line, drawn by two surveyors in the eighteenth century to settle a boundary dispute between Maryland and Pennsylvania, eventually became the dividing line between slavery and freedom.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch18_s122_p000]\n",
|
||||
"Citation : Chapter 18: The Progressive Era, 1900⠍1916 › An Urban Age and a Consumer Society › page number not yet added\n",
|
||||
"Citation : Chapter 18: The Progressive Era, 1900⠍1916 › An Urban Age and a Consumer Society › approx. p. 578\n",
|
||||
"PDF page : 1110\n",
|
||||
"Text :\n",
|
||||
"The Progressive era was a period of explosive economic growth, fueled by increasing industrial production, a rapid rise in population, and the continued expansion of the consumer marketplace. In the first decade of the twentieth century, the economy’s total output rose by about 85 percent. For the last time in American history, farms and cities grew together. As farm prices recovered from their low point during the depression of the 1890s, American agriculture entered what would later be remembered as its “golden age.” The expansion of urban areas stimulated demand for farm goods. Farm families poured into the western Great Plains. More than 1 million claims for free government land were filed under the Homestead Act of 1862—more than in the previous forty years. Between 1900 and 1910, the combined population of Texas and Oklahoma rose by nearly 2 million, and Kansas, Nebraska, and the Dakotas added 800,000. Irrigation transformed the Imperial Valley of California and parts of Arizona into major areas of commercial farming.\n",
|
||||
"----------------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"[ch26_s178_p000]\n",
|
||||
"Citation : Chapter 26: The Conservative Turn, 1969⠍1988 › President Nixon › page number not yet added\n",
|
||||
"Citation : Chapter 26: The Conservative Turn, 1969⠍1988 › President Nixon › approx. p. 853\n",
|
||||
"PDF page : 1630\n",
|
||||
"Text :\n",
|
||||
"Richard Nixon’s presidency bridged the eras of liberalism under Kennedy and Johnson and the conservatism of the Reagan era. Nixon was the first president from California, and his victory signaled the growing power of the conservative Sunbelt in national politics. From the vantage point of the early twenty-first century, it is difficult to recall how marginal conservatism seemed at the end of World War II. Associated in many minds with conspiracy theories, anti-Semitism, and preference for social hierarchy over democracy and equality, conservatism seemed a relic of a discredited past.\n",
|
||||
@@ -1173,7 +1270,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 15,
|
||||
"id": "f1bc3f74",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1182,12 +1279,12 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Top 10 largest parents:\n",
|
||||
" 6,944 words — The American Dilemma\n",
|
||||
" 6,972 words — The American Dilemma\n",
|
||||
" 6,690 words — The Transformation of the West\n",
|
||||
" 6,226 words — The Golden Age\n",
|
||||
" 6,247 words — The Golden Age\n",
|
||||
" 5,697 words — The New Movements and the Rights Revolution\n",
|
||||
" 5,279 words — The Anticommunist Crusade\n",
|
||||
" 5,278 words — The Making of Radical Reconstruction\n",
|
||||
" 5,301 words — The Making of Radical Reconstruction\n",
|
||||
" 5,298 words — The Anticommunist Crusade\n",
|
||||
" 5,071 words — Varieties of Progressivism\n",
|
||||
" 4,972 words — Culture Wars\n",
|
||||
" 4,885 words — The Meaning of Freedom\n",
|
||||
@@ -1217,7 +1314,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 16,
|
||||
"id": "8355e686",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1228,7 +1325,7 @@
|
||||
"Before capping: 204 parents\n",
|
||||
"After capping: 229 parents\n",
|
||||
"New max parent: 3,973 words\n",
|
||||
"New avg parent: 1,915 words\n"
|
||||
"New avg parent: 1,921 words\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -1276,7 +1373,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 17,
|
||||
"id": "dad7c6d3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1297,7 +1394,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 18,
|
||||
"id": "0ef0c08a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -1308,11 +1405,11 @@
|
||||
"Parent lookup entries: 204\n",
|
||||
"Children with missing parent: 0\n",
|
||||
"Duplicate capped parent IDs: none ✓\n",
|
||||
"Saved 5474 children → /home/keshav/code/apush-rag/data/processed/chunks_children.json\n",
|
||||
"Saved 5530 children → /home/keshav/code/apush-rag/data/processed/chunks_children.json\n",
|
||||
"Saved 204 parent entries → /home/keshav/code/apush-rag/data/processed/parent_lookup.json\n",
|
||||
"\n",
|
||||
"Round-trip test:\n",
|
||||
" Child ID: ch02_s014_p008\n",
|
||||
" Child ID: ch02_s014_p002\n",
|
||||
" Parent ID: ch02_s014_parent\n",
|
||||
" Parent parts found: 1\n",
|
||||
" Citation: Chapter 2: European Colonies and Native Nations, 1600⠍1660 › New Englanders Divided\n"
|
||||
|
||||
Reference in New Issue
Block a user