parseed input from textbook pdf

This commit is contained in:
Keshav Anand
2026-03-10 19:33:16 -05:00
parent 158b09416d
commit 9cc57da50e
3 changed files with 957 additions and 3 deletions

4
.gitignore vendored
View File

@@ -64,9 +64,7 @@ htmlcov/
.dmypy.json
pyrightconfig.json
# ── Jupyter ───────────────────────────────────────────────────────────────────
.ipynb_checkpoints/
*.ipynb
# ── Logs ──────────────────────────────────────────────────────────────────────
logs/

289
config/page_map.yaml Normal file
View File

@@ -0,0 +1,289 @@
chapters:
1:
title: 'Chapter 1: Old Worlds and New'
real_page: null
sections:
'An Old World: North America': null
'An Old World: West Africa': null
'An Old World: Western Europe': null
Contact: null
The Spanish Empire: null
The French and Dutch Empires: null
Chapter Review: null
2:
title: 'Chapter 2: European Colonies and Native Nations, 1600⠍1660'
real_page: null
sections:
England and the Americas: null
Early English Exploration and Colonization: null
The Chesapeake: null
Origins of American Slavery: null
The New England Way: null
New Englanders Divided: null
Religion, Politics, and Freedom: null
Chapter Review: null
3:
title: 'Chapter 3: Creating Anglo-America, 1660⠍1750'
real_page: null
sections:
Global Competition and the Expansion of England⠒s Empire: null
Entrenchment of American Slavery: null
Colonies in Crisis: null
The Growth of Colonial America: null
Social Classes in the British Colonies: null
North America at Mid-Century: null
Chapter Review: null
4:
title: 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763'
real_page: null
sections:
Slavery and Empire: null
Slave Cultures and Slave Resistance: null
An Empire of Freedom: null
The Public Sphere: null
The Great Awakening: null
Imperial Rivalries: null
Battle for the Continent: null
Chapter Review: null
5:
title: 'Chapter 5: The American Revolution, 1763⠍1783'
real_page: null
sections:
The Crisis Begins: null
The Road to Revolution: null
The Coming of Independence: null
Securing Independence: null
Chapter Review: null
6:
title: 'Chapter 6: The Revolution Within'
real_page: null
sections:
Democratizing Freedom: null
Toward Religious Toleration: null
Defining Economic Freedom: null
The Limits of Liberty: null
Slavery and the Revolution: null
Daughters of Liberty: null
Chapter Review: null
7:
title: 'Chapter 7: Founding a Nation, 1783⠍1791'
real_page: null
sections:
America Under the Confederation: null
A New Constitution: null
The Ratification Debate and the Origin of the Bill of Rights: null
"“We the Peopleâ€\x9D": null
Chapter Review: null
8:
title: 'Chapter 8: Securing the Republic, 1791⠍1815'
real_page: null
sections:
Politics in an Age of Passion: null
The Adams Presidency: null
Jefferson in Power: null
"The “Second War of Independenceâ€\x9D": null
Chapter Review: null
9:
title: 'Chapter 9: The Market Revolution, 1800⠍1840'
real_page: null
sections:
A New Economy: null
The Rise of the West: null
Market Society: null
The Free Individual: null
The Limits of Prosperity: null
Chapter Review: null
10:
title: 'Chapter 10: Democracy in America, 1815⠍1840'
real_page: null
sections:
The Triumph of Democracy: null
Nationalism and Its Discontents: null
Nation, Section, and Party: null
The Age of Jackson: null
Indian Removal: null
The Bank War and After: null
Chapter Review: null
11:
title: 'Chapter 11: The Peculiar Institution'
real_page: null
sections:
The Old South: null
Life Under Slavery: null
Slave Culture: null
Resistance to Slavery: null
Chapter Review: null
12:
title: 'Chapter 12: An Age of Reform, 1820⠍1840'
real_page: null
sections:
The Reform Impulse: null
The Crusade Against Slavery: null
Black and White Abolitionism: null
The Origins of Feminism: null
Chapter Review: null
13:
title: 'Chapter 13: A House Divided, 1840⠍1861'
real_page: null
sections:
Fruits of Manifest Destiny: null
A Dose of Arsenic: null
The Rise of the Republican Party: null
The Emergence of Lincoln: null
The Impending Crisis: null
Chapter Review: null
14:
title: 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865'
real_page: null
sections:
The First Modern War: null
The Coming of Emancipation: null
The Second American Revolution: null
The Confederate Nation: null
Turning Points: null
Rehearsals for Reconstruction and the End of the War: null
Chapter Review: null
15:
title: "Chapter 15: “What Is Freedom?â€\x9D: Reconstruction"
real_page: null
sections:
The Meaning of Freedom: null
The Making of Radical Reconstruction: null
Radical Reconstruction in the South: null
The Overthrow of Reconstruction: null
Chapter Review: null
16:
title: 'Chapter 16: America⠒s Gilded Age, 1870⠍1890'
real_page: null
sections:
The Second Industrial Revolution: null
Freedom in the Gilded Age: null
Labor and the Republic: null
The Transformation of the West: null
Politics in a Gilded Age: null
Chapter Review: null
17:
title: 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900'
real_page: null
sections:
The Populist Challenge: null
The Segregated South: null
Redrawing the Boundaries: null
Becoming a World Power: null
Chapter Review: null
18:
title: 'Chapter 18: The Progressive Era, 1900⠍1916'
real_page: null
sections:
An Urban Age and a Consumer Society: null
Varieties of Progressivism: null
The Politics of Progressivism: null
The Progressive Presidents: null
Chapter Review: null
19:
title: 'Chapter 19: Safe for Democracy: The United States and World War I'
real_page: null
sections:
An Era of Intervention: null
America and the Great War: null
The War at Home: null
Who Is an American?: null
'1919': null
Chapter Review: null
20:
title: 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932'
real_page: null
sections:
The Business of America: null
Business and Government: null
The Birth of Civil Liberties: null
The Culture Wars: null
The Great Depression: null
Chapter Review: null
21:
title: 'Chapter 21: The New Deal, 1932⠍1940'
real_page: null
sections:
The First New Deal: null
The Grassroots Revolt: null
The Second New Deal: null
A Reckoning With Liberty: null
The Limits of Change: null
A New Conception of America: null
Chapter Review: null
22:
title: 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945'
real_page: null
sections:
Fighting World War II: null
The Home Front: null
Visions of Postwar Freedom: null
The American Dilemma: null
The End of the War: null
Chapter Review: null
23:
title: 'Chapter 23: The United States and the Cold War, 1945⠍1953'
real_page: null
sections:
Origins of the Cold War: null
The Cold War and the Idea of Freedom: null
The Truman Presidency: null
The Anticommunist Crusade: null
Chapter Review: null
24:
title: 'Chapter 24: An Affluent Society, 1953⠍1960'
real_page: null
sections:
The Golden Age: null
The Eisenhower Era: null
The Freedom Movement: null
The Election of 1960: null
Chapter Review: null
25:
title: 'Chapter 25: The Sixties, 1960⠍1968'
real_page: null
sections:
The Civil Rights Revolution: null
The Kennedy Years: null
Lyndon Johnson⠒s Presidency: null
The Changing Black Movement: null
Vietnam and the New Left: null
The New Movements and the Rights Revolution: null
'1968': null
Chapter Review: null
26:
title: 'Chapter 26: The Conservative Turn, 1969⠍1988'
real_page: null
sections:
President Nixon: null
Grassroots Rights Movements: null
Foreign Policy and Watergate: null
The End of the Golden Age: null
The Rising Tide of Conservatism: null
The Reagan Revolution: null
Chapter Review: null
27:
title: 'Chapter 27: A New World Order, 1989⠍2004'
real_page: null
sections:
The Post⠍Cold War World: null
Globalization and Its Discontents: null
Culture Wars: null
Impeachment and the Election of 2000: null
The Attacks of September 11: null
The War on Terrorism: null
An American Empire?: null
The Aftermath of September 11 at Home: null
Chapter Review: null
28:
title: 'Chapter 28: A Divided Nation'
real_page: null
sections:
The Winds of Change: null
The Great Recession: null
Obama in Office: null
The Obama Presidency: null
President Trump: null
'2020: Year of Crisis': null
Freedom in the Twenty-First Century: null
Chapter Review: null

667
notebooks/pdf_parse.ipynb Normal file
View File

@@ -0,0 +1,667 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e91fd8c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hello, World!\n"
]
}
],
"source": [
"print(\"Hello, World!\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "11896305",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 'Half-title Page', 2]\n",
"[1, 'Physical/Political Map of The United States', 5]\n",
"[1, 'Political Map of The World', 6]\n",
"[1, 'Title Page', 7]\n",
"[1, 'Copyright', 10]\n",
"[1, 'Dedication', 13]\n",
"[1, 'Contents', 14]\n",
"[1, 'List of Maps, Tables, and Figures', 22]\n",
"[1, 'About the Authors', 32]\n",
"[1, 'Preface', 34]\n",
"[1, 'Resources For Students And Instructors', 54]\n",
"[1, 'Chapter 1: Old Worlds and New', 59]\n",
"[1, 'An Old World: North America', 63]\n",
"[1, 'An Old World: West Africa', 73]\n",
"[1, 'An Old World: Western Europe', 75]\n",
"[1, 'Contact', 80]\n",
"[1, 'The Spanish Empire', 88]\n",
"[1, 'The French and Dutch Empires', 108]\n",
"[1, 'Chapter Review', 120]\n",
"[1, 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124]\n",
"[1, 'England and the Americas', 129]\n",
"[1, 'Early English Exploration and Colonization', 138]\n",
"[1, 'The Chesapeake', 142]\n",
"[1, 'Origins of American Slavery', 150]\n",
"[1, 'The New England Way', 157]\n",
"[1, 'New Englanders Divided', 169]\n",
"[1, 'Religion, Politics, and Freedom', 180]\n",
"[1, 'Chapter Review', 188]\n",
"[1, 'Chapter 3: Creating Anglo-America, 1660⠍1750', 193]\n",
"[1, 'Global Competition and the Expansion of England⠒s Empire', 197]\n",
"[1, 'Entrenchment of American Slavery', 206]\n",
"[1, 'Colonies in Crisis', 216]\n",
"[1, 'The Growth of Colonial America', 223]\n",
"[1, 'Social Classes in the British Colonies', 238]\n",
"[1, 'North America at Mid-Century', 246]\n",
"[1, 'Chapter Review', 249]\n",
"[1, 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763', 253]\n",
"[1, 'Slavery and Empire', 257]\n",
"[1, 'Slave Cultures and Slave Resistance', 274]\n",
"[1, 'An Empire of Freedom', 280]\n",
"[1, 'The Public Sphere', 285]\n",
"[1, 'The Great Awakening', 294]\n",
"[1, 'Imperial Rivalries', 298]\n",
"[1, 'Battle for the Continent', 306]\n",
"[1, 'Chapter Review', 320]\n",
"[1, 'Chapter 5: The American Revolution, 1763⠍1783', 325]\n",
"[1, 'The Crisis Begins', 329]\n",
"[1, 'The Road to Revolution', 339]\n",
"[1, 'The Coming of Independence', 345]\n",
"[1, 'Securing Independence', 359]\n",
"[1, 'Chapter Review', 375]\n",
"[1, 'Chapter 6: The Revolution Within', 381]\n",
"[1, 'Democratizing Freedom', 384]\n",
"[1, 'Toward Religious Toleration', 392]\n",
"[1, 'Defining Economic Freedom', 399]\n",
"[1, 'The Limits of Liberty', 404]\n",
"[1, 'Slavery and the Revolution', 410]\n",
"[1, 'Daughters of Liberty', 422]\n",
"[1, 'Chapter Review', 432]\n",
"[1, 'Chapter 7: Founding a Nation, 1783⠍1791', 435]\n",
"[1, 'America Under the Confederation', 439]\n",
"[1, 'A New Constitution', 450]\n",
"[1, 'The Ratification Debate and the Origin of the Bill of Rights', 460]\n",
"[1, '“We the Peopleâ€\\x9d', 472]\n",
"[1, 'Chapter Review', 486]\n",
"[1, 'Chapter 8: Securing the Republic, 1791⠍1815', 491]\n",
"[1, 'Politics in an Age of Passion', 494]\n",
"[1, 'The Adams Presidency', 508]\n",
"[1, 'Jefferson in Power', 522]\n",
"[1, 'The “Second War of Independenceâ€\\x9d', 531]\n",
"[1, 'Chapter Review', 542]\n",
"[1, 'Chapter 9: The Market Revolution, 1800⠍1840', 548]\n",
"[1, 'A New Economy', 552]\n",
"[1, 'The Rise of the West', 558]\n",
"[1, 'Market Society', 566]\n",
"[1, 'The Free Individual', 582]\n",
"[1, 'The Limits of Prosperity', 591]\n",
"[1, 'Chapter Review', 601]\n",
"[1, 'Chapter 10: Democracy in America, 1815⠍1840', 606]\n",
"[1, 'The Triumph of Democracy', 610]\n",
"[1, 'Nationalism and Its Discontents', 623]\n",
"[1, 'Nation, Section, and Party', 630]\n",
"[1, 'The Age of Jackson', 639]\n",
"[1, 'Indian Removal', 647]\n",
"[1, 'The Bank War and After', 657]\n",
"[1, 'Chapter Review', 664]\n",
"[1, 'Chapter 11: The Peculiar Institution', 669]\n",
"[1, 'The Old South', 672]\n",
"[1, 'Life Under Slavery', 690]\n",
"[1, 'Slave Culture', 704]\n",
"[1, 'Resistance to Slavery', 712]\n",
"[1, 'Chapter Review', 722]\n",
"[1, 'Chapter 12: An Age of Reform, 1820⠍1840', 725]\n",
"[1, 'The Reform Impulse', 728]\n",
"[1, 'The Crusade Against Slavery', 740]\n",
"[1, 'Black and White Abolitionism', 755]\n",
"[1, 'The Origins of Feminism', 761]\n",
"[1, 'Chapter Review', 775]\n",
"[1, 'Chapter 13: A House Divided, 1840⠍1861', 780]\n",
"[1, 'Fruits of Manifest Destiny', 783]\n",
"[1, 'A Dose of Arsenic', 803]\n",
"[1, 'The Rise of the Republican Party', 814]\n",
"[1, 'The Emergence of Lincoln', 821]\n",
"[1, 'The Impending Crisis', 837]\n",
"[1, 'Chapter Review', 844]\n",
"[1, 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865', 849]\n",
"[1, 'The First Modern War', 853]\n",
"[1, 'The Coming of Emancipation', 864]\n",
"[1, 'The Second American Revolution', 876]\n",
"[1, 'The Confederate Nation', 891]\n",
"[1, 'Turning Points', 900]\n",
"[1, 'Rehearsals for Reconstruction and the End of the War', 904]\n",
"[1, 'Chapter Review', 912]\n",
"[1, 'Chapter 15: “What Is Freedom?â€\\x9d: Reconstruction', 917]\n",
"[1, 'The Meaning of Freedom', 921]\n",
"[1, 'The Making of Radical Reconstruction', 938]\n",
"[1, 'Radical Reconstruction in the South', 956]\n",
"[1, 'The Overthrow of Reconstruction', 963]\n",
"[1, 'Chapter Review', 972]\n",
"[1, 'Chapter 16: America⠒s Gilded Age, 1870⠍1890', 976]\n",
"[1, 'The Second Industrial Revolution', 980]\n",
"[1, 'Freedom in the Gilded Age', 992]\n",
"[1, 'Labor and the Republic', 999]\n",
"[1, 'The Transformation of the West', 1009]\n",
"[1, 'Politics in a Gilded Age', 1032]\n",
"[1, 'Chapter Review', 1039]\n",
"[1, 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900', 1044]\n",
"[1, 'The Populist Challenge', 1048]\n",
"[1, 'The Segregated South', 1059]\n",
"[1, 'Redrawing the Boundaries', 1075]\n",
"[1, 'Becoming a World Power', 1082]\n",
"[1, 'Chapter Review', 1101]\n",
"[1, 'Chapter 18: The Progressive Era, 1900⠍1916', 1106]\n",
"[1, 'An Urban Age and a Consumer Society', 1111]\n",
"[1, 'Varieties of Progressivism', 1128]\n",
"[1, 'The Politics of Progressivism', 1144]\n",
"[1, 'The Progressive Presidents', 1158]\n",
"[1, 'Chapter Review', 1170]\n",
"[1, 'Chapter 19: Safe for Democracy: The United States and World War I', 1176]\n",
"[1, 'An Era of Intervention', 1181]\n",
"[1, 'America and the Great War', 1189]\n",
"[1, 'The War at Home', 1195]\n",
"[1, 'Who Is an American?', 1210]\n",
"[1, '1919', 1227]\n",
"[1, 'Chapter Review', 1239]\n",
"[1, 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932', 1244]\n",
"[1, 'The Business of America', 1248]\n",
"[1, 'Business and Government', 1258]\n",
"[1, 'The Birth of Civil Liberties', 1267]\n",
"[1, 'The Culture Wars', 1273]\n",
"[1, 'The Great Depression', 1290]\n",
"[1, 'Chapter Review', 1298]\n",
"[1, 'Chapter 21: The New Deal, 1932⠍1940', 1303]\n",
"[1, 'The First New Deal', 1308]\n",
"[1, 'The Grassroots Revolt', 1321]\n",
"[1, 'The Second New Deal', 1328]\n",
"[1, 'A Reckoning With Liberty', 1333]\n",
"[1, 'The Limits of Change', 1343]\n",
"[1, 'A New Conception of America', 1353]\n",
"[1, 'Chapter Review', 1362]\n",
"[1, 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945', 1368]\n",
"[1, 'Fighting World War II', 1374]\n",
"[1, 'The Home Front', 1386]\n",
"[1, 'Visions of Postwar Freedom', 1398]\n",
"[1, 'The American Dilemma', 1403]\n",
"[1, 'The End of the War', 1424]\n",
"[1, 'Chapter Review', 1432]\n",
"[1, 'Chapter 23: The United States and the Cold War, 1945⠍1953', 1437]\n",
"[1, 'Origins of the Cold War', 1442]\n",
"[1, 'The Cold War and the Idea of Freedom', 1456]\n",
"[1, 'The Truman Presidency', 1463]\n",
"[1, 'The Anticommunist Crusade', 1471]\n",
"[1, 'Chapter Review', 1488]\n",
"[1, 'Chapter 24: An Affluent Society, 1953⠍1960', 1493]\n",
"[1, 'The Golden Age', 1497]\n",
"[1, 'The Eisenhower Era', 1519]\n",
"[1, 'The Freedom Movement', 1533]\n",
"[1, 'The Election of 1960', 1548]\n",
"[1, 'Chapter Review', 1552]\n",
"[1, 'Chapter 25: The Sixties, 1960⠍1968', 1557]\n",
"[1, 'The Civil Rights Revolution', 1561]\n",
"[1, 'The Kennedy Years', 1566]\n",
"[1, 'Lyndon Johnson⠒s Presidency', 1571]\n",
"[1, 'The Changing Black Movement', 1581]\n",
"[1, 'Vietnam and the New Left', 1586]\n",
"[1, 'The New Movements and the Rights Revolution', 1596]\n",
"[1, '1968', 1617]\n",
"[1, 'Chapter Review', 1622]\n",
"[1, 'Chapter 26: The Conservative Turn, 1969⠍1988', 1628]\n",
"[1, 'President Nixon', 1631]\n",
"[1, 'Grassroots Rights Movements', 1638]\n",
"[1, 'Foreign Policy and Watergate', 1643]\n",
"[1, 'The End of the Golden Age', 1654]\n",
"[1, 'The Rising Tide of Conservatism', 1667]\n",
"[1, 'The Reagan Revolution', 1677]\n",
"[1, 'Chapter Review', 1691]\n",
"[1, 'Chapter 27: A New World Order, 1989⠍2004', 1696]\n",
"[1, 'The Post⠍Cold War World', 1700]\n",
"[1, 'Globalization and Its Discontents', 1709]\n",
"[1, 'Culture Wars', 1720]\n",
"[1, 'Impeachment and the Election of 2000', 1743]\n",
"[1, 'The Attacks of September 11', 1747]\n",
"[1, 'The War on Terrorism', 1750]\n",
"[1, 'An American Empire?', 1754]\n",
"[1, 'The Aftermath of September 11 at Home', 1759]\n",
"[1, 'Chapter Review', 1764]\n",
"[1, 'Chapter 28: A Divided Nation', 1769]\n",
"[1, 'The Winds of Change', 1772]\n",
"[1, 'The Great Recession', 1780]\n",
"[1, 'Obama in Office', 1789]\n",
"[1, 'The Obama Presidency', 1798]\n",
"[1, 'President Trump', 1807]\n",
"[1, '2020: Year of Crisis', 1820]\n",
"[1, 'Freedom in the Twenty-First Century', 1831]\n",
"[1, 'Chapter Review', 1841]\n",
"[1, 'Suggested Reading', 1845]\n",
"[1, 'The Declaration of Independence (1776)', 1909]\n",
"[1, 'The Constitution of The United States (1787)', 1917]\n",
"[1, 'Glossary', 1943]\n",
"[1, 'Credits', 2008]\n",
"[1, 'Index', 2016]\n"
]
}
],
"source": [
"import fitz\n",
"doc = fitz.open('../data/raw/textbook.pdf')\n",
"toc = doc.get_toc()\n",
"for item in toc:\n",
" print(item) # [level, title, pdf_page]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "991dbad2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sections to process: 204\n",
"[1, 'Chapter 1: Old Worlds and New', 59]\n",
"[1, 'An Old World: North America', 63]\n",
"[1, 'An Old World: West Africa', 73]\n",
"[1, 'An Old World: Western Europe', 75]\n",
"[1, 'Contact', 80]\n",
"[1, 'The Spanish Empire', 88]\n",
"[1, 'The French and Dutch Empires', 108]\n",
"[1, 'Chapter Review', 120]\n",
"[1, 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124]\n",
"[1, 'England and the Americas', 129]\n",
"[1, 'Early English Exploration and Colonization', 138]\n",
"[1, 'The Chesapeake', 142]\n",
"[1, 'Origins of American Slavery', 150]\n",
"[1, 'The New England Way', 157]\n",
"[1, 'New Englanders Divided', 169]\n",
"[1, 'Religion, Politics, and Freedom', 180]\n",
"[1, 'Chapter Review', 188]\n",
"[1, 'Chapter 3: Creating Anglo-America, 1660⠍1750', 193]\n",
"[1, 'Global Competition and the Expansion of England⠒s Empire', 197]\n",
"[1, 'Entrenchment of American Slavery', 206]\n",
"[1, 'Colonies in Crisis', 216]\n",
"[1, 'The Growth of Colonial America', 223]\n",
"[1, 'Social Classes in the British Colonies', 238]\n",
"[1, 'North America at Mid-Century', 246]\n",
"[1, 'Chapter Review', 249]\n",
"[1, 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763', 253]\n",
"[1, 'Slavery and Empire', 257]\n",
"[1, 'Slave Cultures and Slave Resistance', 274]\n",
"[1, 'An Empire of Freedom', 280]\n",
"[1, 'The Public Sphere', 285]\n",
"[1, 'The Great Awakening', 294]\n",
"[1, 'Imperial Rivalries', 298]\n",
"[1, 'Battle for the Continent', 306]\n",
"[1, 'Chapter Review', 320]\n",
"[1, 'Chapter 5: The American Revolution, 1763⠍1783', 325]\n",
"[1, 'The Crisis Begins', 329]\n",
"[1, 'The Road to Revolution', 339]\n",
"[1, 'The Coming of Independence', 345]\n",
"[1, 'Securing Independence', 359]\n",
"[1, 'Chapter Review', 375]\n",
"[1, 'Chapter 6: The Revolution Within', 381]\n",
"[1, 'Democratizing Freedom', 384]\n",
"[1, 'Toward Religious Toleration', 392]\n",
"[1, 'Defining Economic Freedom', 399]\n",
"[1, 'The Limits of Liberty', 404]\n",
"[1, 'Slavery and the Revolution', 410]\n",
"[1, 'Daughters of Liberty', 422]\n",
"[1, 'Chapter Review', 432]\n",
"[1, 'Chapter 7: Founding a Nation, 1783⠍1791', 435]\n",
"[1, 'America Under the Confederation', 439]\n",
"[1, 'A New Constitution', 450]\n",
"[1, 'The Ratification Debate and the Origin of the Bill of Rights', 460]\n",
"[1, '“We the Peopleâ€\\x9d', 472]\n",
"[1, 'Chapter Review', 486]\n",
"[1, 'Chapter 8: Securing the Republic, 1791⠍1815', 491]\n",
"[1, 'Politics in an Age of Passion', 494]\n",
"[1, 'The Adams Presidency', 508]\n",
"[1, 'Jefferson in Power', 522]\n",
"[1, 'The “Second War of Independenceâ€\\x9d', 531]\n",
"[1, 'Chapter Review', 542]\n",
"[1, 'Chapter 9: The Market Revolution, 1800⠍1840', 548]\n",
"[1, 'A New Economy', 552]\n",
"[1, 'The Rise of the West', 558]\n",
"[1, 'Market Society', 566]\n",
"[1, 'The Free Individual', 582]\n",
"[1, 'The Limits of Prosperity', 591]\n",
"[1, 'Chapter Review', 601]\n",
"[1, 'Chapter 10: Democracy in America, 1815⠍1840', 606]\n",
"[1, 'The Triumph of Democracy', 610]\n",
"[1, 'Nationalism and Its Discontents', 623]\n",
"[1, 'Nation, Section, and Party', 630]\n",
"[1, 'The Age of Jackson', 639]\n",
"[1, 'Indian Removal', 647]\n",
"[1, 'The Bank War and After', 657]\n",
"[1, 'Chapter Review', 664]\n",
"[1, 'Chapter 11: The Peculiar Institution', 669]\n",
"[1, 'The Old South', 672]\n",
"[1, 'Life Under Slavery', 690]\n",
"[1, 'Slave Culture', 704]\n",
"[1, 'Resistance to Slavery', 712]\n",
"[1, 'Chapter Review', 722]\n",
"[1, 'Chapter 12: An Age of Reform, 1820⠍1840', 725]\n",
"[1, 'The Reform Impulse', 728]\n",
"[1, 'The Crusade Against Slavery', 740]\n",
"[1, 'Black and White Abolitionism', 755]\n",
"[1, 'The Origins of Feminism', 761]\n",
"[1, 'Chapter Review', 775]\n",
"[1, 'Chapter 13: A House Divided, 1840⠍1861', 780]\n",
"[1, 'Fruits of Manifest Destiny', 783]\n",
"[1, 'A Dose of Arsenic', 803]\n",
"[1, 'The Rise of the Republican Party', 814]\n",
"[1, 'The Emergence of Lincoln', 821]\n",
"[1, 'The Impending Crisis', 837]\n",
"[1, 'Chapter Review', 844]\n",
"[1, 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865', 849]\n",
"[1, 'The First Modern War', 853]\n",
"[1, 'The Coming of Emancipation', 864]\n",
"[1, 'The Second American Revolution', 876]\n",
"[1, 'The Confederate Nation', 891]\n",
"[1, 'Turning Points', 900]\n",
"[1, 'Rehearsals for Reconstruction and the End of the War', 904]\n",
"[1, 'Chapter Review', 912]\n",
"[1, 'Chapter 15: “What Is Freedom?â€\\x9d: Reconstruction', 917]\n",
"[1, 'The Meaning of Freedom', 921]\n",
"[1, 'The Making of Radical Reconstruction', 938]\n",
"[1, 'Radical Reconstruction in the South', 956]\n",
"[1, 'The Overthrow of Reconstruction', 963]\n",
"[1, 'Chapter Review', 972]\n",
"[1, 'Chapter 16: America⠒s Gilded Age, 1870⠍1890', 976]\n",
"[1, 'The Second Industrial Revolution', 980]\n",
"[1, 'Freedom in the Gilded Age', 992]\n",
"[1, 'Labor and the Republic', 999]\n",
"[1, 'The Transformation of the West', 1009]\n",
"[1, 'Politics in a Gilded Age', 1032]\n",
"[1, 'Chapter Review', 1039]\n",
"[1, 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900', 1044]\n",
"[1, 'The Populist Challenge', 1048]\n",
"[1, 'The Segregated South', 1059]\n",
"[1, 'Redrawing the Boundaries', 1075]\n",
"[1, 'Becoming a World Power', 1082]\n",
"[1, 'Chapter Review', 1101]\n",
"[1, 'Chapter 18: The Progressive Era, 1900⠍1916', 1106]\n",
"[1, 'An Urban Age and a Consumer Society', 1111]\n",
"[1, 'Varieties of Progressivism', 1128]\n",
"[1, 'The Politics of Progressivism', 1144]\n",
"[1, 'The Progressive Presidents', 1158]\n",
"[1, 'Chapter Review', 1170]\n",
"[1, 'Chapter 19: Safe for Democracy: The United States and World War I', 1176]\n",
"[1, 'An Era of Intervention', 1181]\n",
"[1, 'America and the Great War', 1189]\n",
"[1, 'The War at Home', 1195]\n",
"[1, 'Who Is an American?', 1210]\n",
"[1, '1919', 1227]\n",
"[1, 'Chapter Review', 1239]\n",
"[1, 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932', 1244]\n",
"[1, 'The Business of America', 1248]\n",
"[1, 'Business and Government', 1258]\n",
"[1, 'The Birth of Civil Liberties', 1267]\n",
"[1, 'The Culture Wars', 1273]\n",
"[1, 'The Great Depression', 1290]\n",
"[1, 'Chapter Review', 1298]\n",
"[1, 'Chapter 21: The New Deal, 1932⠍1940', 1303]\n",
"[1, 'The First New Deal', 1308]\n",
"[1, 'The Grassroots Revolt', 1321]\n",
"[1, 'The Second New Deal', 1328]\n",
"[1, 'A Reckoning With Liberty', 1333]\n",
"[1, 'The Limits of Change', 1343]\n",
"[1, 'A New Conception of America', 1353]\n",
"[1, 'Chapter Review', 1362]\n",
"[1, 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945', 1368]\n",
"[1, 'Fighting World War II', 1374]\n",
"[1, 'The Home Front', 1386]\n",
"[1, 'Visions of Postwar Freedom', 1398]\n",
"[1, 'The American Dilemma', 1403]\n",
"[1, 'The End of the War', 1424]\n",
"[1, 'Chapter Review', 1432]\n",
"[1, 'Chapter 23: The United States and the Cold War, 1945⠍1953', 1437]\n",
"[1, 'Origins of the Cold War', 1442]\n",
"[1, 'The Cold War and the Idea of Freedom', 1456]\n",
"[1, 'The Truman Presidency', 1463]\n",
"[1, 'The Anticommunist Crusade', 1471]\n",
"[1, 'Chapter Review', 1488]\n",
"[1, 'Chapter 24: An Affluent Society, 1953⠍1960', 1493]\n",
"[1, 'The Golden Age', 1497]\n",
"[1, 'The Eisenhower Era', 1519]\n",
"[1, 'The Freedom Movement', 1533]\n",
"[1, 'The Election of 1960', 1548]\n",
"[1, 'Chapter Review', 1552]\n",
"[1, 'Chapter 25: The Sixties, 1960⠍1968', 1557]\n",
"[1, 'The Civil Rights Revolution', 1561]\n",
"[1, 'The Kennedy Years', 1566]\n",
"[1, 'Lyndon Johnson⠒s Presidency', 1571]\n",
"[1, 'The Changing Black Movement', 1581]\n",
"[1, 'Vietnam and the New Left', 1586]\n",
"[1, 'The New Movements and the Rights Revolution', 1596]\n",
"[1, '1968', 1617]\n",
"[1, 'Chapter Review', 1622]\n",
"[1, 'Chapter 26: The Conservative Turn, 1969⠍1988', 1628]\n",
"[1, 'President Nixon', 1631]\n",
"[1, 'Grassroots Rights Movements', 1638]\n",
"[1, 'Foreign Policy and Watergate', 1643]\n",
"[1, 'The End of the Golden Age', 1654]\n",
"[1, 'The Rising Tide of Conservatism', 1667]\n",
"[1, 'The Reagan Revolution', 1677]\n",
"[1, 'Chapter Review', 1691]\n",
"[1, 'Chapter 27: A New World Order, 1989⠍2004', 1696]\n",
"[1, 'The Post⠍Cold War World', 1700]\n",
"[1, 'Globalization and Its Discontents', 1709]\n",
"[1, 'Culture Wars', 1720]\n",
"[1, 'Impeachment and the Election of 2000', 1743]\n",
"[1, 'The Attacks of September 11', 1747]\n",
"[1, 'The War on Terrorism', 1750]\n",
"[1, 'An American Empire?', 1754]\n",
"[1, 'The Aftermath of September 11 at Home', 1759]\n",
"[1, 'Chapter Review', 1764]\n",
"[1, 'Chapter 28: A Divided Nation', 1769]\n",
"[1, 'The Winds of Change', 1772]\n",
"[1, 'The Great Recession', 1780]\n",
"[1, 'Obama in Office', 1789]\n",
"[1, 'The Obama Presidency', 1798]\n",
"[1, 'President Trump', 1807]\n",
"[1, '2020: Year of Crisis', 1820]\n",
"[1, 'Freedom in the Twenty-First Century', 1831]\n",
"[1, 'Chapter Review', 1841]\n"
]
}
],
"source": [
"# Find where Chapter 1 starts and throw away everything before it\n",
"start_index = next(i for i, item in enumerate(toc) if 'Chapter 1' in item[1])\n",
"chapters_toc = toc[start_index:]\n",
"\n",
"# Also throw away back matter (Suggested Reading, Glossary, Index etc.)\n",
"end_titles = {'Suggested Reading', 'The Declaration of Independence (1776)', \n",
" 'The Constitution of The United States (1787)', 'Glossary', \n",
" 'Credits', 'Index'}\n",
"chapters_toc = [item for item in chapters_toc if item[1] not in end_titles]\n",
"\n",
"print(f\"Sections to process: {len(chapters_toc)}\")\n",
"for item in chapters_toc:\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "43e20197",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'title': 'Chapter 1: Old Worlds and New', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': True, 'start_pdf': 58, 'end_pdf': 61}\n",
"{'title': 'An Old World: North America', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 62, 'end_pdf': 71}\n",
"{'title': 'An Old World: West Africa', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 72, 'end_pdf': 73}\n",
"{'title': 'An Old World: Western Europe', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 74, 'end_pdf': 78}\n",
"{'title': 'Contact', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 79, 'end_pdf': 86}\n",
"{'title': 'The Spanish Empire', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 87, 'end_pdf': 106}\n",
"{'title': 'The French and Dutch Empires', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 107, 'end_pdf': 118}\n",
"{'title': 'Chapter Review', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 119, 'end_pdf': 122}\n",
"{'title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': True, 'start_pdf': 123, 'end_pdf': 127}\n",
"{'title': 'England and the Americas', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 128, 'end_pdf': 136}\n",
"{'title': 'Early English Exploration and Colonization', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 137, 'end_pdf': 140}\n",
"{'title': 'The Chesapeake', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 141, 'end_pdf': 148}\n"
]
}
],
"source": [
"import re\n",
"\n",
"def parse_chapter_num(title):\n",
" match = re.match(r'Chapter (\\d+):', title)\n",
" return int(match.group(1)) if match else None\n",
"\n",
"structured = []\n",
"current_chapter_num = None\n",
"current_chapter_title = None\n",
"\n",
"for i, item in enumerate(chapters_toc):\n",
" title = item[1]\n",
" start_pdf = item[2] - 1 # 0-indexed\n",
" end_pdf = (chapters_toc[i + 1][2] - 2) if i + 1 < len(chapters_toc) else doc.page_count - 1\n",
"\n",
" chapter_num = parse_chapter_num(title)\n",
"\n",
" if chapter_num:\n",
" # This entry IS a chapter\n",
" current_chapter_num = chapter_num\n",
" current_chapter_title = title\n",
" is_chapter_header = True\n",
" else:\n",
" is_chapter_header = False\n",
"\n",
" structured.append({\n",
" \"title\": title,\n",
" \"chapter_num\": current_chapter_num,\n",
" \"chapter_title\": current_chapter_title,\n",
" \"is_chapter_header\": is_chapter_header,\n",
" \"start_pdf\": start_pdf,\n",
" \"end_pdf\": end_pdf,\n",
" })\n",
"\n",
"# Sanity check\n",
"for s in structured[:12]:\n",
" print(s)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "149bc714",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 'Half-title Page', 2, {'kind': 1, 'xref': 63504, 'page': 1, 'to': Point(76.47846, 86.92822), 'zoom': 0.0}]\n",
"[1, 'Physical/Political Map of The United States', 5, {'kind': 1, 'xref': 63507, 'page': 4, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
"[1, 'Political Map of The World', 6, {'kind': 1, 'xref': 63509, 'page': 5, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
"[1, 'Title Page', 7, {'kind': 1, 'xref': 63511, 'page': 6, 'to': Point(76.47846, 86.92822), 'zoom': 0.0}]\n",
"[1, 'Copyright', 10, {'kind': 1, 'xref': 63513, 'page': 9, 'to': Point(76.5, 87.0), 'zoom': 0.0}]\n",
"[1, 'Dedication', 13, {'kind': 1, 'xref': 63515, 'page': 12, 'to': Point(76.5, 87.0), 'zoom': 0.0}]\n",
"[1, 'Contents', 14, {'kind': 1, 'xref': 63517, 'page': 13, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
"[1, 'List of Maps, Tables, and Figures', 22, {'kind': 1, 'xref': 63519, 'page': 21, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
"[1, 'About the Authors', 32, {'kind': 1, 'xref': 63521, 'page': 31, 'to': Point(76.47846, 91.40668), 'zoom': 0.0}]\n",
"[1, 'Preface', 34, {'kind': 1, 'xref': 63523, 'page': 33, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
"[1, 'Resources For Students And Instructors', 54, {'kind': 1, 'xref': 63525, 'page': 53, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
"[1, 'Chapter 1: Old Worlds and New', 59, {'kind': 1, 'xref': 63527, 'page': 58, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
"[1, 'An Old World: North America', 63, {'kind': 1, 'xref': 63529, 'page': 62, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
"[1, 'An Old World: West Africa', 73, {'kind': 1, 'xref': 63531, 'page': 72, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
"[1, 'An Old World: Western Europe', 75, {'kind': 1, 'xref': 63533, 'page': 74, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n"
]
}
],
"source": [
"toc_full = doc.get_toc(simple=False)\n",
"\n",
"for item in toc_full[:15]:\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c2563864",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generated → /home/keshav/code/apush-rag/config/page_map.yaml\n",
"Now open that file and fill in the real page numbers. Leave as null if unknown.\n"
]
}
],
"source": [
"import yaml\n",
"from pathlib import Path\n",
"\n",
"project_root = Path().resolve().parent\n",
"output_path = project_root / \"config\" / \"page_map.yaml\"\n",
"\n",
"page_map = {\"chapters\": {}}\n",
"\n",
"for section in structured:\n",
" ch_num = section[\"chapter_num\"]\n",
" ch_title = section[\"chapter_title\"]\n",
" title = section[\"title\"]\n",
"\n",
" # Initialize chapter entry if first time seeing it\n",
" if ch_num not in page_map[\"chapters\"]:\n",
" page_map[\"chapters\"][ch_num] = {\n",
" \"title\": ch_title,\n",
" \"real_page\": None, # ← you fill this in\n",
" \"sections\": {}\n",
" }\n",
"\n",
" # Add section with null page — you fill these in\n",
" if not section[\"is_chapter_header\"]:\n",
" page_map[\"chapters\"][ch_num][\"sections\"][title] = None\n",
"\n",
"with open(output_path, \"w\") as f:\n",
" yaml.dump(page_map, f, allow_unicode=True, sort_keys=False, default_flow_style=False)\n",
"\n",
"print(f\"Generated → {output_path}\")\n",
"print(\"Now open that file and fill in the real page numbers. Leave as null if unknown.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}