parseed input from textbook pdf
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -64,9 +64,7 @@ htmlcov/
|
||||
.dmypy.json
|
||||
pyrightconfig.json
|
||||
|
||||
# ── Jupyter ───────────────────────────────────────────────────────────────────
|
||||
.ipynb_checkpoints/
|
||||
*.ipynb
|
||||
|
||||
|
||||
# ── Logs ──────────────────────────────────────────────────────────────────────
|
||||
logs/
|
||||
|
||||
289
config/page_map.yaml
Normal file
289
config/page_map.yaml
Normal file
@@ -0,0 +1,289 @@
|
||||
chapters:
|
||||
1:
|
||||
title: 'Chapter 1: Old Worlds and New'
|
||||
real_page: null
|
||||
sections:
|
||||
'An Old World: North America': null
|
||||
'An Old World: West Africa': null
|
||||
'An Old World: Western Europe': null
|
||||
Contact: null
|
||||
The Spanish Empire: null
|
||||
The French and Dutch Empires: null
|
||||
Chapter Review: null
|
||||
2:
|
||||
title: 'Chapter 2: European Colonies and Native Nations, 1600⠍1660'
|
||||
real_page: null
|
||||
sections:
|
||||
England and the Americas: null
|
||||
Early English Exploration and Colonization: null
|
||||
The Chesapeake: null
|
||||
Origins of American Slavery: null
|
||||
The New England Way: null
|
||||
New Englanders Divided: null
|
||||
Religion, Politics, and Freedom: null
|
||||
Chapter Review: null
|
||||
3:
|
||||
title: 'Chapter 3: Creating Anglo-America, 1660⠍1750'
|
||||
real_page: null
|
||||
sections:
|
||||
Global Competition and the Expansion of England⠒s Empire: null
|
||||
Entrenchment of American Slavery: null
|
||||
Colonies in Crisis: null
|
||||
The Growth of Colonial America: null
|
||||
Social Classes in the British Colonies: null
|
||||
North America at Mid-Century: null
|
||||
Chapter Review: null
|
||||
4:
|
||||
title: 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763'
|
||||
real_page: null
|
||||
sections:
|
||||
Slavery and Empire: null
|
||||
Slave Cultures and Slave Resistance: null
|
||||
An Empire of Freedom: null
|
||||
The Public Sphere: null
|
||||
The Great Awakening: null
|
||||
Imperial Rivalries: null
|
||||
Battle for the Continent: null
|
||||
Chapter Review: null
|
||||
5:
|
||||
title: 'Chapter 5: The American Revolution, 1763⠍1783'
|
||||
real_page: null
|
||||
sections:
|
||||
The Crisis Begins: null
|
||||
The Road to Revolution: null
|
||||
The Coming of Independence: null
|
||||
Securing Independence: null
|
||||
Chapter Review: null
|
||||
6:
|
||||
title: 'Chapter 6: The Revolution Within'
|
||||
real_page: null
|
||||
sections:
|
||||
Democratizing Freedom: null
|
||||
Toward Religious Toleration: null
|
||||
Defining Economic Freedom: null
|
||||
The Limits of Liberty: null
|
||||
Slavery and the Revolution: null
|
||||
Daughters of Liberty: null
|
||||
Chapter Review: null
|
||||
7:
|
||||
title: 'Chapter 7: Founding a Nation, 1783⠍1791'
|
||||
real_page: null
|
||||
sections:
|
||||
America Under the Confederation: null
|
||||
A New Constitution: null
|
||||
The Ratification Debate and the Origin of the Bill of Rights: null
|
||||
"“We the Peopleâ€\x9D": null
|
||||
Chapter Review: null
|
||||
8:
|
||||
title: 'Chapter 8: Securing the Republic, 1791⠍1815'
|
||||
real_page: null
|
||||
sections:
|
||||
Politics in an Age of Passion: null
|
||||
The Adams Presidency: null
|
||||
Jefferson in Power: null
|
||||
"The “Second War of Independenceâ€\x9D": null
|
||||
Chapter Review: null
|
||||
9:
|
||||
title: 'Chapter 9: The Market Revolution, 1800⠍1840'
|
||||
real_page: null
|
||||
sections:
|
||||
A New Economy: null
|
||||
The Rise of the West: null
|
||||
Market Society: null
|
||||
The Free Individual: null
|
||||
The Limits of Prosperity: null
|
||||
Chapter Review: null
|
||||
10:
|
||||
title: 'Chapter 10: Democracy in America, 1815⠍1840'
|
||||
real_page: null
|
||||
sections:
|
||||
The Triumph of Democracy: null
|
||||
Nationalism and Its Discontents: null
|
||||
Nation, Section, and Party: null
|
||||
The Age of Jackson: null
|
||||
Indian Removal: null
|
||||
The Bank War and After: null
|
||||
Chapter Review: null
|
||||
11:
|
||||
title: 'Chapter 11: The Peculiar Institution'
|
||||
real_page: null
|
||||
sections:
|
||||
The Old South: null
|
||||
Life Under Slavery: null
|
||||
Slave Culture: null
|
||||
Resistance to Slavery: null
|
||||
Chapter Review: null
|
||||
12:
|
||||
title: 'Chapter 12: An Age of Reform, 1820⠍1840'
|
||||
real_page: null
|
||||
sections:
|
||||
The Reform Impulse: null
|
||||
The Crusade Against Slavery: null
|
||||
Black and White Abolitionism: null
|
||||
The Origins of Feminism: null
|
||||
Chapter Review: null
|
||||
13:
|
||||
title: 'Chapter 13: A House Divided, 1840⠍1861'
|
||||
real_page: null
|
||||
sections:
|
||||
Fruits of Manifest Destiny: null
|
||||
A Dose of Arsenic: null
|
||||
The Rise of the Republican Party: null
|
||||
The Emergence of Lincoln: null
|
||||
The Impending Crisis: null
|
||||
Chapter Review: null
|
||||
14:
|
||||
title: 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865'
|
||||
real_page: null
|
||||
sections:
|
||||
The First Modern War: null
|
||||
The Coming of Emancipation: null
|
||||
The Second American Revolution: null
|
||||
The Confederate Nation: null
|
||||
Turning Points: null
|
||||
Rehearsals for Reconstruction and the End of the War: null
|
||||
Chapter Review: null
|
||||
15:
|
||||
title: "Chapter 15: “What Is Freedom?â€\x9D: Reconstruction"
|
||||
real_page: null
|
||||
sections:
|
||||
The Meaning of Freedom: null
|
||||
The Making of Radical Reconstruction: null
|
||||
Radical Reconstruction in the South: null
|
||||
The Overthrow of Reconstruction: null
|
||||
Chapter Review: null
|
||||
16:
|
||||
title: 'Chapter 16: America⠒s Gilded Age, 1870⠍1890'
|
||||
real_page: null
|
||||
sections:
|
||||
The Second Industrial Revolution: null
|
||||
Freedom in the Gilded Age: null
|
||||
Labor and the Republic: null
|
||||
The Transformation of the West: null
|
||||
Politics in a Gilded Age: null
|
||||
Chapter Review: null
|
||||
17:
|
||||
title: 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900'
|
||||
real_page: null
|
||||
sections:
|
||||
The Populist Challenge: null
|
||||
The Segregated South: null
|
||||
Redrawing the Boundaries: null
|
||||
Becoming a World Power: null
|
||||
Chapter Review: null
|
||||
18:
|
||||
title: 'Chapter 18: The Progressive Era, 1900⠍1916'
|
||||
real_page: null
|
||||
sections:
|
||||
An Urban Age and a Consumer Society: null
|
||||
Varieties of Progressivism: null
|
||||
The Politics of Progressivism: null
|
||||
The Progressive Presidents: null
|
||||
Chapter Review: null
|
||||
19:
|
||||
title: 'Chapter 19: Safe for Democracy: The United States and World War I'
|
||||
real_page: null
|
||||
sections:
|
||||
An Era of Intervention: null
|
||||
America and the Great War: null
|
||||
The War at Home: null
|
||||
Who Is an American?: null
|
||||
'1919': null
|
||||
Chapter Review: null
|
||||
20:
|
||||
title: 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932'
|
||||
real_page: null
|
||||
sections:
|
||||
The Business of America: null
|
||||
Business and Government: null
|
||||
The Birth of Civil Liberties: null
|
||||
The Culture Wars: null
|
||||
The Great Depression: null
|
||||
Chapter Review: null
|
||||
21:
|
||||
title: 'Chapter 21: The New Deal, 1932⠍1940'
|
||||
real_page: null
|
||||
sections:
|
||||
The First New Deal: null
|
||||
The Grassroots Revolt: null
|
||||
The Second New Deal: null
|
||||
A Reckoning With Liberty: null
|
||||
The Limits of Change: null
|
||||
A New Conception of America: null
|
||||
Chapter Review: null
|
||||
22:
|
||||
title: 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945'
|
||||
real_page: null
|
||||
sections:
|
||||
Fighting World War II: null
|
||||
The Home Front: null
|
||||
Visions of Postwar Freedom: null
|
||||
The American Dilemma: null
|
||||
The End of the War: null
|
||||
Chapter Review: null
|
||||
23:
|
||||
title: 'Chapter 23: The United States and the Cold War, 1945⠍1953'
|
||||
real_page: null
|
||||
sections:
|
||||
Origins of the Cold War: null
|
||||
The Cold War and the Idea of Freedom: null
|
||||
The Truman Presidency: null
|
||||
The Anticommunist Crusade: null
|
||||
Chapter Review: null
|
||||
24:
|
||||
title: 'Chapter 24: An Affluent Society, 1953⠍1960'
|
||||
real_page: null
|
||||
sections:
|
||||
The Golden Age: null
|
||||
The Eisenhower Era: null
|
||||
The Freedom Movement: null
|
||||
The Election of 1960: null
|
||||
Chapter Review: null
|
||||
25:
|
||||
title: 'Chapter 25: The Sixties, 1960⠍1968'
|
||||
real_page: null
|
||||
sections:
|
||||
The Civil Rights Revolution: null
|
||||
The Kennedy Years: null
|
||||
Lyndon Johnson⠒s Presidency: null
|
||||
The Changing Black Movement: null
|
||||
Vietnam and the New Left: null
|
||||
The New Movements and the Rights Revolution: null
|
||||
'1968': null
|
||||
Chapter Review: null
|
||||
26:
|
||||
title: 'Chapter 26: The Conservative Turn, 1969⠍1988'
|
||||
real_page: null
|
||||
sections:
|
||||
President Nixon: null
|
||||
Grassroots Rights Movements: null
|
||||
Foreign Policy and Watergate: null
|
||||
The End of the Golden Age: null
|
||||
The Rising Tide of Conservatism: null
|
||||
The Reagan Revolution: null
|
||||
Chapter Review: null
|
||||
27:
|
||||
title: 'Chapter 27: A New World Order, 1989⠍2004'
|
||||
real_page: null
|
||||
sections:
|
||||
The Post⠍Cold War World: null
|
||||
Globalization and Its Discontents: null
|
||||
Culture Wars: null
|
||||
Impeachment and the Election of 2000: null
|
||||
The Attacks of September 11: null
|
||||
The War on Terrorism: null
|
||||
An American Empire?: null
|
||||
The Aftermath of September 11 at Home: null
|
||||
Chapter Review: null
|
||||
28:
|
||||
title: 'Chapter 28: A Divided Nation'
|
||||
real_page: null
|
||||
sections:
|
||||
The Winds of Change: null
|
||||
The Great Recession: null
|
||||
Obama in Office: null
|
||||
The Obama Presidency: null
|
||||
President Trump: null
|
||||
'2020: Year of Crisis': null
|
||||
Freedom in the Twenty-First Century: null
|
||||
Chapter Review: null
|
||||
667
notebooks/pdf_parse.ipynb
Normal file
667
notebooks/pdf_parse.ipynb
Normal file
@@ -0,0 +1,667 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e91fd8c7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hello, World!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Hello, World!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "11896305",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[1, 'Half-title Page', 2]\n",
|
||||
"[1, 'Physical/Political Map of The United States', 5]\n",
|
||||
"[1, 'Political Map of The World', 6]\n",
|
||||
"[1, 'Title Page', 7]\n",
|
||||
"[1, 'Copyright', 10]\n",
|
||||
"[1, 'Dedication', 13]\n",
|
||||
"[1, 'Contents', 14]\n",
|
||||
"[1, 'List of Maps, Tables, and Figures', 22]\n",
|
||||
"[1, 'About the Authors', 32]\n",
|
||||
"[1, 'Preface', 34]\n",
|
||||
"[1, 'Resources For Students And Instructors', 54]\n",
|
||||
"[1, 'Chapter 1: Old Worlds and New', 59]\n",
|
||||
"[1, 'An Old World: North America', 63]\n",
|
||||
"[1, 'An Old World: West Africa', 73]\n",
|
||||
"[1, 'An Old World: Western Europe', 75]\n",
|
||||
"[1, 'Contact', 80]\n",
|
||||
"[1, 'The Spanish Empire', 88]\n",
|
||||
"[1, 'The French and Dutch Empires', 108]\n",
|
||||
"[1, 'Chapter Review', 120]\n",
|
||||
"[1, 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124]\n",
|
||||
"[1, 'England and the Americas', 129]\n",
|
||||
"[1, 'Early English Exploration and Colonization', 138]\n",
|
||||
"[1, 'The Chesapeake', 142]\n",
|
||||
"[1, 'Origins of American Slavery', 150]\n",
|
||||
"[1, 'The New England Way', 157]\n",
|
||||
"[1, 'New Englanders Divided', 169]\n",
|
||||
"[1, 'Religion, Politics, and Freedom', 180]\n",
|
||||
"[1, 'Chapter Review', 188]\n",
|
||||
"[1, 'Chapter 3: Creating Anglo-America, 1660⠍1750', 193]\n",
|
||||
"[1, 'Global Competition and the Expansion of England⠒s Empire', 197]\n",
|
||||
"[1, 'Entrenchment of American Slavery', 206]\n",
|
||||
"[1, 'Colonies in Crisis', 216]\n",
|
||||
"[1, 'The Growth of Colonial America', 223]\n",
|
||||
"[1, 'Social Classes in the British Colonies', 238]\n",
|
||||
"[1, 'North America at Mid-Century', 246]\n",
|
||||
"[1, 'Chapter Review', 249]\n",
|
||||
"[1, 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763', 253]\n",
|
||||
"[1, 'Slavery and Empire', 257]\n",
|
||||
"[1, 'Slave Cultures and Slave Resistance', 274]\n",
|
||||
"[1, 'An Empire of Freedom', 280]\n",
|
||||
"[1, 'The Public Sphere', 285]\n",
|
||||
"[1, 'The Great Awakening', 294]\n",
|
||||
"[1, 'Imperial Rivalries', 298]\n",
|
||||
"[1, 'Battle for the Continent', 306]\n",
|
||||
"[1, 'Chapter Review', 320]\n",
|
||||
"[1, 'Chapter 5: The American Revolution, 1763⠍1783', 325]\n",
|
||||
"[1, 'The Crisis Begins', 329]\n",
|
||||
"[1, 'The Road to Revolution', 339]\n",
|
||||
"[1, 'The Coming of Independence', 345]\n",
|
||||
"[1, 'Securing Independence', 359]\n",
|
||||
"[1, 'Chapter Review', 375]\n",
|
||||
"[1, 'Chapter 6: The Revolution Within', 381]\n",
|
||||
"[1, 'Democratizing Freedom', 384]\n",
|
||||
"[1, 'Toward Religious Toleration', 392]\n",
|
||||
"[1, 'Defining Economic Freedom', 399]\n",
|
||||
"[1, 'The Limits of Liberty', 404]\n",
|
||||
"[1, 'Slavery and the Revolution', 410]\n",
|
||||
"[1, 'Daughters of Liberty', 422]\n",
|
||||
"[1, 'Chapter Review', 432]\n",
|
||||
"[1, 'Chapter 7: Founding a Nation, 1783⠍1791', 435]\n",
|
||||
"[1, 'America Under the Confederation', 439]\n",
|
||||
"[1, 'A New Constitution', 450]\n",
|
||||
"[1, 'The Ratification Debate and the Origin of the Bill of Rights', 460]\n",
|
||||
"[1, '“We the Peopleâ€\\x9d', 472]\n",
|
||||
"[1, 'Chapter Review', 486]\n",
|
||||
"[1, 'Chapter 8: Securing the Republic, 1791⠍1815', 491]\n",
|
||||
"[1, 'Politics in an Age of Passion', 494]\n",
|
||||
"[1, 'The Adams Presidency', 508]\n",
|
||||
"[1, 'Jefferson in Power', 522]\n",
|
||||
"[1, 'The “Second War of Independenceâ€\\x9d', 531]\n",
|
||||
"[1, 'Chapter Review', 542]\n",
|
||||
"[1, 'Chapter 9: The Market Revolution, 1800⠍1840', 548]\n",
|
||||
"[1, 'A New Economy', 552]\n",
|
||||
"[1, 'The Rise of the West', 558]\n",
|
||||
"[1, 'Market Society', 566]\n",
|
||||
"[1, 'The Free Individual', 582]\n",
|
||||
"[1, 'The Limits of Prosperity', 591]\n",
|
||||
"[1, 'Chapter Review', 601]\n",
|
||||
"[1, 'Chapter 10: Democracy in America, 1815⠍1840', 606]\n",
|
||||
"[1, 'The Triumph of Democracy', 610]\n",
|
||||
"[1, 'Nationalism and Its Discontents', 623]\n",
|
||||
"[1, 'Nation, Section, and Party', 630]\n",
|
||||
"[1, 'The Age of Jackson', 639]\n",
|
||||
"[1, 'Indian Removal', 647]\n",
|
||||
"[1, 'The Bank War and After', 657]\n",
|
||||
"[1, 'Chapter Review', 664]\n",
|
||||
"[1, 'Chapter 11: The Peculiar Institution', 669]\n",
|
||||
"[1, 'The Old South', 672]\n",
|
||||
"[1, 'Life Under Slavery', 690]\n",
|
||||
"[1, 'Slave Culture', 704]\n",
|
||||
"[1, 'Resistance to Slavery', 712]\n",
|
||||
"[1, 'Chapter Review', 722]\n",
|
||||
"[1, 'Chapter 12: An Age of Reform, 1820⠍1840', 725]\n",
|
||||
"[1, 'The Reform Impulse', 728]\n",
|
||||
"[1, 'The Crusade Against Slavery', 740]\n",
|
||||
"[1, 'Black and White Abolitionism', 755]\n",
|
||||
"[1, 'The Origins of Feminism', 761]\n",
|
||||
"[1, 'Chapter Review', 775]\n",
|
||||
"[1, 'Chapter 13: A House Divided, 1840⠍1861', 780]\n",
|
||||
"[1, 'Fruits of Manifest Destiny', 783]\n",
|
||||
"[1, 'A Dose of Arsenic', 803]\n",
|
||||
"[1, 'The Rise of the Republican Party', 814]\n",
|
||||
"[1, 'The Emergence of Lincoln', 821]\n",
|
||||
"[1, 'The Impending Crisis', 837]\n",
|
||||
"[1, 'Chapter Review', 844]\n",
|
||||
"[1, 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865', 849]\n",
|
||||
"[1, 'The First Modern War', 853]\n",
|
||||
"[1, 'The Coming of Emancipation', 864]\n",
|
||||
"[1, 'The Second American Revolution', 876]\n",
|
||||
"[1, 'The Confederate Nation', 891]\n",
|
||||
"[1, 'Turning Points', 900]\n",
|
||||
"[1, 'Rehearsals for Reconstruction and the End of the War', 904]\n",
|
||||
"[1, 'Chapter Review', 912]\n",
|
||||
"[1, 'Chapter 15: “What Is Freedom?â€\\x9d: Reconstruction', 917]\n",
|
||||
"[1, 'The Meaning of Freedom', 921]\n",
|
||||
"[1, 'The Making of Radical Reconstruction', 938]\n",
|
||||
"[1, 'Radical Reconstruction in the South', 956]\n",
|
||||
"[1, 'The Overthrow of Reconstruction', 963]\n",
|
||||
"[1, 'Chapter Review', 972]\n",
|
||||
"[1, 'Chapter 16: America⠒s Gilded Age, 1870⠍1890', 976]\n",
|
||||
"[1, 'The Second Industrial Revolution', 980]\n",
|
||||
"[1, 'Freedom in the Gilded Age', 992]\n",
|
||||
"[1, 'Labor and the Republic', 999]\n",
|
||||
"[1, 'The Transformation of the West', 1009]\n",
|
||||
"[1, 'Politics in a Gilded Age', 1032]\n",
|
||||
"[1, 'Chapter Review', 1039]\n",
|
||||
"[1, 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900', 1044]\n",
|
||||
"[1, 'The Populist Challenge', 1048]\n",
|
||||
"[1, 'The Segregated South', 1059]\n",
|
||||
"[1, 'Redrawing the Boundaries', 1075]\n",
|
||||
"[1, 'Becoming a World Power', 1082]\n",
|
||||
"[1, 'Chapter Review', 1101]\n",
|
||||
"[1, 'Chapter 18: The Progressive Era, 1900⠍1916', 1106]\n",
|
||||
"[1, 'An Urban Age and a Consumer Society', 1111]\n",
|
||||
"[1, 'Varieties of Progressivism', 1128]\n",
|
||||
"[1, 'The Politics of Progressivism', 1144]\n",
|
||||
"[1, 'The Progressive Presidents', 1158]\n",
|
||||
"[1, 'Chapter Review', 1170]\n",
|
||||
"[1, 'Chapter 19: Safe for Democracy: The United States and World War I', 1176]\n",
|
||||
"[1, 'An Era of Intervention', 1181]\n",
|
||||
"[1, 'America and the Great War', 1189]\n",
|
||||
"[1, 'The War at Home', 1195]\n",
|
||||
"[1, 'Who Is an American?', 1210]\n",
|
||||
"[1, '1919', 1227]\n",
|
||||
"[1, 'Chapter Review', 1239]\n",
|
||||
"[1, 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932', 1244]\n",
|
||||
"[1, 'The Business of America', 1248]\n",
|
||||
"[1, 'Business and Government', 1258]\n",
|
||||
"[1, 'The Birth of Civil Liberties', 1267]\n",
|
||||
"[1, 'The Culture Wars', 1273]\n",
|
||||
"[1, 'The Great Depression', 1290]\n",
|
||||
"[1, 'Chapter Review', 1298]\n",
|
||||
"[1, 'Chapter 21: The New Deal, 1932⠍1940', 1303]\n",
|
||||
"[1, 'The First New Deal', 1308]\n",
|
||||
"[1, 'The Grassroots Revolt', 1321]\n",
|
||||
"[1, 'The Second New Deal', 1328]\n",
|
||||
"[1, 'A Reckoning With Liberty', 1333]\n",
|
||||
"[1, 'The Limits of Change', 1343]\n",
|
||||
"[1, 'A New Conception of America', 1353]\n",
|
||||
"[1, 'Chapter Review', 1362]\n",
|
||||
"[1, 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945', 1368]\n",
|
||||
"[1, 'Fighting World War II', 1374]\n",
|
||||
"[1, 'The Home Front', 1386]\n",
|
||||
"[1, 'Visions of Postwar Freedom', 1398]\n",
|
||||
"[1, 'The American Dilemma', 1403]\n",
|
||||
"[1, 'The End of the War', 1424]\n",
|
||||
"[1, 'Chapter Review', 1432]\n",
|
||||
"[1, 'Chapter 23: The United States and the Cold War, 1945⠍1953', 1437]\n",
|
||||
"[1, 'Origins of the Cold War', 1442]\n",
|
||||
"[1, 'The Cold War and the Idea of Freedom', 1456]\n",
|
||||
"[1, 'The Truman Presidency', 1463]\n",
|
||||
"[1, 'The Anticommunist Crusade', 1471]\n",
|
||||
"[1, 'Chapter Review', 1488]\n",
|
||||
"[1, 'Chapter 24: An Affluent Society, 1953⠍1960', 1493]\n",
|
||||
"[1, 'The Golden Age', 1497]\n",
|
||||
"[1, 'The Eisenhower Era', 1519]\n",
|
||||
"[1, 'The Freedom Movement', 1533]\n",
|
||||
"[1, 'The Election of 1960', 1548]\n",
|
||||
"[1, 'Chapter Review', 1552]\n",
|
||||
"[1, 'Chapter 25: The Sixties, 1960⠍1968', 1557]\n",
|
||||
"[1, 'The Civil Rights Revolution', 1561]\n",
|
||||
"[1, 'The Kennedy Years', 1566]\n",
|
||||
"[1, 'Lyndon Johnson⠒s Presidency', 1571]\n",
|
||||
"[1, 'The Changing Black Movement', 1581]\n",
|
||||
"[1, 'Vietnam and the New Left', 1586]\n",
|
||||
"[1, 'The New Movements and the Rights Revolution', 1596]\n",
|
||||
"[1, '1968', 1617]\n",
|
||||
"[1, 'Chapter Review', 1622]\n",
|
||||
"[1, 'Chapter 26: The Conservative Turn, 1969⠍1988', 1628]\n",
|
||||
"[1, 'President Nixon', 1631]\n",
|
||||
"[1, 'Grassroots Rights Movements', 1638]\n",
|
||||
"[1, 'Foreign Policy and Watergate', 1643]\n",
|
||||
"[1, 'The End of the Golden Age', 1654]\n",
|
||||
"[1, 'The Rising Tide of Conservatism', 1667]\n",
|
||||
"[1, 'The Reagan Revolution', 1677]\n",
|
||||
"[1, 'Chapter Review', 1691]\n",
|
||||
"[1, 'Chapter 27: A New World Order, 1989⠍2004', 1696]\n",
|
||||
"[1, 'The Post⠍Cold War World', 1700]\n",
|
||||
"[1, 'Globalization and Its Discontents', 1709]\n",
|
||||
"[1, 'Culture Wars', 1720]\n",
|
||||
"[1, 'Impeachment and the Election of 2000', 1743]\n",
|
||||
"[1, 'The Attacks of September 11', 1747]\n",
|
||||
"[1, 'The War on Terrorism', 1750]\n",
|
||||
"[1, 'An American Empire?', 1754]\n",
|
||||
"[1, 'The Aftermath of September 11 at Home', 1759]\n",
|
||||
"[1, 'Chapter Review', 1764]\n",
|
||||
"[1, 'Chapter 28: A Divided Nation', 1769]\n",
|
||||
"[1, 'The Winds of Change', 1772]\n",
|
||||
"[1, 'The Great Recession', 1780]\n",
|
||||
"[1, 'Obama in Office', 1789]\n",
|
||||
"[1, 'The Obama Presidency', 1798]\n",
|
||||
"[1, 'President Trump', 1807]\n",
|
||||
"[1, '2020: Year of Crisis', 1820]\n",
|
||||
"[1, 'Freedom in the Twenty-First Century', 1831]\n",
|
||||
"[1, 'Chapter Review', 1841]\n",
|
||||
"[1, 'Suggested Reading', 1845]\n",
|
||||
"[1, 'The Declaration of Independence (1776)', 1909]\n",
|
||||
"[1, 'The Constitution of The United States (1787)', 1917]\n",
|
||||
"[1, 'Glossary', 1943]\n",
|
||||
"[1, 'Credits', 2008]\n",
|
||||
"[1, 'Index', 2016]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import fitz\n",
|
||||
"doc = fitz.open('../data/raw/textbook.pdf')\n",
|
||||
"toc = doc.get_toc()\n",
|
||||
"for item in toc:\n",
|
||||
" print(item) # [level, title, pdf_page]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "991dbad2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sections to process: 204\n",
|
||||
"[1, 'Chapter 1: Old Worlds and New', 59]\n",
|
||||
"[1, 'An Old World: North America', 63]\n",
|
||||
"[1, 'An Old World: West Africa', 73]\n",
|
||||
"[1, 'An Old World: Western Europe', 75]\n",
|
||||
"[1, 'Contact', 80]\n",
|
||||
"[1, 'The Spanish Empire', 88]\n",
|
||||
"[1, 'The French and Dutch Empires', 108]\n",
|
||||
"[1, 'Chapter Review', 120]\n",
|
||||
"[1, 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124]\n",
|
||||
"[1, 'England and the Americas', 129]\n",
|
||||
"[1, 'Early English Exploration and Colonization', 138]\n",
|
||||
"[1, 'The Chesapeake', 142]\n",
|
||||
"[1, 'Origins of American Slavery', 150]\n",
|
||||
"[1, 'The New England Way', 157]\n",
|
||||
"[1, 'New Englanders Divided', 169]\n",
|
||||
"[1, 'Religion, Politics, and Freedom', 180]\n",
|
||||
"[1, 'Chapter Review', 188]\n",
|
||||
"[1, 'Chapter 3: Creating Anglo-America, 1660⠍1750', 193]\n",
|
||||
"[1, 'Global Competition and the Expansion of England⠒s Empire', 197]\n",
|
||||
"[1, 'Entrenchment of American Slavery', 206]\n",
|
||||
"[1, 'Colonies in Crisis', 216]\n",
|
||||
"[1, 'The Growth of Colonial America', 223]\n",
|
||||
"[1, 'Social Classes in the British Colonies', 238]\n",
|
||||
"[1, 'North America at Mid-Century', 246]\n",
|
||||
"[1, 'Chapter Review', 249]\n",
|
||||
"[1, 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763', 253]\n",
|
||||
"[1, 'Slavery and Empire', 257]\n",
|
||||
"[1, 'Slave Cultures and Slave Resistance', 274]\n",
|
||||
"[1, 'An Empire of Freedom', 280]\n",
|
||||
"[1, 'The Public Sphere', 285]\n",
|
||||
"[1, 'The Great Awakening', 294]\n",
|
||||
"[1, 'Imperial Rivalries', 298]\n",
|
||||
"[1, 'Battle for the Continent', 306]\n",
|
||||
"[1, 'Chapter Review', 320]\n",
|
||||
"[1, 'Chapter 5: The American Revolution, 1763⠍1783', 325]\n",
|
||||
"[1, 'The Crisis Begins', 329]\n",
|
||||
"[1, 'The Road to Revolution', 339]\n",
|
||||
"[1, 'The Coming of Independence', 345]\n",
|
||||
"[1, 'Securing Independence', 359]\n",
|
||||
"[1, 'Chapter Review', 375]\n",
|
||||
"[1, 'Chapter 6: The Revolution Within', 381]\n",
|
||||
"[1, 'Democratizing Freedom', 384]\n",
|
||||
"[1, 'Toward Religious Toleration', 392]\n",
|
||||
"[1, 'Defining Economic Freedom', 399]\n",
|
||||
"[1, 'The Limits of Liberty', 404]\n",
|
||||
"[1, 'Slavery and the Revolution', 410]\n",
|
||||
"[1, 'Daughters of Liberty', 422]\n",
|
||||
"[1, 'Chapter Review', 432]\n",
|
||||
"[1, 'Chapter 7: Founding a Nation, 1783⠍1791', 435]\n",
|
||||
"[1, 'America Under the Confederation', 439]\n",
|
||||
"[1, 'A New Constitution', 450]\n",
|
||||
"[1, 'The Ratification Debate and the Origin of the Bill of Rights', 460]\n",
|
||||
"[1, '“We the Peopleâ€\\x9d', 472]\n",
|
||||
"[1, 'Chapter Review', 486]\n",
|
||||
"[1, 'Chapter 8: Securing the Republic, 1791⠍1815', 491]\n",
|
||||
"[1, 'Politics in an Age of Passion', 494]\n",
|
||||
"[1, 'The Adams Presidency', 508]\n",
|
||||
"[1, 'Jefferson in Power', 522]\n",
|
||||
"[1, 'The “Second War of Independenceâ€\\x9d', 531]\n",
|
||||
"[1, 'Chapter Review', 542]\n",
|
||||
"[1, 'Chapter 9: The Market Revolution, 1800⠍1840', 548]\n",
|
||||
"[1, 'A New Economy', 552]\n",
|
||||
"[1, 'The Rise of the West', 558]\n",
|
||||
"[1, 'Market Society', 566]\n",
|
||||
"[1, 'The Free Individual', 582]\n",
|
||||
"[1, 'The Limits of Prosperity', 591]\n",
|
||||
"[1, 'Chapter Review', 601]\n",
|
||||
"[1, 'Chapter 10: Democracy in America, 1815⠍1840', 606]\n",
|
||||
"[1, 'The Triumph of Democracy', 610]\n",
|
||||
"[1, 'Nationalism and Its Discontents', 623]\n",
|
||||
"[1, 'Nation, Section, and Party', 630]\n",
|
||||
"[1, 'The Age of Jackson', 639]\n",
|
||||
"[1, 'Indian Removal', 647]\n",
|
||||
"[1, 'The Bank War and After', 657]\n",
|
||||
"[1, 'Chapter Review', 664]\n",
|
||||
"[1, 'Chapter 11: The Peculiar Institution', 669]\n",
|
||||
"[1, 'The Old South', 672]\n",
|
||||
"[1, 'Life Under Slavery', 690]\n",
|
||||
"[1, 'Slave Culture', 704]\n",
|
||||
"[1, 'Resistance to Slavery', 712]\n",
|
||||
"[1, 'Chapter Review', 722]\n",
|
||||
"[1, 'Chapter 12: An Age of Reform, 1820⠍1840', 725]\n",
|
||||
"[1, 'The Reform Impulse', 728]\n",
|
||||
"[1, 'The Crusade Against Slavery', 740]\n",
|
||||
"[1, 'Black and White Abolitionism', 755]\n",
|
||||
"[1, 'The Origins of Feminism', 761]\n",
|
||||
"[1, 'Chapter Review', 775]\n",
|
||||
"[1, 'Chapter 13: A House Divided, 1840⠍1861', 780]\n",
|
||||
"[1, 'Fruits of Manifest Destiny', 783]\n",
|
||||
"[1, 'A Dose of Arsenic', 803]\n",
|
||||
"[1, 'The Rise of the Republican Party', 814]\n",
|
||||
"[1, 'The Emergence of Lincoln', 821]\n",
|
||||
"[1, 'The Impending Crisis', 837]\n",
|
||||
"[1, 'Chapter Review', 844]\n",
|
||||
"[1, 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865', 849]\n",
|
||||
"[1, 'The First Modern War', 853]\n",
|
||||
"[1, 'The Coming of Emancipation', 864]\n",
|
||||
"[1, 'The Second American Revolution', 876]\n",
|
||||
"[1, 'The Confederate Nation', 891]\n",
|
||||
"[1, 'Turning Points', 900]\n",
|
||||
"[1, 'Rehearsals for Reconstruction and the End of the War', 904]\n",
|
||||
"[1, 'Chapter Review', 912]\n",
|
||||
"[1, 'Chapter 15: “What Is Freedom?â€\\x9d: Reconstruction', 917]\n",
|
||||
"[1, 'The Meaning of Freedom', 921]\n",
|
||||
"[1, 'The Making of Radical Reconstruction', 938]\n",
|
||||
"[1, 'Radical Reconstruction in the South', 956]\n",
|
||||
"[1, 'The Overthrow of Reconstruction', 963]\n",
|
||||
"[1, 'Chapter Review', 972]\n",
|
||||
"[1, 'Chapter 16: America⠒s Gilded Age, 1870⠍1890', 976]\n",
|
||||
"[1, 'The Second Industrial Revolution', 980]\n",
|
||||
"[1, 'Freedom in the Gilded Age', 992]\n",
|
||||
"[1, 'Labor and the Republic', 999]\n",
|
||||
"[1, 'The Transformation of the West', 1009]\n",
|
||||
"[1, 'Politics in a Gilded Age', 1032]\n",
|
||||
"[1, 'Chapter Review', 1039]\n",
|
||||
"[1, 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900', 1044]\n",
|
||||
"[1, 'The Populist Challenge', 1048]\n",
|
||||
"[1, 'The Segregated South', 1059]\n",
|
||||
"[1, 'Redrawing the Boundaries', 1075]\n",
|
||||
"[1, 'Becoming a World Power', 1082]\n",
|
||||
"[1, 'Chapter Review', 1101]\n",
|
||||
"[1, 'Chapter 18: The Progressive Era, 1900⠍1916', 1106]\n",
|
||||
"[1, 'An Urban Age and a Consumer Society', 1111]\n",
|
||||
"[1, 'Varieties of Progressivism', 1128]\n",
|
||||
"[1, 'The Politics of Progressivism', 1144]\n",
|
||||
"[1, 'The Progressive Presidents', 1158]\n",
|
||||
"[1, 'Chapter Review', 1170]\n",
|
||||
"[1, 'Chapter 19: Safe for Democracy: The United States and World War I', 1176]\n",
|
||||
"[1, 'An Era of Intervention', 1181]\n",
|
||||
"[1, 'America and the Great War', 1189]\n",
|
||||
"[1, 'The War at Home', 1195]\n",
|
||||
"[1, 'Who Is an American?', 1210]\n",
|
||||
"[1, '1919', 1227]\n",
|
||||
"[1, 'Chapter Review', 1239]\n",
|
||||
"[1, 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932', 1244]\n",
|
||||
"[1, 'The Business of America', 1248]\n",
|
||||
"[1, 'Business and Government', 1258]\n",
|
||||
"[1, 'The Birth of Civil Liberties', 1267]\n",
|
||||
"[1, 'The Culture Wars', 1273]\n",
|
||||
"[1, 'The Great Depression', 1290]\n",
|
||||
"[1, 'Chapter Review', 1298]\n",
|
||||
"[1, 'Chapter 21: The New Deal, 1932⠍1940', 1303]\n",
|
||||
"[1, 'The First New Deal', 1308]\n",
|
||||
"[1, 'The Grassroots Revolt', 1321]\n",
|
||||
"[1, 'The Second New Deal', 1328]\n",
|
||||
"[1, 'A Reckoning With Liberty', 1333]\n",
|
||||
"[1, 'The Limits of Change', 1343]\n",
|
||||
"[1, 'A New Conception of America', 1353]\n",
|
||||
"[1, 'Chapter Review', 1362]\n",
|
||||
"[1, 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945', 1368]\n",
|
||||
"[1, 'Fighting World War II', 1374]\n",
|
||||
"[1, 'The Home Front', 1386]\n",
|
||||
"[1, 'Visions of Postwar Freedom', 1398]\n",
|
||||
"[1, 'The American Dilemma', 1403]\n",
|
||||
"[1, 'The End of the War', 1424]\n",
|
||||
"[1, 'Chapter Review', 1432]\n",
|
||||
"[1, 'Chapter 23: The United States and the Cold War, 1945⠍1953', 1437]\n",
|
||||
"[1, 'Origins of the Cold War', 1442]\n",
|
||||
"[1, 'The Cold War and the Idea of Freedom', 1456]\n",
|
||||
"[1, 'The Truman Presidency', 1463]\n",
|
||||
"[1, 'The Anticommunist Crusade', 1471]\n",
|
||||
"[1, 'Chapter Review', 1488]\n",
|
||||
"[1, 'Chapter 24: An Affluent Society, 1953⠍1960', 1493]\n",
|
||||
"[1, 'The Golden Age', 1497]\n",
|
||||
"[1, 'The Eisenhower Era', 1519]\n",
|
||||
"[1, 'The Freedom Movement', 1533]\n",
|
||||
"[1, 'The Election of 1960', 1548]\n",
|
||||
"[1, 'Chapter Review', 1552]\n",
|
||||
"[1, 'Chapter 25: The Sixties, 1960⠍1968', 1557]\n",
|
||||
"[1, 'The Civil Rights Revolution', 1561]\n",
|
||||
"[1, 'The Kennedy Years', 1566]\n",
|
||||
"[1, 'Lyndon Johnson⠒s Presidency', 1571]\n",
|
||||
"[1, 'The Changing Black Movement', 1581]\n",
|
||||
"[1, 'Vietnam and the New Left', 1586]\n",
|
||||
"[1, 'The New Movements and the Rights Revolution', 1596]\n",
|
||||
"[1, '1968', 1617]\n",
|
||||
"[1, 'Chapter Review', 1622]\n",
|
||||
"[1, 'Chapter 26: The Conservative Turn, 1969⠍1988', 1628]\n",
|
||||
"[1, 'President Nixon', 1631]\n",
|
||||
"[1, 'Grassroots Rights Movements', 1638]\n",
|
||||
"[1, 'Foreign Policy and Watergate', 1643]\n",
|
||||
"[1, 'The End of the Golden Age', 1654]\n",
|
||||
"[1, 'The Rising Tide of Conservatism', 1667]\n",
|
||||
"[1, 'The Reagan Revolution', 1677]\n",
|
||||
"[1, 'Chapter Review', 1691]\n",
|
||||
"[1, 'Chapter 27: A New World Order, 1989⠍2004', 1696]\n",
|
||||
"[1, 'The Post⠍Cold War World', 1700]\n",
|
||||
"[1, 'Globalization and Its Discontents', 1709]\n",
|
||||
"[1, 'Culture Wars', 1720]\n",
|
||||
"[1, 'Impeachment and the Election of 2000', 1743]\n",
|
||||
"[1, 'The Attacks of September 11', 1747]\n",
|
||||
"[1, 'The War on Terrorism', 1750]\n",
|
||||
"[1, 'An American Empire?', 1754]\n",
|
||||
"[1, 'The Aftermath of September 11 at Home', 1759]\n",
|
||||
"[1, 'Chapter Review', 1764]\n",
|
||||
"[1, 'Chapter 28: A Divided Nation', 1769]\n",
|
||||
"[1, 'The Winds of Change', 1772]\n",
|
||||
"[1, 'The Great Recession', 1780]\n",
|
||||
"[1, 'Obama in Office', 1789]\n",
|
||||
"[1, 'The Obama Presidency', 1798]\n",
|
||||
"[1, 'President Trump', 1807]\n",
|
||||
"[1, '2020: Year of Crisis', 1820]\n",
|
||||
"[1, 'Freedom in the Twenty-First Century', 1831]\n",
|
||||
"[1, 'Chapter Review', 1841]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Find where Chapter 1 starts and throw away everything before it\n",
|
||||
"start_index = next(i for i, item in enumerate(toc) if 'Chapter 1' in item[1])\n",
|
||||
"chapters_toc = toc[start_index:]\n",
|
||||
"\n",
|
||||
"# Also throw away back matter (Suggested Reading, Glossary, Index etc.)\n",
|
||||
"end_titles = {'Suggested Reading', 'The Declaration of Independence (1776)', \n",
|
||||
" 'The Constitution of The United States (1787)', 'Glossary', \n",
|
||||
" 'Credits', 'Index'}\n",
|
||||
"chapters_toc = [item for item in chapters_toc if item[1] not in end_titles]\n",
|
||||
"\n",
|
||||
"print(f\"Sections to process: {len(chapters_toc)}\")\n",
|
||||
"for item in chapters_toc:\n",
|
||||
" print(item)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "43e20197",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'title': 'Chapter 1: Old Worlds and New', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': True, 'start_pdf': 58, 'end_pdf': 61}\n",
|
||||
"{'title': 'An Old World: North America', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 62, 'end_pdf': 71}\n",
|
||||
"{'title': 'An Old World: West Africa', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 72, 'end_pdf': 73}\n",
|
||||
"{'title': 'An Old World: Western Europe', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 74, 'end_pdf': 78}\n",
|
||||
"{'title': 'Contact', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 79, 'end_pdf': 86}\n",
|
||||
"{'title': 'The Spanish Empire', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 87, 'end_pdf': 106}\n",
|
||||
"{'title': 'The French and Dutch Empires', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 107, 'end_pdf': 118}\n",
|
||||
"{'title': 'Chapter Review', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 119, 'end_pdf': 122}\n",
|
||||
"{'title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': True, 'start_pdf': 123, 'end_pdf': 127}\n",
|
||||
"{'title': 'England and the Americas', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 128, 'end_pdf': 136}\n",
|
||||
"{'title': 'Early English Exploration and Colonization', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 137, 'end_pdf': 140}\n",
|
||||
"{'title': 'The Chesapeake', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 141, 'end_pdf': 148}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def parse_chapter_num(title):\n",
|
||||
" match = re.match(r'Chapter (\\d+):', title)\n",
|
||||
" return int(match.group(1)) if match else None\n",
|
||||
"\n",
|
||||
"structured = []\n",
|
||||
"current_chapter_num = None\n",
|
||||
"current_chapter_title = None\n",
|
||||
"\n",
|
||||
"for i, item in enumerate(chapters_toc):\n",
|
||||
" title = item[1]\n",
|
||||
" start_pdf = item[2] - 1 # 0-indexed\n",
|
||||
" end_pdf = (chapters_toc[i + 1][2] - 2) if i + 1 < len(chapters_toc) else doc.page_count - 1\n",
|
||||
"\n",
|
||||
" chapter_num = parse_chapter_num(title)\n",
|
||||
"\n",
|
||||
" if chapter_num:\n",
|
||||
" # This entry IS a chapter\n",
|
||||
" current_chapter_num = chapter_num\n",
|
||||
" current_chapter_title = title\n",
|
||||
" is_chapter_header = True\n",
|
||||
" else:\n",
|
||||
" is_chapter_header = False\n",
|
||||
"\n",
|
||||
" structured.append({\n",
|
||||
" \"title\": title,\n",
|
||||
" \"chapter_num\": current_chapter_num,\n",
|
||||
" \"chapter_title\": current_chapter_title,\n",
|
||||
" \"is_chapter_header\": is_chapter_header,\n",
|
||||
" \"start_pdf\": start_pdf,\n",
|
||||
" \"end_pdf\": end_pdf,\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"# Sanity check\n",
|
||||
"for s in structured[:12]:\n",
|
||||
" print(s)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "149bc714",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[1, 'Half-title Page', 2, {'kind': 1, 'xref': 63504, 'page': 1, 'to': Point(76.47846, 86.92822), 'zoom': 0.0}]\n",
|
||||
"[1, 'Physical/Political Map of The United States', 5, {'kind': 1, 'xref': 63507, 'page': 4, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
|
||||
"[1, 'Political Map of The World', 6, {'kind': 1, 'xref': 63509, 'page': 5, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
|
||||
"[1, 'Title Page', 7, {'kind': 1, 'xref': 63511, 'page': 6, 'to': Point(76.47846, 86.92822), 'zoom': 0.0}]\n",
|
||||
"[1, 'Copyright', 10, {'kind': 1, 'xref': 63513, 'page': 9, 'to': Point(76.5, 87.0), 'zoom': 0.0}]\n",
|
||||
"[1, 'Dedication', 13, {'kind': 1, 'xref': 63515, 'page': 12, 'to': Point(76.5, 87.0), 'zoom': 0.0}]\n",
|
||||
"[1, 'Contents', 14, {'kind': 1, 'xref': 63517, 'page': 13, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
|
||||
"[1, 'List of Maps, Tables, and Figures', 22, {'kind': 1, 'xref': 63519, 'page': 21, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
|
||||
"[1, 'About the Authors', 32, {'kind': 1, 'xref': 63521, 'page': 31, 'to': Point(76.47846, 91.40668), 'zoom': 0.0}]\n",
|
||||
"[1, 'Preface', 34, {'kind': 1, 'xref': 63523, 'page': 33, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
|
||||
"[1, 'Resources For Students And Instructors', 54, {'kind': 1, 'xref': 63525, 'page': 53, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
|
||||
"[1, 'Chapter 1: Old Worlds and New', 59, {'kind': 1, 'xref': 63527, 'page': 58, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
|
||||
"[1, 'An Old World: North America', 63, {'kind': 1, 'xref': 63529, 'page': 62, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n",
|
||||
"[1, 'An Old World: West Africa', 73, {'kind': 1, 'xref': 63531, 'page': 72, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n",
|
||||
"[1, 'An Old World: Western Europe', 75, {'kind': 1, 'xref': 63533, 'page': 74, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"toc_full = doc.get_toc(simple=False)\n",
|
||||
"\n",
|
||||
"for item in toc_full[:15]:\n",
|
||||
" print(item)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "c2563864",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Generated → /home/keshav/code/apush-rag/config/page_map.yaml\n",
|
||||
"Now open that file and fill in the real page numbers. Leave as null if unknown.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import yaml\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"project_root = Path().resolve().parent\n",
|
||||
"output_path = project_root / \"config\" / \"page_map.yaml\"\n",
|
||||
"\n",
|
||||
"page_map = {\"chapters\": {}}\n",
|
||||
"\n",
|
||||
"for section in structured:\n",
|
||||
" ch_num = section[\"chapter_num\"]\n",
|
||||
" ch_title = section[\"chapter_title\"]\n",
|
||||
" title = section[\"title\"]\n",
|
||||
"\n",
|
||||
" # Initialize chapter entry if first time seeing it\n",
|
||||
" if ch_num not in page_map[\"chapters\"]:\n",
|
||||
" page_map[\"chapters\"][ch_num] = {\n",
|
||||
" \"title\": ch_title,\n",
|
||||
" \"real_page\": None, # ← you fill this in\n",
|
||||
" \"sections\": {}\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # Add section with null page — you fill these in\n",
|
||||
" if not section[\"is_chapter_header\"]:\n",
|
||||
" page_map[\"chapters\"][ch_num][\"sections\"][title] = None\n",
|
||||
"\n",
|
||||
"with open(output_path, \"w\") as f:\n",
|
||||
" yaml.dump(page_map, f, allow_unicode=True, sort_keys=False, default_flow_style=False)\n",
|
||||
"\n",
|
||||
"print(f\"Generated → {output_path}\")\n",
|
||||
"print(\"Now open that file and fill in the real page numbers. Leave as null if unknown.\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user