From 9cc57da50ee9d453c34f5217c37bca9f482c63c9 Mon Sep 17 00:00:00 2001 From: Keshav Anand Date: Tue, 10 Mar 2026 19:33:16 -0500 Subject: [PATCH] parseed input from textbook pdf --- .gitignore | 4 +- config/page_map.yaml | 289 +++++++++++++++++ notebooks/pdf_parse.ipynb | 667 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 957 insertions(+), 3 deletions(-) create mode 100644 config/page_map.yaml create mode 100644 notebooks/pdf_parse.ipynb diff --git a/.gitignore b/.gitignore index 74bfe97..42ead45 100644 --- a/.gitignore +++ b/.gitignore @@ -64,9 +64,7 @@ htmlcov/ .dmypy.json pyrightconfig.json -# ── Jupyter ─────────────────────────────────────────────────────────────────── -.ipynb_checkpoints/ -*.ipynb + # ── Logs ────────────────────────────────────────────────────────────────────── logs/ diff --git a/config/page_map.yaml b/config/page_map.yaml new file mode 100644 index 0000000..82a741d --- /dev/null +++ b/config/page_map.yaml @@ -0,0 +1,289 @@ +chapters: + 1: + title: 'Chapter 1: Old Worlds and New' + real_page: null + sections: + 'An Old World: North America': null + 'An Old World: West Africa': null + 'An Old World: Western Europe': null + Contact: null + The Spanish Empire: null + The French and Dutch Empires: null + Chapter Review: null + 2: + title: 'Chapter 2: European Colonies and Native Nations, 1600⠍1660' + real_page: null + sections: + England and the Americas: null + Early English Exploration and Colonization: null + The Chesapeake: null + Origins of American Slavery: null + The New England Way: null + New Englanders Divided: null + Religion, Politics, and Freedom: null + Chapter Review: null + 3: + title: 'Chapter 3: Creating Anglo-America, 1660⠍1750' + real_page: null + sections: + Global Competition and the Expansion of England⠒s Empire: null + Entrenchment of American Slavery: null + Colonies in Crisis: null + The Growth of Colonial America: null + Social Classes in the British Colonies: null + North America at Mid-Century: null + Chapter Review: null + 4: + title: 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763' + real_page: null + sections: + Slavery and Empire: null + Slave Cultures and Slave Resistance: null + An Empire of Freedom: null + The Public Sphere: null + The Great Awakening: null + Imperial Rivalries: null + Battle for the Continent: null + Chapter Review: null + 5: + title: 'Chapter 5: The American Revolution, 1763⠍1783' + real_page: null + sections: + The Crisis Begins: null + The Road to Revolution: null + The Coming of Independence: null + Securing Independence: null + Chapter Review: null + 6: + title: 'Chapter 6: The Revolution Within' + real_page: null + sections: + Democratizing Freedom: null + Toward Religious Toleration: null + Defining Economic Freedom: null + The Limits of Liberty: null + Slavery and the Revolution: null + Daughters of Liberty: null + Chapter Review: null + 7: + title: 'Chapter 7: Founding a Nation, 1783⠍1791' + real_page: null + sections: + America Under the Confederation: null + A New Constitution: null + The Ratification Debate and the Origin of the Bill of Rights: null + "“We the Peopleâ€\x9D": null + Chapter Review: null + 8: + title: 'Chapter 8: Securing the Republic, 1791⠍1815' + real_page: null + sections: + Politics in an Age of Passion: null + The Adams Presidency: null + Jefferson in Power: null + "The “Second War of Independenceâ€\x9D": null + Chapter Review: null + 9: + title: 'Chapter 9: The Market Revolution, 1800⠍1840' + real_page: null + sections: + A New Economy: null + The Rise of the West: null + Market Society: null + The Free Individual: null + The Limits of Prosperity: null + Chapter Review: null + 10: + title: 'Chapter 10: Democracy in America, 1815⠍1840' + real_page: null + sections: + The Triumph of Democracy: null + Nationalism and Its Discontents: null + Nation, Section, and Party: null + The Age of Jackson: null + Indian Removal: null + The Bank War and After: null + Chapter Review: null + 11: + title: 'Chapter 11: The Peculiar Institution' + real_page: null + sections: + The Old South: null + Life Under Slavery: null + Slave Culture: null + Resistance to Slavery: null + Chapter Review: null + 12: + title: 'Chapter 12: An Age of Reform, 1820⠍1840' + real_page: null + sections: + The Reform Impulse: null + The Crusade Against Slavery: null + Black and White Abolitionism: null + The Origins of Feminism: null + Chapter Review: null + 13: + title: 'Chapter 13: A House Divided, 1840⠍1861' + real_page: null + sections: + Fruits of Manifest Destiny: null + A Dose of Arsenic: null + The Rise of the Republican Party: null + The Emergence of Lincoln: null + The Impending Crisis: null + Chapter Review: null + 14: + title: 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865' + real_page: null + sections: + The First Modern War: null + The Coming of Emancipation: null + The Second American Revolution: null + The Confederate Nation: null + Turning Points: null + Rehearsals for Reconstruction and the End of the War: null + Chapter Review: null + 15: + title: "Chapter 15: “What Is Freedom?â€\x9D: Reconstruction" + real_page: null + sections: + The Meaning of Freedom: null + The Making of Radical Reconstruction: null + Radical Reconstruction in the South: null + The Overthrow of Reconstruction: null + Chapter Review: null + 16: + title: 'Chapter 16: America⠒s Gilded Age, 1870⠍1890' + real_page: null + sections: + The Second Industrial Revolution: null + Freedom in the Gilded Age: null + Labor and the Republic: null + The Transformation of the West: null + Politics in a Gilded Age: null + Chapter Review: null + 17: + title: 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900' + real_page: null + sections: + The Populist Challenge: null + The Segregated South: null + Redrawing the Boundaries: null + Becoming a World Power: null + Chapter Review: null + 18: + title: 'Chapter 18: The Progressive Era, 1900⠍1916' + real_page: null + sections: + An Urban Age and a Consumer Society: null + Varieties of Progressivism: null + The Politics of Progressivism: null + The Progressive Presidents: null + Chapter Review: null + 19: + title: 'Chapter 19: Safe for Democracy: The United States and World War I' + real_page: null + sections: + An Era of Intervention: null + America and the Great War: null + The War at Home: null + Who Is an American?: null + '1919': null + Chapter Review: null + 20: + title: 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932' + real_page: null + sections: + The Business of America: null + Business and Government: null + The Birth of Civil Liberties: null + The Culture Wars: null + The Great Depression: null + Chapter Review: null + 21: + title: 'Chapter 21: The New Deal, 1932⠍1940' + real_page: null + sections: + The First New Deal: null + The Grassroots Revolt: null + The Second New Deal: null + A Reckoning With Liberty: null + The Limits of Change: null + A New Conception of America: null + Chapter Review: null + 22: + title: 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945' + real_page: null + sections: + Fighting World War II: null + The Home Front: null + Visions of Postwar Freedom: null + The American Dilemma: null + The End of the War: null + Chapter Review: null + 23: + title: 'Chapter 23: The United States and the Cold War, 1945⠍1953' + real_page: null + sections: + Origins of the Cold War: null + The Cold War and the Idea of Freedom: null + The Truman Presidency: null + The Anticommunist Crusade: null + Chapter Review: null + 24: + title: 'Chapter 24: An Affluent Society, 1953⠍1960' + real_page: null + sections: + The Golden Age: null + The Eisenhower Era: null + The Freedom Movement: null + The Election of 1960: null + Chapter Review: null + 25: + title: 'Chapter 25: The Sixties, 1960⠍1968' + real_page: null + sections: + The Civil Rights Revolution: null + The Kennedy Years: null + Lyndon Johnson⠒s Presidency: null + The Changing Black Movement: null + Vietnam and the New Left: null + The New Movements and the Rights Revolution: null + '1968': null + Chapter Review: null + 26: + title: 'Chapter 26: The Conservative Turn, 1969⠍1988' + real_page: null + sections: + President Nixon: null + Grassroots Rights Movements: null + Foreign Policy and Watergate: null + The End of the Golden Age: null + The Rising Tide of Conservatism: null + The Reagan Revolution: null + Chapter Review: null + 27: + title: 'Chapter 27: A New World Order, 1989⠍2004' + real_page: null + sections: + The Post⠍Cold War World: null + Globalization and Its Discontents: null + Culture Wars: null + Impeachment and the Election of 2000: null + The Attacks of September 11: null + The War on Terrorism: null + An American Empire?: null + The Aftermath of September 11 at Home: null + Chapter Review: null + 28: + title: 'Chapter 28: A Divided Nation' + real_page: null + sections: + The Winds of Change: null + The Great Recession: null + Obama in Office: null + The Obama Presidency: null + President Trump: null + '2020: Year of Crisis': null + Freedom in the Twenty-First Century: null + Chapter Review: null diff --git a/notebooks/pdf_parse.ipynb b/notebooks/pdf_parse.ipynb new file mode 100644 index 0000000..b22a860 --- /dev/null +++ b/notebooks/pdf_parse.ipynb @@ -0,0 +1,667 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e91fd8c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello, World!\n" + ] + } + ], + "source": [ + "print(\"Hello, World!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "11896305", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 'Half-title Page', 2]\n", + "[1, 'Physical/Political Map of The United States', 5]\n", + "[1, 'Political Map of The World', 6]\n", + "[1, 'Title Page', 7]\n", + "[1, 'Copyright', 10]\n", + "[1, 'Dedication', 13]\n", + "[1, 'Contents', 14]\n", + "[1, 'List of Maps, Tables, and Figures', 22]\n", + "[1, 'About the Authors', 32]\n", + "[1, 'Preface', 34]\n", + "[1, 'Resources For Students And Instructors', 54]\n", + "[1, 'Chapter 1: Old Worlds and New', 59]\n", + "[1, 'An Old World: North America', 63]\n", + "[1, 'An Old World: West Africa', 73]\n", + "[1, 'An Old World: Western Europe', 75]\n", + "[1, 'Contact', 80]\n", + "[1, 'The Spanish Empire', 88]\n", + "[1, 'The French and Dutch Empires', 108]\n", + "[1, 'Chapter Review', 120]\n", + "[1, 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124]\n", + "[1, 'England and the Americas', 129]\n", + "[1, 'Early English Exploration and Colonization', 138]\n", + "[1, 'The Chesapeake', 142]\n", + "[1, 'Origins of American Slavery', 150]\n", + "[1, 'The New England Way', 157]\n", + "[1, 'New Englanders Divided', 169]\n", + "[1, 'Religion, Politics, and Freedom', 180]\n", + "[1, 'Chapter Review', 188]\n", + "[1, 'Chapter 3: Creating Anglo-America, 1660⠍1750', 193]\n", + "[1, 'Global Competition and the Expansion of England⠒s Empire', 197]\n", + "[1, 'Entrenchment of American Slavery', 206]\n", + "[1, 'Colonies in Crisis', 216]\n", + "[1, 'The Growth of Colonial America', 223]\n", + "[1, 'Social Classes in the British Colonies', 238]\n", + "[1, 'North America at Mid-Century', 246]\n", + "[1, 'Chapter Review', 249]\n", + "[1, 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763', 253]\n", + "[1, 'Slavery and Empire', 257]\n", + "[1, 'Slave Cultures and Slave Resistance', 274]\n", + "[1, 'An Empire of Freedom', 280]\n", + "[1, 'The Public Sphere', 285]\n", + "[1, 'The Great Awakening', 294]\n", + "[1, 'Imperial Rivalries', 298]\n", + "[1, 'Battle for the Continent', 306]\n", + "[1, 'Chapter Review', 320]\n", + "[1, 'Chapter 5: The American Revolution, 1763⠍1783', 325]\n", + "[1, 'The Crisis Begins', 329]\n", + "[1, 'The Road to Revolution', 339]\n", + "[1, 'The Coming of Independence', 345]\n", + "[1, 'Securing Independence', 359]\n", + "[1, 'Chapter Review', 375]\n", + "[1, 'Chapter 6: The Revolution Within', 381]\n", + "[1, 'Democratizing Freedom', 384]\n", + "[1, 'Toward Religious Toleration', 392]\n", + "[1, 'Defining Economic Freedom', 399]\n", + "[1, 'The Limits of Liberty', 404]\n", + "[1, 'Slavery and the Revolution', 410]\n", + "[1, 'Daughters of Liberty', 422]\n", + "[1, 'Chapter Review', 432]\n", + "[1, 'Chapter 7: Founding a Nation, 1783⠍1791', 435]\n", + "[1, 'America Under the Confederation', 439]\n", + "[1, 'A New Constitution', 450]\n", + "[1, 'The Ratification Debate and the Origin of the Bill of Rights', 460]\n", + "[1, '“We the Peopleâ€\\x9d', 472]\n", + "[1, 'Chapter Review', 486]\n", + "[1, 'Chapter 8: Securing the Republic, 1791⠍1815', 491]\n", + "[1, 'Politics in an Age of Passion', 494]\n", + "[1, 'The Adams Presidency', 508]\n", + "[1, 'Jefferson in Power', 522]\n", + "[1, 'The “Second War of Independenceâ€\\x9d', 531]\n", + "[1, 'Chapter Review', 542]\n", + "[1, 'Chapter 9: The Market Revolution, 1800⠍1840', 548]\n", + "[1, 'A New Economy', 552]\n", + "[1, 'The Rise of the West', 558]\n", + "[1, 'Market Society', 566]\n", + "[1, 'The Free Individual', 582]\n", + "[1, 'The Limits of Prosperity', 591]\n", + "[1, 'Chapter Review', 601]\n", + "[1, 'Chapter 10: Democracy in America, 1815⠍1840', 606]\n", + "[1, 'The Triumph of Democracy', 610]\n", + "[1, 'Nationalism and Its Discontents', 623]\n", + "[1, 'Nation, Section, and Party', 630]\n", + "[1, 'The Age of Jackson', 639]\n", + "[1, 'Indian Removal', 647]\n", + "[1, 'The Bank War and After', 657]\n", + "[1, 'Chapter Review', 664]\n", + "[1, 'Chapter 11: The Peculiar Institution', 669]\n", + "[1, 'The Old South', 672]\n", + "[1, 'Life Under Slavery', 690]\n", + "[1, 'Slave Culture', 704]\n", + "[1, 'Resistance to Slavery', 712]\n", + "[1, 'Chapter Review', 722]\n", + "[1, 'Chapter 12: An Age of Reform, 1820⠍1840', 725]\n", + "[1, 'The Reform Impulse', 728]\n", + "[1, 'The Crusade Against Slavery', 740]\n", + "[1, 'Black and White Abolitionism', 755]\n", + "[1, 'The Origins of Feminism', 761]\n", + "[1, 'Chapter Review', 775]\n", + "[1, 'Chapter 13: A House Divided, 1840⠍1861', 780]\n", + "[1, 'Fruits of Manifest Destiny', 783]\n", + "[1, 'A Dose of Arsenic', 803]\n", + "[1, 'The Rise of the Republican Party', 814]\n", + "[1, 'The Emergence of Lincoln', 821]\n", + "[1, 'The Impending Crisis', 837]\n", + "[1, 'Chapter Review', 844]\n", + "[1, 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865', 849]\n", + "[1, 'The First Modern War', 853]\n", + "[1, 'The Coming of Emancipation', 864]\n", + "[1, 'The Second American Revolution', 876]\n", + "[1, 'The Confederate Nation', 891]\n", + "[1, 'Turning Points', 900]\n", + "[1, 'Rehearsals for Reconstruction and the End of the War', 904]\n", + "[1, 'Chapter Review', 912]\n", + "[1, 'Chapter 15: “What Is Freedom?â€\\x9d: Reconstruction', 917]\n", + "[1, 'The Meaning of Freedom', 921]\n", + "[1, 'The Making of Radical Reconstruction', 938]\n", + "[1, 'Radical Reconstruction in the South', 956]\n", + "[1, 'The Overthrow of Reconstruction', 963]\n", + "[1, 'Chapter Review', 972]\n", + "[1, 'Chapter 16: America⠒s Gilded Age, 1870⠍1890', 976]\n", + "[1, 'The Second Industrial Revolution', 980]\n", + "[1, 'Freedom in the Gilded Age', 992]\n", + "[1, 'Labor and the Republic', 999]\n", + "[1, 'The Transformation of the West', 1009]\n", + "[1, 'Politics in a Gilded Age', 1032]\n", + "[1, 'Chapter Review', 1039]\n", + "[1, 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900', 1044]\n", + "[1, 'The Populist Challenge', 1048]\n", + "[1, 'The Segregated South', 1059]\n", + "[1, 'Redrawing the Boundaries', 1075]\n", + "[1, 'Becoming a World Power', 1082]\n", + "[1, 'Chapter Review', 1101]\n", + "[1, 'Chapter 18: The Progressive Era, 1900⠍1916', 1106]\n", + "[1, 'An Urban Age and a Consumer Society', 1111]\n", + "[1, 'Varieties of Progressivism', 1128]\n", + "[1, 'The Politics of Progressivism', 1144]\n", + "[1, 'The Progressive Presidents', 1158]\n", + "[1, 'Chapter Review', 1170]\n", + "[1, 'Chapter 19: Safe for Democracy: The United States and World War I', 1176]\n", + "[1, 'An Era of Intervention', 1181]\n", + "[1, 'America and the Great War', 1189]\n", + "[1, 'The War at Home', 1195]\n", + "[1, 'Who Is an American?', 1210]\n", + "[1, '1919', 1227]\n", + "[1, 'Chapter Review', 1239]\n", + "[1, 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932', 1244]\n", + "[1, 'The Business of America', 1248]\n", + "[1, 'Business and Government', 1258]\n", + "[1, 'The Birth of Civil Liberties', 1267]\n", + "[1, 'The Culture Wars', 1273]\n", + "[1, 'The Great Depression', 1290]\n", + "[1, 'Chapter Review', 1298]\n", + "[1, 'Chapter 21: The New Deal, 1932⠍1940', 1303]\n", + "[1, 'The First New Deal', 1308]\n", + "[1, 'The Grassroots Revolt', 1321]\n", + "[1, 'The Second New Deal', 1328]\n", + "[1, 'A Reckoning With Liberty', 1333]\n", + "[1, 'The Limits of Change', 1343]\n", + "[1, 'A New Conception of America', 1353]\n", + "[1, 'Chapter Review', 1362]\n", + "[1, 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945', 1368]\n", + "[1, 'Fighting World War II', 1374]\n", + "[1, 'The Home Front', 1386]\n", + "[1, 'Visions of Postwar Freedom', 1398]\n", + "[1, 'The American Dilemma', 1403]\n", + "[1, 'The End of the War', 1424]\n", + "[1, 'Chapter Review', 1432]\n", + "[1, 'Chapter 23: The United States and the Cold War, 1945⠍1953', 1437]\n", + "[1, 'Origins of the Cold War', 1442]\n", + "[1, 'The Cold War and the Idea of Freedom', 1456]\n", + "[1, 'The Truman Presidency', 1463]\n", + "[1, 'The Anticommunist Crusade', 1471]\n", + "[1, 'Chapter Review', 1488]\n", + "[1, 'Chapter 24: An Affluent Society, 1953⠍1960', 1493]\n", + "[1, 'The Golden Age', 1497]\n", + "[1, 'The Eisenhower Era', 1519]\n", + "[1, 'The Freedom Movement', 1533]\n", + "[1, 'The Election of 1960', 1548]\n", + "[1, 'Chapter Review', 1552]\n", + "[1, 'Chapter 25: The Sixties, 1960⠍1968', 1557]\n", + "[1, 'The Civil Rights Revolution', 1561]\n", + "[1, 'The Kennedy Years', 1566]\n", + "[1, 'Lyndon Johnson⠒s Presidency', 1571]\n", + "[1, 'The Changing Black Movement', 1581]\n", + "[1, 'Vietnam and the New Left', 1586]\n", + "[1, 'The New Movements and the Rights Revolution', 1596]\n", + "[1, '1968', 1617]\n", + "[1, 'Chapter Review', 1622]\n", + "[1, 'Chapter 26: The Conservative Turn, 1969⠍1988', 1628]\n", + "[1, 'President Nixon', 1631]\n", + "[1, 'Grassroots Rights Movements', 1638]\n", + "[1, 'Foreign Policy and Watergate', 1643]\n", + "[1, 'The End of the Golden Age', 1654]\n", + "[1, 'The Rising Tide of Conservatism', 1667]\n", + "[1, 'The Reagan Revolution', 1677]\n", + "[1, 'Chapter Review', 1691]\n", + "[1, 'Chapter 27: A New World Order, 1989⠍2004', 1696]\n", + "[1, 'The Post⠍Cold War World', 1700]\n", + "[1, 'Globalization and Its Discontents', 1709]\n", + "[1, 'Culture Wars', 1720]\n", + "[1, 'Impeachment and the Election of 2000', 1743]\n", + "[1, 'The Attacks of September 11', 1747]\n", + "[1, 'The War on Terrorism', 1750]\n", + "[1, 'An American Empire?', 1754]\n", + "[1, 'The Aftermath of September 11 at Home', 1759]\n", + "[1, 'Chapter Review', 1764]\n", + "[1, 'Chapter 28: A Divided Nation', 1769]\n", + "[1, 'The Winds of Change', 1772]\n", + "[1, 'The Great Recession', 1780]\n", + "[1, 'Obama in Office', 1789]\n", + "[1, 'The Obama Presidency', 1798]\n", + "[1, 'President Trump', 1807]\n", + "[1, '2020: Year of Crisis', 1820]\n", + "[1, 'Freedom in the Twenty-First Century', 1831]\n", + "[1, 'Chapter Review', 1841]\n", + "[1, 'Suggested Reading', 1845]\n", + "[1, 'The Declaration of Independence (1776)', 1909]\n", + "[1, 'The Constitution of The United States (1787)', 1917]\n", + "[1, 'Glossary', 1943]\n", + "[1, 'Credits', 2008]\n", + "[1, 'Index', 2016]\n" + ] + } + ], + "source": [ + "import fitz\n", + "doc = fitz.open('../data/raw/textbook.pdf')\n", + "toc = doc.get_toc()\n", + "for item in toc:\n", + " print(item) # [level, title, pdf_page]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "991dbad2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sections to process: 204\n", + "[1, 'Chapter 1: Old Worlds and New', 59]\n", + "[1, 'An Old World: North America', 63]\n", + "[1, 'An Old World: West Africa', 73]\n", + "[1, 'An Old World: Western Europe', 75]\n", + "[1, 'Contact', 80]\n", + "[1, 'The Spanish Empire', 88]\n", + "[1, 'The French and Dutch Empires', 108]\n", + "[1, 'Chapter Review', 120]\n", + "[1, 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 124]\n", + "[1, 'England and the Americas', 129]\n", + "[1, 'Early English Exploration and Colonization', 138]\n", + "[1, 'The Chesapeake', 142]\n", + "[1, 'Origins of American Slavery', 150]\n", + "[1, 'The New England Way', 157]\n", + "[1, 'New Englanders Divided', 169]\n", + "[1, 'Religion, Politics, and Freedom', 180]\n", + "[1, 'Chapter Review', 188]\n", + "[1, 'Chapter 3: Creating Anglo-America, 1660⠍1750', 193]\n", + "[1, 'Global Competition and the Expansion of England⠒s Empire', 197]\n", + "[1, 'Entrenchment of American Slavery', 206]\n", + "[1, 'Colonies in Crisis', 216]\n", + "[1, 'The Growth of Colonial America', 223]\n", + "[1, 'Social Classes in the British Colonies', 238]\n", + "[1, 'North America at Mid-Century', 246]\n", + "[1, 'Chapter Review', 249]\n", + "[1, 'Chapter 4: Slavery, Freedom, and the Struggle for Empire to 1763', 253]\n", + "[1, 'Slavery and Empire', 257]\n", + "[1, 'Slave Cultures and Slave Resistance', 274]\n", + "[1, 'An Empire of Freedom', 280]\n", + "[1, 'The Public Sphere', 285]\n", + "[1, 'The Great Awakening', 294]\n", + "[1, 'Imperial Rivalries', 298]\n", + "[1, 'Battle for the Continent', 306]\n", + "[1, 'Chapter Review', 320]\n", + "[1, 'Chapter 5: The American Revolution, 1763⠍1783', 325]\n", + "[1, 'The Crisis Begins', 329]\n", + "[1, 'The Road to Revolution', 339]\n", + "[1, 'The Coming of Independence', 345]\n", + "[1, 'Securing Independence', 359]\n", + "[1, 'Chapter Review', 375]\n", + "[1, 'Chapter 6: The Revolution Within', 381]\n", + "[1, 'Democratizing Freedom', 384]\n", + "[1, 'Toward Religious Toleration', 392]\n", + "[1, 'Defining Economic Freedom', 399]\n", + "[1, 'The Limits of Liberty', 404]\n", + "[1, 'Slavery and the Revolution', 410]\n", + "[1, 'Daughters of Liberty', 422]\n", + "[1, 'Chapter Review', 432]\n", + "[1, 'Chapter 7: Founding a Nation, 1783⠍1791', 435]\n", + "[1, 'America Under the Confederation', 439]\n", + "[1, 'A New Constitution', 450]\n", + "[1, 'The Ratification Debate and the Origin of the Bill of Rights', 460]\n", + "[1, '“We the Peopleâ€\\x9d', 472]\n", + "[1, 'Chapter Review', 486]\n", + "[1, 'Chapter 8: Securing the Republic, 1791⠍1815', 491]\n", + "[1, 'Politics in an Age of Passion', 494]\n", + "[1, 'The Adams Presidency', 508]\n", + "[1, 'Jefferson in Power', 522]\n", + "[1, 'The “Second War of Independenceâ€\\x9d', 531]\n", + "[1, 'Chapter Review', 542]\n", + "[1, 'Chapter 9: The Market Revolution, 1800⠍1840', 548]\n", + "[1, 'A New Economy', 552]\n", + "[1, 'The Rise of the West', 558]\n", + "[1, 'Market Society', 566]\n", + "[1, 'The Free Individual', 582]\n", + "[1, 'The Limits of Prosperity', 591]\n", + "[1, 'Chapter Review', 601]\n", + "[1, 'Chapter 10: Democracy in America, 1815⠍1840', 606]\n", + "[1, 'The Triumph of Democracy', 610]\n", + "[1, 'Nationalism and Its Discontents', 623]\n", + "[1, 'Nation, Section, and Party', 630]\n", + "[1, 'The Age of Jackson', 639]\n", + "[1, 'Indian Removal', 647]\n", + "[1, 'The Bank War and After', 657]\n", + "[1, 'Chapter Review', 664]\n", + "[1, 'Chapter 11: The Peculiar Institution', 669]\n", + "[1, 'The Old South', 672]\n", + "[1, 'Life Under Slavery', 690]\n", + "[1, 'Slave Culture', 704]\n", + "[1, 'Resistance to Slavery', 712]\n", + "[1, 'Chapter Review', 722]\n", + "[1, 'Chapter 12: An Age of Reform, 1820⠍1840', 725]\n", + "[1, 'The Reform Impulse', 728]\n", + "[1, 'The Crusade Against Slavery', 740]\n", + "[1, 'Black and White Abolitionism', 755]\n", + "[1, 'The Origins of Feminism', 761]\n", + "[1, 'Chapter Review', 775]\n", + "[1, 'Chapter 13: A House Divided, 1840⠍1861', 780]\n", + "[1, 'Fruits of Manifest Destiny', 783]\n", + "[1, 'A Dose of Arsenic', 803]\n", + "[1, 'The Rise of the Republican Party', 814]\n", + "[1, 'The Emergence of Lincoln', 821]\n", + "[1, 'The Impending Crisis', 837]\n", + "[1, 'Chapter Review', 844]\n", + "[1, 'Chapter 14: A New Birth of Freedom: The Civil War, 1861⠍1865', 849]\n", + "[1, 'The First Modern War', 853]\n", + "[1, 'The Coming of Emancipation', 864]\n", + "[1, 'The Second American Revolution', 876]\n", + "[1, 'The Confederate Nation', 891]\n", + "[1, 'Turning Points', 900]\n", + "[1, 'Rehearsals for Reconstruction and the End of the War', 904]\n", + "[1, 'Chapter Review', 912]\n", + "[1, 'Chapter 15: “What Is Freedom?â€\\x9d: Reconstruction', 917]\n", + "[1, 'The Meaning of Freedom', 921]\n", + "[1, 'The Making of Radical Reconstruction', 938]\n", + "[1, 'Radical Reconstruction in the South', 956]\n", + "[1, 'The Overthrow of Reconstruction', 963]\n", + "[1, 'Chapter Review', 972]\n", + "[1, 'Chapter 16: America⠒s Gilded Age, 1870⠍1890', 976]\n", + "[1, 'The Second Industrial Revolution', 980]\n", + "[1, 'Freedom in the Gilded Age', 992]\n", + "[1, 'Labor and the Republic', 999]\n", + "[1, 'The Transformation of the West', 1009]\n", + "[1, 'Politics in a Gilded Age', 1032]\n", + "[1, 'Chapter Review', 1039]\n", + "[1, 'Chapter 17: Freedom⠒s Boundaries, at Home and Abroad, 1890⠍1900', 1044]\n", + "[1, 'The Populist Challenge', 1048]\n", + "[1, 'The Segregated South', 1059]\n", + "[1, 'Redrawing the Boundaries', 1075]\n", + "[1, 'Becoming a World Power', 1082]\n", + "[1, 'Chapter Review', 1101]\n", + "[1, 'Chapter 18: The Progressive Era, 1900⠍1916', 1106]\n", + "[1, 'An Urban Age and a Consumer Society', 1111]\n", + "[1, 'Varieties of Progressivism', 1128]\n", + "[1, 'The Politics of Progressivism', 1144]\n", + "[1, 'The Progressive Presidents', 1158]\n", + "[1, 'Chapter Review', 1170]\n", + "[1, 'Chapter 19: Safe for Democracy: The United States and World War I', 1176]\n", + "[1, 'An Era of Intervention', 1181]\n", + "[1, 'America and the Great War', 1189]\n", + "[1, 'The War at Home', 1195]\n", + "[1, 'Who Is an American?', 1210]\n", + "[1, '1919', 1227]\n", + "[1, 'Chapter Review', 1239]\n", + "[1, 'Chapter 20: From Business Culture to Great Depression: The Twenties, 1920⠍1932', 1244]\n", + "[1, 'The Business of America', 1248]\n", + "[1, 'Business and Government', 1258]\n", + "[1, 'The Birth of Civil Liberties', 1267]\n", + "[1, 'The Culture Wars', 1273]\n", + "[1, 'The Great Depression', 1290]\n", + "[1, 'Chapter Review', 1298]\n", + "[1, 'Chapter 21: The New Deal, 1932⠍1940', 1303]\n", + "[1, 'The First New Deal', 1308]\n", + "[1, 'The Grassroots Revolt', 1321]\n", + "[1, 'The Second New Deal', 1328]\n", + "[1, 'A Reckoning With Liberty', 1333]\n", + "[1, 'The Limits of Change', 1343]\n", + "[1, 'A New Conception of America', 1353]\n", + "[1, 'Chapter Review', 1362]\n", + "[1, 'Chapter 22: Fighting for the Four Freedoms: World War II, 1941⠍1945', 1368]\n", + "[1, 'Fighting World War II', 1374]\n", + "[1, 'The Home Front', 1386]\n", + "[1, 'Visions of Postwar Freedom', 1398]\n", + "[1, 'The American Dilemma', 1403]\n", + "[1, 'The End of the War', 1424]\n", + "[1, 'Chapter Review', 1432]\n", + "[1, 'Chapter 23: The United States and the Cold War, 1945⠍1953', 1437]\n", + "[1, 'Origins of the Cold War', 1442]\n", + "[1, 'The Cold War and the Idea of Freedom', 1456]\n", + "[1, 'The Truman Presidency', 1463]\n", + "[1, 'The Anticommunist Crusade', 1471]\n", + "[1, 'Chapter Review', 1488]\n", + "[1, 'Chapter 24: An Affluent Society, 1953⠍1960', 1493]\n", + "[1, 'The Golden Age', 1497]\n", + "[1, 'The Eisenhower Era', 1519]\n", + "[1, 'The Freedom Movement', 1533]\n", + "[1, 'The Election of 1960', 1548]\n", + "[1, 'Chapter Review', 1552]\n", + "[1, 'Chapter 25: The Sixties, 1960⠍1968', 1557]\n", + "[1, 'The Civil Rights Revolution', 1561]\n", + "[1, 'The Kennedy Years', 1566]\n", + "[1, 'Lyndon Johnson⠒s Presidency', 1571]\n", + "[1, 'The Changing Black Movement', 1581]\n", + "[1, 'Vietnam and the New Left', 1586]\n", + "[1, 'The New Movements and the Rights Revolution', 1596]\n", + "[1, '1968', 1617]\n", + "[1, 'Chapter Review', 1622]\n", + "[1, 'Chapter 26: The Conservative Turn, 1969⠍1988', 1628]\n", + "[1, 'President Nixon', 1631]\n", + "[1, 'Grassroots Rights Movements', 1638]\n", + "[1, 'Foreign Policy and Watergate', 1643]\n", + "[1, 'The End of the Golden Age', 1654]\n", + "[1, 'The Rising Tide of Conservatism', 1667]\n", + "[1, 'The Reagan Revolution', 1677]\n", + "[1, 'Chapter Review', 1691]\n", + "[1, 'Chapter 27: A New World Order, 1989⠍2004', 1696]\n", + "[1, 'The Post⠍Cold War World', 1700]\n", + "[1, 'Globalization and Its Discontents', 1709]\n", + "[1, 'Culture Wars', 1720]\n", + "[1, 'Impeachment and the Election of 2000', 1743]\n", + "[1, 'The Attacks of September 11', 1747]\n", + "[1, 'The War on Terrorism', 1750]\n", + "[1, 'An American Empire?', 1754]\n", + "[1, 'The Aftermath of September 11 at Home', 1759]\n", + "[1, 'Chapter Review', 1764]\n", + "[1, 'Chapter 28: A Divided Nation', 1769]\n", + "[1, 'The Winds of Change', 1772]\n", + "[1, 'The Great Recession', 1780]\n", + "[1, 'Obama in Office', 1789]\n", + "[1, 'The Obama Presidency', 1798]\n", + "[1, 'President Trump', 1807]\n", + "[1, '2020: Year of Crisis', 1820]\n", + "[1, 'Freedom in the Twenty-First Century', 1831]\n", + "[1, 'Chapter Review', 1841]\n" + ] + } + ], + "source": [ + "# Find where Chapter 1 starts and throw away everything before it\n", + "start_index = next(i for i, item in enumerate(toc) if 'Chapter 1' in item[1])\n", + "chapters_toc = toc[start_index:]\n", + "\n", + "# Also throw away back matter (Suggested Reading, Glossary, Index etc.)\n", + "end_titles = {'Suggested Reading', 'The Declaration of Independence (1776)', \n", + " 'The Constitution of The United States (1787)', 'Glossary', \n", + " 'Credits', 'Index'}\n", + "chapters_toc = [item for item in chapters_toc if item[1] not in end_titles]\n", + "\n", + "print(f\"Sections to process: {len(chapters_toc)}\")\n", + "for item in chapters_toc:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "43e20197", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'title': 'Chapter 1: Old Worlds and New', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': True, 'start_pdf': 58, 'end_pdf': 61}\n", + "{'title': 'An Old World: North America', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 62, 'end_pdf': 71}\n", + "{'title': 'An Old World: West Africa', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 72, 'end_pdf': 73}\n", + "{'title': 'An Old World: Western Europe', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 74, 'end_pdf': 78}\n", + "{'title': 'Contact', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 79, 'end_pdf': 86}\n", + "{'title': 'The Spanish Empire', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 87, 'end_pdf': 106}\n", + "{'title': 'The French and Dutch Empires', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 107, 'end_pdf': 118}\n", + "{'title': 'Chapter Review', 'chapter_num': 1, 'chapter_title': 'Chapter 1: Old Worlds and New', 'is_chapter_header': False, 'start_pdf': 119, 'end_pdf': 122}\n", + "{'title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': True, 'start_pdf': 123, 'end_pdf': 127}\n", + "{'title': 'England and the Americas', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 128, 'end_pdf': 136}\n", + "{'title': 'Early English Exploration and Colonization', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 137, 'end_pdf': 140}\n", + "{'title': 'The Chesapeake', 'chapter_num': 2, 'chapter_title': 'Chapter 2: European Colonies and Native Nations, 1600⠍1660', 'is_chapter_header': False, 'start_pdf': 141, 'end_pdf': 148}\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "def parse_chapter_num(title):\n", + " match = re.match(r'Chapter (\\d+):', title)\n", + " return int(match.group(1)) if match else None\n", + "\n", + "structured = []\n", + "current_chapter_num = None\n", + "current_chapter_title = None\n", + "\n", + "for i, item in enumerate(chapters_toc):\n", + " title = item[1]\n", + " start_pdf = item[2] - 1 # 0-indexed\n", + " end_pdf = (chapters_toc[i + 1][2] - 2) if i + 1 < len(chapters_toc) else doc.page_count - 1\n", + "\n", + " chapter_num = parse_chapter_num(title)\n", + "\n", + " if chapter_num:\n", + " # This entry IS a chapter\n", + " current_chapter_num = chapter_num\n", + " current_chapter_title = title\n", + " is_chapter_header = True\n", + " else:\n", + " is_chapter_header = False\n", + "\n", + " structured.append({\n", + " \"title\": title,\n", + " \"chapter_num\": current_chapter_num,\n", + " \"chapter_title\": current_chapter_title,\n", + " \"is_chapter_header\": is_chapter_header,\n", + " \"start_pdf\": start_pdf,\n", + " \"end_pdf\": end_pdf,\n", + " })\n", + "\n", + "# Sanity check\n", + "for s in structured[:12]:\n", + " print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "149bc714", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 'Half-title Page', 2, {'kind': 1, 'xref': 63504, 'page': 1, 'to': Point(76.47846, 86.92822), 'zoom': 0.0}]\n", + "[1, 'Physical/Political Map of The United States', 5, {'kind': 1, 'xref': 63507, 'page': 4, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n", + "[1, 'Political Map of The World', 6, {'kind': 1, 'xref': 63509, 'page': 5, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n", + "[1, 'Title Page', 7, {'kind': 1, 'xref': 63511, 'page': 6, 'to': Point(76.47846, 86.92822), 'zoom': 0.0}]\n", + "[1, 'Copyright', 10, {'kind': 1, 'xref': 63513, 'page': 9, 'to': Point(76.5, 87.0), 'zoom': 0.0}]\n", + "[1, 'Dedication', 13, {'kind': 1, 'xref': 63515, 'page': 12, 'to': Point(76.5, 87.0), 'zoom': 0.0}]\n", + "[1, 'Contents', 14, {'kind': 1, 'xref': 63517, 'page': 13, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n", + "[1, 'List of Maps, Tables, and Figures', 22, {'kind': 1, 'xref': 63519, 'page': 21, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n", + "[1, 'About the Authors', 32, {'kind': 1, 'xref': 63521, 'page': 31, 'to': Point(76.47846, 91.40668), 'zoom': 0.0}]\n", + "[1, 'Preface', 34, {'kind': 1, 'xref': 63523, 'page': 33, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n", + "[1, 'Resources For Students And Instructors', 54, {'kind': 1, 'xref': 63525, 'page': 53, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n", + "[1, 'Chapter 1: Old Worlds and New', 59, {'kind': 1, 'xref': 63527, 'page': 58, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n", + "[1, 'An Old World: North America', 63, {'kind': 1, 'xref': 63529, 'page': 62, 'to': Point(76.18479, 90.134159), 'zoom': 0.0}]\n", + "[1, 'An Old World: West Africa', 73, {'kind': 1, 'xref': 63531, 'page': 72, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n", + "[1, 'An Old World: Western Europe', 75, {'kind': 1, 'xref': 63533, 'page': 74, 'to': Point(76.5, 91.5), 'zoom': 0.0}]\n" + ] + } + ], + "source": [ + "toc_full = doc.get_toc(simple=False)\n", + "\n", + "for item in toc_full[:15]:\n", + " print(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c2563864", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated → /home/keshav/code/apush-rag/config/page_map.yaml\n", + "Now open that file and fill in the real page numbers. Leave as null if unknown.\n" + ] + } + ], + "source": [ + "import yaml\n", + "from pathlib import Path\n", + "\n", + "project_root = Path().resolve().parent\n", + "output_path = project_root / \"config\" / \"page_map.yaml\"\n", + "\n", + "page_map = {\"chapters\": {}}\n", + "\n", + "for section in structured:\n", + " ch_num = section[\"chapter_num\"]\n", + " ch_title = section[\"chapter_title\"]\n", + " title = section[\"title\"]\n", + "\n", + " # Initialize chapter entry if first time seeing it\n", + " if ch_num not in page_map[\"chapters\"]:\n", + " page_map[\"chapters\"][ch_num] = {\n", + " \"title\": ch_title,\n", + " \"real_page\": None, # ← you fill this in\n", + " \"sections\": {}\n", + " }\n", + "\n", + " # Add section with null page — you fill these in\n", + " if not section[\"is_chapter_header\"]:\n", + " page_map[\"chapters\"][ch_num][\"sections\"][title] = None\n", + "\n", + "with open(output_path, \"w\") as f:\n", + " yaml.dump(page_map, f, allow_unicode=True, sort_keys=False, default_flow_style=False)\n", + "\n", + "print(f\"Generated → {output_path}\")\n", + "print(\"Now open that file and fill in the real page numbers. Leave as null if unknown.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}