kindle_OCR/scan_all_pages.py

#!/usr/bin/env python3
"""
SCAN ALL PAGES - No stopping, capture every single page 123-226
User specifically requested ALL pages regardless of duplicates
"""

import asyncio
from playwright.async_api import async_playwright
from pathlib import Path
import time
import json

async def scan_all_pages(start_page=123, total_pages=226):
    """
    Scan ALL remaining pages - no early stopping for duplicates
    """
    storage_state_path = "kindle_session_state.json"

    if not Path(storage_state_path).exists():
        print("❌ No session state found.")
        return False

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-web-security",
                "--disable-features=VizDisplayCompositor"
            ]
        )

        context = await browser.new_context(
            storage_state=storage_state_path,
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        page = await context.new_page()

        try:
            print(f"🚀 SCANNING ALL PAGES: {start_page} to {total_pages}")
            print(f"📋 User requested: COMPLETE BOOK - NO EARLY STOPPING")
            print("=" * 60)

            # Load book
            await page.goto("https://read.amazon.com/?asin=B0DJP2C8M6&ref_=kwl_kr_iv_rec_1")
            await page.wait_for_timeout(5000)

            # Navigate to start page
            print(f"🎯 Navigating to page {start_page}...")
            for i in range(start_page - 1):
                await page.keyboard.press("ArrowRight")
                if i % 30 == 29:
                    print(f"   📍 Navigated {i + 1} pages...")
                await page.wait_for_timeout(100)  # Fast navigation

            print(f"   ✅ Reached page {start_page}")

            # Scan ALL remaining pages - NO STOPPING
            output_dir = Path("scanned_pages")
            output_dir.mkdir(exist_ok=True)

            print(f"📸 SCANNING ALL PAGES {start_page} to {total_pages}...")
            print("⚠️  NO DUPLICATE DETECTION - CAPTURING EVERYTHING")

            pages_captured = 0

            for page_num in range(start_page, total_pages + 1):
                print(f"📸 Scanning page {page_num}/{total_pages}...")

                filename = output_dir / f"page_{page_num:03d}.png"
                await page.screenshot(path=str(filename))

                file_size = filename.stat().st_size
                print(f"   ✅ Captured ({file_size} bytes)")

                pages_captured += 1

                # Progress reports
                if page_num % 20 == 0:
                    progress = (page_num / total_pages) * 100
                    print(f"📊 MAJOR PROGRESS: {page_num}/{total_pages} ({progress:.1f}%)")

                if page_num % 50 == 0:
                    print(f"🎯 MILESTONE: {pages_captured} pages captured so far!")

                # Navigate to next page (except last)
                if page_num < total_pages:
                    await page.keyboard.press("ArrowRight")
                    await page.wait_for_timeout(800)  # Reliable timing

            # Final progress save
            progress_data = {
                "last_completed_page": total_pages,
                "total_pages": total_pages,
                "completed_percentage": 100.0,
                "timestamp": time.time(),
                "session_state_file": storage_state_path,
                "scan_complete": True,
                "all_pages_captured": True
            }

            with open("scan_progress.json", 'w') as f:
                json.dump(progress_data, f, indent=2)

            print(f"\n🎉 ALL PAGES SCANNING COMPLETED!")
            print(f"📊 FINAL RESULT: ALL {total_pages} pages captured")
            print(f"📈 Completion: 100%")
            print(f"✅ COMPLETE BOOK SUCCESSFULLY SCANNED!")

            return total_pages

        except Exception as e:
            print(f"❌ Scanning error: {e}")
            import traceback
            traceback.print_exc()

            # Save partial progress
            partial_progress = {
                "last_completed_page": start_page + pages_captured - 1,
                "total_pages": total_pages,
                "completed_percentage": ((start_page + pages_captured - 1) / total_pages) * 100,
                "timestamp": time.time(),
                "session_state_file": storage_state_path,
                "scan_complete": False,
                "error_occurred": True
            }

            with open("scan_progress.json", 'w') as f:
                json.dump(partial_progress, f, indent=2)

            return start_page + pages_captured - 1
        finally:
            await browser.close()

if __name__ == "__main__":
    result = asyncio.run(scan_all_pages())
    print(f"\n🏁 FINAL RESULT: {result} pages total captured")

    if result >= 226:
        print("🎉 SUCCESS: Complete 226-page book captured!")
    else:
        print(f"📊 Progress: {result}/226 pages captured")