kindle_OCR/scan_beginning.py

#!/usr/bin/env python3
"""
SCAN BEGINNING - Pages 1-64 to complete the book
"""

import asyncio
from playwright.async_api import async_playwright
from pathlib import Path
import time
import json

async def scan_beginning_pages(start_page=1, end_page=64):
    """
    Scan the beginning pages 1-64 that were missing
    """
    storage_state_path = "kindle_session_state.json"

    if not Path(storage_state_path).exists():
        print("❌ No session state found.")
        return False

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-web-security",
                "--disable-features=VizDisplayCompositor"
            ]
        )

        context = await browser.new_context(
            storage_state=storage_state_path,
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        page = await context.new_page()

        try:
            print(f"🚀 SCANNING BEGINNING: Pages {start_page} to {end_page}")
            print("=" * 50)

            # Load book
            await page.goto("https://read.amazon.com/?asin=B0DJP2C8M6&ref_=kwl_kr_iv_rec_1")
            await page.wait_for_timeout(5000)

            # Navigate to actual first page (page 1)
            print("🎯 Navigating to first page...")

            # Try to click on page 1 or beginning - check if we're already there
            await page.keyboard.press("Home")  # Go to beginning
            await page.wait_for_timeout(2000)

            # Make sure we're at the very beginning
            for _ in range(10):
                await page.keyboard.press("ArrowLeft")
                await page.wait_for_timeout(100)

            print("✅ At beginning of book")

            # Scan pages 1-64
            output_dir = Path("scanned_pages")
            output_dir.mkdir(exist_ok=True)

            print(f"📸 SCANNING PAGES {start_page} to {end_page}...")

            pages_captured = 0

            for page_num in range(start_page, end_page + 1):
                print(f"📸 Scanning page {page_num}/{end_page}...")

                filename = output_dir / f"page_{page_num:03d}.png"
                await page.screenshot(path=str(filename))

                file_size = filename.stat().st_size
                print(f"   ✅ Captured ({file_size} bytes)")

                pages_captured += 1

                # Progress reports
                if page_num % 10 == 0:
                    progress = (page_num / end_page) * 100
                    print(f"📊 PROGRESS: {page_num}/{end_page} ({progress:.1f}%)")

                # Navigate to next page (except last)
                if page_num < end_page:
                    await page.keyboard.press("ArrowRight")
                    await page.wait_for_timeout(800)

            print(f"\n🎉 BEGINNING PAGES COMPLETED!")
            print(f"📊 RESULT: Pages 1-{end_page} captured")
            print(f"✅ {pages_captured} pages successfully scanned!")

            return end_page

        except Exception as e:
            print(f"❌ Scanning error: {e}")
            import traceback
            traceback.print_exc()
            return pages_captured
        finally:
            await browser.close()

if __name__ == "__main__":
    result = asyncio.run(scan_beginning_pages())
    print(f"\n🏁 RESULT: {result} beginning pages captured")