kindle_OCR/improved_chunked_scanner.py

#!/usr/bin/env python3
"""
IMPROVED CHUNKED SCANNER - Uses proven working navigation from successful scan
"""

import asyncio
import argparse
import re
from playwright.async_api import async_playwright
from pathlib import Path
import time
import json

async def improved_chunked_scanner(start_page=1, chunk_size=40, total_pages=226):
    """
    Improved chunked scanner using proven working navigation
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-web-security",
                "--disable-features=VizDisplayCompositor"
            ]
        )
        context = await browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )

        await context.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined,
            });
        """)

        page = await context.new_page()

        try:
            print(f"🎯 IMPROVED CHUNKED SCANNER - Pages {start_page} to {min(start_page + chunk_size - 1, total_pages)}")
            print("=" * 70)

            # STEP 1: LOGIN (simplified since CAPTCHA solved)
            print("🔐 Step 1: Logging in...")
            await page.goto("https://read.amazon.com/?asin=B0DJP2C8M6&ref_=kwl_kr_iv_rec_1")
            await page.wait_for_timeout(5000)

            if "signin" in page.url:
                email_field = await page.wait_for_selector("#ap_email", timeout=10000)
                await email_field.fill("ondrej.glaser@gmail.com")
                continue_btn = await page.wait_for_selector("#continue", timeout=5000)
                await continue_btn.click()
                await page.wait_for_timeout(3000)
                password_field = await page.wait_for_selector("#ap_password", timeout=10000)
                await password_field.fill("csjXgew3In")
                signin_btn = await page.wait_for_selector("#signInSubmit", timeout=5000)
                await signin_btn.click()
                await page.wait_for_timeout(5000)

            print("✅ Login completed")

            # STEP 2: WAIT FOR READER TO LOAD (using working selectors)
            print("📖 Step 2: Waiting for reader to load...")
            # Try multiple selectors that worked before
            reader_loaded = False
            selectors_to_try = ["ion-header", "[class*='reader']", "#reader-header"]

            for selector in selectors_to_try:
                try:
                    await page.wait_for_selector(selector, timeout=10000)
                    print(f"   ✅ Reader loaded: {selector}")
                    reader_loaded = True
                    break
                except:
                    continue

            if not reader_loaded:
                # Fallback - just wait and check for book content
                await page.wait_for_timeout(8000)
                print("   ✅ Using fallback detection")

            # STEP 3: NAVIGATION STRATEGY
            if start_page == 1:
                print("🎯 Step 3: Navigating to beginning...")
                # Use proven TOC method for first chunk
                try:
                    toc_button = await page.wait_for_selector("[aria-label='Table of Contents']", timeout=5000)
                    await toc_button.click()
                    await page.wait_for_timeout(2000)

                    cover_link = await page.wait_for_selector("text=Cover", timeout=5000)
                    await cover_link.click()
                    await page.wait_for_timeout(3000)

                    # Close TOC using proven method
                    for i in range(5):
                        await page.keyboard.press("Escape")
                        await page.wait_for_timeout(500)
                    await page.click("body", position={"x": 600, "y": 400})
                    await page.wait_for_timeout(2000)

                    print("   ✅ Navigated to book beginning")
                except Exception as e:
                    print(f"   ⚠️ TOC navigation failed: {e}")
            else:
                print(f"🎯 Step 3: Continuing from page {start_page}...")
                # For continuation, we assume we're already positioned correctly
                # from previous chunks or use a more conservative approach

            # STEP 4: SCANNING WITH PROVEN NAVIGATION
            output_dir = Path("scanned_pages")
            output_dir.mkdir(exist_ok=True)

            end_page = min(start_page + chunk_size - 1, total_pages)

            print(f"🚀 Step 4: Scanning pages {start_page} to {end_page}...")

            consecutive_identical = 0
            last_file_size = 0

            # Simple scanning loop like the working version
            for page_num in range(start_page, end_page + 1):
                print(f"📸 Scanning page {page_num}...")

                # Take screenshot
                filename = output_dir / f"page_{page_num:03d}.png"
                await page.screenshot(path=str(filename), full_page=False)

                # Check file size
                file_size = filename.stat().st_size
                if abs(file_size - last_file_size) < 5000:  # More lenient
                    consecutive_identical += 1
                    print(f"   ⚠️ Possible duplicate ({consecutive_identical}/7)")
                else:
                    consecutive_identical = 0
                    print(f"   ✅ New content ({file_size} bytes)")

                last_file_size = file_size

                # Stop if too many duplicates
                if consecutive_identical >= 7:
                    print("📖 Detected end of book")
                    break

                # Navigate to next page (except last)
                if page_num < end_page:
                    await page.keyboard.press("ArrowRight")
                    await page.wait_for_timeout(1000)  # Use proven timing

            # Save progress
            progress_file = Path("scan_progress.json")
            actual_end_page = page_num if consecutive_identical < 7 else page_num - consecutive_identical

            progress_data = {
                "last_completed_page": actual_end_page,
                "total_pages": total_pages,
                "chunk_size": chunk_size,
                "timestamp": time.time()
            }

            with open(progress_file, 'w') as f:
                json.dump(progress_data, f, indent=2)

            print(f"\n🎉 CHUNK COMPLETED!")
            print(f"📊 Actually scanned: {start_page} to {actual_end_page}")
            print(f"📁 Progress saved to: {progress_file}")

            return actual_end_page

        except Exception as e:
            print(f"❌ Error: {e}")
            import traceback
            traceback.print_exc()
            return start_page - 1
        finally:
            await browser.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Improved Chunked Kindle Scanner")
    parser.add_argument("--start-page", type=int, default=65, help="Starting page")
    parser.add_argument("--chunk-size", type=int, default=30, help="Pages per chunk")
    parser.add_argument("--total-pages", type=int, default=226, help="Total pages")

    args = parser.parse_args()

    asyncio.run(improved_chunked_scanner(args.start_page, args.chunk_size, args.total_pages))