#!/usr/bin/env python3 """ IMPROVED CHUNKED SCANNER - Uses proven working navigation from successful scan """ import asyncio import argparse import re from playwright.async_api import async_playwright from pathlib import Path import time import json async def improved_chunked_scanner(start_page=1, chunk_size=40, total_pages=226): """ Improved chunked scanner using proven working navigation """ async with async_playwright() as p: browser = await p.chromium.launch( headless=False, args=[ "--disable-blink-features=AutomationControlled", "--disable-web-security", "--disable-features=VizDisplayCompositor" ] ) context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) await context.add_init_script(""" Object.defineProperty(navigator, 'webdriver', { get: () => undefined, }); """) page = await context.new_page() try: print(f"šŸŽÆ IMPROVED CHUNKED SCANNER - Pages {start_page} to {min(start_page + chunk_size - 1, total_pages)}") print("=" * 70) # STEP 1: LOGIN (simplified since CAPTCHA solved) print("šŸ” Step 1: Logging in...") await page.goto("https://read.amazon.com/?asin=B0DJP2C8M6&ref_=kwl_kr_iv_rec_1") await page.wait_for_timeout(5000) if "signin" in page.url: email_field = await page.wait_for_selector("#ap_email", timeout=10000) await email_field.fill("ondrej.glaser@gmail.com") continue_btn = await page.wait_for_selector("#continue", timeout=5000) await continue_btn.click() await page.wait_for_timeout(3000) password_field = await page.wait_for_selector("#ap_password", timeout=10000) await password_field.fill("csjXgew3In") signin_btn = await page.wait_for_selector("#signInSubmit", timeout=5000) await signin_btn.click() await page.wait_for_timeout(5000) print("āœ… Login completed") # STEP 2: WAIT FOR READER TO LOAD (using working selectors) print("šŸ“– Step 2: Waiting for reader to load...") # Try multiple selectors that worked before reader_loaded = False selectors_to_try = ["ion-header", "[class*='reader']", "#reader-header"] for selector in selectors_to_try: try: await page.wait_for_selector(selector, timeout=10000) print(f" āœ… Reader loaded: {selector}") reader_loaded = True break except: continue if not reader_loaded: # Fallback - just wait and check for book content await page.wait_for_timeout(8000) print(" āœ… Using fallback detection") # STEP 3: NAVIGATION STRATEGY if start_page == 1: print("šŸŽÆ Step 3: Navigating to beginning...") # Use proven TOC method for first chunk try: toc_button = await page.wait_for_selector("[aria-label='Table of Contents']", timeout=5000) await toc_button.click() await page.wait_for_timeout(2000) cover_link = await page.wait_for_selector("text=Cover", timeout=5000) await cover_link.click() await page.wait_for_timeout(3000) # Close TOC using proven method for i in range(5): await page.keyboard.press("Escape") await page.wait_for_timeout(500) await page.click("body", position={"x": 600, "y": 400}) await page.wait_for_timeout(2000) print(" āœ… Navigated to book beginning") except Exception as e: print(f" āš ļø TOC navigation failed: {e}") else: print(f"šŸŽÆ Step 3: Continuing from page {start_page}...") # For continuation, we assume we're already positioned correctly # from previous chunks or use a more conservative approach # STEP 4: SCANNING WITH PROVEN NAVIGATION output_dir = Path("scanned_pages") output_dir.mkdir(exist_ok=True) end_page = min(start_page + chunk_size - 1, total_pages) print(f"šŸš€ Step 4: Scanning pages {start_page} to {end_page}...") consecutive_identical = 0 last_file_size = 0 # Simple scanning loop like the working version for page_num in range(start_page, end_page + 1): print(f"šŸ“ø Scanning page {page_num}...") # Take screenshot filename = output_dir / f"page_{page_num:03d}.png" await page.screenshot(path=str(filename), full_page=False) # Check file size file_size = filename.stat().st_size if abs(file_size - last_file_size) < 5000: # More lenient consecutive_identical += 1 print(f" āš ļø Possible duplicate ({consecutive_identical}/7)") else: consecutive_identical = 0 print(f" āœ… New content ({file_size} bytes)") last_file_size = file_size # Stop if too many duplicates if consecutive_identical >= 7: print("šŸ“– Detected end of book") break # Navigate to next page (except last) if page_num < end_page: await page.keyboard.press("ArrowRight") await page.wait_for_timeout(1000) # Use proven timing # Save progress progress_file = Path("scan_progress.json") actual_end_page = page_num if consecutive_identical < 7 else page_num - consecutive_identical progress_data = { "last_completed_page": actual_end_page, "total_pages": total_pages, "chunk_size": chunk_size, "timestamp": time.time() } with open(progress_file, 'w') as f: json.dump(progress_data, f, indent=2) print(f"\nšŸŽ‰ CHUNK COMPLETED!") print(f"šŸ“Š Actually scanned: {start_page} to {actual_end_page}") print(f"šŸ“ Progress saved to: {progress_file}") return actual_end_page except Exception as e: print(f"āŒ Error: {e}") import traceback traceback.print_exc() return start_page - 1 finally: await browser.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Improved Chunked Kindle Scanner") parser.add_argument("--start-page", type=int, default=65, help="Starting page") parser.add_argument("--chunk-size", type=int, default=30, help="Pages per chunk") parser.add_argument("--total-pages", type=int, default=226, help="Total pages") args = parser.parse_args() asyncio.run(improved_chunked_scanner(args.start_page, args.chunk_size, args.total_pages))