Files
kindle_OCR/scan_beginning.py
2025-09-25 08:41:21 +02:00

107 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
SCAN BEGINNING - Pages 1-64 to complete the book
"""
import asyncio
from playwright.async_api import async_playwright
from pathlib import Path
import time
import json
async def scan_beginning_pages(start_page=1, end_page=64):
"""
Scan the beginning pages 1-64 that were missing
"""
storage_state_path = "kindle_session_state.json"
if not Path(storage_state_path).exists():
print("❌ No session state found.")
return False
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=VizDisplayCompositor"
]
)
context = await browser.new_context(
storage_state=storage_state_path,
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = await context.new_page()
try:
print(f"🚀 SCANNING BEGINNING: Pages {start_page} to {end_page}")
print("=" * 50)
# Load book
await page.goto("https://read.amazon.com/?asin=B0DJP2C8M6&ref_=kwl_kr_iv_rec_1")
await page.wait_for_timeout(5000)
# Navigate to actual first page (page 1)
print("🎯 Navigating to first page...")
# Try to click on page 1 or beginning - check if we're already there
await page.keyboard.press("Home") # Go to beginning
await page.wait_for_timeout(2000)
# Make sure we're at the very beginning
for _ in range(10):
await page.keyboard.press("ArrowLeft")
await page.wait_for_timeout(100)
print("✅ At beginning of book")
# Scan pages 1-64
output_dir = Path("scanned_pages")
output_dir.mkdir(exist_ok=True)
print(f"📸 SCANNING PAGES {start_page} to {end_page}...")
pages_captured = 0
for page_num in range(start_page, end_page + 1):
print(f"📸 Scanning page {page_num}/{end_page}...")
filename = output_dir / f"page_{page_num:03d}.png"
await page.screenshot(path=str(filename))
file_size = filename.stat().st_size
print(f" ✅ Captured ({file_size} bytes)")
pages_captured += 1
# Progress reports
if page_num % 10 == 0:
progress = (page_num / end_page) * 100
print(f"📊 PROGRESS: {page_num}/{end_page} ({progress:.1f}%)")
# Navigate to next page (except last)
if page_num < end_page:
await page.keyboard.press("ArrowRight")
await page.wait_for_timeout(800)
print(f"\n🎉 BEGINNING PAGES COMPLETED!")
print(f"📊 RESULT: Pages 1-{end_page} captured")
print(f"{pages_captured} pages successfully scanned!")
return end_page
except Exception as e:
print(f"❌ Scanning error: {e}")
import traceback
traceback.print_exc()
return pages_captured
finally:
await browser.close()
if __name__ == "__main__":
result = asyncio.run(scan_beginning_pages())
print(f"\n🏁 RESULT: {result} beginning pages captured")