#!/usr/bin/env python3 """ PDF to Text Converter and Splitter Converts PDF files to text and splits them into chunks of N pages each """ import os import sys from pathlib import Path from pypdf import PdfReader def convert_and_split_pdf(pdf_path, pages_per_chunk=10): """ Convert a PDF to text and split into smaller text files Args: pdf_path: Path to the PDF file pages_per_chunk: Number of pages per output file (default: 10) """ pdf_path = Path(pdf_path) if not pdf_path.exists(): print(f"Error: PDF file not found: {pdf_path}") return print(f"Processing: {pdf_path.name}") # Read the PDF reader = PdfReader(pdf_path) total_pages = len(reader.pages) print(f"Total pages: {total_pages}") # Create output directory output_dir = pdf_path.parent / f"{pdf_path.stem}_text" output_dir.mkdir(exist_ok=True) print(f"Output directory: {output_dir}") # Process pages in chunks chunk_num = 1 page_num = 0 while page_num < total_pages: # Determine the range for this chunk start_page = page_num end_page = min(page_num + pages_per_chunk, total_pages) # Extract text from pages in this chunk chunk_text = [] chunk_text.append(f"{'=' * 80}\n") chunk_text.append(f"{pdf_path.name} - Pages {start_page + 1} to {end_page}\n") chunk_text.append(f"{'=' * 80}\n\n") for i in range(start_page, end_page): page = reader.pages[i] text = page.extract_text() chunk_text.append(f"\n{'─' * 80}\n") chunk_text.append(f"Page {i + 1}\n") chunk_text.append(f"{'─' * 80}\n\n") chunk_text.append(text) chunk_text.append("\n") # Save the chunk to a file output_file = output_dir / f"{pdf_path.stem}_pages_{start_page + 1:03d}-{end_page:03d}.txt" with open(output_file, 'w', encoding='utf-8') as f: f.write(''.join(chunk_text)) print(f" Created: {output_file.name} (pages {start_page + 1}-{end_page})") page_num = end_page chunk_num += 1 print(f"\nCompleted! Created {chunk_num - 1} text files in {output_dir}") print() def main(): if len(sys.argv) < 2: print("Usage: python pdf_converter.py [pdf_file2] ...") print(" python pdf_converter.py --pages N") sys.exit(1) # Default pages per chunk pages_per_chunk = 10 pdf_files = [] # Parse arguments i = 1 while i < len(sys.argv): if sys.argv[i] == '--pages' and i + 1 < len(sys.argv): pages_per_chunk = int(sys.argv[i + 1]) i += 2 else: pdf_files.append(sys.argv[i]) i += 1 if not pdf_files: print("Error: No PDF files specified") sys.exit(1) print(f"PDF to Text Converter - {pages_per_chunk} pages per chunk\n") # Process each PDF file for pdf_file in pdf_files: convert_and_split_pdf(pdf_file, pages_per_chunk) if __name__ == "__main__": main()