geutebruck/SOURCES/pdf_converter.py

#!/usr/bin/env python3
"""
PDF to Text Converter and Splitter
Converts PDF files to text and splits them into chunks of N pages each
"""

import os
import sys
from pathlib import Path
from pypdf import PdfReader


def convert_and_split_pdf(pdf_path, pages_per_chunk=10):
    """
    Convert a PDF to text and split into smaller text files

    Args:
        pdf_path: Path to the PDF file
        pages_per_chunk: Number of pages per output file (default: 10)
    """
    pdf_path = Path(pdf_path)

    if not pdf_path.exists():
        print(f"Error: PDF file not found: {pdf_path}")
        return

    print(f"Processing: {pdf_path.name}")

    # Read the PDF
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    print(f"Total pages: {total_pages}")

    # Create output directory
    output_dir = pdf_path.parent / f"{pdf_path.stem}_text"
    output_dir.mkdir(exist_ok=True)
    print(f"Output directory: {output_dir}")

    # Process pages in chunks
    chunk_num = 1
    page_num = 0

    while page_num < total_pages:
        # Determine the range for this chunk
        start_page = page_num
        end_page = min(page_num + pages_per_chunk, total_pages)

        # Extract text from pages in this chunk
        chunk_text = []
        chunk_text.append(f"{'=' * 80}\n")
        chunk_text.append(f"{pdf_path.name} - Pages {start_page + 1} to {end_page}\n")
        chunk_text.append(f"{'=' * 80}\n\n")

        for i in range(start_page, end_page):
            page = reader.pages[i]
            text = page.extract_text()

            chunk_text.append(f"\n{'─' * 80}\n")
            chunk_text.append(f"Page {i + 1}\n")
            chunk_text.append(f"{'─' * 80}\n\n")
            chunk_text.append(text)
            chunk_text.append("\n")

        # Save the chunk to a file
        output_file = output_dir / f"{pdf_path.stem}_pages_{start_page + 1:03d}-{end_page:03d}.txt"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(''.join(chunk_text))

        print(f"  Created: {output_file.name} (pages {start_page + 1}-{end_page})")

        page_num = end_page
        chunk_num += 1

    print(f"\nCompleted! Created {chunk_num - 1} text files in {output_dir}")
    print()


def main():
    if len(sys.argv) < 2:
        print("Usage: python pdf_converter.py <pdf_file1> [pdf_file2] ...")
        print("       python pdf_converter.py <pdf_file> --pages N")
        sys.exit(1)

    # Default pages per chunk
    pages_per_chunk = 10
    pdf_files = []

    # Parse arguments
    i = 1
    while i < len(sys.argv):
        if sys.argv[i] == '--pages' and i + 1 < len(sys.argv):
            pages_per_chunk = int(sys.argv[i + 1])
            i += 2
        else:
            pdf_files.append(sys.argv[i])
            i += 1

    if not pdf_files:
        print("Error: No PDF files specified")
        sys.exit(1)

    print(f"PDF to Text Converter - {pages_per_chunk} pages per chunk\n")

    # Process each PDF file
    for pdf_file in pdf_files:
        convert_and_split_pdf(pdf_file, pages_per_chunk)


if __name__ == "__main__":
    main()