geutebruck/extract_pdfs.ps1

# PDF to Text Extraction Script
# Extracts text from all PDFs in SOURCES folder and saves as .txt files

param(
    [string]$SourcePath = "C:\DEV\COPILOT\SOURCES",
    [string]$OutputPath = "C:\DEV\COPILOT\SOURCES\EXTRACTED_TEXT",
    [int]$PagesPerBatch = 50
)

Write-Host "=== PDF Text Extraction Script ===" -ForegroundColor Cyan
Write-Host "Source: $SourcePath"
Write-Host "Output: $OutputPath"
Write-Host ""

# Create output directory
if (-not (Test-Path $OutputPath)) {
    New-Item -ItemType Directory -Path $OutputPath -Force | Out-Null
    Write-Host "Created output directory: $OutputPath" -ForegroundColor Green
}

# Find all PDF files
$pdfFiles = Get-ChildItem -Path $SourcePath -Filter "*.pdf" -Recurse
Write-Host "Found $($pdfFiles.Count) PDF files" -ForegroundColor Yellow
Write-Host ""

# Function to extract text using iTextSharp (if available)
function Extract-PDF-iTextSharp {
    param($pdfPath, $outputPath)

    try {
        # Try to load iTextSharp
        $itextPath = "C:\DEV\COPILOT\packages\iTextSharp.5.5.13.3\lib\itextsharp.dll"

        if (-not (Test-Path $itextPath)) {
            return $false
        }

        Add-Type -Path $itextPath

        $reader = New-Object iTextSharp.text.pdf.PdfReader($pdfPath)
        $text = ""

        for ($page = 1; $page -le $reader.NumberOfPages; $page++) {
            $text += [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, $page)
            $text += "`r`n`r`n--- Page $page ---`r`n`r`n"

            if ($page % 10 -eq 0) {
                Write-Host "  Extracted page $page of $($reader.NumberOfPages)..." -ForegroundColor DarkGray
            }
        }

        $reader.Close()
        [System.IO.File]::WriteAllText($outputPath, $text)
        return $true
    }
    catch {
        return $false
    }
}

# Function to extract text using Word COM (fallback method)
function Extract-PDF-Word {
    param($pdfPath, $outputPath)

    try {
        $word = New-Object -ComObject Word.Application
        $word.Visible = $false
        $doc = $word.Documents.Open($pdfPath)
        $text = $doc.Content.Text
        [System.IO.File]::WriteAllText($outputPath, $text)
        $doc.Close()
        $word.Quit()
        [System.Runtime.Interopservices.Marshal]::ReleaseComObject($word) | Out-Null
        return $true
    }
    catch {
        return $false
    }
}

# Function to extract using Claude's Read tool (via temp Python script)
function Extract-PDF-Python {
    param($pdfPath, $outputPath)

    $pythonScript = @"
import sys
try:
    import PyPDF2

    with open(sys.argv[1], 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for i, page in enumerate(reader.pages):
            text += page.extract_text()
            text += f'\n\n--- Page {i+1} ---\n\n'

        with open(sys.argv[2], 'w', encoding='utf-8') as output:
            output.write(text)
    print('SUCCESS')
except ImportError:
    print('PyPDF2 not installed')
    sys.exit(1)
except Exception as e:
    print(f'ERROR: {e}')
    sys.exit(1)
"@

    $tempScript = "$env:TEMP\extract_pdf_temp.py"
    $pythonScript | Out-File -FilePath $tempScript -Encoding UTF8

    try {
        $result = python $tempScript $pdfPath $outputPath 2>&1
        if ($result -eq 'SUCCESS') {
            Remove-Item $tempScript -Force
            return $true
        }
    }
    catch {
        # Python not available
    }

    if (Test-Path $tempScript) {
        Remove-Item $tempScript -Force
    }
    return $false
}

# Process each PDF
$successCount = 0
$failCount = 0

foreach ($pdf in $pdfFiles) {
    $relativePath = $pdf.FullName.Substring($SourcePath.Length + 1)
    $outputFile = Join-Path $OutputPath ($relativePath -replace '\.pdf$', '.txt')
    $outputDir = Split-Path $outputFile -Parent

    Write-Host "Processing: $relativePath" -ForegroundColor White

    # Create subdirectory if needed
    if (-not (Test-Path $outputDir)) {
        New-Item -ItemType Directory -Path $outputDir -Force | Out-Null
    }

    # Skip if already extracted
    if (Test-Path $outputFile) {
        $fileInfo = Get-Item $outputFile
        if ($fileInfo.Length -gt 1000) {
            Write-Host "  SKIPPED: Already extracted ($($fileInfo.Length) bytes)" -ForegroundColor DarkGray
            $successCount++
            continue
        }
    }

    # Try different extraction methods
    $success = $false

    Write-Host "  Trying iTextSharp..." -ForegroundColor DarkGray
    if (Extract-PDF-iTextSharp $pdf.FullName $outputFile) {
        Write-Host "  SUCCESS: Extracted with iTextSharp" -ForegroundColor Green
        $success = $true
    }

    if (-not $success) {
        Write-Host "  Trying Python/PyPDF2..." -ForegroundColor DarkGray
        if (Extract-PDF-Python $pdf.FullName $outputFile) {
            Write-Host "  SUCCESS: Extracted with Python" -ForegroundColor Green
            $success = $true
        }
    }

    if (-not $success) {
        Write-Host "  Trying Word COM..." -ForegroundColor DarkGray
        if (Extract-PDF-Word $pdf.FullName $outputFile) {
            Write-Host "  SUCCESS: Extracted with Word" -ForegroundColor Green
            $success = $true
        }
    }

    if ($success) {
        $successCount++
        $fileSize = (Get-Item $outputFile).Length
        Write-Host "  Output: $outputFile ($fileSize bytes)" -ForegroundColor Cyan
    }
    else {
        $failCount++
        Write-Host "  FAILED: Could not extract text" -ForegroundColor Red
        # Create placeholder file
        "PDF text extraction failed for: $($pdf.FullName)" | Out-File $outputFile
    }

    Write-Host ""
}

Write-Host "=== Extraction Complete ===" -ForegroundColor Cyan
Write-Host "Success: $successCount" -ForegroundColor Green
Write-Host "Failed:  $failCount" -ForegroundColor $(if ($failCount -gt 0) { 'Red' } else { 'Green' })
Write-Host ""
Write-Host "Output directory: $OutputPath" -ForegroundColor Yellow