# PDF to Text Extraction Script # Extracts text from all PDFs in SOURCES folder and saves as .txt files param( [string]$SourcePath = "C:\DEV\COPILOT\SOURCES", [string]$OutputPath = "C:\DEV\COPILOT\SOURCES\EXTRACTED_TEXT", [int]$PagesPerBatch = 50 ) Write-Host "=== PDF Text Extraction Script ===" -ForegroundColor Cyan Write-Host "Source: $SourcePath" Write-Host "Output: $OutputPath" Write-Host "" # Create output directory if (-not (Test-Path $OutputPath)) { New-Item -ItemType Directory -Path $OutputPath -Force | Out-Null Write-Host "Created output directory: $OutputPath" -ForegroundColor Green } # Find all PDF files $pdfFiles = Get-ChildItem -Path $SourcePath -Filter "*.pdf" -Recurse Write-Host "Found $($pdfFiles.Count) PDF files" -ForegroundColor Yellow Write-Host "" # Function to extract text using iTextSharp (if available) function Extract-PDF-iTextSharp { param($pdfPath, $outputPath) try { # Try to load iTextSharp $itextPath = "C:\DEV\COPILOT\packages\iTextSharp.5.5.13.3\lib\itextsharp.dll" if (-not (Test-Path $itextPath)) { return $false } Add-Type -Path $itextPath $reader = New-Object iTextSharp.text.pdf.PdfReader($pdfPath) $text = "" for ($page = 1; $page -le $reader.NumberOfPages; $page++) { $text += [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, $page) $text += "`r`n`r`n--- Page $page ---`r`n`r`n" if ($page % 10 -eq 0) { Write-Host " Extracted page $page of $($reader.NumberOfPages)..." -ForegroundColor DarkGray } } $reader.Close() [System.IO.File]::WriteAllText($outputPath, $text) return $true } catch { return $false } } # Function to extract text using Word COM (fallback method) function Extract-PDF-Word { param($pdfPath, $outputPath) try { $word = New-Object -ComObject Word.Application $word.Visible = $false $doc = $word.Documents.Open($pdfPath) $text = $doc.Content.Text [System.IO.File]::WriteAllText($outputPath, $text) $doc.Close() $word.Quit() [System.Runtime.Interopservices.Marshal]::ReleaseComObject($word) | Out-Null return $true } catch { return $false } } # Function to extract using Claude's Read tool (via temp Python script) function Extract-PDF-Python { param($pdfPath, $outputPath) $pythonScript = @" import sys try: import PyPDF2 with open(sys.argv[1], 'rb') as file: reader = PyPDF2.PdfReader(file) text = '' for i, page in enumerate(reader.pages): text += page.extract_text() text += f'\n\n--- Page {i+1} ---\n\n' with open(sys.argv[2], 'w', encoding='utf-8') as output: output.write(text) print('SUCCESS') except ImportError: print('PyPDF2 not installed') sys.exit(1) except Exception as e: print(f'ERROR: {e}') sys.exit(1) "@ $tempScript = "$env:TEMP\extract_pdf_temp.py" $pythonScript | Out-File -FilePath $tempScript -Encoding UTF8 try { $result = python $tempScript $pdfPath $outputPath 2>&1 if ($result -eq 'SUCCESS') { Remove-Item $tempScript -Force return $true } } catch { # Python not available } if (Test-Path $tempScript) { Remove-Item $tempScript -Force } return $false } # Process each PDF $successCount = 0 $failCount = 0 foreach ($pdf in $pdfFiles) { $relativePath = $pdf.FullName.Substring($SourcePath.Length + 1) $outputFile = Join-Path $OutputPath ($relativePath -replace '\.pdf$', '.txt') $outputDir = Split-Path $outputFile -Parent Write-Host "Processing: $relativePath" -ForegroundColor White # Create subdirectory if needed if (-not (Test-Path $outputDir)) { New-Item -ItemType Directory -Path $outputDir -Force | Out-Null } # Skip if already extracted if (Test-Path $outputFile) { $fileInfo = Get-Item $outputFile if ($fileInfo.Length -gt 1000) { Write-Host " SKIPPED: Already extracted ($($fileInfo.Length) bytes)" -ForegroundColor DarkGray $successCount++ continue } } # Try different extraction methods $success = $false Write-Host " Trying iTextSharp..." -ForegroundColor DarkGray if (Extract-PDF-iTextSharp $pdf.FullName $outputFile) { Write-Host " SUCCESS: Extracted with iTextSharp" -ForegroundColor Green $success = $true } if (-not $success) { Write-Host " Trying Python/PyPDF2..." -ForegroundColor DarkGray if (Extract-PDF-Python $pdf.FullName $outputFile) { Write-Host " SUCCESS: Extracted with Python" -ForegroundColor Green $success = $true } } if (-not $success) { Write-Host " Trying Word COM..." -ForegroundColor DarkGray if (Extract-PDF-Word $pdf.FullName $outputFile) { Write-Host " SUCCESS: Extracted with Word" -ForegroundColor Green $success = $true } } if ($success) { $successCount++ $fileSize = (Get-Item $outputFile).Length Write-Host " Output: $outputFile ($fileSize bytes)" -ForegroundColor Cyan } else { $failCount++ Write-Host " FAILED: Could not extract text" -ForegroundColor Red # Create placeholder file "PDF text extraction failed for: $($pdf.FullName)" | Out-File $outputFile } Write-Host "" } Write-Host "=== Extraction Complete ===" -ForegroundColor Cyan Write-Host "Success: $successCount" -ForegroundColor Green Write-Host "Failed: $failCount" -ForegroundColor $(if ($failCount -gt 0) { 'Red' } else { 'Green' }) Write-Host "" Write-Host "Output directory: $OutputPath" -ForegroundColor Yellow