diff --git a/config.conf b/config.conf index e69de29..544b05e 100644 --- a/config.conf +++ b/config.conf @@ -0,0 +1,11 @@ +{ + "watchFolder": "C:\\Users\\dell\\Downloads\\watchfolder", + "logFile": "C:\\Users\\dell\\Downloads\\fileflow\\FileFlow.log", + "libreOfficePath": "C:\\Program Files\\LibreOffice\\program\\soffice.exe", + "maxRetries": 5, + "retryDelaySec": 5, + "scanIntervalSec": 15, + "pdfToDocxOutputFolder": "C:\\Users\\dell\\Downloads\\converted_from_pdf", + "processedPdfFolder": "C:\\Users\\dell\\Downloads\\processed_pdfs", + "processedDocxFolder": "C:\\Users\\dell\\Downloads\\processed_docx" +} \ No newline at end of file diff --git a/config.json b/config.json index be97287..cb3e82c 100644 --- a/config.json +++ b/config.json @@ -1,8 +1,14 @@ { - "watchFolder": "C:\\Users\\mahaj\\OneDrive\\Documents\\WatchFolder", - "logFile": "C:\\Users\\mahaj\\Downloads\\fileflow\\engine\\FileFlow.log", + "watchFolder": "C:\\Users\\dell\\Downloads\\watchfolder", + "logFile": "C:\\Users\\dell\\Downloads\\fileflow\\FileFlow.log", "libreOfficePath": "C:\\Program Files\\LibreOffice\\program\\soffice.exe", + + "pdfToDocxOutputFolder": "C:\\Users\\dell\\Downloads\\converted_from_pdf", + "processedPdfFolder": "C:\\Users\\dell\\Downloads\\processed_pdfs", + "processedDocxFolder": "C:\\Users\\dell\\Downloads\\processed_docx", + "failedFolder": "C:\\Users\\dell\\Downloads\\watchfolder\\failed", + + "scanIntervalSec": 60, "maxRetries": 5, - "retryDelaySec": 5, - "scanIntervalSec": 15 -} \ No newline at end of file + "retryDelaySec": 5 +} diff --git a/engine/Start-FileFlow.ps1 b/engine/Start-FileFlow.ps1 index 4bc5f47..1472245 100644 --- a/engine/Start-FileFlow.ps1 +++ b/engine/Start-FileFlow.ps1 @@ -1,90 +1,224 @@ -<# -.SYNOPSIS - The definitive, stable version of FileFlow. This script is fully configurable - and 100% compatible with Windows PowerShell 5.1. -#> - -# --- Configuration --- -$configFile = Join-Path (Split-Path $MyInvocation.MyCommand.Path -Parent) "..\config.json" -if (-not (Test-Path $configFile)) { - Write-Host "FATAL ERROR: config.json not found. Make sure it is in the main 'fileflow' directory." +# ===================================================================== +# FileFlow – Automatic PDF <-> DOCX + PPTX + IMAGE converter with OCR +# ===================================================================== + +# ---------- 1. Load config ---------- +$configPath = Join-Path $PSScriptRoot "..\config.json" +if (-not (Test-Path $configPath)) { + Write-Host "Config file not found: $configPath" exit 1 } -$config = Get-Content -Raw -Path $configFile | ConvertFrom-Json -$watchFolder = $config.watchFolder -$logFile = $config.logFile -$libreOfficePath = $config.libreOfficePath -$maxRetries = $config.maxRetries -$retryDelaySec = $config.retryDelaySec -$scanIntervalSec = $config.scanIntervalSec +$config = Get-Content $configPath | ConvertFrom-Json -# --- Logging function --- -function Write-Log { - param([string]$Message) - # THE FIX IS HERE: We pass the $logFile path as a parameter to this function. - Add-Content -Path $logFile -Value "$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') - $Message" +$watchFolder = $config.watchFolder +$logFile = $config.logFile +$libreOfficePath = $config.libreOfficePath + +$pdfToDocxOutputFolder = $config.pdfToDocxOutputFolder +$processedPdfFolder = $config.processedPdfFolder +$processedDocxFolder = $config.processedDocxFolder +$failedFolder = $config.failedFolder + +$scanInterval = $config.scanIntervalSec + +# ---------- 2. Ensure folders exist ---------- +$folders = @( + $watchFolder, + $pdfToDocxOutputFolder, + $processedPdfFolder, + $processedDocxFolder, + $failedFolder, + (Split-Path $logFile -Parent) +) | Where-Object { $_ -and ($_ -ne "") } + +foreach ($folder in $folders) { + if (-not (Test-Path $folder)) { + New-Item -ItemType Directory -Path $folder -Force | Out-Null + } } -# --- Main Conversion Logic --- -function Process-File { +# ---------- 3. Logging helper ---------- +function Write-Log { param( - [string]$FilePath, - [string]$LogPath, - [string]$ConverterPath, - [int]$Retries, - [int]$Delay + [string]$Level, + [string]$Message ) + $ts = (Get-Date).ToString("yyyy-MM-dd HH:mm:ss") + $line = "$ts - [$Level] $Message" + $line | Out-File -FilePath $logFile -Append -Encoding utf8 + Write-Host $line +} + +Write-Log "INFO" "=======================================================" +Write-Log "INFO" "FileFlow Activated. Monitoring '$watchFolder'." +Write-Log "INFO" "LibreOffice path: $libreOfficePath" +Write-Log "INFO" "PDF->DOCX output: $pdfToDocxOutputFolder" +Write-Log "INFO" "Processed PDFs: $processedPdfFolder" +Write-Log "INFO" "Processed DOCX/PPTX PDFs: $processedDocxFolder" +Write-Log "INFO" "Failed folder: $failedFolder" + +# Path to helper Python script for PDF->DOCX +$pdf2docxScript = Join-Path $PSScriptRoot "pdf2docx_run.py" + +# ---------- 4. Main loop ---------- +while ($true) { + + $files = Get-ChildItem -Path $watchFolder -File -ErrorAction SilentlyContinue - $fileName = (Get-Item -LiteralPath $FilePath).Name - $processedFolder = Join-Path (Split-Path $FilePath -Parent) "processed" - $lockFilePath = "$FilePath.lock" + if (-not $files) { + Start-Sleep -Seconds $scanInterval + continue + } - if (Test-Path $lockFilePath) { return } - New-Item -Path $lockFilePath -ItemType File | Out-Null - # Call Write-Log with the correct log path variable from the parameters - Write-Log "INFO: Detected and locked '$fileName' for processing." + foreach ($f in $files) { + $full = $f.FullName + $ext = $f.Extension.ToLower() + $base = $f.BaseName - for ($i = 1; $i -le $Retries; $i++) { try { - if (-not (Test-Path $processedFolder)) { - New-Item -ItemType Directory -Path $processedFolder -ErrorAction Stop | Out-Null + + # ====================================================== + # A) PDF -> DOCX (with OCR) + # ====================================================== + if ($ext -eq ".pdf") { + + Write-Log "INFO" "Detected PDF '$($f.Name)' -> Running OCR." + + $ocrTemp = Join-Path $watchFolder "ocr_temp.pdf" + + # Clean temp if exists + if (Test-Path $ocrTemp) { Remove-Item -Force $ocrTemp -ErrorAction SilentlyContinue } + + # Run ocrmypdf exactly as we tested + Write-Log "INFO" "OCR Running..." + ocrmypdf --force-ocr --output-type pdf "$full" "$ocrTemp" + $ocrExit = $LASTEXITCODE + Write-Log "DEBUG" "OCR exit code: $ocrExit" + + if ($ocrExit -ne 0 -or -not (Test-Path $ocrTemp)) { + Write-Log "ERROR" "OCR FAILED for '$($f.Name)'; moving to FAILED." + Move-Item -Force "$full" (Join-Path $failedFolder $f.Name) + continue + } + + Write-Log "INFO" "OCR Success -> Converting PDF -> DOCX" + + $outDocx = Join-Path $pdfToDocxOutputFolder "$base.docx" + + # Run helper Python script + python "$pdf2docxScript" "$ocrTemp" "$outDocx" + $pdf2docxExit = $LASTEXITCODE + Write-Log "DEBUG" "pdf2docx exit code: $pdf2docxExit" + + if ($pdf2docxExit -eq 0 -and (Test-Path $outDocx)) { + Write-Log "INFO" "PDF->DOCX done: $outDocx" + # Move original PDF to processed_pdfs + Move-Item -Force "$full" (Join-Path $processedPdfFolder $f.Name) + } + else { + Write-Log "ERROR" "PDF->DOCX FAILED for '$($f.Name)'; moving to FAILED." + Move-Item -Force "$full" (Join-Path $failedFolder $f.Name) + } + + # Clean temp OCR file + if (Test-Path $ocrTemp) { Remove-Item -Force $ocrTemp -ErrorAction SilentlyContinue } } - $argumentList = "--headless --convert-to pdf `"$FilePath`" --outdir `"$processedFolder`"" - Start-Process -FilePath $ConverterPath -ArgumentList $argumentList -Wait -NoNewWindow - $destination = Join-Path $processedFolder $fileName - Move-Item -LiteralPath $FilePath -Destination $destination -Force - Write-Log "SUCCESS: Converted and moved '$fileName'." - Remove-Item $lockFilePath -Force - return - } - catch { - if ($i -lt $Retries) { - Write-Log "WARN: Attempt $i failed on '$fileName' (likely locked). Retrying in $Delay seconds..." - Start-Sleep -Seconds $Delay - } else { - Write-Log "ERROR: Failed to process '$fileName' after $Retries attempts. It remained locked." - Remove-Item $lockFilePath -Force + + # ====================================================== + # B) DOCX -> PDF (LibreOffice) + # ====================================================== + elseif ($ext -eq ".docx") { + + Write-Log "INFO" "Detected DOCX '$($f.Name)' -> Converting to PDF" + + $outPdf = Join-Path $processedDocxFolder ($base + ".pdf") + if (Test-Path $outPdf) { Remove-Item -Force $outPdf -ErrorAction SilentlyContinue } + + # Use EXACT command that worked manually: + & "$libreOfficePath" --headless --invisible --nologo --norestore ` + --convert-to pdf "$full" --outdir "$processedDocxFolder" + + # Give LibreOffice a moment to write the file + Start-Sleep -Seconds 3 + + if (Test-Path $outPdf) { + Write-Log "INFO" "DOCX→PDF SUCCESS: $outPdf" + # Move original DOCX out of watch folder (optional: to processed_docx) + Move-Item -Force "$full" (Join-Path $processedDocxFolder $f.Name) + } + else { + Write-Log "ERROR" "DOCX→PDF FAILED for '$($f.Name)' -> Moving to FAILED" + Move-Item -Force "$full" (Join-Path $failedFolder $f.Name) + } } - } - } -} -# --- Start Monitoring --- -Add-Content -Path $logFile -Value "=======================================================" -ErrorAction SilentlyContinue -Add-Content -Path $logFile -Value "$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') - FileFlow Activated. Monitoring '$watchFolder'." -Write-Host "FileFlow Activated. Monitoring for files. (Press Ctrl+C to stop)" + # ====================================================== + # C) PPTX -> PDF (LibreOffice) + # ====================================================== + elseif ($ext -eq ".pptx") { + + Write-Log "INFO" "Detected PPTX '$($f.Name)' -> Converting to PDF" + + $outPdf = Join-Path $processedDocxFolder ($base + ".pdf") + if (Test-Path $outPdf) { Remove-Item -Force $outPdf -ErrorAction SilentlyContinue } + + & "$libreOfficePath" --headless --invisible --nologo --norestore ` + --convert-to pdf "$full" --outdir "$processedDocxFolder" + + Start-Sleep -Seconds 3 + + if (Test-Path $outPdf) { + Write-Log "INFO" "PPTX→PDF SUCCESS: $outPdf" + Move-Item -Force "$full" (Join-Path $processedDocxFolder $f.Name) + } + else { + Write-Log "ERROR" "PPTX→PDF FAILED for '$($f.Name)' -> Moving to FAILED" + Move-Item -Force "$full" (Join-Path $failedFolder $f.Name) + } + } + + # ====================================================== + # D) IMAGE -> PDF (via ocrmypdf image support) + # ====================================================== + elseif ($ext -in @(".png", ".jpg", ".jpeg", ".tif", ".tiff")) { + + Write-Log "INFO" "Detected IMAGE '$($f.Name)' -> Converting to PDF" + + $outPdf = Join-Path $processedPdfFolder ($base + ".pdf") + if (Test-Path $outPdf) { Remove-Item -Force $outPdf -ErrorAction SilentlyContinue } + + # ocrmypdf can take image input directly + ocrmypdf --force-ocr --output-type pdf "$full" "$outPdf" + $imgExit = $LASTEXITCODE + Write-Log "DEBUG" "Image OCR/convert exit code: $imgExit" + + if ($imgExit -eq 0 -and (Test-Path $outPdf)) { + Write-Log "INFO" "IMAGE→PDF SUCCESS: $outPdf" + Move-Item -Force "$full" (Join-Path $processedPdfFolder $f.Name) + } + else { + Write-Log "ERROR" "Image→PDF FAILED for '$($f.Name)' -> Moving to FAILED" + Move-Item -Force "$full" (Join-Path $failedFolder $f.Name) + } + } + + # ====================================================== + # E) Unsupported types + # ====================================================== + else { + Write-Log "INFO" "Ignoring unsupported file type '$($f.Name)'" + } -# --- Main Loop --- -while ($true) { - $unlockedFiles = Get-ChildItem -Path $watchFolder -Filter "*.docx" | Where-Object { -not (Test-Path "$($_.FullName).lock") } - if ($unlockedFiles) { - Write-Log "INFO: Scan detected $($unlockedFiles.Count) new file(s)." - foreach ($file in $unlockedFiles) { - # We are already passing all the necessary variables here, so no change is needed. - Process-File -FilePath $file.FullName -LogPath $logFile -ConverterPath $libreOfficePath -Retries $maxRetries -Delay $retryDelaySec } - Write-Log "INFO: Batch complete." + catch { + Write-Log "ERROR" "Unexpected error for '$($f.Name)': $_" + try { + Move-Item -Force "$full" (Join-Path $failedFolder $f.Name) + } catch {} + } } - Start-Sleep -Seconds $scanIntervalSec -} \ No newline at end of file + + Write-Log "INFO" "Scan finished. Sleeping $scanInterval seconds." + Start-Sleep -Seconds $scanInterval +} diff --git a/engine/pdf2docx_run.py b/engine/pdf2docx_run.py new file mode 100644 index 0000000..645d919 --- /dev/null +++ b/engine/pdf2docx_run.py @@ -0,0 +1,16 @@ +# pdf2docx_run.py +# usage: python pdf2docx_run.py "input.pdf" "output.docx" +import sys +from pdf2docx import Converter + +if len(sys.argv) < 3: + print("Usage: pdf2docx_run.py ") + sys.exit(2) + +src = sys.argv[1] +dst = sys.argv[2] + +cv = Converter(src) +cv.convert(dst) +cv.close() +print("PDF->DOCX done:", dst)