Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions config.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"watchFolder": "C:\\Users\\dell\\Downloads\\watchfolder",
"logFile": "C:\\Users\\dell\\Downloads\\fileflow\\FileFlow.log",
"libreOfficePath": "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"maxRetries": 5,
"retryDelaySec": 5,
"scanIntervalSec": 15,
"pdfToDocxOutputFolder": "C:\\Users\\dell\\Downloads\\converted_from_pdf",
"processedPdfFolder": "C:\\Users\\dell\\Downloads\\processed_pdfs",
"processedDocxFolder": "C:\\Users\\dell\\Downloads\\processed_docx"
}
16 changes: 11 additions & 5 deletions config.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
{
"watchFolder": "C:\\Users\\mahaj\\OneDrive\\Documents\\WatchFolder",
"logFile": "C:\\Users\\mahaj\\Downloads\\fileflow\\engine\\FileFlow.log",
"watchFolder": "C:\\Users\\dell\\Downloads\\watchfolder",
"logFile": "C:\\Users\\dell\\Downloads\\fileflow\\FileFlow.log",
"libreOfficePath": "C:\\Program Files\\LibreOffice\\program\\soffice.exe",

"pdfToDocxOutputFolder": "C:\\Users\\dell\\Downloads\\converted_from_pdf",
"processedPdfFolder": "C:\\Users\\dell\\Downloads\\processed_pdfs",
"processedDocxFolder": "C:\\Users\\dell\\Downloads\\processed_docx",
"failedFolder": "C:\\Users\\dell\\Downloads\\watchfolder\\failed",

"scanIntervalSec": 60,
"maxRetries": 5,
"retryDelaySec": 5,
"scanIntervalSec": 15
}
"retryDelaySec": 5
}
278 changes: 206 additions & 72 deletions engine/Start-FileFlow.ps1
Original file line number Diff line number Diff line change
@@ -1,90 +1,224 @@
<#
.SYNOPSIS
The definitive, stable version of FileFlow. This script is fully configurable
and 100% compatible with Windows PowerShell 5.1.
#>

# --- Configuration ---
$configFile = Join-Path (Split-Path $MyInvocation.MyCommand.Path -Parent) "..\config.json"
if (-not (Test-Path $configFile)) {
Write-Host "FATAL ERROR: config.json not found. Make sure it is in the main 'fileflow' directory."
# =====================================================================
# FileFlow – Automatic PDF <-> DOCX + PPTX + IMAGE converter with OCR
# =====================================================================

# ---------- 1. Load config ----------
$configPath = Join-Path $PSScriptRoot "..\config.json"
if (-not (Test-Path $configPath)) {
Write-Host "Config file not found: $configPath"
exit 1
}
$config = Get-Content -Raw -Path $configFile | ConvertFrom-Json

$watchFolder = $config.watchFolder
$logFile = $config.logFile
$libreOfficePath = $config.libreOfficePath
$maxRetries = $config.maxRetries
$retryDelaySec = $config.retryDelaySec
$scanIntervalSec = $config.scanIntervalSec
$config = Get-Content $configPath | ConvertFrom-Json

# --- Logging function ---
function Write-Log {
param([string]$Message)
# THE FIX IS HERE: We pass the $logFile path as a parameter to this function.
Add-Content -Path $logFile -Value "$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') - $Message"
$watchFolder = $config.watchFolder
$logFile = $config.logFile
$libreOfficePath = $config.libreOfficePath

$pdfToDocxOutputFolder = $config.pdfToDocxOutputFolder
$processedPdfFolder = $config.processedPdfFolder
$processedDocxFolder = $config.processedDocxFolder
$failedFolder = $config.failedFolder

$scanInterval = $config.scanIntervalSec

# ---------- 2. Ensure folders exist ----------
$folders = @(
$watchFolder,
$pdfToDocxOutputFolder,
$processedPdfFolder,
$processedDocxFolder,
$failedFolder,
(Split-Path $logFile -Parent)
) | Where-Object { $_ -and ($_ -ne "") }

foreach ($folder in $folders) {
if (-not (Test-Path $folder)) {
New-Item -ItemType Directory -Path $folder -Force | Out-Null
}
}

# --- Main Conversion Logic ---
function Process-File {
# ---------- 3. Logging helper ----------
function Write-Log {
param(
[string]$FilePath,
[string]$LogPath,
[string]$ConverterPath,
[int]$Retries,
[int]$Delay
[string]$Level,
[string]$Message
)
$ts = (Get-Date).ToString("yyyy-MM-dd HH:mm:ss")
$line = "$ts - [$Level] $Message"
$line | Out-File -FilePath $logFile -Append -Encoding utf8
Write-Host $line
}

Write-Log "INFO" "======================================================="
Write-Log "INFO" "FileFlow Activated. Monitoring '$watchFolder'."
Write-Log "INFO" "LibreOffice path: $libreOfficePath"
Write-Log "INFO" "PDF->DOCX output: $pdfToDocxOutputFolder"
Write-Log "INFO" "Processed PDFs: $processedPdfFolder"
Write-Log "INFO" "Processed DOCX/PPTX PDFs: $processedDocxFolder"
Write-Log "INFO" "Failed folder: $failedFolder"

# Path to helper Python script for PDF->DOCX
$pdf2docxScript = Join-Path $PSScriptRoot "pdf2docx_run.py"

# ---------- 4. Main loop ----------
while ($true) {

$files = Get-ChildItem -Path $watchFolder -File -ErrorAction SilentlyContinue

$fileName = (Get-Item -LiteralPath $FilePath).Name
$processedFolder = Join-Path (Split-Path $FilePath -Parent) "processed"
$lockFilePath = "$FilePath.lock"
if (-not $files) {
Start-Sleep -Seconds $scanInterval
continue
}

if (Test-Path $lockFilePath) { return }
New-Item -Path $lockFilePath -ItemType File | Out-Null
# Call Write-Log with the correct log path variable from the parameters
Write-Log "INFO: Detected and locked '$fileName' for processing."
foreach ($f in $files) {
$full = $f.FullName
$ext = $f.Extension.ToLower()
$base = $f.BaseName

for ($i = 1; $i -le $Retries; $i++) {
try {
if (-not (Test-Path $processedFolder)) {
New-Item -ItemType Directory -Path $processedFolder -ErrorAction Stop | Out-Null

# ======================================================
# A) PDF -> DOCX (with OCR)
# ======================================================
if ($ext -eq ".pdf") {

Write-Log "INFO" "Detected PDF '$($f.Name)' -> Running OCR."

$ocrTemp = Join-Path $watchFolder "ocr_temp.pdf"

# Clean temp if exists
if (Test-Path $ocrTemp) { Remove-Item -Force $ocrTemp -ErrorAction SilentlyContinue }

# Run ocrmypdf exactly as we tested
Write-Log "INFO" "OCR Running..."
ocrmypdf --force-ocr --output-type pdf "$full" "$ocrTemp"
$ocrExit = $LASTEXITCODE
Write-Log "DEBUG" "OCR exit code: $ocrExit"

if ($ocrExit -ne 0 -or -not (Test-Path $ocrTemp)) {
Write-Log "ERROR" "OCR FAILED for '$($f.Name)'; moving to FAILED."
Move-Item -Force "$full" (Join-Path $failedFolder $f.Name)
continue
}

Write-Log "INFO" "OCR Success -> Converting PDF -> DOCX"

$outDocx = Join-Path $pdfToDocxOutputFolder "$base.docx"

# Run helper Python script
python "$pdf2docxScript" "$ocrTemp" "$outDocx"
$pdf2docxExit = $LASTEXITCODE
Write-Log "DEBUG" "pdf2docx exit code: $pdf2docxExit"

if ($pdf2docxExit -eq 0 -and (Test-Path $outDocx)) {
Write-Log "INFO" "PDF->DOCX done: $outDocx"
# Move original PDF to processed_pdfs
Move-Item -Force "$full" (Join-Path $processedPdfFolder $f.Name)
}
else {
Write-Log "ERROR" "PDF->DOCX FAILED for '$($f.Name)'; moving to FAILED."
Move-Item -Force "$full" (Join-Path $failedFolder $f.Name)
}

# Clean temp OCR file
if (Test-Path $ocrTemp) { Remove-Item -Force $ocrTemp -ErrorAction SilentlyContinue }
}
$argumentList = "--headless --convert-to pdf `"$FilePath`" --outdir `"$processedFolder`""
Start-Process -FilePath $ConverterPath -ArgumentList $argumentList -Wait -NoNewWindow
$destination = Join-Path $processedFolder $fileName
Move-Item -LiteralPath $FilePath -Destination $destination -Force
Write-Log "SUCCESS: Converted and moved '$fileName'."
Remove-Item $lockFilePath -Force
return
}
catch {
if ($i -lt $Retries) {
Write-Log "WARN: Attempt $i failed on '$fileName' (likely locked). Retrying in $Delay seconds..."
Start-Sleep -Seconds $Delay
} else {
Write-Log "ERROR: Failed to process '$fileName' after $Retries attempts. It remained locked."
Remove-Item $lockFilePath -Force

# ======================================================
# B) DOCX -> PDF (LibreOffice)
# ======================================================
elseif ($ext -eq ".docx") {

Write-Log "INFO" "Detected DOCX '$($f.Name)' -> Converting to PDF"

$outPdf = Join-Path $processedDocxFolder ($base + ".pdf")
if (Test-Path $outPdf) { Remove-Item -Force $outPdf -ErrorAction SilentlyContinue }

# Use EXACT command that worked manually:
& "$libreOfficePath" --headless --invisible --nologo --norestore `
--convert-to pdf "$full" --outdir "$processedDocxFolder"

# Give LibreOffice a moment to write the file
Start-Sleep -Seconds 3

if (Test-Path $outPdf) {
Write-Log "INFO" "DOCX→PDF SUCCESS: $outPdf"
# Move original DOCX out of watch folder (optional: to processed_docx)
Move-Item -Force "$full" (Join-Path $processedDocxFolder $f.Name)
}
else {
Write-Log "ERROR" "DOCX→PDF FAILED for '$($f.Name)' -> Moving to FAILED"
Move-Item -Force "$full" (Join-Path $failedFolder $f.Name)
}
}
}
}
}

# --- Start Monitoring ---
Add-Content -Path $logFile -Value "=======================================================" -ErrorAction SilentlyContinue
Add-Content -Path $logFile -Value "$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss') - FileFlow Activated. Monitoring '$watchFolder'."
Write-Host "FileFlow Activated. Monitoring for files. (Press Ctrl+C to stop)"
# ======================================================
# C) PPTX -> PDF (LibreOffice)
# ======================================================
elseif ($ext -eq ".pptx") {

Write-Log "INFO" "Detected PPTX '$($f.Name)' -> Converting to PDF"

$outPdf = Join-Path $processedDocxFolder ($base + ".pdf")
if (Test-Path $outPdf) { Remove-Item -Force $outPdf -ErrorAction SilentlyContinue }

& "$libreOfficePath" --headless --invisible --nologo --norestore `
--convert-to pdf "$full" --outdir "$processedDocxFolder"

Start-Sleep -Seconds 3

if (Test-Path $outPdf) {
Write-Log "INFO" "PPTX→PDF SUCCESS: $outPdf"
Move-Item -Force "$full" (Join-Path $processedDocxFolder $f.Name)
}
else {
Write-Log "ERROR" "PPTX→PDF FAILED for '$($f.Name)' -> Moving to FAILED"
Move-Item -Force "$full" (Join-Path $failedFolder $f.Name)
}
}

# ======================================================
# D) IMAGE -> PDF (via ocrmypdf image support)
# ======================================================
elseif ($ext -in @(".png", ".jpg", ".jpeg", ".tif", ".tiff")) {

Write-Log "INFO" "Detected IMAGE '$($f.Name)' -> Converting to PDF"

$outPdf = Join-Path $processedPdfFolder ($base + ".pdf")
if (Test-Path $outPdf) { Remove-Item -Force $outPdf -ErrorAction SilentlyContinue }

# ocrmypdf can take image input directly
ocrmypdf --force-ocr --output-type pdf "$full" "$outPdf"
$imgExit = $LASTEXITCODE
Write-Log "DEBUG" "Image OCR/convert exit code: $imgExit"

if ($imgExit -eq 0 -and (Test-Path $outPdf)) {
Write-Log "INFO" "IMAGE→PDF SUCCESS: $outPdf"
Move-Item -Force "$full" (Join-Path $processedPdfFolder $f.Name)
}
else {
Write-Log "ERROR" "Image→PDF FAILED for '$($f.Name)' -> Moving to FAILED"
Move-Item -Force "$full" (Join-Path $failedFolder $f.Name)
}
}

# ======================================================
# E) Unsupported types
# ======================================================
else {
Write-Log "INFO" "Ignoring unsupported file type '$($f.Name)'"
}

# --- Main Loop ---
while ($true) {
$unlockedFiles = Get-ChildItem -Path $watchFolder -Filter "*.docx" | Where-Object { -not (Test-Path "$($_.FullName).lock") }
if ($unlockedFiles) {
Write-Log "INFO: Scan detected $($unlockedFiles.Count) new file(s)."
foreach ($file in $unlockedFiles) {
# We are already passing all the necessary variables here, so no change is needed.
Process-File -FilePath $file.FullName -LogPath $logFile -ConverterPath $libreOfficePath -Retries $maxRetries -Delay $retryDelaySec
}
Write-Log "INFO: Batch complete."
catch {
Write-Log "ERROR" "Unexpected error for '$($f.Name)': $_"
try {
Move-Item -Force "$full" (Join-Path $failedFolder $f.Name)
} catch {}
}
}
Start-Sleep -Seconds $scanIntervalSec
}

Write-Log "INFO" "Scan finished. Sleeping $scanInterval seconds."
Start-Sleep -Seconds $scanInterval
}
16 changes: 16 additions & 0 deletions engine/pdf2docx_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# pdf2docx_run.py
# usage: python pdf2docx_run.py "input.pdf" "output.docx"
import sys
from pdf2docx import Converter

if len(sys.argv) < 3:
print("Usage: pdf2docx_run.py <input.pdf> <output.docx>")
sys.exit(2)

src = sys.argv[1]
dst = sys.argv[2]

cv = Converter(src)
cv.convert(dst)
cv.close()
print("PDF->DOCX done:", dst)