├── .gitignore ├── Initialize-Environment.ps1 ├── README.md ├── Sort-Files.ps1 ├── datelib.psm1 ├── itextsharplib.psm1 └── tesseractlib.psm1 /.gitignore: -------------------------------------------------------------------------------- 1 | Input/ 2 | Output/ 3 | Lib/ 4 | Test/ 5 | *.zip 6 | *.jpg -------------------------------------------------------------------------------- /Initialize-Environment.ps1: -------------------------------------------------------------------------------- 1 | # 2 | # Title: Initialize-Environment.ps1 3 | # Author: Jourdan Templeton 4 | # Email: hello@jourdant.me 5 | # Modified: 21/07/2015 04:35PM NZDT 6 | # 7 | 8 | #properties 9 | #================================================================ 10 | $tesseract_url = "https://nuget.org/api/v2/package/Tesseract" 11 | $tessdata_url = "https://nuget.org/api/v2/package/tesseract-ocr" 12 | $itextsharp_url = "https://nuget.org/api/v2/package/iTextSharp" 13 | 14 | $tesseract_zip_name = "$PSScriptRoot\tesseract.zip" 15 | $tessdata_zip_name = "$PSScriptRoot\tessdata.zip" 16 | $itextsharp_zip_name = "$PSScriptRoot\itextsharp.zip" 17 | 18 | $input_dir_name = "$PSScriptRoot\Input" 19 | $output_dir_name = "$PSScriptRoot\Output" 20 | $lib_dir_name = "$PSScriptRoot\Lib" 21 | #================================================================ 22 | 23 | #create dir structure 24 | If ((Test-Path $input_dir_name) -eq $False) { mkdir $input_dir_name | Out-Null } 25 | If ((Test-Path $output_dir_name) -eq $False) { mkdir $output_dir_name | Out-Null } 26 | If ((Test-Path $lib_dir_name) -eq $False) { mkdir $lib_dir_name | Out-Null } 27 | 28 | #import assemblies into session 29 | Add-Type -Assembly "System.IO.Compression.FileSystem" 30 | 31 | #download and extract tesseract libraries 32 | If ((Test-Path $tesseract_zip_name) -eq $False) { 33 | Write-Output "Downloading: '$tesseract_url' To: '$tesseract_zip_name'" 34 | Invoke-WebRequest -Uri $tesseract_url -OutFile $tesseract_zip_name 35 | } 36 | 37 | If ((Test-Path $tesseract_zip_name) -eq $True) 38 | { 39 | $zip = [IO.Compression.ZipFile]::OpenRead($tesseract_zip_name) 40 | 41 | #extract tesseract libraries 42 | $zip.Entries | Where FullName -match "(x86|x64)|net451/tesseract\.dll" | % { 43 | $dir = (Get-Item $lib_dir_name).FullName 44 | If ($_.FullName.Contains("content")) { $dir += "\" + $matches[0] } 45 | If ((Test-Path $dir) -eq $False) { mkdir $dir | Out-Null } 46 | 47 | $file = $dir + "\" + $_.Name 48 | [IO.Compression.ZipFileExtensions]::ExtractToFile($_, $file, $true) 49 | } 50 | 51 | $zip.Dispose() 52 | Remove-Item $tesseract_zip_name -Force 53 | } 54 | 55 | #download and extract tesseract data files 56 | If ((Test-Path $tessdata_zip_name) -eq $False) { 57 | Write-Output "Downloading: '$tessdata_url' To: '$tessdata_zip_name'" 58 | Invoke-WebRequest -Uri $tessdata_url -OutFile $tessdata_zip_name 59 | } 60 | 61 | If ((Test-Path $tessdata_zip_name) -eq $True) 62 | { 63 | $zip = [IO.Compression.ZipFile]::OpenRead($tessdata_zip_name) 64 | 65 | #extract tessdata libraries 66 | $zip.Entries | Where FullName -match "eng" | % { 67 | $dir = (Get-Item $lib_dir_name).FullName + "\tessdata" 68 | If ((Test-Path $dir) -eq $False) { mkdir $dir | Out-Null } 69 | 70 | $file = $dir + "\" + $_.Name 71 | [IO.Compression.ZipFileExtensions]::ExtractToFile($_, $file, $true) 72 | } 73 | 74 | $zip.Dispose() 75 | Remove-Item $tessdata_zip_name -Force 76 | } 77 | 78 | #download and extract iTextSharp library 79 | If ((Test-Path $itextsharp_zip_name) -eq $False) { 80 | Write-Output "Downloading: '$itextsharp_url' To: '$itextsharp_zip_name'" 81 | Invoke-WebRequest -Uri $itextsharp_url -OutFile $itextsharp_zip_name 82 | } 83 | 84 | If ((Test-Path $itextsharp_zip_name) -eq $True) 85 | { 86 | $zip = [IO.Compression.ZipFile]::OpenRead($itextsharp_zip_name) 87 | 88 | #extract itextsharp libraries 89 | $zip.Entries | Where FullName -match "itextsharp.dll" | % { 90 | $dir = (Get-Item $lib_dir_name).FullName 91 | $file = $dir + "\" + $_.Name 92 | [IO.Compression.ZipFileExtensions]::ExtractToFile($_, $file, $true) 93 | } 94 | 95 | $zip.Dispose() 96 | Remove-Item $itextsharp_zip_name -Force 97 | } 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #powershell-paperless 2 | _Going paperless in a world filled with paper._ 3 | 4 | > **If you have just arrived here, check out my blog post for a better explanation: [http://blog.jourdant.me/powershell-and-tesseract-going-paperless-with-ocr/](http://blog.jourdant.me/powershell-and-tesseract-going-paperless-with-ocr/)** 5 | 6 | These scripts will allow you to harness the power of Tesseract and PowerShell to sort scanned copies of documents. Or use OCR on any image in general. 7 | 8 | 9 | ##Usage 10 | 1. Clone repo to your machine 11 | 2. Run Initialize-Environment.ps1 - this will download the necessary libraries and extract the files to the correct locations 12 | 3. Run Sort-Images.ps1 with your directories 13 | 4. High five! 14 | 15 |
16 |
17 |
18 | 19 | ######Jourdan Templeton - 2015 20 | -------------------------------------------------------------------------------- /Sort-Files.ps1: -------------------------------------------------------------------------------- 1 | # 2 | # Title: Sort-Files.ps1 3 | # Author: Jourdan Templeton 4 | # Email: hello@jourdant.me 5 | # Modified: 10/01/2015 21:52PM NZDT 6 | # 7 | 8 | [CmdletBinding(SupportsShouldProcess=$true)]Param( 9 | [Parameter(Mandatory=$true)][string]$InputDirectory, 10 | [Parameter(Mandatory=$true)][string]$OutputDirectory 11 | ) 12 | 13 | #check paths 14 | If ((Test-Path $InputDirectory) -eq $False) { Throw "Input directory does not exist." } Else { $InputDirectory = (Get-Item $InputDirectory).FullName } 15 | "Input Directory:" + $InputDirectory 16 | If ((Test-Path $OutputDirectory) -eq $False) { Throw "Output directory does not exist." } Else { $OutputDirectory = (Get-Item $OutputDirectory).FullName } 17 | "Output Directory: " + $OutputDirectory + " `r`n" 18 | 19 | #import libraries 20 | Remove-Module *lib 21 | Get-ChildItem -Filter "*lib.psm1" | % { Import-Module $_.FullName } 22 | 23 | #get files 24 | $files = Get-ChildItem -Path $InputDirectory -Recurse -Filter *.pdf | Where-Object { !$_.PSIsContainer } 25 | "Total files to process: " + $files.count 26 | 27 | #process files 28 | If ($files.count -lt 1) { "No files to process. Closing..."; return } 29 | ForEach ($file in $files) 30 | { 31 | "Processing: " + $file.FullName 32 | #ocr image with tesseract 33 | If ($file.Name.Split('.')[-1] -notmatch "pdf") 34 | { 35 | $image = New-Object System.Drawing.Bitmap($file.FullName) 36 | $ocr = Get-TessTextFromImage -Image $image 37 | "Confidence: " + ($ocr.Confidence * 100).ToString("##") + "%" 38 | 39 | $text = $ocr.Text 40 | $ocr = $null 41 | 42 | #process with itextsharp 43 | } Else { 44 | $text = Get-ItsTextFromPdf -Path $file.FullName 45 | } 46 | 47 | #get date 48 | $path = $OutputDirectory 49 | $date = '' 50 | $output = '' 51 | 52 | Try 53 | { 54 | #parse date and sort 55 | $date = Find-Date -InputText $text 56 | $path += "/" + $date.ToString("yyyy/MM-MMM") 57 | $output = $path + "/" + $date.ToString("yyyy-MM-dd_") + $file.Name.Replace(" ", "") 58 | } 59 | Catch 60 | { 61 | $path += "/Unknown" 62 | $output = $path + "/" + $file.Name 63 | } 64 | Finally 65 | { 66 | #create output dir 67 | If ((Test-Path $path) -eq $false) { mkdir $path | Out-Null } 68 | "Copying to: '" + $output + "'" 69 | Copy-Item -Path $file.FullName -Destination $output -Force 70 | 71 | $text = $null 72 | } 73 | 74 | "`r`n" 75 | } 76 | -------------------------------------------------------------------------------- /datelib.psm1: -------------------------------------------------------------------------------- 1 | # 2 | # Title: itextsharplib.psm1 3 | # Author: Jourdan Templeton 4 | # Email: hello@jourdant.me 5 | # Modified: 10/01/2015 21:49PM NZDT 6 | # 7 | 8 | ###regex breakdown 9 | $r_day = '(? 46 | Function Find-Date([Parameter(Mandatory=$true)][string]$InputText) 47 | { 48 | If ($InputText -eq $null) { throw "$InputText cannot be null" } 49 | 50 | $text = $InputText.ToUpper() 51 | Write-Verbose $text 52 | 53 | #capture all dates within the image 54 | $dates = @() 55 | ForEach ($regex in $script:patterns) 56 | { 57 | write-verbose $regex 58 | #regex 59 | $matches = ([regex]$regex).Matches($text) 60 | If ($matches.Count -gt 0) 61 | { 62 | write-verbose $matches 63 | #select all parsable dates between set range (clean out as many false positives and mistakes as possible) 64 | $matches = $matches | Where-Object { $_.Value.Length -gt 5 } | % { Try { [DateTime]::Parse($_) } Catch {} } | Where-Object { $_.Year -ge $script:min_year -and $_.Year -le $script:max_year } 65 | Write-Verbose ("Matches==Null: " + ($matches -eq $null) + ", Total Matches: " + $matches.Count) 66 | If ($matches.Count -gt 0) { $dates += $matches[0] } 67 | } 68 | 69 | #clear matches collection for next iteration 70 | $matches = $null 71 | } 72 | 73 | #final logic 74 | If ($dates -ne $null -and $dates.Count -gt 0) 75 | { 76 | #optional: custom date selection logic eg: 77 | #If ($InputText -match "statement|bank|account") { $dates = $dates | Sort -Descending } 78 | 79 | return $dates[0] 80 | } 81 | Else 82 | { 83 | If ($InputText.Contains(' ') -eq $false) { throw "No date could be found." } 84 | Else 85 | { 86 | #recurse with text minus spaces 87 | return Find-Date -InputText $InputText.Replace(' ', '') 88 | } 89 | } 90 | } -------------------------------------------------------------------------------- /itextsharplib.psm1: -------------------------------------------------------------------------------- 1 | # 2 | # Title: itextsharplib.psm1 3 | # Author: Jourdan Templeton 4 | # Email: hello@jourdant.me 5 | # Modified: 10/01/2015 21:49PM NZDT 6 | # 7 | 8 | Add-Type -Path "$PSScriptRoot\Lib\itextsharp.dll" 9 | 10 | <# 11 | .SYNOPSIS 12 | 13 | This cmdlet loads a PDF file and returns the text content. 14 | .DESCRIPTION 15 | 16 | This cmdlet loads a PDF file and returns the text content. NOTE: this only applies to documents that have text fields embedded. This does not apply to text contained in images of the PDF. 17 | .PARAMETER Path 18 | 19 | The path to the image to be processed. 20 | .EXAMPLE 21 | 22 | Get-ItsTextFromImage -Path "C:\temp\test.pdf" 23 | .EXAMPLE 24 | 25 | $text = Get-ChildItem "C:\Temp" -Filter *.pdf | Get-ItsTextFromImage 26 | #> 27 | Function Get-ItsTextFromPdf() 28 | { 29 | Param( 30 | [Parameter(Mandatory=$true, ValueFromPipeline=$true)][Alias("FullName")][String]$Path 31 | ) 32 | Process { 33 | #construct reader object and prepare for reading 34 | $reader = New-Object iTextSharp.text.pdf.PdfReader($Path) 35 | 36 | #read pdf 37 | $ret = [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, 1) 38 | 39 | #clean up references 40 | $reader.Dispose() 41 | return $ret 42 | } 43 | } -------------------------------------------------------------------------------- /tesseractlib.psm1: -------------------------------------------------------------------------------- 1 | # 2 | # Title: tesseractlib.psm1 3 | # Author: Jourdan Templeton 4 | # Email: hello@jourdant.me 5 | # Modified: 04/01/2015 08:30PM NZDT 6 | # 7 | 8 | Add-Type -AssemblyName "System.Drawing" 9 | Add-Type -Path "$PSScriptRoot\Lib\Tesseract.dll" 10 | $tesseract = New-Object Tesseract.TesseractEngine((Get-Item ".\Lib\tessdata").FullName, "eng", [Tesseract.EngineMode]::Default, $null) 11 | 12 | <# 13 | .SYNOPSIS 14 | 15 | This cmdlet loads either a file path or image and returns the text contained with the confidence. 16 | .DESCRIPTION 17 | 18 | This cmdlet loads either a file path or image and returns the text contained with the confidence. 19 | You can pipe in either System.Drawing.Image file or a child-item object. 20 | .PARAMETER Image 21 | 22 | The image file already loaded into memory. 23 | .PARAMETER FullName 24 | 25 | The path to the image to be processed. 26 | .EXAMPLE 27 | 28 | $image = New-Object System.Drawing.Bitmap("c:\test.jpg") 29 | Get-TessTextFromImage -Image $image 30 | .EXAMPLE 31 | 32 | New-Object System.Drawing.Bitmap("C:\test.jpg") | Get-TessTextFromImage 33 | .EXAMPLE 34 | 35 | $image = New-Object System.Drawing.Bitmap("c:\test.jpg") 36 | Get-TessTextFromImage -Image $image 37 | #> 38 | Function Get-TessTextFromImage() 39 | { 40 | Param( 41 | [Parameter(Mandatory=$true, ValueFromPipeline=$true, ParameterSetName="ImageObject")][System.Drawing.Image]$Image, 42 | [Parameter(Mandatory=$true, ValueFromPipeline=$true, ParameterSetName="FilePath")][Alias("FullName")][String]$Path 43 | ) 44 | Process { 45 | #load image if path is a param 46 | If ($PsCmdlet.ParameterSetName -eq "FilePath") { $Image = New-Object System.Drawing.Bitmap((Get-Item $path).Fullname) } 47 | 48 | #perform OCR on image 49 | $pix = [Tesseract.PixConverter]::ToPix($image) 50 | $page = $tesseract.Process($pix) 51 | 52 | #build return object 53 | $ret = New-Object PSObject -Property @{"Text"= $page.GetText(); 54 | "Confidence"= $page.GetMeanConfidence()} 55 | 56 | #clean up references 57 | $page.Dispose() 58 | If ($PsCmdlet.ParameterSetName -eq "FilePath") { $image.Dispose() } 59 | return $ret 60 | } 61 | } --------------------------------------------------------------------------------