├── .gitignore
├── Initialize-Environment.ps1
├── README.md
├── Sort-Files.ps1
├── datelib.psm1
├── itextsharplib.psm1
└── tesseractlib.psm1
/.gitignore:
--------------------------------------------------------------------------------
1 | Input/
2 | Output/
3 | Lib/
4 | Test/
5 | *.zip
6 | *.jpg
--------------------------------------------------------------------------------
/Initialize-Environment.ps1:
--------------------------------------------------------------------------------
1 | #
2 | # Title: Initialize-Environment.ps1
3 | # Author: Jourdan Templeton
4 | # Email: hello@jourdant.me
5 | # Modified: 21/07/2015 04:35PM NZDT
6 | #
7 |
8 | #properties
9 | #================================================================
10 | $tesseract_url = "https://nuget.org/api/v2/package/Tesseract"
11 | $tessdata_url = "https://nuget.org/api/v2/package/tesseract-ocr"
12 | $itextsharp_url = "https://nuget.org/api/v2/package/iTextSharp"
13 |
14 | $tesseract_zip_name = "$PSScriptRoot\tesseract.zip"
15 | $tessdata_zip_name = "$PSScriptRoot\tessdata.zip"
16 | $itextsharp_zip_name = "$PSScriptRoot\itextsharp.zip"
17 |
18 | $input_dir_name = "$PSScriptRoot\Input"
19 | $output_dir_name = "$PSScriptRoot\Output"
20 | $lib_dir_name = "$PSScriptRoot\Lib"
21 | #================================================================
22 |
23 | #create dir structure
24 | If ((Test-Path $input_dir_name) -eq $False) { mkdir $input_dir_name | Out-Null }
25 | If ((Test-Path $output_dir_name) -eq $False) { mkdir $output_dir_name | Out-Null }
26 | If ((Test-Path $lib_dir_name) -eq $False) { mkdir $lib_dir_name | Out-Null }
27 |
28 | #import assemblies into session
29 | Add-Type -Assembly "System.IO.Compression.FileSystem"
30 |
31 | #download and extract tesseract libraries
32 | If ((Test-Path $tesseract_zip_name) -eq $False) {
33 | Write-Output "Downloading: '$tesseract_url' To: '$tesseract_zip_name'"
34 | Invoke-WebRequest -Uri $tesseract_url -OutFile $tesseract_zip_name
35 | }
36 |
37 | If ((Test-Path $tesseract_zip_name) -eq $True)
38 | {
39 | $zip = [IO.Compression.ZipFile]::OpenRead($tesseract_zip_name)
40 |
41 | #extract tesseract libraries
42 | $zip.Entries | Where FullName -match "(x86|x64)|net451/tesseract\.dll" | % {
43 | $dir = (Get-Item $lib_dir_name).FullName
44 | If ($_.FullName.Contains("content")) { $dir += "\" + $matches[0] }
45 | If ((Test-Path $dir) -eq $False) { mkdir $dir | Out-Null }
46 |
47 | $file = $dir + "\" + $_.Name
48 | [IO.Compression.ZipFileExtensions]::ExtractToFile($_, $file, $true)
49 | }
50 |
51 | $zip.Dispose()
52 | Remove-Item $tesseract_zip_name -Force
53 | }
54 |
55 | #download and extract tesseract data files
56 | If ((Test-Path $tessdata_zip_name) -eq $False) {
57 | Write-Output "Downloading: '$tessdata_url' To: '$tessdata_zip_name'"
58 | Invoke-WebRequest -Uri $tessdata_url -OutFile $tessdata_zip_name
59 | }
60 |
61 | If ((Test-Path $tessdata_zip_name) -eq $True)
62 | {
63 | $zip = [IO.Compression.ZipFile]::OpenRead($tessdata_zip_name)
64 |
65 | #extract tessdata libraries
66 | $zip.Entries | Where FullName -match "eng" | % {
67 | $dir = (Get-Item $lib_dir_name).FullName + "\tessdata"
68 | If ((Test-Path $dir) -eq $False) { mkdir $dir | Out-Null }
69 |
70 | $file = $dir + "\" + $_.Name
71 | [IO.Compression.ZipFileExtensions]::ExtractToFile($_, $file, $true)
72 | }
73 |
74 | $zip.Dispose()
75 | Remove-Item $tessdata_zip_name -Force
76 | }
77 |
78 | #download and extract iTextSharp library
79 | If ((Test-Path $itextsharp_zip_name) -eq $False) {
80 | Write-Output "Downloading: '$itextsharp_url' To: '$itextsharp_zip_name'"
81 | Invoke-WebRequest -Uri $itextsharp_url -OutFile $itextsharp_zip_name
82 | }
83 |
84 | If ((Test-Path $itextsharp_zip_name) -eq $True)
85 | {
86 | $zip = [IO.Compression.ZipFile]::OpenRead($itextsharp_zip_name)
87 |
88 | #extract itextsharp libraries
89 | $zip.Entries | Where FullName -match "itextsharp.dll" | % {
90 | $dir = (Get-Item $lib_dir_name).FullName
91 | $file = $dir + "\" + $_.Name
92 | [IO.Compression.ZipFileExtensions]::ExtractToFile($_, $file, $true)
93 | }
94 |
95 | $zip.Dispose()
96 | Remove-Item $itextsharp_zip_name -Force
97 | }
98 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #powershell-paperless
2 | _Going paperless in a world filled with paper._
3 |
4 | > **If you have just arrived here, check out my blog post for a better explanation: [http://blog.jourdant.me/powershell-and-tesseract-going-paperless-with-ocr/](http://blog.jourdant.me/powershell-and-tesseract-going-paperless-with-ocr/)**
5 |
6 | These scripts will allow you to harness the power of Tesseract and PowerShell to sort scanned copies of documents. Or use OCR on any image in general.
7 |
8 |
9 | ##Usage
10 | 1. Clone repo to your machine
11 | 2. Run Initialize-Environment.ps1 - this will download the necessary libraries and extract the files to the correct locations
12 | 3. Run Sort-Images.ps1 with your directories
13 | 4. High five!
14 |
15 |
16 |
17 |
18 |
19 | ######Jourdan Templeton - 2015
20 |
--------------------------------------------------------------------------------
/Sort-Files.ps1:
--------------------------------------------------------------------------------
1 | #
2 | # Title: Sort-Files.ps1
3 | # Author: Jourdan Templeton
4 | # Email: hello@jourdant.me
5 | # Modified: 10/01/2015 21:52PM NZDT
6 | #
7 |
8 | [CmdletBinding(SupportsShouldProcess=$true)]Param(
9 | [Parameter(Mandatory=$true)][string]$InputDirectory,
10 | [Parameter(Mandatory=$true)][string]$OutputDirectory
11 | )
12 |
13 | #check paths
14 | If ((Test-Path $InputDirectory) -eq $False) { Throw "Input directory does not exist." } Else { $InputDirectory = (Get-Item $InputDirectory).FullName }
15 | "Input Directory:" + $InputDirectory
16 | If ((Test-Path $OutputDirectory) -eq $False) { Throw "Output directory does not exist." } Else { $OutputDirectory = (Get-Item $OutputDirectory).FullName }
17 | "Output Directory: " + $OutputDirectory + " `r`n"
18 |
19 | #import libraries
20 | Remove-Module *lib
21 | Get-ChildItem -Filter "*lib.psm1" | % { Import-Module $_.FullName }
22 |
23 | #get files
24 | $files = Get-ChildItem -Path $InputDirectory -Recurse -Filter *.pdf | Where-Object { !$_.PSIsContainer }
25 | "Total files to process: " + $files.count
26 |
27 | #process files
28 | If ($files.count -lt 1) { "No files to process. Closing..."; return }
29 | ForEach ($file in $files)
30 | {
31 | "Processing: " + $file.FullName
32 | #ocr image with tesseract
33 | If ($file.Name.Split('.')[-1] -notmatch "pdf")
34 | {
35 | $image = New-Object System.Drawing.Bitmap($file.FullName)
36 | $ocr = Get-TessTextFromImage -Image $image
37 | "Confidence: " + ($ocr.Confidence * 100).ToString("##") + "%"
38 |
39 | $text = $ocr.Text
40 | $ocr = $null
41 |
42 | #process with itextsharp
43 | } Else {
44 | $text = Get-ItsTextFromPdf -Path $file.FullName
45 | }
46 |
47 | #get date
48 | $path = $OutputDirectory
49 | $date = ''
50 | $output = ''
51 |
52 | Try
53 | {
54 | #parse date and sort
55 | $date = Find-Date -InputText $text
56 | $path += "/" + $date.ToString("yyyy/MM-MMM")
57 | $output = $path + "/" + $date.ToString("yyyy-MM-dd_") + $file.Name.Replace(" ", "")
58 | }
59 | Catch
60 | {
61 | $path += "/Unknown"
62 | $output = $path + "/" + $file.Name
63 | }
64 | Finally
65 | {
66 | #create output dir
67 | If ((Test-Path $path) -eq $false) { mkdir $path | Out-Null }
68 | "Copying to: '" + $output + "'"
69 | Copy-Item -Path $file.FullName -Destination $output -Force
70 |
71 | $text = $null
72 | }
73 |
74 | "`r`n"
75 | }
76 |
--------------------------------------------------------------------------------
/datelib.psm1:
--------------------------------------------------------------------------------
1 | #
2 | # Title: itextsharplib.psm1
3 | # Author: Jourdan Templeton
4 | # Email: hello@jourdant.me
5 | # Modified: 10/01/2015 21:49PM NZDT
6 | #
7 |
8 | ###regex breakdown
9 | $r_day = '(?
46 | Function Find-Date([Parameter(Mandatory=$true)][string]$InputText)
47 | {
48 | If ($InputText -eq $null) { throw "$InputText cannot be null" }
49 |
50 | $text = $InputText.ToUpper()
51 | Write-Verbose $text
52 |
53 | #capture all dates within the image
54 | $dates = @()
55 | ForEach ($regex in $script:patterns)
56 | {
57 | write-verbose $regex
58 | #regex
59 | $matches = ([regex]$regex).Matches($text)
60 | If ($matches.Count -gt 0)
61 | {
62 | write-verbose $matches
63 | #select all parsable dates between set range (clean out as many false positives and mistakes as possible)
64 | $matches = $matches | Where-Object { $_.Value.Length -gt 5 } | % { Try { [DateTime]::Parse($_) } Catch {} } | Where-Object { $_.Year -ge $script:min_year -and $_.Year -le $script:max_year }
65 | Write-Verbose ("Matches==Null: " + ($matches -eq $null) + ", Total Matches: " + $matches.Count)
66 | If ($matches.Count -gt 0) { $dates += $matches[0] }
67 | }
68 |
69 | #clear matches collection for next iteration
70 | $matches = $null
71 | }
72 |
73 | #final logic
74 | If ($dates -ne $null -and $dates.Count -gt 0)
75 | {
76 | #optional: custom date selection logic eg:
77 | #If ($InputText -match "statement|bank|account") { $dates = $dates | Sort -Descending }
78 |
79 | return $dates[0]
80 | }
81 | Else
82 | {
83 | If ($InputText.Contains(' ') -eq $false) { throw "No date could be found." }
84 | Else
85 | {
86 | #recurse with text minus spaces
87 | return Find-Date -InputText $InputText.Replace(' ', '')
88 | }
89 | }
90 | }
--------------------------------------------------------------------------------
/itextsharplib.psm1:
--------------------------------------------------------------------------------
1 | #
2 | # Title: itextsharplib.psm1
3 | # Author: Jourdan Templeton
4 | # Email: hello@jourdant.me
5 | # Modified: 10/01/2015 21:49PM NZDT
6 | #
7 |
8 | Add-Type -Path "$PSScriptRoot\Lib\itextsharp.dll"
9 |
10 | <#
11 | .SYNOPSIS
12 |
13 | This cmdlet loads a PDF file and returns the text content.
14 | .DESCRIPTION
15 |
16 | This cmdlet loads a PDF file and returns the text content. NOTE: this only applies to documents that have text fields embedded. This does not apply to text contained in images of the PDF.
17 | .PARAMETER Path
18 |
19 | The path to the image to be processed.
20 | .EXAMPLE
21 |
22 | Get-ItsTextFromImage -Path "C:\temp\test.pdf"
23 | .EXAMPLE
24 |
25 | $text = Get-ChildItem "C:\Temp" -Filter *.pdf | Get-ItsTextFromImage
26 | #>
27 | Function Get-ItsTextFromPdf()
28 | {
29 | Param(
30 | [Parameter(Mandatory=$true, ValueFromPipeline=$true)][Alias("FullName")][String]$Path
31 | )
32 | Process {
33 | #construct reader object and prepare for reading
34 | $reader = New-Object iTextSharp.text.pdf.PdfReader($Path)
35 |
36 | #read pdf
37 | $ret = [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, 1)
38 |
39 | #clean up references
40 | $reader.Dispose()
41 | return $ret
42 | }
43 | }
--------------------------------------------------------------------------------
/tesseractlib.psm1:
--------------------------------------------------------------------------------
1 | #
2 | # Title: tesseractlib.psm1
3 | # Author: Jourdan Templeton
4 | # Email: hello@jourdant.me
5 | # Modified: 04/01/2015 08:30PM NZDT
6 | #
7 |
8 | Add-Type -AssemblyName "System.Drawing"
9 | Add-Type -Path "$PSScriptRoot\Lib\Tesseract.dll"
10 | $tesseract = New-Object Tesseract.TesseractEngine((Get-Item ".\Lib\tessdata").FullName, "eng", [Tesseract.EngineMode]::Default, $null)
11 |
12 | <#
13 | .SYNOPSIS
14 |
15 | This cmdlet loads either a file path or image and returns the text contained with the confidence.
16 | .DESCRIPTION
17 |
18 | This cmdlet loads either a file path or image and returns the text contained with the confidence.
19 | You can pipe in either System.Drawing.Image file or a child-item object.
20 | .PARAMETER Image
21 |
22 | The image file already loaded into memory.
23 | .PARAMETER FullName
24 |
25 | The path to the image to be processed.
26 | .EXAMPLE
27 |
28 | $image = New-Object System.Drawing.Bitmap("c:\test.jpg")
29 | Get-TessTextFromImage -Image $image
30 | .EXAMPLE
31 |
32 | New-Object System.Drawing.Bitmap("C:\test.jpg") | Get-TessTextFromImage
33 | .EXAMPLE
34 |
35 | $image = New-Object System.Drawing.Bitmap("c:\test.jpg")
36 | Get-TessTextFromImage -Image $image
37 | #>
38 | Function Get-TessTextFromImage()
39 | {
40 | Param(
41 | [Parameter(Mandatory=$true, ValueFromPipeline=$true, ParameterSetName="ImageObject")][System.Drawing.Image]$Image,
42 | [Parameter(Mandatory=$true, ValueFromPipeline=$true, ParameterSetName="FilePath")][Alias("FullName")][String]$Path
43 | )
44 | Process {
45 | #load image if path is a param
46 | If ($PsCmdlet.ParameterSetName -eq "FilePath") { $Image = New-Object System.Drawing.Bitmap((Get-Item $path).Fullname) }
47 |
48 | #perform OCR on image
49 | $pix = [Tesseract.PixConverter]::ToPix($image)
50 | $page = $tesseract.Process($pix)
51 |
52 | #build return object
53 | $ret = New-Object PSObject -Property @{"Text"= $page.GetText();
54 | "Confidence"= $page.GetMeanConfidence()}
55 |
56 | #clean up references
57 | $page.Dispose()
58 | If ($PsCmdlet.ParameterSetName -eq "FilePath") { $image.Dispose() }
59 | return $ret
60 | }
61 | }
--------------------------------------------------------------------------------