├── dict ├── user.dic ├── en_US.dic ├── vi_VN.aff └── en_US.aff ├── tessdata ├── configs │ ├── quiet │ ├── digits │ ├── pdf │ ├── tsv │ ├── unlv │ ├── hocr │ ├── bazaar │ ├── txt │ ├── tess_configvars │ └── tess_configs ├── pdf.ttf ├── eng.traineddata ├── osd.traineddata └── vie.traineddata ├── .tx ├── update-l10n.sh └── config ├── Hunspellx64.dll ├── Hunspellx86.dll ├── Icons └── fatcow │ ├── find.png │ ├── ocr.png │ ├── open.png │ ├── save.png │ ├── scan.png │ ├── tools.png │ ├── cancel.png │ ├── gear_in.png │ ├── zoom_in.png │ ├── ocr_small.png │ ├── rotate_ccw.png │ ├── rotate_cw.png │ ├── zoom_fit.png │ ├── zoom_out.png │ ├── draw_eraser.png │ ├── image_paste.png │ ├── zoom_actual.png │ ├── pilcrow_delete.png │ ├── inline_spellcheck.png │ ├── document_page_next.png │ └── document_page_previous.png ├── Library ├── Tesseract.dll ├── Tesseract.pdb ├── VietKeyInput.dll ├── MultiComboBox.dll ├── MultiComboBox.pdb └── ICSharpCode.SharpZipLib.dll ├── Resources └── VietOCR.ico ├── samples ├── vietsample.tif ├── vietsample1.pdf ├── vietsample1.tif ├── vietsample2.png └── vietsample2.tif ├── x64 ├── libleptonica1860.dll └── libtesseract551.dll ├── x86 ├── libleptonica1860.dll └── libtesseract551.dll ├── packages.config ├── Properties ├── Settings.settings ├── Settings.Designer.cs └── AssemblyInfo.cs ├── .project ├── VietOCR.csproj.user ├── Data ├── san.DangAmbigs.txt ├── slk.DangAmbigs.txt ├── ces.DangAmbigs.txt ├── eng.DangAmbigs.txt ├── vie.DangAmbigs.txt └── ISO639-1.xml ├── App.config ├── .gitignore ├── App.xaml ├── Test ├── ConsoleAppTests.cs ├── Properties │ └── AssemblyInfo.cs ├── Utilities │ └── PdfUtilitiesTests.cs ├── obj │ └── Release │ │ └── DesignTimeResolveAssemblyReferencesInput.cache ├── app.config ├── packages.config └── .vs │ └── VietOCRTests.csproj.dtbcache.json ├── ProcessingOptions.cs ├── Utilities ├── VietUtilities.cs ├── FixedSizeStack.cs ├── ImageConverter.cs ├── Utilities.cs ├── BreakIterator.cs ├── Watcher.cs ├── FileExtractor.cs └── FormLocalizer.cs ├── StatusForm.xaml ├── HtmlHelpDialog.xaml ├── Postprocessing ├── IPostProcessor.cs ├── EngPP.cs ├── ProcessorFactory.cs ├── TextUtilities.cs ├── Processor.cs └── ViePP.cs ├── Controls ├── MoveThumb.cs ├── ResizeThumb.cs └── RedUnderlineAdorner.cs ├── StatusForm.xaml.cs ├── SplitPdfArgs.cs ├── SliderDialog.xaml ├── ChangeCaseDialog.xaml.cs ├── README.md ├── HtmlHelpDialog.xaml.cs ├── ChangeCaseDialog.xaml ├── DownloadDialog.xaml ├── WIA ├── WiaOperationException.cs └── WiaScannerAdapter.cs ├── VietOCR.sln ├── App.xaml.cs ├── SliderDialog.xaml.cs ├── DataSource.cs ├── ImageInfoDialog.xaml.cs ├── GuiWithInputMethod.cs ├── OCRHelper.cs ├── GuiWithUILanguage.cs ├── GuiWithOEM.cs ├── ConsoleApp.cs ├── BulkDialog.xaml ├── SplitPdfDialog.xaml ├── ImageInfoDialog.xaml ├── GuiWithRegistry.cs ├── GuiWithBatch.cs ├── GuiWithScan.cs ├── SplitPdfDialog.xaml.cs ├── FindReplaceDialog.xaml.cs ├── GuiWithPSM.cs ├── DownloadDialog.vi.resx ├── GuiWithThumbnail.cs ├── FindReplaceDialog.xaml └── readme_cs_ja.html /dict/user.dic: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tessdata/configs/quiet: -------------------------------------------------------------------------------- 1 | debug_file /dev/null 2 | -------------------------------------------------------------------------------- /tessdata/configs/digits: -------------------------------------------------------------------------------- 1 | tessedit_char_whitelist 0123456789-. 2 | -------------------------------------------------------------------------------- /.tx/update-l10n.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | tx pull -a -f --minimum-perc 75 4 | -------------------------------------------------------------------------------- /tessdata/configs/pdf: -------------------------------------------------------------------------------- 1 | tessedit_create_pdf 1 2 | tessedit_pageseg_mode 1 3 | -------------------------------------------------------------------------------- /tessdata/configs/tsv: -------------------------------------------------------------------------------- 1 | tessedit_create_tsv 1 2 | tessedit_pageseg_mode 1 3 | -------------------------------------------------------------------------------- /tessdata/configs/unlv: -------------------------------------------------------------------------------- 1 | tessedit_write_unlv 1 2 | tessedit_pageseg_mode 6 3 | -------------------------------------------------------------------------------- /dict/en_US.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/dict/en_US.dic -------------------------------------------------------------------------------- /Hunspellx64.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Hunspellx64.dll -------------------------------------------------------------------------------- /Hunspellx86.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Hunspellx86.dll -------------------------------------------------------------------------------- /tessdata/pdf.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/tessdata/pdf.ttf -------------------------------------------------------------------------------- /Icons/fatcow/find.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/find.png -------------------------------------------------------------------------------- /Icons/fatcow/ocr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/ocr.png -------------------------------------------------------------------------------- /Icons/fatcow/open.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/open.png -------------------------------------------------------------------------------- /Icons/fatcow/save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/save.png -------------------------------------------------------------------------------- /Icons/fatcow/scan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/scan.png -------------------------------------------------------------------------------- /Icons/fatcow/tools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/tools.png -------------------------------------------------------------------------------- /Library/Tesseract.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Library/Tesseract.dll -------------------------------------------------------------------------------- /Library/Tesseract.pdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Library/Tesseract.pdb -------------------------------------------------------------------------------- /Resources/VietOCR.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Resources/VietOCR.ico -------------------------------------------------------------------------------- /samples/vietsample.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/samples/vietsample.tif -------------------------------------------------------------------------------- /tessdata/configs/hocr: -------------------------------------------------------------------------------- 1 | tessedit_create_hocr 1 2 | tessedit_pageseg_mode 1 3 | hocr_font_info 0 4 | -------------------------------------------------------------------------------- /Icons/fatcow/cancel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/cancel.png -------------------------------------------------------------------------------- /Icons/fatcow/gear_in.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/gear_in.png -------------------------------------------------------------------------------- /Icons/fatcow/zoom_in.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/zoom_in.png -------------------------------------------------------------------------------- /Library/VietKeyInput.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Library/VietKeyInput.dll -------------------------------------------------------------------------------- /samples/vietsample1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/samples/vietsample1.pdf -------------------------------------------------------------------------------- /samples/vietsample1.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/samples/vietsample1.tif -------------------------------------------------------------------------------- /samples/vietsample2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/samples/vietsample2.png -------------------------------------------------------------------------------- /samples/vietsample2.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/samples/vietsample2.tif -------------------------------------------------------------------------------- /tessdata/eng.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/tessdata/eng.traineddata -------------------------------------------------------------------------------- /tessdata/osd.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/tessdata/osd.traineddata -------------------------------------------------------------------------------- /tessdata/vie.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/tessdata/vie.traineddata -------------------------------------------------------------------------------- /x64/libleptonica1860.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/x64/libleptonica1860.dll -------------------------------------------------------------------------------- /x64/libtesseract551.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/x64/libtesseract551.dll -------------------------------------------------------------------------------- /x86/libleptonica1860.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/x86/libleptonica1860.dll -------------------------------------------------------------------------------- /x86/libtesseract551.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/x86/libtesseract551.dll -------------------------------------------------------------------------------- /Icons/fatcow/ocr_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/ocr_small.png -------------------------------------------------------------------------------- /Icons/fatcow/rotate_ccw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/rotate_ccw.png -------------------------------------------------------------------------------- /Icons/fatcow/rotate_cw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/rotate_cw.png -------------------------------------------------------------------------------- /Icons/fatcow/zoom_fit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/zoom_fit.png -------------------------------------------------------------------------------- /Icons/fatcow/zoom_out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/zoom_out.png -------------------------------------------------------------------------------- /Library/MultiComboBox.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Library/MultiComboBox.dll -------------------------------------------------------------------------------- /Library/MultiComboBox.pdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Library/MultiComboBox.pdb -------------------------------------------------------------------------------- /Icons/fatcow/draw_eraser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/draw_eraser.png -------------------------------------------------------------------------------- /Icons/fatcow/image_paste.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/image_paste.png -------------------------------------------------------------------------------- /Icons/fatcow/zoom_actual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/zoom_actual.png -------------------------------------------------------------------------------- /Icons/fatcow/pilcrow_delete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/pilcrow_delete.png -------------------------------------------------------------------------------- /Icons/fatcow/inline_spellcheck.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/inline_spellcheck.png -------------------------------------------------------------------------------- /Icons/fatcow/document_page_next.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/document_page_next.png -------------------------------------------------------------------------------- /Library/ICSharpCode.SharpZipLib.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Library/ICSharpCode.SharpZipLib.dll -------------------------------------------------------------------------------- /Icons/fatcow/document_page_previous.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nguyenq/VietOCRwpf/HEAD/Icons/fatcow/document_page_previous.png -------------------------------------------------------------------------------- /tessdata/configs/bazaar: -------------------------------------------------------------------------------- 1 | load_system_dawg F 2 | load_freq_dawg F 3 | user_words_suffix user-words 4 | user_patterns_suffix user-patterns 5 | -------------------------------------------------------------------------------- /tessdata/configs/txt: -------------------------------------------------------------------------------- 1 | # This config file should be used with other cofig files which creates renderers. 2 | # usage example: tesseract eurotext.tif eurotext txt hocr pdf 3 | tessedit_create_txt 1 4 | -------------------------------------------------------------------------------- /packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /Properties/Settings.settings: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | VietOCRwpf 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /VietOCR.csproj.user: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ProjectFiles 5 | 6 | -------------------------------------------------------------------------------- /Data/san.DangAmbigs.txt: -------------------------------------------------------------------------------- 1 | # Plain replaces. Character "0" in the first column indicates a plain replace. Delimiter is (\t) character. 2 | 0 अा आ 3 | 0 श्ा श 4 | 0 ण्ा ण 5 | 0 हाा हा 6 | 7 | # Regular expression replaces. Character "1" in the first column indicates a regex replace. Delimiter is (\t) character. -------------------------------------------------------------------------------- /Data/slk.DangAmbigs.txt: -------------------------------------------------------------------------------- 1 | # Plain replaces. Character "0" in the first column indicates a plain replace. Delimiter is (\t) character. 2 | 0 iii m 3 | 0 l< k 4 | 0 lx h 5 | 0 xn m 6 | 0 - — 7 | 0 l' ľ 8 | 0 d' ď 9 | 0 t' ť 10 | 11 | # Regular expression replaces. Character "1" in the first column indicates a regex replace. Delimiter is (\t) character. -------------------------------------------------------------------------------- /App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio. 3 | ################################################################################ 4 | 5 | /bin 6 | /obj 7 | *.suo 8 | /Setup/Debug 9 | /Setup/Release 10 | /.vs 11 | /packages 12 | /TestResults 13 | /Test/bin 14 | /Test/obj 15 | -------------------------------------------------------------------------------- /App.xaml: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /tessdata/configs/tess_configvars: -------------------------------------------------------------------------------- 1 | # This text file contains non-init control parameters, specifically non-init variables, to modify Tesseract engine's behaviour. 2 | # https://code.google.com/p/tesseract-ocr/wiki/ControlParams 3 | # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version 4 | # Each key/value pair, delimited by spaces, takes one line. Comment lines start with # sign. 5 | # Ex: 6 | #tessedit_char_whitelist 0123456789-. -------------------------------------------------------------------------------- /Test/ConsoleAppTests.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using System.IO; 3 | using System.Security.AccessControl; 4 | using VietOCR; 5 | 6 | namespace UnitTest 7 | { 8 | [TestClass] 9 | public sealed class ConsoleAppTests 10 | { 11 | [TestMethod] 12 | public void MainTest() 13 | { 14 | string[] args = { "samples/vietsample.tif", "out", "-l", "vie", "pdf_textonly" }; 15 | ConsoleApp.Main(args); 16 | Assert.IsTrue(File.Exists("out.pdf")); 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Test/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | [assembly: AssemblyTitle("UnitTest")] 6 | [assembly: AssemblyDescription("")] 7 | [assembly: AssemblyConfiguration("")] 8 | [assembly: AssemblyCompany("")] 9 | [assembly: AssemblyProduct("UnitTest")] 10 | [assembly: AssemblyCopyright("Copyright © 2025")] 11 | [assembly: AssemblyTrademark("")] 12 | [assembly: AssemblyCulture("")] 13 | 14 | [assembly: ComVisible(false)] 15 | 16 | [assembly: Guid("61e5e263-898a-4bb0-b099-b7194bc58529")] 17 | 18 | // [assembly: AssemblyVersion("1.0.*")] 19 | [assembly: AssemblyVersion("1.0.0.0")] 20 | [assembly: AssemblyFileVersion("1.0.0.0")] 21 | -------------------------------------------------------------------------------- /ProcessingOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace VietOCR 8 | { 9 | public class ProcessingOptions 10 | { 11 | public bool Deskew { get; set; } 12 | public bool PostProcessing { get; set; } 13 | public bool RemoveLines { get; set; } 14 | public bool RemoveLineBreaks { get; set; } 15 | public bool CorrectLetterCases { get; set; } 16 | public bool RemoveHyphens { get; set; } 17 | public bool ReplaceHyphens { get; set; } 18 | public bool DangAmbigsEnabled { get; set; } 19 | public string DangAmbigsPath { get; set; } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Utilities/VietUtilities.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | using System.Text; 4 | using System.Text.RegularExpressions; 5 | 6 | namespace Net.SourceForge.Vietpad.Utilities 7 | { 8 | class VietUtilities 9 | { 10 | //private static readonly ILog logger = LogFactory.CreateLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType); 11 | 12 | /** 13 | * Strips accents off words. 14 | */ 15 | public static string StripDiacritics(string accented) 16 | { 17 | Regex regex = new Regex("\\p{IsCombiningDiacriticalMarks}+"); 18 | 19 | string strFormD = accented.Normalize(NormalizationForm.FormD); 20 | return regex.Replace(strFormD, string.Empty).Replace('\u0111', 'd').Replace('\u0110', 'D'); 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /StatusForm.xaml: -------------------------------------------------------------------------------- 1 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Data/ces.DangAmbigs.txt: -------------------------------------------------------------------------------- 1 | # Plain replaces. Character "0" in the first column indicates a plain replace. Delimiter is (\t) character. 2 | 0 iii m 3 | 0 l< k 4 | 0 lx h 5 | 0 xn m 6 | 0 () o 7 | 0 a' á 8 | 0 e' é 9 | 0 y' ý 10 | 0 d' ď 11 | 0 t' ť 12 | 0 7“ ?“ 13 | 0 7" ?" 14 | 0 f“ .“ 15 | 0 f" ." 16 | 0 & a 17 | 18 | # Regular expression replaces. Character "1" in the first column indicates a regex replace. Delimiter is (\t) character. 19 | # Correct common errors caused by OCR 20 | # 1 to l 21 | 1 (?<=\p{L}*)1(?=\p{L}+) l 22 | 1 (?<=\p{L}+)1(?=\p{L}*) l 23 | # ] to l 24 | 1 (?<=\p{L}*)\](?=\p{L}+) l 25 | 1 (?<=\p{L}+)\](?=\p{L}*) l 26 | # | to l 27 | 1 (?<=\p{L}*)\|(?=\p{L}+) l 28 | 1 (?<=\p{L}+)\|(?=\p{L}*) l 29 | # 0 to o 30 | 1 (?<=\p{L}*)0(?=\p{L}+) o 31 | 1 (?<=\p{L}+)0(?=\p{L}*) o 32 | -------------------------------------------------------------------------------- /Data/eng.DangAmbigs.txt: -------------------------------------------------------------------------------- 1 | # Plain replaces. Character "0" in the first column indicates a plain replace. Delimiter is (\t) character. 2 | 0 iii m 3 | 0 l< k 4 | 0 lx h 5 | 0 xn m 6 | 7 | # Regular expression replaces. Character "1" in the first column indicates a regex replace. Delimiter is (\t) character. 8 | # Correct common errors caused by OCR 9 | # 11 to n 10 | 1 \b11(?=\p{L}+\b) n 11 | # 1 to l 12 | 1 \b1(?=\p{L}+\b) l 13 | 1 (?<=\p{L}*)1(?=\p{L}+) l 14 | 1 (?<=\p{L}+)1(?=\p{L}*) l 15 | # ] to l 16 | 1 (?<=\p{L}*)\](?=\p{L}+) l 17 | 1 (?<=\p{L}+)\](?=\p{L}*) l 18 | # | to l 19 | 1 (?<=\p{L}*)\|(?=\p{L}+) l 20 | 1 (?<=\p{L}+)\|(?=\p{L}*) l 21 | # I to l 22 | 1 \bI(?![mn]+\b) l 23 | # 0 to o 24 | 1 (?<=\b\p{L}*)0(?=\p{L}*\b) o 25 | 1 (?<=\p{L}*)0(?=\p{L}+) o 26 | 1 (?<=\p{L}+)0(?=\p{L}*) o 27 | -------------------------------------------------------------------------------- /tessdata/configs/tess_configs: -------------------------------------------------------------------------------- 1 | # This text file contains init-only control parameters to modify Tesseract engine's behaviour. 2 | # https://code.google.com/p/tesseract-ocr/wiki/ControlParams 3 | # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version 4 | # Each key/value pair, delimited by spaces, takes one line. Comment lines start with # sign. 5 | # Ex: 6 | #debug_file /dev/null 7 | #debug_file tesseract.log 8 | # May be beneficial to disable system dictionaries for Vietnamese to improve accuracies 9 | #load_system_dawg F 10 | #load_freq_dawg F 11 | #user_words_suffix user-words 12 | #user_patterns_suffix user-patterns 13 | #language_model_penalty_non_dict_word 0.15 14 | #language_model_penalty_non_freq_dict_word 0.1 15 | #tessedit_char_whitelist 0123456789-. 16 | #tessedit_create_hocr 1 17 | #tessedit_pageseg_mode 1 -------------------------------------------------------------------------------- /HtmlHelpDialog.xaml: -------------------------------------------------------------------------------- 1 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Postprocessing/IPostProcessor.cs: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright @ 2008 Quan Nguyen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | namespace VietOCR.NET.Postprocessing 18 | { 19 | public interface IPostProcessor 20 | { 21 | string PostProcess(string text); 22 | } 23 | } -------------------------------------------------------------------------------- /Data/vie.DangAmbigs.txt: -------------------------------------------------------------------------------- 1 | # Plain replaces. Character "0" in the first column indicates a plain replace. Delimiter is (\t) character. 2 | 0 tmg úng 3 | 0 rl n 4 | 0 rr n 5 | 0 êĩ- ết 6 | 0 âỳ ấy 7 | 0 oĩ ơi 8 | 0 ôỈ ỡi 9 | 0 u1I ưn 10 | 0 q1I qu 11 | 0 II u 12 | 0 êf ết 13 | 0 fâ rầ 14 | 0 fê iề 15 | 0 xiê về 16 | 0 ýê yề 17 | 0 lc k 18 | 0 vđi với 19 | 0 cũa của 20 | 0 phãi phải 21 | 0 —- — 22 | 0 -— — 23 | 0 –– — 24 | 0 Ð Đ 25 | 0 âÍ ấ 26 | 27 | # Regular expression replaces. Character "1" in the first column indicates a regex replace. Delimiter is (\t) character. 28 | 1 (?i)(?<=đ)ă\b ã 29 | 1 (?i)(?<=[ch])ă\b ả 30 | 1 (?i)ă(?![cmnpt]) à 31 | 1 (?i)ẵ(?=[cpt]) ắ 32 | 1 (?<=\b[Tt])m rư 33 | 1 (?i)\bl(?=[rh]) t 34 | 1 (u|ll|r)(?=[gh]) n 35 | 1 (iii|ln|rn) m 36 | 1 (?i)(?<=[qrgsv])ll u 37 | 1 (?i)(?<=[cnpt])ll h 38 | 1 (?i)[oe](?=h) c 39 | 1 \Bđ ớ -------------------------------------------------------------------------------- /Controls/MoveThumb.cs: -------------------------------------------------------------------------------- 1 | using System.Windows.Controls; 2 | using System.Windows.Controls.Primitives; 3 | 4 | namespace VietOCR.Controls 5 | { 6 | public class MoveThumb : Thumb 7 | { 8 | public MoveThumb() 9 | { 10 | DragDelta += new DragDeltaEventHandler(this.MoveThumb_DragDelta); 11 | } 12 | 13 | private void MoveThumb_DragDelta(object sender, DragDeltaEventArgs e) 14 | { 15 | Control designerItem = this.DataContext as Control; 16 | 17 | if (designerItem != null) 18 | { 19 | double left = Canvas.GetLeft(designerItem); 20 | double top = Canvas.GetTop(designerItem); 21 | 22 | Canvas.SetLeft(designerItem, left + e.HorizontalChange); 23 | Canvas.SetTop(designerItem, top + e.VerticalChange); 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Test/Utilities/PdfUtilitiesTests.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using VietOCR.NET.Utilities; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Text; 7 | using System.Threading.Tasks; 8 | 9 | namespace UnitTest 10 | { 11 | [TestClass()] 12 | public class PdfUtilitiesTests 13 | { 14 | [TestMethod()] 15 | [Ignore] 16 | public void ConvertPdf2TiffTest() 17 | { 18 | 19 | } 20 | 21 | [TestMethod()] 22 | [DeploymentItem("samples/vietsample1.pdf", "samples")] 23 | public void GetPdfPageCountTest() 24 | { 25 | string inputPdfFile = "samples/vietsample1.pdf"; 26 | int expResult = 2; 27 | int result = PdfUtilities.GetPdfPageCount(inputPdfFile); 28 | Assert.AreEqual(expResult, result); 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /Test/obj/Release/DesignTimeResolveAssemblyReferencesInput.cache: -------------------------------------------------------------------------------- 1 | .winmd.dll.exe4D:\Projects\github\VietOCRwpf\samples\vietsample.tif-D:\Projects\github\VietOCRwpf\packages.configSC:\Program Files (x86)\Reference Assemblies\Microsoft\Framework\.NETFramework\v4.8\Full{CandidateAssemblyFiles}PC:\Program Files\Common Files\microsoft shared\VSTT\17.0\UITestExtensionPackages{HintPathFromItem}{TargetFrameworkDirectory}B{Registry:Software\Microsoft\.NETFramework,v4.8,AssemblyFoldersEx} {RawFileName}/D:\Projects\github\VietOCRwpf\Test\bin\Release\B{Registry:Software\Microsoft\.NETFramework,v4.8,AssemblyFoldersEx}XD:\Projects\github\VietOCRwpf\Test\obj\Release\DesignTimeResolveAssemblyReferences.cacheSC:\Program Files (x86)\Reference Assemblies\Microsoft\Framework\.NETFramework\v4.8\[C:\Program Files (x86)\Reference Assemblies\Microsoft\Framework\.NETFramework\v4.8\Facades\.NETFramework,Version=v4.8.NET Framework 4.8v4.8msil 2 | v4.0.30319 -------------------------------------------------------------------------------- /Postprocessing/EngPP.cs: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright @ 2008 Quan Nguyen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | using System; 17 | using System.Collections.Generic; 18 | using System.Text; 19 | 20 | namespace VietOCR.NET.Postprocessing 21 | { 22 | class EngPP : IPostProcessor 23 | { 24 | public string PostProcess(string text) 25 | { 26 | // no special processing for English 27 | return text; 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /StatusForm.xaml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.ComponentModel; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | using System.Windows; 8 | using System.Windows.Controls; 9 | using System.Windows.Data; 10 | using System.Windows.Documents; 11 | using System.Windows.Input; 12 | using System.Windows.Media; 13 | using System.Windows.Media.Imaging; 14 | using System.Windows.Shapes; 15 | 16 | namespace VietOCR 17 | { 18 | /// 19 | /// Interaction logic for StatusForm.xaml 20 | /// 21 | public partial class StatusForm : Window 22 | { 23 | public TextBox TextBox 24 | { 25 | get { return textBox; } 26 | } 27 | 28 | public StatusForm() 29 | { 30 | InitializeComponent(); 31 | } 32 | 33 | protected override void OnClosing(CancelEventArgs e) 34 | { 35 | base.OnClosing(e); 36 | 37 | e.Cancel = true; 38 | this.Visibility = Visibility.Hidden; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /Properties/Settings.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace VietOCR.Properties { 12 | 13 | 14 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 15 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "16.3.0.0")] 16 | internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase { 17 | 18 | private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings()))); 19 | 20 | public static Settings Default { 21 | get { 22 | return defaultInstance; 23 | } 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Test/app.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /SplitPdfArgs.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace VietOCR 6 | { 7 | class SplitPdfArgs 8 | { 9 | string inputFilename; 10 | 11 | public string InputFilename 12 | { 13 | get { return inputFilename; } 14 | set { inputFilename = value; } 15 | } 16 | string outputFilename; 17 | 18 | public string OutputFilename 19 | { 20 | get { return outputFilename; } 21 | set { outputFilename = value; } 22 | } 23 | 24 | string fromPage; 25 | 26 | public string FromPage 27 | { 28 | get { return fromPage; } 29 | set { fromPage = value; } 30 | } 31 | string toPage; 32 | 33 | public string ToPage 34 | { 35 | get { return toPage; } 36 | set { toPage = value; } 37 | } 38 | string numOfPages; 39 | 40 | public string NumOfPages 41 | { 42 | get { return numOfPages; } 43 | set { numOfPages = value; } 44 | } 45 | 46 | bool pages; 47 | public bool Pages 48 | { 49 | get { return pages; } 50 | set { pages = value; } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Utilities/FixedSizeStack.cs: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright @ 2013 Quan Nguyen 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | using System; 17 | using System.Collections.Generic; 18 | using System.Text; 19 | 20 | namespace VietOCR.NET.Utilities 21 | { 22 | public class FixedSizeStack : LinkedList 23 | { 24 | private int limit; 25 | 26 | public FixedSizeStack(int limit) 27 | : base() 28 | { 29 | this.limit = limit; 30 | } 31 | 32 | public T Pop() 33 | { 34 | T obj = base.First.Value; 35 | base.RemoveFirst(); 36 | return obj; 37 | } 38 | 39 | public void Push(T obj) 40 | { 41 | base.AddFirst(obj); 42 | if (this.Count > limit) 43 | { 44 | base.RemoveLast(); 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /SliderDialog.xaml: -------------------------------------------------------------------------------- 1 | 10 | 11 | 12 | 55 | -------------------------------------------------------------------------------- /SplitPdfDialog.xaml: -------------------------------------------------------------------------------- 1 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |