├── .gitignore ├── LICENSE ├── README.md ├── langtests ├── README.md ├── counttestset.sh ├── deva_setup.sh ├── deva_test.sh ├── frk_setup.sh ├── frk_test.sh ├── reports │ ├── Devanagari-2019-03-09.summary │ ├── Devanagari.summary │ ├── Fraktur-2019-03-09.summary │ ├── Fraktur.summary │ ├── frk-2019-03-09.summary │ ├── frk.summary │ ├── san-2019-03-09.summary │ └── san.summary ├── runlangtests.sh └── runtestset.sh ├── testdata ├── README.md ├── ara.unicharset ├── ara.wordlist ├── chi_sim.unicharset ├── chi_tra.unicharset ├── deu.Arial_Unicode_MS.exp0.lstmf ├── deu │ ├── deu.traineddata │ └── deu.unicharset ├── eng.Arial.exp0.lstmf ├── eng.Arial_Unicode_MS.exp0.lstmf ├── eng.params_model ├── eng.unicharset ├── eng │ ├── eng.traineddata │ └── eng.unicharset ├── fra.Arial_Unicode_MS.exp0.lstmf ├── fra │ ├── fra.traineddata │ └── fra.unicharset ├── jpn.unicharset ├── kan.Arial_Unicode_MS.exp0.lstmf ├── kan.unicharset ├── kan │ ├── kan.traineddata │ └── kan.unicharset ├── kor.Arial_Unicode_MS.exp0.lstmf ├── kor.unicharset ├── kor │ ├── kor.traineddata │ └── kor.unicharset ├── mar.unicharset ├── por.unicharset ├── scanftest.txt └── trivial.unicharset ├── testing ├── 12.tif ├── 136.tif ├── 256.tif ├── 324.tif ├── 410.tif ├── 432.tif ├── 433.tif ├── 540.tif ├── 692.tif ├── 779.tif ├── 793.tif ├── 8071_093.3B.tif ├── 8071_093.3B.txt ├── 8071_093.3B.uzn ├── 808.tif ├── 8087_054.3B.tif ├── 8087_054.3B.txt ├── 8087_054.3B.uzn ├── 8087_054.3G.tif ├── 815.tif ├── HelloGoogle.tif ├── README.md ├── arabic.tif ├── basicapitest.cpp ├── deslant.tif ├── devatest-rotated-270.png ├── devatest.png ├── eng.Arial.exp0.tr ├── eng.unicharset ├── eng.wordlist.clean.freq ├── eng.xheights ├── eng_beam.unicharset ├── eurotext.tif ├── eurotext.txt ├── hebrew-nikud-genesis-1-2.png ├── hebrew.png ├── hebtypo.jpg ├── hin_beam.unicharset ├── phototest-rotated-180.png ├── phototest-rotated-L.png ├── phototest-rotated-R.png ├── phototest.gold.txt ├── phototest.tif ├── phototest.txt ├── phototest_2.tif ├── phototestrot.tif ├── raaj.tif ├── segmodeimg.odt ├── segmodeimg.tif ├── trainingital.box ├── trainingital.tif ├── trainingitalline.box ├── trainingitalline.tif ├── trainingtimes.box ├── trainingtimes.tif ├── trainingtimesline.box ├── trainingtimesline.tif └── viet.tif └── unlvtests ├── README.md ├── counttestset.sh ├── reorgdata.sh ├── reports ├── 1995.bus.3B.sum ├── 1995.doe3.3B.sum ├── 1995.mag.3B.sum ├── 1995.news.3B.sum ├── 2.03.summary ├── 2.04.summary ├── 4_best_int_spa.summary ├── 4_best_spa.summary ├── 4_fast_eng.summary └── 4_fast_spa.summary ├── runalltests.sh ├── runalltests_spa.sh └── runtestset.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | langtests/results/* 35 | 36 | # Fonts 37 | /testing/*.ttf 38 | /testing/.uuid 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tesseract-ocr/test 2 | Repository for binaries (images, tessdata) required for testing Tesseract. 3 | 4 | This repository should be included as a submodule in tesseract-ocr/tesseract. 5 | -------------------------------------------------------------------------------- /langtests/README.md: -------------------------------------------------------------------------------- 1 | # Language tests. 2 | The scripts in this directory make it possible to test Accuracy of Tesseract for different languages. 3 | ## Setup 4 | ### Step 1: If not already installed, download the modified ISRI toolkit, 5 | make and install the tools in /usr/local/bin. 6 | ``` 7 | git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git 8 | cd ~/ocr-evaluation-tools 9 | sudo make install 10 | ``` 11 | ### Step 2: If not already built, Build tesseract. 12 | Use binaries from the tesseract/src/api and tesseract/src/training directory. 13 | ### Step 3 14 | Download images and corresponding ground truth text for the language to be tested. 15 | Each testset should have only one kind of images (eg. tif, png, jpg etc). 16 | The ground truth text files should have the same base filename with txt extension. 17 | As needed, modify the filenames and create the `pages` file for each testset. 18 | 19 | Instructions for testing Fraktur and Sanskrit languages are given below as an example. 20 | 21 | ## Testing for Fraktur - frk and script/Fraktur 22 | ### Download the images and groundtruth, modify to required format. 23 | ``` 24 | bash frk_setup.sh 25 | ``` 26 | ### Run tests for Fraktur - frk and script/Fraktur 27 | ``` 28 | bash frk_test.sh 29 | ``` 30 | ## Testing for Sanskrit - san and script/Devanagari 31 | ### Download the images and groundtruth, modify to required format. 32 | ``` 33 | bash deva_setup.sh 34 | ``` 35 | ### Run tests 36 | ``` 37 | bash deva_test.sh 38 | ``` 39 | 40 | ### Notes from Nick White regarding wordacc 41 | 42 | If you just want to remove all lines which have 100% recognition, 43 | you can add a 'awk' command like this: 44 | 45 | ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' 46 | results.txt 47 | 48 | or if you've already got a results file you want to change, you can do this: 49 | 50 | awk '$3 != 100 {print $0}' results.txt newresults.txt 51 | 52 | If you only want the last sections where things are broken down by 53 | word, you can add a sed command, like this: 54 | 55 | ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ 56 | !d' | awk '$3 != 100 {print $0}' results.txt 57 | -------------------------------------------------------------------------------- /langtests/counttestset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # File: counttestset.sh 3 | # Description: Script to count the errors on a single UNLV set. 4 | # Author: Ray Smith 5 | # Created: Wed Jun 13 11:58:01 PDT 2007 6 | # 7 | # (C) Copyright 2007, Google Inc. 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | if [ $# -ne 2 ] 19 | then 20 | echo "Usage:$0 pagesfile langcode" 21 | exit 1 22 | fi 23 | 24 | pages=$1 25 | langcode=$2 26 | 27 | imdir=${pages%/pages} 28 | setname=${imdir##*/} 29 | resdir=langtests/results/$setname 30 | mkdir -p langtests/reports 31 | echo "Counting on set $setname in directory $imdir to $resdir" 32 | accfiles="" 33 | wafiles="" 34 | while read page dir 35 | do 36 | if [ "$dir" ] 37 | then 38 | srcdir="$imdir/$dir" 39 | else 40 | srcdir="$imdir" 41 | fi 42 | echo "$srcdir/$page" 43 | # Count character errors. 44 | ocrevalutf8 accuracy "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.acc" 45 | accfiles="$accfiles $resdir/$page.acc" 46 | # Count word errors. 47 | ocrevalutf8 wordacc -S"$resdir/$langcode.stopwords" "$srcdir/$page.txt" "$resdir/$page.txt" > "$resdir/$page.wa" 48 | wafiles="$wafiles $resdir/$page.wa" 49 | done <"$pages" 50 | 51 | accsum $accfiles >"langtests/results/$setname.characc" 52 | wordaccsum $wafiles >"langtests/results/$setname.wordacc" 53 | -------------------------------------------------------------------------------- /langtests/deva_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #### 3 | # Get the images for testing 4 | # 5 | rm -rf ~/lang-deva-downloads 6 | mkdir ~/lang-deva-downloads 7 | cd ~/lang-deva-downloads 8 | git clone https://github.com/Shreeshrii/imagessan.git --depth 1 9 | 10 | ### 11 | # Copy and rename files as needed for the evaluation script 12 | # 13 | mkdir -p ~/lang-files 14 | rm -rf ~/lang-files/san-* 15 | for testset in oldstyle shreelipi fontsamples 16 | do 17 | cd ~/lang-files 18 | mkdir -p ./san-$testset 19 | cp ~/lang-deva-downloads/imagessan/$testset/*.* ./san-$testset/ 20 | cd ./san-$testset/ 21 | for f in *-gt.txt; do mv "$f" "$(echo "$f" | sed -r 's/-gt//')" ; done 22 | ls -1 *.png >pages 23 | sed -i -e 's/.png//g' pages 24 | done 25 | 26 | ### 27 | # Copy Devanagari stopwords 28 | mkdir -p ~/lang-stopwords 29 | cd ~/lang-stopwords 30 | cp ~/lang-deva-downloads/imagessan/stopwords.txt ~/lang-stopwords/san.stopwords.txt 31 | cp ~/lang-deva-downloads/imagessan/stopwords.txt ~/lang-stopwords/Devanagari.stopwords.txt 32 | 33 | ### 34 | # Get the traineddata for testing 35 | cd ~/tesseract 36 | mkdir -p tessdata_best 37 | mkdir -p tessdata_fast 38 | mkdir -p tessdata_fast/script 39 | mkdir -p tessdata_best/script 40 | mkdir -p tessdata/script 41 | # 42 | cd ~/tesseract 43 | cd ./tessdata_best 44 | wget -O san.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/san.traineddata 45 | cd ./script 46 | wget -O Devanagari.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/script/Devanagari.traineddata 47 | cd ~/tesseract 48 | cd ./tessdata_fast 49 | wget -O san.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/master/san.traineddata 50 | cd ./script 51 | wget -O Devanagari.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/master/script/Devanagari.traineddata 52 | cd ~/tesseract 53 | cd ./tessdata 54 | wget -O san.traineddata https://github.com/tesseract-ocr/tessdata/raw/master/san.traineddata 55 | cd ./script 56 | wget -O Devanagari.traineddata https://github.com/tesseract-ocr/tessdata/raw/master/script/Devanagari.traineddata 57 | 58 | cd ~/tesseract/test/langtests 59 | -------------------------------------------------------------------------------- /langtests/deva_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # run ./langtests/runlangtests.sh with the root data dir, testname, tessdata-dir, language code and image extension 3 | 4 | cd ~/tesseract/test 5 | 6 | rm -rf ./langtests/results/*san* 7 | rm -rf ./langtests/results/*Devanagari* 8 | #rm -rf ./langtests/reports/*san-$(date +%F)* 9 | #rm -rf ./langtests/reports/*Devanagari-$(date +%F)* 10 | 11 | # Run the tests 12 | ./langtests/runlangtests.sh ~/lang-files 4_fast_Devanagari ../tessdata_fast/script Devanagari png 13 | rm -rf ./langtests/results/*Devanagari* 14 | ./langtests/runlangtests.sh ~/lang-files 4_best_int_Devanagari ../tessdata/script Devanagari png 15 | rm -rf ./langtests/results/*Devanagari* 16 | ./langtests/runlangtests.sh ~/lang-files 4_best_Devanagari ../tessdata_best/script Devanagari png 17 | rm -rf ./langtests/results/*Devanagari* 18 | ./langtests/runlangtests.sh ~/lang-files 4_fast_san ../tessdata_fast san png 19 | rm -rf ./langtests/results/*san* 20 | ./langtests/runlangtests.sh ~/lang-files 4_best_int_san ../tessdata san png 21 | rm -rf ./langtests/results/*san* 22 | ./langtests/runlangtests.sh ~/lang-files 4_best_san ../tessdata_best san png 23 | rm -rf ./langtests/results/*san* 24 | 25 | ### It takes a while to run. 26 | 27 | cd ~/tesseract/test/langtests/ 28 | rm -rf ./times.txt 29 | -------------------------------------------------------------------------------- /langtests/frk_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | cd ~/tesseract 4 | 5 | mkdir -p tessdata_best 6 | mkdir -p tessdata_fast 7 | mkdir -p tessdata_fast/script 8 | mkdir -p tessdata_best/script 9 | mkdir -p tessdata/script 10 | 11 | cd ~/tesseract/tessdata 12 | wget -O frk.traineddata https://github.com/tesseract-ocr/tessdata/raw/master/frk.traineddata 13 | wget -O eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata 14 | wget -O osd.traineddata https://github.com/tesseract-ocr/tessdata/raw/master/osd.traineddata 15 | cd script 16 | wget -O Fraktur.traineddata https://github.com/tesseract-ocr/tessdata/raw/master/script/Fraktur.traineddata 17 | 18 | cd ~/tesseract/tessdata_best 19 | wget -O frk.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/frk.traineddata 20 | wget -O eng.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/eng.traineddata 21 | wget -O osd.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/osd.traineddata 22 | cd script 23 | wget -O Fraktur.traineddata https://github.com/tesseract-ocr/tessdata_best/raw/master/script/Fraktur.traineddata 24 | 25 | cd ~/tesseract/tessdata_fast 26 | wget -O frk.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/master/frk.traineddata 27 | wget -O eng.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata 28 | wget -O osd.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/master/osd.traineddata 29 | cd script 30 | wget -O Fraktur.traineddata https://github.com/tesseract-ocr/tessdata_fast/raw/master/script/Fraktur.traineddata 31 | 32 | # 33 | mkdir -p ~/lang-downloads 34 | cd ~/lang-downloads 35 | wget -O frk-jbarth-ubhd.zip http://digi.ub.uni-heidelberg.de/diglitData/v/abbyy11r8-vs-tesseract4.zip 36 | wget -O frk-stweil-gt.zip https://digi.bib.uni-mannheim.de/~stweil/fraktur-gt.zip 37 | 38 | # 39 | mkdir -p ~/lang-files 40 | cd ~/lang-files 41 | unzip ~/lang-downloads/frk-jbarth-ubhd.zip -d frk 42 | unzip ~/lang-downloads/frk-stweil-gt.zip -d frk 43 | mkdir -p ./frk-ligatures 44 | cp ./frk/abbyy-vs-tesseract/*.tif ./frk-ligatures/ 45 | cp ./frk/gt/*.txt ./frk-ligatures/ 46 | 47 | cd ./frk-ligatures/ 48 | ls -1 *.tif >pages 49 | sed -i -e 's/.tif//g' pages 50 | 51 | # 52 | ## mkdir -p ~/lang-stopwords 53 | ## cd ~/lang-stopwords 54 | ## wget -O frk.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt 55 | ## cat frk.stopwords.txt | tr '\n' ' ' > tmp 56 | ## echo "\n" > tmpend 57 | ## cat tmp tmpend > frk.stopwords.txt 58 | ## cp frk.stopwords.txt Fraktur.stopwords.txt 59 | ## rm tmp* 60 | ## echo "Check ~/lang-stopwords/frk.stopwords.txt as wordacc uses a space delimited stopwords file, not line delimited." 61 | ## echo "Also remove duplicate letters because of . and ," 62 | # 63 | rm -rf ./results/*frk* 64 | 65 | cd ~/tesseract/test/langtests 66 | -------------------------------------------------------------------------------- /langtests/frk_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # run langtests/runlangtests.sh with the root ISRI data dir, testname, tessdata-dir, language-code, image-type: 4 | 5 | cd ~/tesseract/test 6 | 7 | rm -rf ./langtests/results/*frk* 8 | rm -rf ./langtests/results/*Fraktur* 9 | rm -rf ./langtests/reports/*frk-$(date +%F)* 10 | rm -rf ./langtests/reports/*Fraktur-$(date +%F)* 11 | 12 | ./langtests/runlangtests.sh ~/lang-files 4_fast_Fraktur ../tessdata_fast/script Fraktur tif 13 | rm -rf ./langtests/results/*Fraktur* 14 | ./langtests/runlangtests.sh ~/lang-files 4_best_int_Fraktur ../tessdata/script Fraktur tif 15 | rm -rf ./langtests/results/*Fraktur* 16 | ./langtests/runlangtests.sh ~/lang-files 4_best_Fraktur ../tessdata_best/script Fraktur tif 17 | rm -rf ./langtests/results/*Fraktur* 18 | ./langtests/runlangtests.sh ~/lang-files 4_fast_frk ../tessdata_fast frk tif 19 | rm -rf ./langtests/results/*frk* 20 | ./langtests/runlangtests.sh ~/lang-files 4_best_int_frk ../tessdata frk tif 21 | rm -rf ./langtests/results/*frk* 22 | ./langtests/runlangtests.sh ~/lang-files 4_best_frk ../tessdata_best frk tif 23 | rm -rf ./langtests/results/*frk* 24 | 25 | ### It takes a while to run. 26 | 27 | mkdir -p ~/tesseract/tessdata_contrib 28 | cd ~/tesseract/test 29 | wget -O ~/tesseract/tessdata_contrib/frk.traineddata https://github.com/Shreeshrii/tessdata_shreetest/raw/master/frk.traineddata 30 | ./langtests/runlangtests.sh ~/lang-files 4_shreetest_frk ~/tesseract/tessdata_contrib frk tif 31 | rm -rf ./langtests/results/*frk* 32 | 33 | wget -O ~/tesseract/tessdata_contrib/frk.traineddata https://github.com/Shreeshrii/tessdata_fraktur/raw/master/frk-plus-Fraktur-3000.traineddata 34 | ./langtests/runlangtests.sh ~/lang-files 4_frk-plus-Fraktur-3000 ~/tesseract/tessdata_contrib frk tif 35 | rm -rf ./langtests/results/*frk* 36 | 37 | wget -O ~/tesseract/tessdata_contrib/frk.traineddata https://github.com/Shreeshrii/tessdata_fraktur/raw/master/frk-plus-Fraktur-52500.traineddata 38 | ./langtests/runlangtests.sh ~/lang-files 4_frk-plus-Fraktur-52500 ~/tesseract/tessdata_contrib frk tif 39 | rm -rf ./langtests/results/*frk* 40 | 41 | cd ~/tesseract/test/langtests/ 42 | rm -rf ./times.txt 43 | -------------------------------------------------------------------------------- /langtests/reports/Devanagari-2019-03-09.summary: -------------------------------------------------------------------------------- 1 | 4_fast_Devanagari san-fontsamples 13168 94.39% 4090 79.57% 4090 79.57 3441.32s 2 | 4_fast_Devanagari san-oldstyle 2883 58.68% 543 37.30% 543 37.30 76.82s 3 | 4_fast_Devanagari san-shreelipi 750 94.58% 279 83.31% 279 83.31 204.21s 4 | 4_best_int_Devanagari san-fontsamples 13056 94.44% 3887 80.59% 3887 80.59 11369.66s 5 | 4_best_int_Devanagari san-oldstyle 2812 59.70% 523 39.61% 523 39.61 255.96s 6 | 4_best_int_Devanagari san-shreelipi 829 94.01% 314 81.22% 314 81.22 647.42s 7 | 4_best_Devanagari san-fontsamples 12827 94.54% 3834 80.85% 3834 80.85 8842.58s 8 | 4_best_Devanagari san-oldstyle 2796 59.93% 523 39.61% 523 39.61 198.19s 9 | 4_best_Devanagari san-shreelipi 830 94.01% 311 81.40% 311 81.40 505.17s 10 | -------------------------------------------------------------------------------- /langtests/reports/Devanagari.summary: -------------------------------------------------------------------------------- 1 | 4_fast_Devanagari san-fontsamples 13168 94.39% 4090 79.57% 4090 79.57 3441.32s 2 | 4_fast_Devanagari san-oldstyle 2883 58.68% 543 37.30% 543 37.30 76.82s 3 | 4_fast_Devanagari san-shreelipi 750 94.58% 279 83.31% 279 83.31 204.21s 4 | 4_best_int_Devanagari san-fontsamples 13056 94.44% 3887 80.59% 3887 80.59 11369.66s 5 | 4_best_int_Devanagari san-oldstyle 2812 59.70% 523 39.61% 523 39.61 255.96s 6 | 4_best_int_Devanagari san-shreelipi 829 94.01% 314 81.22% 314 81.22 647.42s 7 | 4_best_Devanagari san-fontsamples 12827 94.54% 3834 80.85% 3834 80.85 8842.58s 8 | 4_best_Devanagari san-oldstyle 2796 59.93% 523 39.61% 523 39.61 198.19s 9 | 4_best_Devanagari san-shreelipi 830 94.01% 311 81.40% 311 81.40 505.17s 10 | -------------------------------------------------------------------------------- /langtests/reports/Fraktur-2019-03-09.summary: -------------------------------------------------------------------------------- 1 | 4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 197.97s 2 | 4_best_int_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 198.37s 3 | 4_best_Fraktur frk-ligatures 193 94.29% 113 78.88% 81 72.82 155.81s 4 | -------------------------------------------------------------------------------- /langtests/reports/Fraktur.summary: -------------------------------------------------------------------------------- 1 | 4_fast_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 197.97s 2 | 4_best_int_Fraktur frk-ligatures 265 92.16% 116 78.32% 82 72.48 198.37s 3 | 4_best_Fraktur frk-ligatures 193 94.29% 113 78.88% 81 72.82 155.81s 4 | -------------------------------------------------------------------------------- /langtests/reports/frk-2019-03-09.summary: -------------------------------------------------------------------------------- 1 | 4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 193.91s 2 | 4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 194.35s 3 | 4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 147.59s 4 | 4_shreetest_frk frk-ligatures 128 96.21% 84 84.30% 61 79.53 159.41s 5 | 4_frk-plus-Fraktur-3000 frk-ligatures 134 96.03% 60 88.79% 46 84.56 148.53s 6 | 4_frk-plus-Fraktur-52500 frk-ligatures 126 96.27% 47 91.21% 37 87.58 132.38s 7 | -------------------------------------------------------------------------------- /langtests/reports/frk.summary: -------------------------------------------------------------------------------- 1 | 4_fast_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 193.91s 2 | 4_best_int_frk frk-ligatures 244 92.78% 109 79.63% 80 73.15 194.35s 3 | 4_best_frk frk-ligatures 178 94.73% 100 81.31% 74 75.17 147.59s 4 | 4_shreetest_frk frk-ligatures 128 96.21% 84 84.30% 61 79.53 159.41s 5 | 4_frk-plus-Fraktur-3000 frk-ligatures 134 96.03% 60 88.79% 46 84.56 148.53s 6 | 4_frk-plus-Fraktur-52500 frk-ligatures 126 96.27% 47 91.21% 37 87.58 132.38s 7 | -------------------------------------------------------------------------------- /langtests/reports/san-2019-03-09.summary: -------------------------------------------------------------------------------- 1 | 4_fast_san san-fontsamples 18385 92.17% 5389 73.08% 5389 73.08 3534.54s 2 | 4_fast_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 76.76s 3 | 4_fast_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 220.01s 4 | 4_best_int_san san-fontsamples 18385 92.17% 5389 73.08% 5389 73.08 3536.37s 5 | 4_best_int_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 76.86s 6 | 4_best_int_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 220.27s 7 | 4_best_san san-fontsamples 18458 92.14% 5408 72.99% 5408 72.99 2803.86s 8 | 4_best_san san-oldstyle 3121 55.27% 598 30.95% 598 30.95 58.98s 9 | 4_best_san san-shreelipi 1168 91.56% 414 75.24% 414 75.24 171.57s 10 | -------------------------------------------------------------------------------- /langtests/reports/san.summary: -------------------------------------------------------------------------------- 1 | 4_fast_san san-fontsamples 18385 92.17% 5389 73.08% 5389 73.08 3534.54s 2 | 4_fast_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 76.76s 3 | 4_fast_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 220.01s 4 | 4_best_int_san san-fontsamples 18385 92.17% 5389 73.08% 5389 73.08 3536.37s 5 | 4_best_int_san san-oldstyle 3121 55.27% 602 30.48% 602 30.48 76.86s 6 | 4_best_int_san san-shreelipi 1163 91.60% 417 75.06% 417 75.06 220.27s 7 | 4_best_san san-fontsamples 18458 92.14% 5408 72.99% 5408 72.99 2803.86s 8 | 4_best_san san-oldstyle 3121 55.27% 598 30.95% 598 30.95 58.98s 9 | 4_best_san san-shreelipi 1168 91.56% 414 75.24% 414 75.24 171.57s 10 | -------------------------------------------------------------------------------- /langtests/runlangtests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################################## 3 | # File: runlangtests.sh 4 | # Description: Script to run a set of accuracy test sets for any language. 5 | # based on runalltests.sh by Ray Smith 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | ############################################################################## 17 | if [ $# -ne 5 ] 18 | then 19 | echo "Usage:$0 unlv-data-dir version-id tessdata-dir langcode imgext" 20 | exit 1 21 | fi 22 | 23 | tessdata=$3 24 | lang=$4 25 | imgext=$5 26 | 27 | #timesum computes the total cpu time 28 | timesum() { 29 | awk ' BEGIN { 30 | total = 0.0; 31 | } 32 | { 33 | total += $2; 34 | } 35 | END { 36 | printf("%.2f\n", total); 37 | }' "$1" 38 | } 39 | 40 | imdir="$1" 41 | vid="$2" 42 | bindir=${0%/*} 43 | if [ "$bindir" = "$0" ] 44 | then 45 | bindir="./" 46 | fi 47 | rdir=./langtests/reports 48 | if [ "$lang" = "frk" ] || [ "$lang" = "Fraktur" ] 49 | then 50 | testsets="frk-ligatures" 51 | fi 52 | if [ "$lang" = "san" ] || [ "$lang" = "Devanagari" ] 53 | then 54 | ### testsets="san-fontsamples san-oldstyle san-shreelipi san-alphabetsamples" 55 | testsets="san-fontsamples san-oldstyle san-shreelipi" 56 | fi 57 | 58 | totalerrs=0 59 | totalwerrs=0 60 | totalnswerrs=0 61 | for set in $testsets 62 | do 63 | resultsdir=./langtests/results 64 | mkdir -p "$resultsdir" 65 | resdir=$resultsdir/$set 66 | mkdir -p "$resdir" 67 | cp ~/lang-stopwords/$lang.stopwords.txt "$resdir/$lang.stopwords" 68 | if [ -r "$imdir/$set/pages" ] 69 | then 70 | # Run tesseract on all the pages. 71 | bash $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "$lang" "$imgext" 72 | # Count the errors on all the pages. 73 | $bindir/counttestset.sh "$imdir/$set/pages" $lang 74 | # Get the new character word and nonstop word errors and accuracy. 75 | cherrs=$(head -4 "$resultsdir/$set.characc" |tail -1 |cut -c1-9 | 76 | tr -d '[:blank:]') 77 | chacc=$(head -5 "$resultsdir/$set.characc" |tail -1 |cut -c1-9 | 78 | tr -d '[:blank:]') 79 | wderrs=$(head -4 "$resultsdir/$set.wordacc" |tail -1 |cut -c1-9 | 80 | tr -d '[:blank:]') 81 | wdacc=$(head -5 "$resultsdir/$set.wordacc" |tail -1 |cut -c1-9 | 82 | tr -d '[:blank:]') 83 | nswderrs=$(grep Total "$resultsdir/$set.wordacc" |head -2 |tail -1 | 84 | cut -c10-17 |tr -d '[:blank:]') 85 | nswdacc=$(grep Total "$resultsdir/$set.wordacc" |head -2 |tail -1 | 86 | cut -c19-26 |tr -d '[:blank:]') 87 | 88 | sumfile=$resultsdir/$vid.$set.sum 89 | if [ -r "$resultsdir/$set.times" ] 90 | then 91 | total_time=$(timesum "$resultsdir/$set.times") 92 | else 93 | total_time='0.0' 94 | fi 95 | echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ 96 | NonStopWErrors Accuracy TimeTaken">"$resultsdir/$lang."Header 97 | echo "$vid $set $cherrs $chacc $wderrs $wdacc\ 98 | $nswderrs $nswdacc ${total_time}s" >>"$sumfile" 99 | fi 100 | done 101 | 102 | ##cat "$resultsdir/$lang."Header >>"$rdir/$lang-$(date +%F)".summary 103 | 104 | cat "$resultsdir/$vid".*.sum >>"$rdir/$lang-$(date +%F)".summary 105 | cat "$resultsdir/$vid".*.sum >>"$rdir/$lang".summary 106 | 107 | cat "$resultsdir/$lang."Header "$rdir/$lang-$(date +%F)".summary 108 | -------------------------------------------------------------------------------- /langtests/runtestset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # File: runtestset.sh 3 | # Description: Script to run tesseract on a single UNLV set. 4 | # Author: Ray Smith 5 | # Created: Wed Jun 13 10:13:01 PDT 2007 6 | # 7 | # (C) Copyright 2007, Google Inc. 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | if [ $# -ne 4 ] 19 | then 20 | echo "Usage:$0 pagesfile tessdata-dir langcode imgext" 21 | exit 1 22 | fi 23 | 24 | tessdir=.. 25 | tess="time -f %U -o times.txt $tessdir/src/api/tesseract" 26 | 27 | tessdata=$2 28 | langcode=$3 29 | imgext=$4 30 | pages=$1 31 | imdir=${pages%/pages} 32 | setname=${imdir##*/} 33 | 34 | config="" 35 | resdir=./langtests/results/$setname 36 | 37 | echo -e "\nTesting $tessdata and $langcode on set $setname in directory $imdir to $resdir\n" 38 | mkdir -p "$resdir" 39 | rm -f "./langtests/results/$setname.times" 40 | while read page dir 41 | do 42 | # A pages file may be a list of files with subdirs or maybe just 43 | # a plain list of files so accommodate both. 44 | if [ "$dir" ] 45 | then 46 | srcdir="$imdir/$dir" 47 | else 48 | srcdir="$imdir" 49 | fi 50 | echo "$srcdir/$page" 51 | $tess "$srcdir/$page.$imgext" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $langcode --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" 52 | if [ -r times.txt ] 53 | then 54 | read t >"./langtests/results/$setname.times" 56 | echo -e "\033M$page $t" 57 | if [ "$t" = "Command terminated by signal 2" ] 58 | then 59 | exit 0 60 | fi 61 | fi 62 | done <"$pages" 63 | -------------------------------------------------------------------------------- /testdata/README.md: -------------------------------------------------------------------------------- 1 | ## testdata 2 | 3 | This repo has files required by Google's unittests for Tesseract. 4 | 5 | The unicharset files were provided by Ray Smith for unicharcompress_test. 6 | These seem to be old format unicharsets, having ligatures as unichars. 7 | 8 | A number of testdata files required for unittests were not made available 9 | with the test sources. An attempt has been made to recreate similar files. 10 | 11 | The files for lstm related tests have been created as follows: 12 | 13 | 14 | ``` 15 | src/training/tesstrain.sh \ 16 | --fonts_dir ~/.fonts \ 17 | --lang eng \ 18 | --linedata_only \ 19 | --noextract_font_properties \ 20 | --workspace_dir ~/tmp \ 21 | --exposures "0" \ 22 | --langdata_dir ../langdata_lstm \ 23 | --tessdata_dir ../tessdata \ 24 | --output_dir ~/tesseract/test/testdata \ 25 | --fontlist "Arial" "Arial Unicode MS" \ 26 | --training_text ~/langdata_lstm/eng/eng.training_text \ 27 | --maxpages 20 \ 28 | --xsize 800 29 | 30 | rm ~/tesseract/test/testdata/eng.training_files.txt 31 | rm ~/tesseract/test/testdata/eng/eng.charset_size*.txt 32 | 33 | src/training/tesstrain.sh \ 34 | --fonts_dir ~/.fonts \ 35 | --lang kor \ 36 | --linedata_only \ 37 | --noextract_font_properties \ 38 | --langdata_dir ../langdata_lstm \ 39 | --tessdata_dir ../tessdata \ 40 | --output_dir ~/tesseract/test/testdata \ 41 | --fontlist "Arial Unicode MS" \ 42 | --maxpages 15 \ 43 | --xsize 800 44 | 45 | rm ~/tesseract/test/testdata/kor.training_files.txt 46 | rm ~/tesseract/test/testdata/kor/kor.charset_size*.txt 47 | 48 | src/training/tesstrain.sh \ 49 | --fonts_dir ~/.fonts \ 50 | --lang kan \ 51 | --linedata_only \ 52 | --noextract_font_properties \ 53 | --langdata_dir ../langdata_lstm \ 54 | --tessdata_dir ../tessdata \ 55 | --output_dir ~/tesseract/test/testdata \ 56 | --fontlist "Arial Unicode MS" \ 57 | --maxpages 10 --xsize 800 58 | 59 | rm ~/tesseract/test/testdata/kan.training_files.txt 60 | rm ~/tesseract/test/testdata/kan/kan.charset_size*.txt 61 | 62 | src/training/tesstrain.sh \ 63 | --fonts_dir ~/.fonts \ 64 | --lang deu \ 65 | --linedata_only \ 66 | --noextract_font_properties \ 67 | --langdata_dir ../langdata_lstm \ 68 | --tessdata_dir ../tessdata \ 69 | --output_dir ~/tesseract/test/testdata \ 70 | --fontlist "Arial Unicode MS" \ 71 | --maxpages 10 --xsize 800 72 | 73 | rm ~/tesseract/test/testdata/deu.training_files.txt 74 | rm ~/tesseract/test/testdata/deu/deu.charset_size*.txt 75 | 76 | src/training/tesstrain.sh \ 77 | --fonts_dir ~/.fonts \ 78 | --lang fra \ 79 | --linedata_only \ 80 | --noextract_font_properties \ 81 | --langdata_dir ../langdata_lstm \ 82 | --tessdata_dir ../tessdata \ 83 | --output_dir ~/tesseract/test/testdata \ 84 | --fontlist "Arial Unicode MS" \ 85 | --maxpages 10 --xsize 800 86 | 87 | rm ~/tesseract/test/testdata/fra.training_files.txt 88 | rm ~/tesseract/test/testdata/fra/fra.charset_size*.txt 89 | 90 | ``` 91 | -------------------------------------------------------------------------------- /testdata/ara.unicharset: -------------------------------------------------------------------------------- 1 | 221 2 | NULL 0 Common 0 3 | Joined 7 0,255,0,255,0,0,0,0,0,0 Latin 1 0 1 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,255,0,255,0,0,0,0,0,0 Common 2 10 2 |Broken|0|1 # Broken 5 | لا 1 0,255,0,255,0,0,0,0,0,0 Arabic 3 13 3 لا # لا [644 627 ]x 6 | ح 1 0,255,0,255,0,0,0,0,0,0 Arabic 4 13 4 ح # ح [62d ]x 7 | ت 1 0,255,0,255,0,0,0,0,0,0 Arabic 5 13 5 ت # ت [62a ]x 8 | ك 1 0,255,0,255,0,0,0,0,0,0 Arabic 6 13 6 ك # ك [643 ]x 9 | ا 1 0,255,0,255,0,0,0,0,0,0 Arabic 7 13 7 ا # ا [627 ]x 10 | ه 1 0,255,0,255,0,0,0,0,0,0 Arabic 8 13 8 ه # ه [647 ]x 11 | م 1 0,255,0,255,0,0,0,0,0,0 Arabic 9 13 9 م # م [645 ]x 12 | ي 1 0,255,0,255,0,0,0,0,0,0 Arabic 10 13 10 ي # ي [64a ]x 13 | ن 1 0,255,0,255,0,0,0,0,0,0 Arabic 11 13 11 ن # ن [646 ]x 14 | أ 1 0,255,0,255,0,0,0,0,0,0 Arabic 12 13 12 أ # أ [623 ]x 15 | ط 1 0,255,0,255,0,0,0,0,0,0 Arabic 13 13 13 ط # ط [637 ]x 16 | ل 1 0,255,0,255,0,0,0,0,0,0 Arabic 14 13 14 ل # ل [644 ]x 17 | ب 1 0,255,0,255,0,0,0,0,0,0 Arabic 15 13 15 ب # ب [628 ]x 18 | س 1 0,255,0,255,0,0,0,0,0,0 Arabic 16 13 16 س # س [633 ]x 19 | ق 1 0,255,0,255,0,0,0,0,0,0 Arabic 17 13 17 ق # ق [642 ]x 20 | ر 1 0,255,0,255,0,0,0,0,0,0 Arabic 18 13 18 ر # ر [631 ]x 21 | ف 1 0,255,0,255,0,0,0,0,0,0 Arabic 19 13 19 ف # ف [641 ]x 22 | ع 1 0,255,0,255,0,0,0,0,0,0 Arabic 20 13 20 ع # ع [639 ]x 23 | ة 1 0,255,0,255,0,0,0,0,0,0 Arabic 21 13 21 ة # ة [629 ]x 24 | « 10 0,255,0,255,0,0,0,0,0,0 Common 22 10 135 « # « [ab ]p 25 | . 10 0,255,0,255,0,0,0,0,0,0 Common 23 6 23 . # . [2e ]p 26 | و 1 0,255,0,255,0,0,0,0,0,0 Arabic 24 13 24 و # و [648 ]x 27 | ش 1 0,255,0,255,0,0,0,0,0,0 Arabic 25 13 25 ش # ش [634 ]x 28 | د 1 0,255,0,255,0,0,0,0,0,0 Arabic 26 13 26 د # د [62f ]x 29 | خ 1 0,255,0,255,0,0,0,0,0,0 Arabic 27 13 27 خ # خ [62e ]x 30 | ص 1 0,255,0,255,0,0,0,0,0,0 Arabic 28 13 28 ص # ص [635 ]x 31 | ض 1 0,255,0,255,0,0,0,0,0,0 Arabic 29 13 29 ض # ض [636 ]x 32 | ـُ 1 0,255,0,255,0,0,0,0,0,0 Common 30 13 30 ـُ # ـُ [640 64f ]x 33 | ج 1 0,255,0,255,0,0,0,0,0,0 Arabic 31 13 31 ج # ج [62c ]x 34 | , 10 0,255,0,255,0,0,0,0,0,0 Common 32 6 32 , # , [2c ]p 35 | إ 1 0,255,0,255,0,0,0,0,0,0 Arabic 33 13 33 إ # إ [625 ]x 36 | 1 8 0,255,0,255,0,0,0,0,0,0 Common 34 2 34 1 # 1 [31 ]0 37 | 9 8 0,255,0,255,0,0,0,0,0,0 Common 35 2 35 9 # 9 [39 ]0 38 | 6 8 0,255,0,255,0,0,0,0,0,0 Common 36 2 36 6 # 6 [36 ]0 39 | - 10 0,255,0,255,0,0,0,0,0,0 Common 37 3 37 - # - [2d ]p 40 | 4 8 0,255,0,255,0,0,0,0,0,0 Common 38 2 38 4 # 4 [34 ]0 41 | / 10 0,255,0,255,0,0,0,0,0,0 Common 39 6 39 / # / [2f ]p 42 | : 10 0,255,0,255,0,0,0,0,0,0 Common 40 6 40 : # : [3a ]p 43 | ( 10 0,255,0,255,0,0,0,0,0,0 Common 41 10 42 ( # ( [28 ]p 44 | ) 10 0,255,0,255,0,0,0,0,0,0 Common 42 10 41 ) # ) [29 ]p 45 | وّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 43 13 43 وّ # وّ [648 651 ]x 46 | غ 1 0,255,0,255,0,0,0,0,0,0 Arabic 44 13 44 غ # غ [63a ]x 47 | ث 1 0,255,0,255,0,0,0,0,0,0 Arabic 45 13 45 ث # ث [62b ]x 48 | ٢ 8 0,255,0,255,0,0,0,0,0,0 Arabic 46 5 46 ٢ # ٢ [662 ]0 49 | ٠ 8 0,255,0,255,0,0,0,0,0,0 Arabic 47 5 47 ٠ # ٠ [660 ]0 50 | " 10 0,255,0,255,0,0,0,0,0,0 Common 48 10 48 " # " [22 ]p 51 | ء 1 0,255,0,255,0,0,0,0,0,0 Arabic 49 13 49 ء # ء [621 ]x 52 | ذ 1 0,255,0,255,0,0,0,0,0,0 Arabic 50 13 50 ذ # ذ [630 ]x 53 | 3 8 0,255,0,255,0,0,0,0,0,0 Common 51 2 51 3 # 3 [33 ]0 54 | رُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 52 13 52 رُ # رُ [631 64f ]x 55 | 2 8 0,255,0,255,0,0,0,0,0,0 Common 53 2 53 2 # 2 [32 ]0 56 | مُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 54 13 54 مُ # مُ [645 64f ]x 57 | ظ 1 0,255,0,255,0,0,0,0,0,0 Arabic 55 13 55 ظ # ظ [638 ]x 58 | 5 8 0,255,0,255,0,0,0,0,0,0 Common 56 2 56 5 # 5 [35 ]0 59 | 8 8 0,255,0,255,0,0,0,0,0,0 Common 57 2 57 8 # 8 [38 ]0 60 | بَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 58 13 58 بَ # بَ [628 64e ]x 61 | لَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 59 13 59 لَ # لَ [644 64e ]x 62 | غَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 60 13 60 غَ # غَ [63a 64e ]x 63 | 0 8 0,255,0,255,0,0,0,0,0,0 Common 61 2 61 0 # 0 [30 ]0 64 | ةٍ 1 0,255,0,255,0,0,0,0,0,0 Arabic 62 13 62 ةٍ # ةٍ [629 64d ]x 65 | ةً 1 0,255,0,255,0,0,0,0,0,0 Arabic 63 13 63 ةً # ةً [629 64b ]x 66 | ى 1 0,255,0,255,0,0,0,0,0,0 Arabic 64 13 64 ى # ى [649 ]x 67 | آ 1 0,255,0,255,0,0,0,0,0,0 Arabic 65 13 65 آ # آ [622 ]x 68 | ز 1 0,255,0,255,0,0,0,0,0,0 Arabic 66 13 66 ز # ز [632 ]x 69 | 7 8 0,255,0,255,0,0,0,0,0,0 Common 67 2 67 7 # 7 [37 ]0 70 | ؤ 1 0,255,0,255,0,0,0,0,0,0 Arabic 68 13 68 ؤ # ؤ [624 ]x 71 | ئ 1 0,255,0,255,0,0,0,0,0,0 Arabic 69 13 69 ئ # ئ [626 ]x 72 | ـ 1 0,255,0,255,0,0,0,0,0,0 Common 70 13 70 ـ # ـ [640 ]x 73 | ' 10 0,255,0,255,0,0,0,0,0,0 Common 71 10 71 ' # ' [27 ]p 74 | نّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 72 13 72 نّ # نّ [646 651 ]x 75 | رّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 73 13 73 رّ # رّ [631 651 ]x 76 | اً 1 0,255,0,255,0,0,0,0,0,0 Arabic 74 13 74 اً # اً [627 64b ]x 77 | دّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 75 13 75 دّ # دّ [62f 651 ]x 78 | رِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 76 13 76 رِ # رِ [631 650 ]x 79 | = 0 0,255,0,255,0,0,0,0,0,0 Common 77 10 77 = # = [3d ] 80 | تْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 78 13 78 تْ # تْ [62a 652 ]x 81 | قَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 79 13 79 قَ # قَ [642 64e ]x 82 | هُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 80 13 80 هُ # هُ [647 64f ]x 83 | كَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 81 13 81 كَ # كَ [643 64e ]x 84 | رَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 82 13 82 رَ # رَ [631 64e ]x 85 | لِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 83 13 83 لِ # لِ [644 650 ]x 86 | دْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 84 13 84 دْ # دْ [62f 652 ]x 87 | تُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 85 13 85 تُ # تُ [62a 64f ]x 88 | قُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 86 13 86 قُ # قُ [642 64f ]x 89 | لَّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 87 13 87 لَّ # لَّ [644 64e 651 ]x 90 | ةٌ 1 0,255,0,255,0,0,0,0,0,0 Arabic 88 13 88 ةٌ # ةٌ [629 64c ]x 91 | بُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 89 13 89 بُ # بُ [628 64f ]x 92 | دِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 90 13 90 دِ # دِ [62f 650 ]x 93 | يَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 91 13 91 يَ # يَ [64a 64e ]x 94 | ] 10 0,255,0,255,0,0,0,0,0,0 Common 92 10 165 ] # ] [5d ]p 95 | * 10 0,255,0,255,0,0,0,0,0,0 Common 93 10 93 * # * [2a ]p 96 | % 10 0,255,0,255,0,0,0,0,0,0 Common 94 4 94 % # % [25 ]p 97 | ؛ 10 0,255,0,255,0,0,0,0,0,0 Common 95 13 95 ؛ # ؛ [61b ]p 98 | | 0 0,255,0,255,0,0,0,0,0,0 Common 96 10 96 | # | [7c ] 99 | ‏ 0 0,255,0,255,0,0,0,0,0,0 Common 97 1 97 ‏ # ‏ [200f ] 100 | لاً 1 0,255,0,255,0,0,0,0,0,0 Arabic 98 13 98 لاً # لاً [644 627 64b ]x 101 | شَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 99 13 99 شَ # شَ [634 64e ]x 102 | نُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 100 13 100 نُ # نُ [646 64f ]x 103 | رْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 101 13 101 رْ # رْ [631 652 ]x 104 | مً 1 0,255,0,255,0,0,0,0,0,0 Arabic 102 13 102 مً # مً [645 64b ]x 105 | مْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 103 13 103 مْ # مْ [645 652 ]x 106 | جْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 104 13 104 جْ # جْ [62c 652 ]x 107 | حْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 105 13 105 حْ # حْ [62d 652 ]x 108 | تَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 106 13 106 تَ # تَ [62a 64e ]x 109 | لأ 1 0,255,0,255,0,0,0,0,0,0 Arabic 107 13 107 لأ # لأ [644 623 ]x 110 | مَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 108 13 108 مَ # مَ [645 64e ]x 111 | رً 1 0,255,0,255,0,0,0,0,0,0 Arabic 109 13 109 رً # رً [631 64b ]x 112 | مَّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 110 13 110 مَّ # مَّ [645 64e 651 ]x 113 | عَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 111 13 111 عَ # عَ [639 64e ]x 114 | فِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 112 13 112 فِ # فِ [641 650 ]x 115 | بْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 113 13 113 بْ # بْ [628 652 ]x 116 | يَّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 114 13 114 يَّ # يَّ [64a 64e 651 ]x 117 | وَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 115 13 115 وَ # وَ [648 64e ]x 118 | عْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 116 13 116 عْ # عْ [639 652 ]x 119 | حُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 117 13 117 حُ # حُ [62d 64f ]x 120 | لُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 118 13 118 لُ # لُ [644 64f ]x 121 | هْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 119 13 119 هْ # هْ [647 652 ]x 122 | يُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 120 13 120 يُ # يُ [64a 64f ]x 123 | فُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 121 13 121 فُ # فُ [641 64f ]x 124 | بِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 122 13 122 بِ # بِ [628 650 ]x 125 | مِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 123 13 123 مِ # مِ [645 650 ]x 126 | حَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 124 13 124 حَ # حَ [62d 64e ]x 127 | نَّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 125 13 125 نَّ # نَّ [646 64e 651 ]x 128 | عُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 126 13 126 عُ # عُ [639 64f ]x 129 | دُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 127 13 127 دُ # دُ [62f 64f ]x 130 | هِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 128 13 128 هِ # هِ [647 650 ]x 131 | كْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 129 13 129 كْ # كْ [643 652 ]x 132 | لْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 130 13 130 لْ # لْ [644 652 ]x 133 | كِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 131 13 131 كِ # كِ [643 650 ]x 134 | ةُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 132 13 132 ةُ # ةُ [629 64f ]x 135 | عِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 133 13 133 عِ # عِ [639 650 ]x 136 | تِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 134 13 134 تِ # تِ [62a 650 ]x 137 | » 10 0,255,0,255,0,0,0,0,0,0 Common 135 10 22 » # » [bb ]p 138 | رَّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 136 13 136 رَّ # رَّ [631 64e 651 ]x 139 | نَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 137 13 137 نَ # نَ [646 64e ]x 140 | بً 1 0,255,0,255,0,0,0,0,0,0 Arabic 138 13 138 بً # بً [628 64b ]x 141 | كّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 139 13 139 كّ # كّ [643 651 ]x 142 | ةِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 140 13 140 ةِ # ةِ [629 650 ]x 143 | “ 10 0,255,0,255,0,0,0,0,0,0 Common 141 10 141 " # “ [201c ]p 144 | خَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 142 13 142 خَ # خَ [62e 64e ]x 145 | كُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 143 13 143 كُ # كُ [643 64f ]x 146 | سّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 144 13 144 سّ # سّ [633 651 ]x 147 | ‬ 0 0,255,0,255,0,0,0,0,0,0 Common 145 16 145 ‬ # ‬ [202c ] 148 | ٪ 10 0,255,0,255,0,0,0,0,0,0 Arabic 146 4 146 ٪ # ٪ [66a ]p 149 | ° 0 0,255,0,255,0,0,0,0,0,0 Common 147 4 147 ° # ° [b0 ] 150 | مّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 148 13 148 مّ # مّ [645 651 ]x 151 | يّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 149 13 149 يّ # يّ [64a 651 ]x 152 | لّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 150 13 150 لّ # لّ [644 651 ]x 153 | … 10 0,255,0,255,0,0,0,0,0,0 Common 151 10 151 ... # … [2026 ]p 154 | ـَ 1 0,255,0,255,0,0,0,0,0,0 Common 152 13 152 ـَ # ـَ [640 64e ]x 155 | > 0 0,255,0,255,0,0,0,0,0,0 Common 153 10 169 > # > [3e ] 156 | ; 10 0,255,0,255,0,0,0,0,0,0 Common 154 10 154 ; # ; [3b ]p 157 | ! 10 0,255,0,255,0,0,0,0,0,0 Common 155 10 155 ! # ! [21 ]p 158 | تّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 156 13 156 تّ # تّ [62a 651 ]x 159 | $ 0 0,255,0,255,0,0,0,0,0,0 Common 157 4 157 $ # $ [24 ] 160 | § 10 0,255,0,255,0,0,0,0,0,0 Common 158 10 158 § # § [a7 ]p 161 | ? 10 0,255,0,255,0,0,0,0,0,0 Common 159 10 159 ? # ? [3f ]p 162 | صّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 160 13 160 صّ # صّ [635 651 ]x 163 | نِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 161 13 161 نِ # نِ [646 650 ]x 164 | ^ 0 0,255,0,255,0,0,0,0,0,0 Common 162 10 162 ^ # ^ [5e ] 165 | { 10 0,255,0,255,0,0,0,0,0,0 Common 163 10 164 { # { [7b ]p 166 | } 10 0,255,0,255,0,0,0,0,0,0 Common 164 10 163 } # } [7d ]p 167 | [ 10 0,255,0,255,0,0,0,0,0,0 Common 165 10 92 [ # [ [5b ]p 168 | ذَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 166 13 166 ذَ # ذَ [630 64e ]x 169 | سِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 167 13 167 سِ # سِ [633 650 ]x 170 | # 10 0,255,0,255,0,0,0,0,0,0 Common 168 4 168 # # # [23 ]p 171 | < 0 0,255,0,255,0,0,0,0,0,0 Common 169 10 153 < # < [3c ] 172 | _ 10 0,255,0,255,0,0,0,0,0,0 Common 170 10 170 _ # _ [5f ]p 173 | فَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 171 13 171 فَ # فَ [641 64e ]x 174 | أَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 172 13 172 أَ # أَ [623 64e ]x 175 | سْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 173 13 173 سْ # سْ [633 652 ]x 176 | جَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 174 13 174 جَ # جَ [62c 64e ]x 177 | سَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 175 13 175 سَ # سَ [633 64e ]x 178 | ةَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 176 13 176 ةَ # ةَ [629 64e ]x 179 | هَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 177 13 177 هَ # هَ [647 64e ]x 180 | طَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 178 13 178 طَ # طَ [637 64e ]x 181 | لاَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 179 13 179 لاَ # لاَ [644 627 64e ]x 182 | وْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 180 13 180 وْ # وْ [648 652 ]x 183 | صَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 181 13 181 صَ # صَ [635 64e ]x 184 | يْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 182 13 182 يْ # يْ [64a 652 ]x 185 | دَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 183 13 183 دَ # دَ [62f 64e ]x 186 | تَّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 184 13 184 تَّ # تَّ [62a 64e 651 ]x 187 | إِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 185 13 185 إِ # إِ [625 650 ]x 188 | نْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 186 13 186 نْ # نْ [646 652 ]x 189 | اْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 187 13 187 اْ # اْ [627 652 ]x 190 | سُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 188 13 188 سُ # سُ [633 64f ]x 191 | جُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 189 13 189 جُ # جُ [62c 64f ]x 192 | فْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 190 13 190 فْ # فْ [641 652 ]x 193 | حِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 191 13 191 حِ # حِ [62d 650 ]x 194 | أُ 1 0,255,0,255,0,0,0,0,0,0 Arabic 192 13 192 أُ # أُ [623 64f ]x 195 | ئِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 193 13 193 ئِ # ئِ [626 650 ]x 196 | رٍ 1 0,255,0,255,0,0,0,0,0,0 Arabic 194 13 194 رٍ # رٍ [631 64d ]x 197 | يً 1 0,255,0,255,0,0,0,0,0,0 Arabic 195 13 195 يً # يً [64a 64b ]x 198 | بّ 1 0,255,0,255,0,0,0,0,0,0 Arabic 196 13 196 بّ # بّ [628 651 ]x 199 | دً 1 0,255,0,255,0,0,0,0,0,0 Arabic 197 13 197 دً # دً [62f 64b ]x 200 | \ 10 0,255,0,255,0,0,0,0,0,0 Common 198 10 198 \ # \ [5c ]p 201 | ؟ 10 0,255,0,255,0,0,0,0,0,0 Common 199 13 199 ؟ # ؟ [61f ]p 202 | قِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 200 13 200 قِ # قِ [642 650 ]x 203 | ضً 1 0,255,0,255,0,0,0,0,0,0 Arabic 201 13 201 ضً # ضً [636 64b ]x 204 | لٍ 1 0,255,0,255,0,0,0,0,0,0 Arabic 202 13 202 لٍ # لٍ [644 64d ]x 205 | ١ 8 0,255,0,255,0,0,0,0,0,0 Arabic 203 5 203 ١ # ١ [661 ]0 206 | + 0 0,255,0,255,0,0,0,0,0,0 Common 204 3 204 + # + [2b ] 207 | قْ 1 0,255,0,255,0,0,0,0,0,0 Arabic 205 13 205 قْ # قْ [642 652 ]x 208 | ءً 1 0,255,0,255,0,0,0,0,0,0 Arabic 206 13 206 ءً # ءً [621 64b ]x 209 | ضَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 207 13 207 ضَ # ضَ [636 64e ]x 210 | ذِ 1 0,255,0,255,0,0,0,0,0,0 Arabic 208 13 208 ذِ # ذِ [630 650 ]x 211 | ” 10 0,255,0,255,0,0,0,0,0,0 Common 209 10 209 " # ” [201d ]p 212 | اَ 1 0,255,0,255,0,0,0,0,0,0 Arabic 210 13 210 اَ # اَ [627 64e ]x 213 | ـِ 1 0,255,0,255,0,0,0,0,0,0 Common 211 13 211 ـِ # ـِ [640 650 ]x 214 | × 0 0,255,0,255,0,0,0,0,0,0 Common 212 10 212 × # × [d7 ] 215 | لَا 1 0,255,0,255,0,0,0,0,0,0 Arabic 213 13 213 لَا # لَا [644 64e 627 ]x 216 | & 10 0,255,0,255,0,0,0,0,0,0 Common 214 10 214 & # & [26 ]p 217 | ~ 0 0,255,0,255,0,0,0,0,0,0 Common 215 10 215 ~ # ~ [7e ] 218 | @ 10 0,255,0,255,0,0,0,0,0,0 Common 216 10 216 @ # @ [40 ]p 219 | ® 0 0,255,0,255,0,0,0,0,0,0 Common 217 10 217 ® # ® [ae ] 220 | وٍ 1 0,255,0,255,0,0,0,0,0,0 Arabic 218 13 218 وٍ # وٍ [648 64d ]x 221 | ₪ 0 0,255,0,255,0,0,0,0,0,0 Common 219 4 219 ₪ # ₪ [20aa ] 222 | © 0 0,255,0,255,0,0,0,0,0,0 Common 220 10 220 © # © [a9 ] 223 | -------------------------------------------------------------------------------- /testdata/deu.Arial_Unicode_MS.exp0.lstmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/deu.Arial_Unicode_MS.exp0.lstmf -------------------------------------------------------------------------------- /testdata/deu/deu.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/deu/deu.traineddata -------------------------------------------------------------------------------- /testdata/deu/deu.unicharset: -------------------------------------------------------------------------------- 1 | 118 2 | NULL 0 NULL 0 3 | Joined 7 0,69,188,255,486,1218,0,30,486,1188 Latin 6 0 78 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,69,186,255,892,2138,0,80,892,2058 Common 83 10 83 |Broken|0|1 # Broken 5 | M 5 57,68,216,255,99,301,0,35,117,286 Latin 8 0 3 M # M [4d ]A 6 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 24 0 4 i # i [69 ]a 7 | r 3 59,68,186,202,58,173,0,40,69,180 Latin 33 0 5 r # r [72 ]a 8 | j 3 0,47,216,255,36,145,0,49,50,173 Latin 78 0 6 j # j [6a ]a 9 | a 3 58,65,186,200,85,164,0,26,97,185 Latin 35 0 7 a # a [61 ]a 10 | m 3 56,68,189,202,108,280,0,25,117,306 Latin 3 0 8 m # m [6d ]a 11 | G 5 58,64,219,255,91,230,0,30,106,230 Latin 21 0 9 G # G [47 ]A 12 | t 3 58,66,206,254,57,167,0,47,59,180 Latin 51 0 10 t # t [74 ]a 13 | e 3 58,64,189,200,87,154,0,32,98,188 Latin 28 0 11 e # e [65 ]a 14 | n 3 59,68,188,202,87,187,0,25,101,208 Latin 29 0 12 n # n [6e ]a 15 | F 5 57,68,216,255,68,210,0,31,77,209 Latin 49 0 13 F # F [46 ]A 16 | o 3 58,66,188,200,87,151,0,32,98,185 Latin 27 0 14 o # o [6f ]a 17 | u 3 57,65,187,202,85,184,0,39,100,208 Latin 42 0 15 u # u [75 ]a 18 | P 5 57,68,216,255,87,225,0,32,97,230 Latin 76 0 16 P # P [50 ]A 19 | C 5 58,65,219,255,87,192,0,32,107,209 Latin 71 0 17 C # C [43 ]A 20 | - 10 105,161,122,175,49,176,0,43,56,215 Common 18 3 18 - # - [2d ]p 21 | s 3 58,65,192,200,78,147,0,30,91,173 Latin 38 0 19 s # s [73 ]a 22 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 20 10 54 ] # ] [5d ]p 23 | g 3 0,43,188,212,88,176,0,32,100,210 Latin 9 0 21 g # g [67 ]a 24 | fi 3 0,69,216,255,82,408,0,42,82,366 Latin 13 0 49 fi # fi [66 69 ]a 25 | W 5 54,68,216,255,106,314,0,41,117,318 Latin 70 0 23 W # W [57 ]A 26 | I 5 59,68,216,255,10,155,0,50,29,173 Latin 4 0 24 I # I [49 ]A 27 | L 5 59,68,216,255,64,193,0,31,74,206 Latin 60 0 25 L # L [4c ]A 28 | K 5 57,68,216,255,92,225,0,37,103,216 Latin 65 0 26 K # K [4b ]A 29 | O 5 57,64,219,255,91,209,0,34,106,233 Latin 14 0 27 O # O [4f ]A 30 | E 5 59,68,216,255,68,210,0,31,80,219 Latin 11 0 28 E # E [45 ]A 31 | N 5 59,68,216,255,87,262,0,27,104,249 Latin 12 0 29 N # N [4e ]A 32 | . 10 26,67,73,112,13,51,0,67,30,173 Common 30 6 30 . # . [2e ]p 33 | d 3 57,65,216,255,88,174,0,28,100,200 Latin 36 0 31 d # d [64 ]a 34 | H 5 59,68,216,255,91,258,0,27,107,244 Latin 52 0 32 H # H [48 ]A 35 | R 5 57,68,216,255,88,227,0,27,104,232 Latin 5 0 33 R # R [52 ]A 36 | V 5 59,68,216,255,103,207,0,41,101,245 Latin 74 0 34 V # V [56 ]A 37 | A 5 52,68,216,255,100,216,0,17,98,231 Latin 7 0 35 A # A [41 ]A 38 | D 5 59,68,216,255,93,230,0,27,107,236 Latin 31 0 36 D # D [44 ]A 39 | Ä 5 64,68,232,255,100,240,0,27,98,248 Latin 95 0 37 Ä # Ä [c4 ]A 40 | S 5 57,64,219,255,87,174,0,30,100,200 Latin 19 0 38 S # S [53 ]A 41 | y 3 0,47,187,202,87,199,0,25,87,230 Latin 59 0 39 y # y [79 ]a 42 | ı 3 62,69,174,241,11,141,0,122,48,293 Latin 24 0 40 ı # ı [131 ]a 43 | Z 5 64,68,216,255,72,218,0,30,77,236 Latin 61 0 41 Z # Z [5a ]A 44 | U 5 58,64,216,255,91,214,0,39,106,220 Latin 15 0 42 U # U [55 ]A 45 | 2 8 30,69,194,255,80,160,0,27,97,173 Common 43 2 43 2 # 2 [32 ]0 46 | , 10 14,46,79,115,17,78,0,58,30,173 Common 44 6 44 , # , [2c ]p 47 | 7 8 12,68,196,255,72,160,0,60,75,173 Common 45 2 45 7 # 7 [37 ]0 48 | < 0 29,102,173,255,69,184,0,50,90,256 Common 46 10 85 < # < [3c ] 49 | = 0 74,139,144,199,90,186,0,32,103,224 Common 47 10 47 = # = [3d ] 50 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 48 10 48 " # “ [201c ]p 51 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 13 0 49 f # f [66 ]a 52 | ü 3 0,65,219,255,85,220,0,39,100,225 Latin 77 0 50 ü # ü [fc ]a 53 | T 5 59,68,216,255,85,227,0,47,88,236 Latin 10 0 51 T # T [54 ]A 54 | h 3 59,68,216,255,87,187,0,25,101,208 Latin 32 0 52 h # h [68 ]a 55 | « 10 26,133,148,235,63,279,0,35,71,281 Common 53 10 104 « # « [ab ]p 56 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 54 10 20 [ # [ [5b ]p 57 | ß 3 0,68,209,255,93,555,0,76,104,565 Latin 55 0 55 ß # ß [df ]a 58 | X 5 59,68,216,255,94,275,0,25,93,256 Latin 58 0 56 X # X [58 ]A 59 | b 3 58,64,216,255,87,180,0,25,100,200 Latin 68 0 57 b # b [62 ]a 60 | x 3 59,68,187,201,85,189,0,25,84,218 Latin 56 0 58 x # x [78 ]a 61 | Y 5 59,68,216,255,91,205,0,47,91,223 Latin 39 0 59 Y # Y [59 ]A 62 | l 3 59,68,216,255,11,147,0,56,27,173 Latin 25 0 60 l # l [6c ]a 63 | z 3 46,68,186,199,65,151,0,32,68,173 Latin 41 0 61 z # z [7a ]a 64 | : 10 58,85,141,221,11,69,0,67,38,173 Common 62 6 62 : # : [3a ]p 65 | ( 10 0,64,216,255,42,118,0,97,61,173 Common 63 10 64 ( # ( [28 ]p 66 | ) 10 0,64,216,255,42,119,0,53,61,173 Common 64 10 63 ) # ) [29 ]p 67 | k 3 57,68,216,255,85,177,0,35,93,198 Latin 26 0 65 k # k [6b ]a 68 | 1 8 49,69,192,255,45,128,0,66,74,173 Common 66 2 66 1 # 1 [31 ]0 69 | ‚ 10 14,48,74,112,17,78,0,58,32,173 Common 67 10 67 ‚ # ‚ [201a ]p 70 | B 5 62,68,216,255,91,227,0,27,106,227 Latin 57 0 68 B # B [42 ]A 71 | ‘ 10 141,233,210,255,17,64,0,216,30,298 Common 69 10 69 ' # ‘ [2018 ]p 72 | w 3 59,68,187,195,108,235,0,32,117,286 Latin 23 0 70 w # w [77 ]a 73 | c 3 58,64,192,200,80,153,0,36,88,178 Latin 17 0 71 c # c [63 ]a 74 | € 0 32,68,209,255,97,238,0,49,103,293 Common 72 4 72 € # € [20ac ] 75 | Ö 5 0,64,232,255,91,302,0,34,106,314 Latin 84 0 73 Ö # Ö [d6 ]A 76 | v 3 59,68,187,197,84,173,0,32,84,218 Latin 34 0 74 v # v [76 ]a 77 | ' 10 148,225,216,255,11,51,0,97,36,173 Common 75 10 75 ' # ' [27 ]p 78 | p 3 0,47,192,226,87,180,0,25,100,200 Latin 16 0 76 p # p [70 ]a 79 | Ü 5 58,64,232,255,91,291,0,39,106,299 Latin 50 0 77 Ü # Ü [dc ]A 80 | J 5 0,64,216,255,39,242,0,30,62,234 Latin 6 0 78 J # J [4a ]A 81 | 0 8 58,66,187,255,88,164,0,45,103,180 Common 79 2 79 0 # 0 [30 ]0 82 | Q 5 7,64,219,255,91,205,0,30,106,227 Latin 111 0 80 Q # Q [51 ]A 83 | 4 8 0,68,198,255,93,161,0,41,96,173 Common 81 2 81 4 # 4 [34 ]0 84 | \ 10 0,67,219,255,28,250,0,71,62,261 Common 82 10 82 \ # \ [5c ]p 85 | | 0 0,67,216,255,8,73,0,80,50,173 Common 83 10 83 | # | [7c ] 86 | ö 3 58,66,219,255,87,248,0,32,98,256 Latin 73 0 84 ö # ö [f6 ]a 87 | > 0 29,102,173,255,78,184,0,50,90,256 Common 85 10 46 > # > [3e ] 88 | 5 8 12,66,199,255,82,160,0,36,103,173 Common 86 2 86 5 # 5 [35 ]0 89 | 9 8 0,66,200,255,89,156,0,39,104,173 Common 87 2 87 9 # 9 [39 ]0 90 | ° 0 66,247,209,255,22,399,0,98,66,409 Common 88 4 88 ° # ° [b0 ] 91 | ! 10 41,67,216,255,11,87,0,71,50,173 Common 89 10 89 ! # ! [21 ]p 92 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 90 4 90 £ # £ [a3 ] 93 | 8 8 57,66,219,255,88,162,0,41,103,174 Common 91 2 91 8 # 8 [38 ]0 94 | + 0 54,102,171,253,90,176,0,37,103,213 Common 92 3 92 + # + [2b ] 95 | ’ 10 141,233,212,255,17,78,0,109,30,298 Common 93 10 93 ' # ’ [2019 ]p 96 | 6 8 58,66,219,255,87,156,0,54,104,173 Common 94 2 94 6 # 6 [36 ]0 97 | ä 3 0,64,219,255,85,294,0,26,97,307 Latin 37 0 95 ä # ä [e4 ]a 98 | ? 10 40,67,219,255,59,144,0,65,77,188 Common 96 10 96 ? # ? [3f ]p 99 | 3 8 0,66,196,255,84,158,0,32,103,173 Common 97 2 97 3 # 3 [33 ]0 100 | # 10 37,84,200,255,99,221,0,41,109,266 Common 98 4 98 # # # [23 ]p 101 | / 10 0,65,219,255,59,228,0,36,62,238 Common 99 6 99 / # / [2f ]p 102 | © 0 28,125,209,255,118,232,0,32,119,257 Common 100 10 100 © # © [a9 ] 103 | % 10 27,67,205,255,105,257,0,49,117,288 Common 101 4 101 % # % [25 ]p 104 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 102 10 102 " # ” [201d ]p 105 | } 10 0,44,216,255,54,148,0,56,59,173 Common 103 10 114 } # } [7d ]p 106 | » 10 0,133,146,235,63,284,0,32,71,294 Common 104 10 53 » # » [bb ]p 107 | ® 0 28,163,209,255,83,223,0,48,92,257 Common 105 10 105 ® # ® [ae ] 108 | & 10 53,64,194,255,108,232,0,47,112,239 Common 106 10 106 & # & [26 ]p 109 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 107 4 107 $ # $ [24 ] 110 | ; 10 14,56,131,221,17,93,0,58,38,173 Common 108 10 108 ; # ; [3b ]p 111 | @ 10 0,65,211,255,99,286,0,39,117,291 Common 109 10 109 @ # @ [40 ]p 112 | — 10 110,155,132,167,126,297,0,23,136,298 Common 110 10 110 - # — [2014 ]p 113 | q 3 0,47,192,202,88,196,0,30,100,200 Latin 80 0 111 q # q [71 ]a 114 | § 0 9,66,219,255,82,207,0,86,93,293 Common 112 10 112 § # § [a7 ] 115 | „ 10 3,48,72,232,58,330,0,36,66,337 Common 113 10 113 „ # „ [201e ]p 116 | { 10 0,44,216,255,54,148,0,71,59,173 Common 114 10 103 { # { [7b ]p 117 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 115 10 115 _ # _ [5f ]p 118 | * 10 78,183,188,255,49,134,0,60,53,173 Common 116 10 116 * # * [2a ]p 119 | " 10 151,225,216,255,52,115,0,71,71,173 Common 117 10 117 " # " [22 ]p 120 | -------------------------------------------------------------------------------- /testdata/eng.Arial.exp0.lstmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/eng.Arial.exp0.lstmf -------------------------------------------------------------------------------- /testdata/eng.Arial_Unicode_MS.exp0.lstmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/eng.Arial_Unicode_MS.exp0.lstmf -------------------------------------------------------------------------------- /testdata/eng.params_model: -------------------------------------------------------------------------------- 1 | # This file is used for params_model_test. 2 | # See src/ccstruct/params_training_featdef.h for the list of keys which 3 | # must occur here. The test accepts any float value for each key. 4 | PTRAIN_DIGITS_SHORT 0.0 5 | PTRAIN_DIGITS_MED 1.1 6 | PTRAIN_DIGITS_LONG 2.2 7 | # Number or pattern (NUMBER_PERM, USER_PATTERN_PERM) 8 | PTRAIN_NUM_SHORT 3.3 9 | PTRAIN_NUM_MED 4.4 10 | PTRAIN_NUM_LONG 5.5 11 | # Document word (DOC_DAWG_PERM) 12 | PTRAIN_DOC_SHORT 6 13 | PTRAIN_DOC_MED 7 14 | PTRAIN_DOC_LONG 8 15 | # Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM) 16 | PTRAIN_DICT_SHORT 9 17 | PTRAIN_DICT_MED 10 18 | PTRAIN_DICT_LONG 11 19 | # Frequent word (FREQ_DAWG_PERM) 20 | PTRAIN_FREQ_SHORT 12 21 | PTRAIN_FREQ_MED 13 22 | PTRAIN_FREQ_LONG 14 23 | PTRAIN_SHAPE_COST_PER_CHAR 15 24 | PTRAIN_NGRAM_COST_PER_CHAR 16 25 | PTRAIN_NUM_BAD_PUNC 17 26 | PTRAIN_NUM_BAD_CASE 18 27 | PTRAIN_XHEIGHT_CONSISTENCY 19 28 | PTRAIN_NUM_BAD_CHAR_TYPE 20 29 | PTRAIN_NUM_BAD_SPACING 21 30 | PTRAIN_NUM_BAD_FONT 22 31 | PTRAIN_RATING_PER_CHAR 23 32 | -------------------------------------------------------------------------------- /testdata/eng.unicharset: -------------------------------------------------------------------------------- 1 | 111 2 | NULL 0 NULL 0 3 | I 5 59,68,216,255,10,155,0,50,29,173 Latin 7 0 1 I # I [49 ]A 4 | ' 10 148,225,216,255,11,51,0,97,36,173 Common 2 10 2 ' # ' [27 ]p 5 | v 3 59,68,187,197,84,173,0,32,84,218 Latin 61 0 3 v # v [76 ]a 6 | e 3 58,64,189,200,87,154,0,32,98,188 Latin 88 0 4 e # e [65 ]a 7 | J 5 0,64,216,255,39,242,0,30,62,234 Latin 79 0 5 J # J [4a ]A 8 | o 3 58,66,188,200,87,151,0,32,98,185 Latin 83 0 6 o # o [6f ]a 9 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 1 0 7 i # i [69 ]a 10 | n 3 59,68,188,202,87,187,0,25,101,208 Latin 45 0 8 n # n [6e ]a 11 | | 0 0,67,216,255,8,73,0,80,50,173 Common 9 10 9 | # | [7c ] 12 | - 10 105,161,122,175,49,176,0,43,56,215 Common 10 3 10 - # - [2d ]p 13 | S 5 57,64,219,255,87,174,0,30,100,200 Latin 26 0 11 S # S [53 ]A 14 | z 3 46,68,186,199,65,151,0,32,68,173 Latin 95 0 12 z # z [7a ]a 15 | : 10 58,85,141,221,11,69,0,67,38,173 Common 13 6 13 : # : [3a ]p 16 | # 10 37,84,200,255,99,221,0,41,109,266 Common 14 4 14 # # # [23 ]p 17 | 6 8 58,66,219,255,87,156,0,54,104,173 Common 15 2 15 6 # 6 [36 ]0 18 | % 10 27,67,205,255,105,257,0,49,117,288 Common 16 4 16 % # % [25 ]p 19 | 5 8 12,66,199,255,82,160,0,36,103,173 Common 17 2 17 5 # 5 [35 ]0 20 | 0 8 58,66,187,255,88,164,0,45,103,180 Common 18 2 18 0 # 0 [30 ]0 21 | @ 10 0,65,211,255,99,286,0,39,117,291 Common 19 10 19 @ # @ [40 ]p 22 | p 3 0,47,192,226,87,180,0,25,100,200 Latin 68 0 20 p # p [70 ]a 23 | a 3 58,65,186,200,85,164,0,26,97,185 Latin 67 0 21 a # a [61 ]a 24 | r 3 59,68,186,202,58,173,0,40,69,180 Latin 40 0 22 r # r [72 ]a 25 | m 3 56,68,189,202,108,280,0,25,117,306 Latin 38 0 23 m # m [6d ]a 26 | F 5 57,68,216,255,68,210,0,31,77,209 Latin 29 0 24 F # F [46 ]A 27 | u 3 57,65,187,202,85,184,0,39,100,208 Latin 85 0 25 u # u [75 ]a 28 | s 3 58,65,192,200,78,147,0,30,91,173 Latin 11 0 26 s # s [73 ]a 29 | B 5 62,68,216,255,91,227,0,27,106,227 Latin 46 0 27 B # B [42 ]A 30 | » 10 0,133,146,235,63,284,0,32,71,294 Common 28 10 49 » # » [bb ]p 31 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 24 0 29 f # f [66 ]a 32 | d 3 57,65,216,255,88,174,0,28,100,200 Latin 59 0 30 d # d [64 ]a 33 | c 3 58,64,192,200,80,153,0,36,88,178 Latin 33 0 31 c # c [63 ]a 34 | h 3 59,68,216,255,87,187,0,25,101,208 Latin 55 0 32 h # h [68 ]a 35 | C 5 58,65,219,255,87,192,0,32,107,209 Latin 31 0 33 C # C [43 ]A 36 | t 3 58,66,206,254,57,167,0,47,59,180 Latin 37 0 34 t # t [74 ]a 37 | L 5 59,68,216,255,64,193,0,31,74,206 Latin 41 0 35 L # L [4c ]A 38 | ? 10 40,67,219,255,59,144,0,65,77,188 Common 36 10 36 ? # ? [3f ]p 39 | T 5 59,68,216,255,85,227,0,47,88,236 Latin 34 0 37 T # T [54 ]A 40 | M 5 57,68,216,255,99,301,0,35,117,286 Latin 23 0 38 M # M [4d ]A 41 | y 3 0,47,187,202,87,199,0,25,87,230 Latin 100 0 39 y # y [79 ]a 42 | R 5 57,68,216,255,88,227,0,27,104,232 Latin 22 0 40 R # R [52 ]A 43 | l 3 59,68,216,255,11,147,0,56,27,173 Latin 35 0 41 l # l [6c ]a 44 | ~ 0 91,229,135,255,73,174,0,41,0,200 Common 42 10 42 ~ # ~ [7e ] 45 | < 0 29,102,173,255,69,184,0,50,90,256 Common 43 10 76 < # < [3c ] 46 | ® 0 28,163,209,255,83,223,0,48,92,257 Common 44 10 44 ® # ® [ae ] 47 | N 5 59,68,216,255,87,262,0,27,104,249 Latin 8 0 45 N # N [4e ]A 48 | b 3 58,64,216,255,87,180,0,25,100,200 Latin 27 0 46 b # b [62 ]a 49 | k 3 57,68,216,255,85,177,0,35,93,198 Latin 101 0 47 k # k [6b ]a 50 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 48 10 70 [ # [ [5b ]p 51 | « 10 26,133,148,235,63,279,0,35,71,281 Common 49 10 28 « # « [ab ]p 52 | 1 8 49,69,192,255,45,128,0,66,74,173 Common 50 2 50 1 # 1 [31 ]0 53 | , 10 14,46,79,115,17,78,0,58,30,173 Common 51 6 51 , # , [2c ]p 54 | . 10 26,67,73,112,13,51,0,67,30,173 Common 52 6 52 . # . [2e ]p 55 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 53 10 53 " # ” [201d ]p 56 | g 3 0,43,188,212,88,176,0,32,100,210 Latin 93 0 54 g # g [67 ]a 57 | H 5 59,68,216,255,91,258,0,27,107,244 Latin 32 0 55 H # H [48 ]A 58 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 56 4 56 $ # $ [24 ] 59 | ( 10 0,64,216,255,42,118,0,97,61,173 Common 57 10 94 ( # ( [28 ]p 60 | + 0 54,102,171,253,90,176,0,37,103,213 Common 58 3 58 + # + [2b ] 61 | D 5 59,68,216,255,93,230,0,27,107,236 Latin 30 0 59 D # D [44 ]A 62 | w 3 59,68,187,195,108,235,0,32,117,286 Latin 103 0 60 w # w [77 ]a 63 | V 5 59,68,216,255,103,207,0,41,101,245 Latin 3 0 61 V # V [56 ]A 64 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 62 4 62 £ # £ [a3 ] 65 | 4 8 0,68,198,255,93,161,0,41,96,173 Common 63 2 63 4 # 4 [34 ]0 66 | 9 8 0,66,200,255,89,156,0,39,104,173 Common 64 2 64 9 # 9 [39 ]0 67 | Q 5 7,64,219,255,91,205,0,30,106,227 Latin 96 0 65 Q # Q [51 ]A 68 | & 10 53,64,194,255,108,232,0,47,112,239 Common 66 10 66 & # & [26 ]p 69 | A 5 52,68,216,255,100,216,0,17,98,231 Latin 21 0 67 A # A [41 ]A 70 | P 5 57,68,216,255,87,225,0,32,97,230 Latin 20 0 68 P # P [50 ]A 71 | ¢ 0 14,158,190,255,56,144,0,72,61,270 Common 69 4 69 ¢ # ¢ [a2 ] 72 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 70 10 48 ] # ] [5d ]p 73 | 3 8 0,66,196,255,84,158,0,32,103,173 Common 71 2 71 3 # 3 [33 ]0 74 | 2 8 30,69,194,255,80,160,0,27,97,173 Common 72 2 72 2 # 2 [32 ]0 75 | © 0 28,125,209,255,118,232,0,32,119,257 Common 73 10 73 © # © [a9 ] 76 | 8 8 57,66,219,255,88,162,0,41,103,174 Common 74 2 74 8 # 8 [38 ]0 77 | / 10 0,65,219,255,59,228,0,36,62,238 Common 75 6 75 / # / [2f ]p 78 | > 0 29,102,173,255,78,184,0,50,90,256 Common 76 10 43 > # > [3e ] 79 | X 5 59,68,216,255,94,275,0,25,93,256 Latin 86 0 77 X # X [58 ]A 80 | é 3 0,64,222,255,87,384,0,32,98,391 Latin 78 0 78 é # é [e9 ]a 81 | j 3 0,47,216,255,36,145,0,49,50,173 Latin 5 0 79 j # j [6a ]a 82 | ; 10 14,56,131,221,17,93,0,58,38,173 Common 80 10 80 ; # ; [3b ]p 83 | 7 8 12,68,196,255,72,160,0,60,75,173 Common 81 2 81 7 # 7 [37 ]0 84 | € 0 32,68,209,255,97,238,0,49,103,293 Common 82 4 82 € # € [20ac ] 85 | O 5 57,64,219,255,91,209,0,34,106,233 Latin 6 0 83 O # O [4f ]A 86 | ¥ 0 59,75,209,255,91,238,0,52,91,270 Common 84 4 84 ¥ # ¥ [a5 ] 87 | U 5 58,64,216,255,91,214,0,39,106,220 Latin 25 0 85 U # U [55 ]A 88 | x 3 59,68,187,201,85,189,0,25,84,218 Latin 77 0 86 x # x [78 ]a 89 | } 10 0,44,216,255,54,148,0,56,59,173 Common 87 10 97 } # } [7d ]p 90 | E 5 59,68,216,255,68,210,0,31,80,219 Latin 4 0 88 E # E [45 ]A 91 | § 0 9,66,219,255,82,207,0,86,93,293 Common 89 10 89 § # § [a7 ] 92 | = 0 74,139,144,199,90,186,0,32,103,224 Common 90 10 90 = # = [3d ] 93 | ! 10 41,67,216,255,11,87,0,71,50,173 Common 91 10 91 ! # ! [21 ]p 94 | ’ 10 141,233,212,255,17,78,0,109,30,298 Common 92 10 92 ' # ’ [2019 ]p 95 | G 5 58,64,219,255,91,230,0,30,106,230 Latin 54 0 93 G # G [47 ]A 96 | ) 10 0,64,216,255,42,119,0,53,61,173 Common 94 10 57 ) # ) [29 ]p 97 | Z 5 64,68,216,255,72,218,0,30,77,236 Latin 12 0 95 Z # Z [5a ]A 98 | q 3 0,47,192,202,88,196,0,30,100,200 Latin 65 0 96 q # q [71 ]a 99 | { 10 0,44,216,255,54,148,0,71,59,173 Common 97 10 87 { # { [7b ]p 100 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 98 10 98 " # “ [201c ]p 101 | — 10 110,155,132,167,126,297,0,23,136,298 Common 99 10 99 - # — [2014 ]p 102 | Y 5 59,68,216,255,91,205,0,47,91,223 Latin 39 0 100 Y # Y [59 ]A 103 | K 5 57,68,216,255,92,225,0,37,103,216 Latin 47 0 101 K # K [4b ]A 104 | * 10 78,183,188,255,49,134,0,60,53,173 Common 102 10 102 * # * [2a ]p 105 | W 5 54,68,216,255,106,314,0,41,117,318 Latin 60 0 103 W # W [57 ]A 106 | " 10 151,225,216,255,52,115,0,71,71,173 Common 104 10 104 " # " [22 ]p 107 | \ 10 0,67,219,255,28,250,0,71,62,261 Common 105 10 105 \ # \ [5c ]p 108 | ° 0 66,247,209,255,22,399,0,98,66,409 Common 106 4 106 ° # ° [b0 ] 109 | fi 3 0,71,216,255,87,202,0,28,105,199 Latin 107 0 107 fi # fi [fb01 ]a 110 | ‘ 10 141,233,210,255,17,64,0,216,30,298 Common 108 10 108 ' # ‘ [2018 ]p 111 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 109 10 109 _ # _ [5f ]p 112 | fl 3 0,71,216,255,87,219,0,28,105,236 Latin 110 0 110 fl # fl [fb02 ]a 113 | -------------------------------------------------------------------------------- /testdata/eng/eng.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/eng/eng.traineddata -------------------------------------------------------------------------------- /testdata/eng/eng.unicharset: -------------------------------------------------------------------------------- 1 | 112 2 | NULL 0 NULL 0 3 | Joined 7 0,69,188,255,486,1218,0,30,486,1188 Latin 26 0 98 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,69,186,255,892,2138,0,80,892,2058 Common 84 10 84 |Broken|0|1 # Broken 5 | I 5 59,68,216,255,10,155,0,50,29,173 Latin 11 0 3 I # I [49 ]A 6 | n 3 59,68,188,202,87,187,0,25,101,208 Latin 22 0 4 n # n [6e ]a 7 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 46 0 5 f # f [66 ]a 8 | o 3 58,66,188,200,87,151,0,32,98,185 Latin 19 0 6 o # o [6f ]a 9 | r 3 59,68,186,202,58,173,0,40,69,180 Latin 44 0 7 r # r [72 ]a 10 | m 3 56,68,189,202,108,280,0,25,117,306 Latin 34 0 8 m # m [6d ]a 11 | a 3 58,65,186,200,85,164,0,26,97,185 Latin 23 0 9 a # a [61 ]a 12 | t 3 58,66,206,254,57,167,0,47,59,180 Latin 21 0 10 t # t [74 ]a 13 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 3 0 11 i # i [69 ]a 14 | G 5 58,64,219,255,91,230,0,30,106,230 Latin 61 0 12 G # G [47 ]A 15 | u 3 57,65,187,202,85,184,0,39,100,208 Latin 58 0 13 u # u [75 ]a 16 | p 3 0,47,192,226,87,180,0,25,100,200 Latin 20 0 14 p # p [70 ]a 17 | s 3 58,65,192,200,78,147,0,30,91,173 Latin 41 0 15 s # s [73 ]a 18 | b 3 58,64,216,255,87,180,0,25,100,200 Latin 35 0 16 b # b [62 ]a 19 | l 3 59,68,216,255,11,147,0,56,27,173 Latin 24 0 17 l # l [6c ]a 20 | c 3 58,64,192,200,80,153,0,36,88,178 Latin 32 0 18 c # c [63 ]a 21 | O 5 57,64,219,255,91,209,0,34,106,233 Latin 6 0 19 O # O [4f ]A 22 | P 5 57,68,216,255,87,225,0,32,97,230 Latin 14 0 20 P # P [50 ]A 23 | T 5 59,68,216,255,85,227,0,47,88,236 Latin 10 0 21 T # T [54 ]A 24 | N 5 59,68,216,255,87,262,0,27,104,249 Latin 4 0 22 N # N [4e ]A 25 | A 5 52,68,216,255,100,216,0,17,98,231 Latin 9 0 23 A # A [41 ]A 26 | L 5 59,68,216,255,64,193,0,31,74,206 Latin 17 0 24 L # L [4c ]A 27 | , 10 14,46,79,115,17,78,0,58,30,173 Common 25 6 25 , # , [2c ]p 28 | j 3 0,47,216,255,36,145,0,49,50,173 Latin 98 0 26 j # j [6a ]a 29 | d 3 57,65,216,255,88,174,0,28,100,200 Latin 55 0 27 d # d [64 ]a 30 | e 3 58,64,189,200,87,154,0,32,98,188 Latin 37 0 28 e # e [65 ]a 31 | z 3 46,68,186,199,65,151,0,32,68,173 Latin 91 0 29 z # z [7a ]a 32 | H 5 59,68,216,255,91,258,0,27,107,244 Latin 50 0 30 H # H [48 ]A 33 | v 3 59,68,187,197,84,173,0,32,84,218 Latin 66 0 31 v # v [76 ]a 34 | C 5 58,65,219,255,87,192,0,32,107,209 Latin 18 0 32 C # C [43 ]A 35 | x 3 59,68,187,201,85,189,0,25,84,218 Latin 38 0 33 x # x [78 ]a 36 | M 5 57,68,216,255,99,301,0,35,117,286 Latin 8 0 34 M # M [4d ]A 37 | B 5 62,68,216,255,91,227,0,27,106,227 Latin 16 0 35 B # B [42 ]A 38 | y 3 0,47,187,202,87,199,0,25,87,230 Latin 42 0 36 y # y [79 ]a 39 | E 5 59,68,216,255,68,210,0,31,80,219 Latin 28 0 37 E # E [45 ]A 40 | X 5 59,68,216,255,94,275,0,25,93,256 Latin 33 0 38 X # X [58 ]A 41 | . 10 26,67,73,112,13,51,0,67,30,173 Common 39 6 39 . # . [2e ]p 42 | ¥ 0 59,75,209,255,91,238,0,52,91,270 Common 40 4 40 ¥ # ¥ [a5 ] 43 | S 5 57,64,219,255,87,174,0,30,100,200 Latin 15 0 41 S # S [53 ]A 44 | Y 5 59,68,216,255,91,205,0,47,91,223 Latin 36 0 42 Y # Y [59 ]A 45 | W 5 54,68,216,255,106,314,0,41,117,318 Latin 49 0 43 W # W [57 ]A 46 | R 5 57,68,216,255,88,227,0,27,104,232 Latin 7 0 44 R # R [52 ]A 47 | K 5 57,68,216,255,92,225,0,37,103,216 Latin 47 0 45 K # K [4b ]A 48 | F 5 57,68,216,255,68,210,0,31,77,209 Latin 5 0 46 F # F [46 ]A 49 | k 3 57,68,216,255,85,177,0,35,93,198 Latin 45 0 47 k # k [6b ]a 50 | ) 10 0,64,216,255,42,119,0,53,61,173 Common 48 10 56 ) # ) [29 ]p 51 | w 3 59,68,187,195,108,235,0,32,117,286 Latin 43 0 49 w # w [77 ]a 52 | h 3 59,68,216,255,87,187,0,25,101,208 Latin 30 0 50 h # h [68 ]a 53 | 1 8 49,69,192,255,45,128,0,66,74,173 Common 51 2 51 1 # 1 [31 ]0 54 | 6 8 58,66,219,255,87,156,0,54,104,173 Common 52 2 52 6 # 6 [36 ]0 55 | 7 8 12,68,196,255,72,160,0,60,75,173 Common 53 2 53 7 # 7 [37 ]0 56 | 9 8 0,66,200,255,89,156,0,39,104,173 Common 54 2 54 9 # 9 [39 ]0 57 | D 5 59,68,216,255,93,230,0,27,107,236 Latin 27 0 55 D # D [44 ]A 58 | ( 10 0,64,216,255,42,118,0,97,61,173 Common 56 10 48 ( # ( [28 ]p 59 | 0 8 58,66,187,255,88,164,0,45,103,180 Common 57 2 57 0 # 0 [30 ]0 60 | U 5 58,64,216,255,91,214,0,39,106,220 Latin 13 0 58 U # U [55 ]A 61 | \ 10 0,67,219,255,28,250,0,71,62,261 Common 59 10 59 \ # \ [5c ]p 62 | : 10 58,85,141,221,11,69,0,67,38,173 Common 60 6 60 : # : [3a ]p 63 | g 3 0,43,188,212,88,176,0,32,100,210 Latin 12 0 61 g # g [67 ]a 64 | 3 8 0,66,196,255,84,158,0,32,103,173 Common 62 2 62 3 # 3 [33 ]0 65 | § 0 9,66,219,255,82,207,0,86,93,293 Common 63 10 63 § # § [a7 ] 66 | 4 8 0,68,198,255,93,161,0,41,96,173 Common 64 2 64 4 # 4 [34 ]0 67 | ¢ 0 14,158,190,255,56,144,0,72,61,270 Common 65 4 65 ¢ # ¢ [a2 ] 68 | V 5 59,68,216,255,103,207,0,41,101,245 Latin 31 0 66 V # V [56 ]A 69 | é 3 0,64,222,255,87,384,0,32,98,391 Latin 67 0 67 é # é [e9 ]a 70 | q 3 0,47,192,202,88,196,0,30,100,200 Latin 69 0 68 q # q [71 ]a 71 | Q 5 7,64,219,255,91,205,0,30,106,227 Latin 68 0 69 Q # Q [51 ]A 72 | ' 10 148,225,216,255,11,51,0,97,36,173 Common 70 10 70 ' # ' [27 ]p 73 | - 10 105,161,122,175,49,176,0,43,56,215 Common 71 3 71 - # - [2d ]p 74 | ’ 10 141,233,212,255,17,78,0,109,30,298 Common 72 10 72 ' # ’ [2019 ]p 75 | * 10 78,183,188,255,49,134,0,60,53,173 Common 73 10 73 * # * [2a ]p 76 | ® 0 28,163,209,255,83,223,0,48,92,257 Common 74 10 74 ® # ® [ae ] 77 | ! 10 41,67,216,255,11,87,0,71,50,173 Common 75 10 75 ! # ! [21 ]p 78 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 76 10 76 _ # _ [5f ]p 79 | < 0 29,102,173,255,69,184,0,50,90,256 Common 77 10 78 < # < [3c ] 80 | > 0 29,102,173,255,78,184,0,50,90,256 Common 78 10 77 > # > [3e ] 81 | © 0 28,125,209,255,118,232,0,32,119,257 Common 79 10 79 © # © [a9 ] 82 | 2 8 30,69,194,255,80,160,0,27,97,173 Common 80 2 80 2 # 2 [32 ]0 83 | } 10 0,44,216,255,54,148,0,56,59,173 Common 81 10 109 } # } [7d ]p 84 | « 10 26,133,148,235,63,279,0,35,71,281 Common 82 10 86 « # « [ab ]p 85 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 83 10 83 " # ” [201d ]p 86 | | 0 0,67,216,255,8,73,0,80,50,173 Common 84 10 84 | # | [7c ] 87 | # 10 37,84,200,255,99,221,0,41,109,266 Common 85 4 85 # # # [23 ]p 88 | » 10 0,133,146,235,63,284,0,32,71,294 Common 86 10 82 » # » [bb ]p 89 | ? 10 40,67,219,255,59,144,0,65,77,188 Common 87 10 87 ? # ? [3f ]p 90 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 88 4 88 £ # £ [a3 ] 91 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 89 10 96 ] # ] [5d ]p 92 | & 10 53,64,194,255,108,232,0,47,112,239 Common 90 10 90 & # & [26 ]p 93 | Z 5 64,68,216,255,72,218,0,30,77,236 Latin 29 0 91 Z # Z [5a ]A 94 | — 10 110,155,132,167,126,297,0,23,136,298 Common 92 10 92 - # — [2014 ]p 95 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 93 4 93 $ # $ [24 ] 96 | 8 8 57,66,219,255,88,162,0,41,103,174 Common 94 2 94 8 # 8 [38 ]0 97 | 5 8 12,66,199,255,82,160,0,36,103,173 Common 95 2 95 5 # 5 [35 ]0 98 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 96 10 89 [ # [ [5b ]p 99 | ~ 0 91,229,135,255,73,174,0,41,0,200 Common 97 10 97 ~ # ~ [7e ] 100 | J 5 0,64,216,255,39,242,0,30,62,234 Latin 26 0 98 J # J [4a ]A 101 | % 10 27,67,205,255,105,257,0,49,117,288 Common 99 4 99 % # % [25 ]p 102 | " 10 151,225,216,255,52,115,0,71,71,173 Common 100 10 100 " # " [22 ]p 103 | @ 10 0,65,211,255,99,286,0,39,117,291 Common 101 10 101 @ # @ [40 ]p 104 | ° 0 66,247,209,255,22,399,0,98,66,409 Common 102 4 102 ° # ° [b0 ] 105 | ‘ 10 141,233,210,255,17,64,0,216,30,298 Common 103 10 103 ' # ‘ [2018 ]p 106 | = 0 74,139,144,199,90,186,0,32,103,224 Common 104 10 104 = # = [3d ] 107 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 105 10 105 " # “ [201c ]p 108 | / 10 0,65,219,255,59,228,0,36,62,238 Common 106 6 106 / # / [2f ]p 109 | ; 10 14,56,131,221,17,93,0,58,38,173 Common 107 10 107 ; # ; [3b ]p 110 | + 0 54,102,171,253,90,176,0,37,103,213 Common 108 3 108 + # + [2b ] 111 | { 10 0,44,216,255,54,148,0,71,59,173 Common 109 10 81 { # { [7b ]p 112 | € 0 32,68,209,255,97,238,0,49,103,293 Common 110 4 110 € # € [20ac ] 113 | ™ 0 63,201,209,255,101,273,0,59,104,293 Common 111 10 111 TM # ™ [2122 ] 114 | -------------------------------------------------------------------------------- /testdata/fra.Arial_Unicode_MS.exp0.lstmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/fra.Arial_Unicode_MS.exp0.lstmf -------------------------------------------------------------------------------- /testdata/fra/fra.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/fra/fra.traineddata -------------------------------------------------------------------------------- /testdata/fra/fra.unicharset: -------------------------------------------------------------------------------- 1 | 140 2 | NULL 0 NULL 0 3 | Joined 7 0,69,188,255,486,1218,0,30,486,1188 Latin 120 0 106 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,69,186,255,892,2138,0,80,892,2058 Common 109 10 109 |Broken|0|1 # Broken 5 | T 5 59,68,216,255,85,227,0,47,88,236 Latin 23 0 3 T # T [54 ]A 6 | o 3 58,66,188,200,87,151,0,32,98,185 Latin 14 0 4 o # o [6f ]a 7 | u 3 57,65,187,202,85,184,0,39,100,208 Latin 41 0 5 u # u [75 ]a 8 | s 3 58,65,192,200,78,147,0,30,91,173 Latin 15 0 6 s # s [73 ]a 9 | L 5 59,68,216,255,64,193,0,31,74,206 Latin 20 0 7 L # L [4c ]A 10 | a 3 58,65,186,200,85,164,0,26,97,185 Latin 33 0 8 a # a [61 ]a 11 | b 3 58,64,216,255,87,180,0,25,100,200 Latin 57 0 9 b # b [62 ]a 12 | è 3 0,64,222,255,87,279,0,32,98,289 Latin 16 0 10 è # è [e8 ]a 13 | g 3 0,43,188,212,88,176,0,32,100,210 Latin 59 0 11 g # g [67 ]a 14 | e 3 58,64,189,200,87,154,0,32,98,188 Latin 18 0 12 e # e [65 ]a 15 | P 5 57,68,216,255,87,225,0,32,97,230 Latin 52 0 13 P # P [50 ]A 16 | O 5 57,64,219,255,91,209,0,34,106,233 Latin 4 0 14 O # O [4f ]A 17 | S 5 57,64,219,255,87,174,0,30,100,200 Latin 6 0 15 S # S [53 ]A 18 | È 5 0,68,232,255,68,220,0,31,80,225 Latin 10 0 16 È # È [c8 ]A 19 | D 5 59,68,216,255,93,230,0,27,107,236 Latin 32 0 17 D # D [44 ]A 20 | E 5 59,68,216,255,68,210,0,31,80,219 Latin 12 0 18 E # E [45 ]A 21 | r 3 59,68,186,202,58,173,0,40,69,180 Latin 45 0 19 r # r [72 ]a 22 | l 3 59,68,216,255,11,147,0,56,27,173 Latin 7 0 20 l # l [6c ]a 23 | m 3 56,68,189,202,108,280,0,25,117,306 Latin 46 0 21 m # m [6d ]a 24 | n 3 59,68,188,202,87,187,0,25,101,208 Latin 39 0 22 n # n [6e ]a 25 | t 3 58,66,206,254,57,167,0,47,59,180 Latin 3 0 23 t # t [74 ]a 26 | * 10 78,183,188,255,49,134,0,60,53,173 Common 24 10 24 * # * [2a ]p 27 | c 3 58,64,192,200,80,153,0,36,88,178 Latin 37 0 25 c # c [63 ]a 28 | h 3 59,68,216,255,87,187,0,25,101,208 Latin 44 0 26 h # h [68 ]a 29 | ô 3 58,66,222,255,87,192,0,32,98,202 Latin 85 0 27 ô # ô [f4 ]a 30 | , 10 14,46,79,115,17,78,0,58,30,173 Common 28 6 28 , # , [2c ]p 31 | € 0 32,68,209,255,97,238,0,49,103,293 Common 29 4 29 € # € [20ac ] 32 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 38 0 30 i # i [69 ]a 33 | v 3 59,68,187,197,84,173,0,32,84,218 Latin 53 0 31 v # v [76 ]a 34 | d 3 57,65,216,255,88,174,0,28,100,200 Latin 17 0 32 d # d [64 ]a 35 | A 5 52,68,216,255,100,216,0,17,98,231 Latin 8 0 33 A # A [41 ]A 36 | Û 5 0,64,232,255,91,350,0,39,106,360 Latin 123 0 34 Û # Û [db ]A 37 | Q 5 7,64,219,255,91,205,0,30,106,227 Latin 55 0 35 Q # Q [51 ]A 38 | É 5 59,68,232,255,68,314,0,31,80,325 Latin 50 0 36 É # É [c9 ]A 39 | C 5 58,65,219,255,87,192,0,32,107,209 Latin 25 0 37 C # C [43 ]A 40 | I 5 59,68,216,255,10,155,0,50,29,173 Latin 30 0 38 I # I [49 ]A 41 | N 5 59,68,216,255,87,262,0,27,104,249 Latin 22 0 39 N # N [4e ]A 42 | F 5 57,68,216,255,68,210,0,31,77,209 Latin 43 0 40 F # F [46 ]A 43 | U 5 58,64,216,255,91,214,0,39,106,220 Latin 5 0 41 U # U [55 ]A 44 | - 10 105,161,122,175,49,176,0,43,56,215 Common 42 3 42 - # - [2d ]p 45 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 40 0 43 f # f [66 ]a 46 | H 5 59,68,216,255,91,258,0,27,107,244 Latin 26 0 44 H # H [48 ]A 47 | R 5 57,68,216,255,88,227,0,27,104,232 Latin 19 0 45 R # R [52 ]A 48 | M 5 57,68,216,255,99,301,0,35,117,286 Latin 21 0 46 M # M [4d ]A 49 | Æ 5 0,68,209,255,106,324,0,32,117,325 Latin 113 0 47 Æ # Æ [c6 ]A 50 | X 5 59,68,216,255,94,275,0,25,93,256 Latin 49 0 48 X # X [58 ]A 51 | x 3 59,68,187,201,85,189,0,25,84,218 Latin 48 0 49 x # x [78 ]a 52 | é 3 0,64,222,255,87,384,0,32,98,391 Latin 36 0 50 é # é [e9 ]a 53 | ' 10 148,225,216,255,11,51,0,97,36,173 Common 51 10 51 ' # ' [27 ]p 54 | p 3 0,47,192,226,87,180,0,25,100,200 Latin 13 0 52 p # p [70 ]a 55 | V 5 59,68,216,255,103,207,0,41,101,245 Latin 31 0 53 V # V [56 ]A 56 | ç 3 0,31,182,232,80,299,0,36,96,309 Latin 58 0 54 ç # ç [e7 ]a 57 | q 3 0,47,192,202,88,196,0,30,100,200 Latin 35 0 55 q # q [71 ]a 58 | . 10 26,67,73,112,13,51,0,67,30,173 Common 56 6 56 . # . [2e ]p 59 | B 5 62,68,216,255,91,227,0,27,106,227 Latin 9 0 57 B # B [42 ]A 60 | Ç 5 0,64,217,255,87,286,0,32,107,296 Latin 54 0 58 Ç # Ç [c7 ]A 61 | G 5 58,64,219,255,91,230,0,30,106,230 Latin 11 0 59 G # G [47 ]A 62 | ® 0 28,163,209,255,83,223,0,48,92,257 Common 60 10 60 ® # ® [ae ] 63 | y 3 0,47,187,202,87,199,0,25,87,230 Latin 76 0 61 y # y [79 ]a 64 | À 5 64,68,232,255,100,209,0,27,98,245 Latin 112 0 62 À # À [c0 ]A 65 | : 10 58,85,141,221,11,69,0,67,38,173 Common 63 6 63 : # : [3a ]p 66 | Ë 5 59,68,232,255,68,253,0,31,80,263 Latin 99 0 64 Ë # Ë [cb ]A 67 | ( 10 0,64,216,255,42,118,0,97,61,173 Common 65 10 97 ( # ( [28 ]p 68 | Ê 5 0,68,232,255,72,266,0,31,80,276 Latin 124 0 66 Ê # Ê [ca ]A 69 | ° 0 66,247,209,255,22,399,0,98,66,409 Common 67 4 67 ° # ° [b0 ] 70 | 1 8 49,69,192,255,45,128,0,66,74,173 Common 68 2 68 1 # 1 [31 ]0 71 | ù 3 57,66,222,255,85,253,0,39,100,263 Latin 108 0 69 ù # ù [f9 ]a 72 | î 3 62,69,222,255,58,189,0,42,47,199 Latin 115 0 70 î # î [ee ]a 73 | } 10 0,44,216,255,54,148,0,56,59,173 Common 71 10 104 } # } [7d ]p 74 | Z 5 64,68,216,255,72,218,0,30,77,236 Latin 107 0 72 Z # Z [5a ]A 75 |  5 64,68,232,255,100,248,0,25,98,258 Latin 126 0 73  #  [c2 ]A 76 | k 3 57,68,216,255,85,177,0,35,93,198 Latin 83 0 74 k # k [6b ]a 77 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 75 10 88 ] # ] [5d ]p 78 | Y 5 59,68,216,255,91,205,0,47,91,223 Latin 61 0 76 Y # Y [59 ]A 79 | 3 8 0,66,196,255,84,158,0,32,103,173 Common 77 2 77 3 # 3 [33 ]0 80 | 0 8 58,66,187,255,88,164,0,45,103,180 Common 78 2 78 0 # 0 [30 ]0 81 | / 10 0,65,219,255,59,228,0,36,62,238 Common 79 6 79 / # / [2f ]p 82 | 8 8 57,66,219,255,88,162,0,41,103,174 Common 80 2 80 8 # 8 [38 ]0 83 | 2 8 30,69,194,255,80,160,0,27,97,173 Common 81 2 81 2 # 2 [32 ]0 84 | 6 8 58,66,219,255,87,156,0,54,104,173 Common 82 2 82 6 # 6 [36 ]0 85 | K 5 57,68,216,255,92,225,0,37,103,216 Latin 74 0 83 K # K [4b ]A 86 | \ 10 0,67,219,255,28,250,0,71,62,261 Common 84 10 84 \ # \ [5c ]p 87 | Ô 5 0,64,232,255,91,250,0,34,106,258 Latin 27 0 85 Ô # Ô [d4 ]A 88 | ; 10 14,56,131,221,17,93,0,58,38,173 Common 86 10 86 ; # ; [3b ]p 89 | ‘ 10 141,233,210,255,17,64,0,216,30,298 Common 87 10 87 ' # ‘ [2018 ]p 90 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 88 10 75 [ # [ [5b ]p 91 | 5 8 12,66,199,255,82,160,0,36,103,173 Common 89 2 89 5 # 5 [35 ]0 92 | W 5 54,68,216,255,106,314,0,41,117,318 Latin 100 0 90 W # W [57 ]A 93 | & 10 53,64,194,255,108,232,0,47,112,239 Common 91 10 91 & # & [26 ]p 94 | ’ 10 141,233,212,255,17,78,0,109,30,298 Common 92 10 92 ' # ’ [2019 ]p 95 | œ 3 53,69,174,209,104,237,0,42,117,293 Latin 133 0 93 œ # œ [153 ]a 96 | 9 8 0,66,200,255,89,156,0,39,104,173 Common 94 2 94 9 # 9 [39 ]0 97 | 7 8 12,68,196,255,72,160,0,60,75,173 Common 95 2 95 7 # 7 [37 ]0 98 | 4 8 0,68,198,255,93,161,0,41,96,173 Common 96 2 96 4 # 4 [34 ]0 99 | ) 10 0,64,216,255,42,119,0,53,61,173 Common 97 10 65 ) # ) [29 ]p 100 | ñ 3 61,68,214,255,87,407,0,25,101,414 Latin 98 0 98 ñ # ñ [f1 ]a 101 | ë 3 0,64,219,255,87,468,0,32,98,481 Latin 64 0 99 ë # ë [eb ]a 102 | w 3 59,68,187,195,108,235,0,32,117,286 Latin 90 0 100 w # w [77 ]a 103 | = 0 74,139,144,199,90,186,0,32,103,224 Common 101 10 101 = # = [3d ] 104 | " 10 151,225,216,255,52,115,0,71,71,173 Common 102 10 102 " # " [22 ]p 105 | # 10 37,84,200,255,99,221,0,41,109,266 Common 103 4 103 # # # [23 ]p 106 | { 10 0,44,216,255,54,148,0,71,59,173 Common 104 10 71 { # { [7b ]p 107 | « 10 26,133,148,235,63,279,0,35,71,281 Common 105 10 125 « # « [ab ]p 108 | J 5 0,64,216,255,39,242,0,30,62,234 Latin 120 0 106 J # J [4a ]A 109 | z 3 46,68,186,199,65,151,0,32,68,173 Latin 72 0 107 z # z [7a ]a 110 | Ù 5 0,64,232,255,91,350,0,39,106,358 Latin 69 0 108 Ù # Ù [d9 ]A 111 | | 0 0,67,216,255,8,73,0,80,50,173 Common 109 10 109 | # | [7c ] 112 | — 10 110,155,132,167,126,297,0,23,136,298 Common 110 10 110 - # — [2014 ]p 113 | + 0 54,102,171,253,90,176,0,37,103,213 Common 111 3 111 + # + [2b ] 114 | à 3 0,64,222,255,85,407,0,26,97,407 Latin 62 0 112 à # à [e0 ]a 115 | æ 3 0,66,174,232,102,419,0,43,117,427 Latin 47 0 113 æ # æ [e6 ]a 116 | % 10 27,67,205,255,105,257,0,49,117,288 Common 114 4 114 % # % [25 ]p 117 | Î 5 3,68,247,255,63,263,0,36,55,271 Latin 70 0 115 Î # Î [ce ]A 118 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 116 10 116 _ # _ [5f ]p 119 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 117 10 117 " # ” [201d ]p 120 | ÿ 3 0,47,219,254,87,199,0,22,87,230 Latin 137 0 118 ÿ # ÿ [ff ]a 121 | © 0 28,125,209,255,118,232,0,32,119,257 Common 119 10 119 © # © [a9 ] 122 | j 3 0,47,216,255,36,145,0,49,50,173 Latin 106 0 120 j # j [6a ]a 123 | ? 10 40,67,219,255,59,144,0,65,77,188 Common 121 10 121 ? # ? [3f ]p 124 | Ï 5 0,68,232,255,42,437,0,36,55,445 Latin 128 0 122 Ï # Ï [cf ]A 125 | û 3 57,65,222,255,87,286,0,39,100,296 Latin 34 0 123 û # û [fb ]a 126 | ê 3 0,64,222,255,87,409,0,32,98,407 Latin 66 0 124 ê # ê [ea ]a 127 | » 10 0,133,146,235,63,284,0,32,71,294 Common 125 10 105 » # » [bb ]p 128 | â 3 0,64,222,255,85,256,0,26,97,256 Latin 73 0 126 â # â [e2 ]a 129 | Ü 5 58,64,232,255,91,291,0,39,106,299 Latin 131 0 127 Ü # Ü [dc ]A 130 | ï 3 0,69,217,255,42,284,0,42,47,291 Latin 122 0 128 ï # ï [ef ]a 131 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 129 10 129 " # “ [201c ]p 132 | ! 10 41,67,216,255,11,87,0,71,50,173 Common 130 10 130 ! # ! [21 ]p 133 | ü 3 0,65,219,255,85,220,0,39,100,225 Latin 127 0 131 ü # ü [fc ]a 134 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 132 4 132 $ # $ [24 ] 135 | Œ 5 51,68,209,255,104,304,0,32,117,320 Latin 93 0 133 Œ # Œ [152 ]A 136 | ä 3 0,64,219,255,85,294,0,26,97,307 Latin 134 0 134 ä # ä [e4 ]a 137 | @ 10 0,65,211,255,99,286,0,39,117,291 Common 135 10 135 @ # @ [40 ]p 138 | < 0 29,102,173,255,69,184,0,50,90,256 Common 136 10 138 < # < [3c ] 139 | Ÿ 5 61,68,245,255,91,201,0,47,91,192 Latin 118 0 137 Ÿ # Ÿ [178 ]A 140 | > 0 29,102,173,255,78,184,0,50,90,256 Common 138 10 136 > # > [3e ] 141 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 139 4 139 £ # £ [a3 ] 142 | -------------------------------------------------------------------------------- /testdata/kan.Arial_Unicode_MS.exp0.lstmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/kan.Arial_Unicode_MS.exp0.lstmf -------------------------------------------------------------------------------- /testdata/kan/kan.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/kan/kan.traineddata -------------------------------------------------------------------------------- /testdata/kan/kan.unicharset: -------------------------------------------------------------------------------- 1 | 137 2 | NULL 0 NULL 0 3 | Joined 7 0,69,188,255,486,1218,0,30,486,1188 Latin 1 0 1 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,69,186,255,892,2138,0,80,892,2058 Common 98 10 98 |Broken|0|1 # Broken 5 | ಬ 1 64,64,188,192,132,151,0,6,144,161 Kannada 3 0 3 ಬ # ಬ [cac ]x 6 | ಳ 1 63,64,208,225,114,145,0,13,132,136 Kannada 4 0 4 ಳ # ಳ [cb3 ]x 7 | ಸ 1 51,64,211,225,109,153,0,13,133,160 Kannada 5 0 5 ಸ # ಸ [cb8 ]x 8 | ಿ 0 44,50,255,255,210,218,0,0,1,90 Kannada 6 0 6 ಿ # ಿ [cbf ] 9 | ಕ 1 63,64,210,225,90,151,0,11,106,134 Kannada 7 0 7 ಕ # ಕ [c95 ]x 10 | ೊ 0 44,50,255,255,370,438,0,0,154,206 Kannada 8 0 8 ೊ # ೊ [cca ] 11 | ್ಳ 1 44,64,208,255,222,260,0,0,222,260 Kannada 9 17 9 ್ಳ # ್ಳ [ccd cb3 ]x 12 | ಲ 1 63,65,188,188,138,146,0,11,150,165 Kannada 10 0 10 ಲ # ಲ [cb2 ]x 13 | ಾ 0 44,50,255,255,304,358,0,13,87,141 Kannada 11 0 11 ಾ # ಾ [cbe ] 14 | ಗ 1 58,67,206,225,108,139,0,10,119,133 Kannada 12 0 12 ಗ # ಗ [c97 ]x 15 | ದ 1 63,64,208,225,137,166,0,11,150,163 Kannada 13 0 13 ದ # ದ [ca6 ]x 16 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 14 4 14 £ # £ [a3 ] 17 | ೧ 8 64,65,185,192,108,126,0,6,109,136 Kannada 15 0 15 ೧ # ೧ [ce7 ]0 18 | ೦ 8 63,65,186,188,98,116,0,11,100,136 Kannada 16 0 16 ೦ # ೦ [ce6 ]0 19 | ವ 1 64,64,208,225,136,165,0,11,158,162 Kannada 17 0 17 ವ # ವ [cb5 ]x 20 | ರ 1 64,65,208,225,103,133,0,11,116,131 Kannada 18 0 18 ರ # ರ [cb0 ]x 21 | ೋ 0 44,50,255,255,464,556,0,0,240,312 Kannada 19 0 19 ೋ # ೋ [ccb ] 22 | ಧ 1 19,27,208,225,137,166,0,11,150,163 Kannada 20 0 20 ಧ # ಧ [ca7 ]x 23 | ೇ 0 44,50,255,255,308,382,0,0,90,148 Kannada 21 0 21 ೇ # ೇ [cc7 ] 24 | ೆ 0 44,50,255,255,218,273,0,0,1,49 Kannada 22 0 22 ೆ # ೆ [cc6 ] 25 | ? 10 61,64,216,225,77,109,0,53,101,136 Common 23 10 23 ? # ? [3f ]p 26 | ಶ 1 63,64,210,225,112,141,0,11,134,146 Kannada 24 0 24 ಶ # ಶ [cb6 ]x 27 | ಮ 1 64,64,208,225,212,224,0,11,232,240 Kannada 25 0 25 ಮ # ಮ [cae ]x 28 | ್‌ 0 0,255,0,255,0,0,0,0,0,0 Kannada 26 17 26 ್‌ # ್‌ [ccd 200c ] 29 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 27 10 27 " # ” [201d ]p 30 | ಪ 1 63,64,210,225,138,165,0,11,154,167 Kannada 28 0 28 ಪ # ಪ [caa ]x 31 | ್ರ 1 44,65,208,255,206,255,0,0,206,255 Kannada 29 17 29 ್ರ # ್ರ [ccd cb0 ]x 32 | ್ಪ 1 44,64,210,255,244,291,0,0,244,291 Kannada 30 17 30 ್ಪ # ್ಪ [ccd caa ]x 33 | ್ಧ 1 19,50,208,255,240,287,0,0,240,287 Kannada 31 17 31 ್ಧ # ್ಧ [ccd ca7 ]x 34 | ್ಫ 1 19,50,210,255,244,291,0,0,244,291 Kannada 32 17 32 ್ಫ # ್ಫ [ccd cab ]x 35 | 1 8 58,67,210,241,62,142,0,39,109,150 Common 33 2 33 1 # 1 [31 ]0 36 | 3 8 58,65,210,241,84,187,0,25,109,150 Common 34 2 34 3 # 3 [33 ]0 37 | ಇ 1 39,67,180,192,139,145,0,16,146,166 Kannada 35 0 35 ಇ # ಇ [c87 ]x 38 | ು 0 44,50,255,255,278,294,0,0,69,82 Kannada 36 0 36 ು # ು [cc1 ] 39 | ಖ 1 63,64,200,210,158,179,0,13,181,214 Kannada 37 0 37 ಖ # ಖ [c96 ]x 40 | ಷ 1 35,56,210,225,144,165,0,11,157,167 Kannada 38 0 38 ಷ # ಷ [cb7 ]x 41 | ೀ 0 44,50,255,255,309,400,0,0,85,103 Kannada 39 0 39 ೀ # ೀ [cc0 ] 42 | ಜ 1 64,64,192,205,132,154,0,6,147,163 Kannada 40 0 40 ಜ # ಜ [c9c ]x 43 | ನ 1 56,61,210,225,116,144,0,10,131,141 Kannada 41 0 41 ನ # ನ [ca8 ]x 44 | ತ 1 63,64,208,225,108,138,0,13,119,134 Kannada 42 0 42 ತ # ತ [ca4 ]x 45 | ್ನ 1 44,61,210,255,221,265,0,0,221,265 Kannada 43 17 43 ್ನ # ್ನ [ccd ca8 ]x 46 | ಡ 1 63,64,208,225,136,163,0,11,147,165 Kannada 44 0 44 ಡ # ಡ [ca1 ]x 47 | ಒ 1 63,64,186,192,128,146,0,6,146,166 Kannada 45 0 45 ಒ # ಒ [c92 ]x 48 | ್ತ 1 44,64,208,255,209,258,0,0,209,258 Kannada 46 17 46 ್ತ # ್ತ [ccd ca4 ]x 49 | . 10 64,67,96,108,32,47,0,23,55,69 Common 47 6 47 . # . [2e ]p 50 | ್ಯ 1 44,64,210,255,352,398,0,0,352,398 Kannada 48 17 48 ್ಯ # ್ಯ [ccd caf ]x 51 | ್ಮ 1 44,64,208,255,322,364,0,0,322,364 Kannada 49 17 49 ್ಮ # ್ಮ [ccd cae ]x 52 | ಎ 1 64,64,192,192,134,149,0,11,147,166 Kannada 50 0 50 ಎ # ಎ [c8e ]x 53 | ಹ 1 63,64,210,225,147,176,0,10,160,174 Kannada 51 0 51 ಹ # ಹ [cb9 ]x 54 | ಅ 1 64,65,192,192,141,158,0,11,164,175 Kannada 52 0 52 ಅ # ಅ [c85 ]x 55 | ಥ 1 19,27,208,225,137,166,0,11,150,163 Kannada 53 0 53 ಥ # ಥ [ca5 ]x 56 | ್ಲ 1 44,65,188,255,240,289,0,0,240,289 Kannada 54 17 54 ್ಲ # ್ಲ [ccd cb2 ]x 57 | ಂ 0 44,50,255,255,321,336,0,11,108,116 Kannada 55 0 55 ಂ # ಂ [c82 ] 58 | ೃ 0 0,0,255,255,257,302,0,0,38,80 Kannada 56 0 56 ೃ # ೃ [cc3 ] 59 | ್ಟ 1 44,64,196,255,245,296,0,0,245,296 Kannada 57 17 57 ್ಟ # ್ಟ [ccd c9f ]x 60 | ಆ 1 63,64,188,192,143,165,0,11,160,175 Kannada 58 0 58 ಆ # ಆ [c86 ]x 61 | ಫ 1 19,25,210,225,138,165,0,11,154,167 Kannada 59 0 59 ಫ # ಫ [cab ]x 62 | ಭ 1 19,27,210,225,132,171,0,6,144,163 Kannada 60 0 60 ಭ # ಭ [cad ]x 63 | ್ಕ 1 44,64,210,255,196,258,0,0,196,258 Kannada 61 17 61 ್ಕ # ್ಕ [ccd c95 ]x 64 | , 10 29,38,96,106,33,54,0,28,65,69 Common 62 6 62 , # , [2c ]p 65 | ಟ 1 64,64,196,200,139,155,0,11,155,172 Kannada 63 0 63 ಟ # ಟ [c9f ]x 66 | ಘ 1 19,29,206,225,163,206,0,8,173,203 Kannada 64 0 64 ಘ # ಘ [c98 ]x 67 | ೫ 8 63,64,188,198,117,153,0,8,134,148 Kannada 65 0 65 ೫ # ೫ [ceb ]0 68 | ೪ 8 64,65,188,195,113,129,0,13,114,136 Kannada 66 0 66 ೪ # ೪ [cea ]0 69 | ' 10 148,193,215,255,33,59,0,28,54,69 Common 67 10 67 ' # ' [27 ]p 70 | ಈ 1 61,64,215,225,157,205,0,10,170,215 Kannada 68 0 68 ಈ # ಈ [c88 ]x 71 | ಣ 1 63,64,185,188,141,166,0,11,152,175 Kannada 69 0 69 ಣ # ಣ [ca3 ]x 72 | ್ವ 1 44,64,208,255,248,286,0,0,248,286 Kannada 70 17 70 ್ವ # ್ವ [ccd cb5 ]x 73 | ಯ 1 63,64,210,225,240,256,0,16,262,274 Kannada 71 0 71 ಯ # ಯ [caf ]x 74 | ಚ 1 64,64,205,225,148,171,0,6,160,170 Kannada 72 0 72 ಚ # ಚ [c9a ]x 75 | ್ಚ 1 44,64,205,255,250,294,0,0,250,294 Kannada 73 17 73 ್ಚ # ್ಚ [ccd c9a ]x 76 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 74 10 74 " # “ [201c ]p 77 | ! 10 64,67,227,241,32,73,0,40,59,79 Common 75 10 75 ! # ! [21 ]p 78 | ೈ 0 0,0,255,255,257,334,0,0,35,83 Kannada 76 0 76 ೈ # ೈ [cc8 ] 79 | ್ಸ 1 44,64,211,255,223,284,0,0,223,284 Kannada 77 17 77 ್ಸ # ್ಸ [ccd cb8 ]x 80 | ್ಬ 1 44,64,188,255,234,285,0,0,234,285 Kannada 78 17 78 ್ಬ # ್ಬ [ccd cac ]x 81 | ೩ 8 64,65,186,188,98,114,0,11,113,136 Kannada 79 0 79 ೩ # ೩ [ce9 ]0 82 | ಛ 1 19,25,208,225,132,165,0,11,146,160 Kannada 80 0 80 ಛ # ಛ [c9b ]x 83 | ್ಞ 1 37,65,181,255,291,349,0,0,291,349 Kannada 81 17 81 ್ಞ # ್ಞ [ccd c9e ]x 84 | ಐ 1 63,64,186,192,138,160,0,10,160,179 Kannada 82 0 82 ಐ # ಐ [c90 ]x 85 | ್ಷ 1 35,56,210,255,247,291,0,0,247,291 Kannada 83 17 83 ್ಷ # ್ಷ [ccd cb7 ]x 86 | ್ಠ 1 44,65,208,255,206,255,0,0,206,255 Kannada 84 17 84 ್ಠ # ್ಠ [ccd ca0 ]x 87 | ೂ 0 44,50,255,255,368,389,0,0,154,174 Kannada 85 0 85 ೂ # ೂ [cc2 ] 88 | ಔ 1 63,64,229,234,136,166,0,6,147,166 Kannada 86 0 86 ಔ # ಔ [c94 ]x 89 | 4 8 59,67,209,241,101,187,0,15,109,150 Common 87 2 87 4 # 4 [34 ]0 90 | ್ಗ 1 44,67,206,255,209,257,0,0,209,257 Kannada 88 17 88 ್ಗ # ್ಗ [ccd c97 ]x 91 | ಃ 0 44,50,255,255,277,298,0,13,65,69 Kannada 89 0 89 ಃ # ಃ [c83 ] 92 | ೨ 8 64,67,183,188,117,131,0,8,121,136 Kannada 90 0 90 ೨ # ೨ [ce8 ]0 93 | ೯ 8 63,64,183,185,111,141,0,10,121,136 Kannada 91 0 91 ೯ # ೯ [cef ]0 94 | ` 0 168,244,231,255,28,80,0,82,48,173 Common 92 10 92 ' # ` [60 ] 95 | » 10 0,133,146,235,63,284,0,32,71,294 Common 93 10 126 » # » [bb ]p 96 | ( 10 0,20,213,239,53,117,0,8,85,98 Common 94 10 112 ( # ( [28 ]p 97 | ್ದ 1 44,64,208,255,240,287,0,0,240,287 Kannada 95 17 95 ್ದ # ್ದ [ccd ca6 ]x 98 | ಏ 1 64,65,203,205,134,148,0,11,147,167 Kannada 96 0 96 ಏ # ಏ [c8f ]x 99 | ಠ 1 64,65,208,225,103,133,0,11,116,131 Kannada 97 0 97 ಠ # ಠ [ca0 ]x 100 | | 0 0,67,216,255,8,73,0,80,50,173 Common 98 10 98 | # | [7c ] 101 | ಓ 1 63,64,225,229,123,146,0,6,146,166 Kannada 99 0 99 ಓ # ಓ [c93 ]x 102 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 100 4 100 $ # $ [24 ] 103 | 2 8 61,67,210,241,97,180,0,18,109,150 Common 101 2 101 2 # 2 [32 ]0 104 | 5 8 58,65,210,241,89,196,0,21,109,150 Common 102 2 102 5 # 5 [35 ]0 105 | 0 8 58,64,210,242,97,175,0,18,109,150 Common 103 2 103 0 # 0 [30 ]0 106 | ್ಣ 1 44,64,185,255,242,299,0,0,242,299 Kannada 104 17 104 ್ಣ # ್ಣ [ccd ca3 ]x 107 | : 10 64,82,166,182,25,68,0,18,58,70 Common 105 6 105 : # : [3a ]p 108 | ಢ 1 19,25,208,225,136,165,0,11,150,165 Kannada 106 0 106 ಢ # ಢ [ca2 ]x 109 | ಉ 1 63,64,188,192,187,210,0,13,200,232 Kannada 107 0 107 ಉ # ಉ [c89 ]x 110 | ್ಥ 1 19,50,208,255,240,287,0,0,240,287 Kannada 108 17 108 ್ಥ # ್ಥ [ccd ca5 ]x 111 | ಊ 1 63,64,186,192,241,274,0,13,254,298 Kannada 109 0 109 ಊ # ಊ [c8a ]x 112 | - 10 105,161,122,175,49,176,0,43,56,215 Common 110 3 110 - # - [2d ]p 113 | ್ಜ 1 44,64,192,255,237,287,0,0,237,287 Kannada 111 17 111 ್ಜ # ್ಜ [ccd c9c ]x 114 | ) 10 0,20,213,239,53,114,0,26,85,121 Common 112 10 94 ) # ) [29 ]p 115 | / 10 25,51,208,222,55,138,0,8,67,116 Common 113 6 113 / # / [2f ]p 116 | ೭ 8 64,67,186,187,90,116,0,8,101,136 Kannada 114 0 114 ೭ # ೭ [ced ]0 117 | ್ಛ 1 19,50,208,255,236,284,0,0,236,284 Kannada 115 17 115 ್ಛ # ್ಛ [ccd c9b ]x 118 | 8 8 58,64,212,242,84,196,0,25,109,150 Common 116 2 116 8 # 8 [38 ]0 119 | । 10 45,64,208,230,17,52,23,94,66,155 Common 117 0 117 । # । [964 ]p 120 | 9 8 56,63,210,242,94,162,0,26,109,150 Common 118 2 118 9 # 9 [39 ]0 121 | ; 10 29,37,163,185,30,69,0,28,58,71 Common 119 10 119 ; # ; [3b ]p 122 | 6 8 58,64,210,241,94,173,0,20,109,150 Common 120 2 120 6 # 6 [36 ]0 123 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 121 10 121 _ # _ [5f ]p 124 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 122 10 136 ] # ] [5d ]p 125 | ್ಭ 1 19,50,210,255,234,287,0,0,234,287 Kannada 123 17 123 ್ಭ # ್ಭ [ccd cad ]x 126 | ೮ 8 63,64,186,192,112,155,0,10,132,143 Kannada 124 0 124 ೮ # ೮ [cee ]0 127 | ೬ 8 64,67,186,193,91,112,0,10,105,136 Kannada 125 0 125 ೬ # ೬ [cec ]0 128 | « 10 26,133,148,235,63,279,0,35,71,281 Common 126 10 93 « # « [ab ]p 129 | * 10 65,131,180,251,80,126,3,29,107,146 Common 127 10 127 * # * [2a ]p 130 | % 10 58,65,213,255,148,193,3,22,170,213 Common 128 4 128 % # % [25 ]p 131 | ೌ 0 44,50,255,255,320,343,0,0,106,122 Kannada 129 17 129 ೌ # ೌ [ccc ] 132 | ್ಡ 1 44,64,208,255,237,289,0,0,237,289 Kannada 130 17 130 ್ಡ # ್ಡ [ccd ca1 ]x 133 | 7 8 58,65,210,241,92,172,0,32,109,150 Common 131 2 131 7 # 7 [37 ]0 134 | " 10 154,193,220,255,33,63,0,3,49,69 Common 132 10 132 " # " [22 ]p 135 | & 10 53,64,194,255,108,232,0,47,112,239 Common 133 10 133 & # & [26 ]p 136 | ್ಶ 1 44,64,210,255,224,270,0,0,224,270 Kannada 134 17 134 ್ಶ # ್ಶ [ccd cb6 ]x 137 | ॥ 10 49,64,208,230,61,94,23,64,106,155 Common 135 0 135 ॥ # ॥ [965 ]p 138 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 136 10 122 [ # [ [5b ]p 139 | -------------------------------------------------------------------------------- /testdata/kor.Arial_Unicode_MS.exp0.lstmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/kor.Arial_Unicode_MS.exp0.lstmf -------------------------------------------------------------------------------- /testdata/kor/kor.traineddata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testdata/kor/kor.traineddata -------------------------------------------------------------------------------- /testdata/por.unicharset: -------------------------------------------------------------------------------- 1 | 226 2 | NULL 0 NULL 0 3 | Joined 7 0,69,188,255,486,1218,0,30,486,1188 Latin 22 0 64 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,69,186,255,892,2138,0,80,892,2058 Common 101 10 101 |Broken|0|1 # Broken 5 | T 5 59,68,216,255,85,227,0,47,88,236 Latin 46 0 3 T # T [54 ]A 6 | u 3 57,65,187,202,85,184,0,39,100,208 Latin 47 0 4 u # u [75 ]a 7 | d 3 57,65,216,255,88,174,0,28,100,200 Latin 23 0 5 d # d [64 ]a 8 | o 3 58,66,188,200,87,151,0,32,98,185 Latin 28 0 6 o # o [6f ]a 9 | S 5 57,64,219,255,87,174,0,30,100,200 Latin 15 0 7 S # S [53 ]A 10 | C 5 58,65,219,255,87,192,0,32,107,209 Latin 31 0 8 C # C [43 ]A 11 | R 5 57,68,216,255,88,227,0,27,104,232 Latin 20 0 9 R # R [52 ]A 12 | A 5 52,68,216,255,100,216,0,17,98,231 Latin 14 0 10 A # A [41 ]A 13 | P 5 57,68,216,255,87,225,0,32,97,230 Latin 37 0 11 P # P [50 ]A 14 | g 3 0,43,188,212,88,176,0,32,100,210 Latin 76 0 12 g # g [67 ]a 15 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 27 0 13 i # i [69 ]a 16 | a 3 58,65,186,200,85,164,0,26,97,185 Latin 10 0 14 a # a [61 ]a 17 | s 3 58,65,192,200,78,147,0,30,91,173 Latin 7 0 15 s # s [73 ]a 18 | 2 8 30,69,194,255,80,160,0,27,97,173 Common 16 2 16 2 # 2 [32 ]0 19 | 0 8 58,66,187,255,88,164,0,45,103,180 Common 17 2 17 0 # 0 [30 ]0 20 | 8 8 57,66,219,255,88,162,0,41,103,174 Common 18 2 18 8 # 8 [38 ]0 21 | M 5 57,68,216,255,99,301,0,35,117,286 Latin 35 0 19 M # M [4d ]A 22 | r 3 59,68,186,202,58,173,0,40,69,180 Latin 9 0 20 r # r [72 ]a 23 | y 3 0,47,187,202,87,199,0,25,87,230 Latin 109 0 21 y # y [79 ]a 24 | j 3 0,47,216,255,36,145,0,49,50,173 Latin 64 0 22 j # j [6a ]a 25 | D 5 59,68,216,255,93,230,0,27,107,236 Latin 5 0 23 D # D [44 ]A 26 | w 3 59,68,187,195,108,235,0,32,117,286 Latin 108 0 24 w # w [77 ]a 27 | n 3 59,68,188,202,87,187,0,25,101,208 Latin 49 0 25 n # n [6e ]a 28 | É 5 59,68,232,255,68,314,0,31,80,325 Latin 59 0 26 É # É [c9 ]A 29 | I 5 59,68,216,255,10,155,0,50,29,173 Latin 13 0 27 I # I [49 ]A 30 | O 5 57,64,219,255,91,209,0,34,106,233 Latin 6 0 28 O # O [4f ]A 31 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 29 10 29 " # “ [201c ]p 32 | l 3 59,68,216,255,11,147,0,56,27,173 Latin 48 0 30 l # l [6c ]a 33 | c 3 58,64,192,200,80,153,0,36,88,178 Latin 8 0 31 c # c [63 ]a 34 | e 3 58,64,189,200,87,154,0,32,98,188 Latin 33 0 32 e # e [65 ]a 35 | E 5 59,68,216,255,68,210,0,31,80,219 Latin 32 0 33 E # E [45 ]A 36 | â 3 0,64,222,255,85,256,0,26,97,256 Latin 34 0 34 â # â [e2 ]a 37 | m 3 56,68,189,202,108,280,0,25,117,306 Latin 19 0 35 m # m [6d ]a 38 | h 3 59,68,216,255,87,187,0,25,101,208 Latin 97 0 36 h # h [68 ]a 39 | p 3 0,47,192,226,87,180,0,25,100,200 Latin 11 0 37 p # p [70 ]a 40 | B 5 62,68,216,255,91,227,0,27,106,227 Latin 45 0 38 B # B [42 ]A 41 | . 10 26,67,73,112,13,51,0,67,30,173 Common 39 6 39 . # . [2e ]p 42 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 40 4 40 £ # £ [a3 ] 43 | ó 3 0,64,222,255,87,192,0,32,98,197 Latin 73 0 41 ó # ó [f3 ]a 44 | 1 8 49,69,192,255,45,128,0,66,74,173 Common 42 2 42 1 # 1 [31 ]0 45 | 6 8 58,66,219,255,87,156,0,54,104,173 Common 43 2 43 6 # 6 [36 ]0 46 | 4 8 0,68,198,255,93,161,0,41,96,173 Common 44 2 44 4 # 4 [34 ]0 47 | b 3 58,64,216,255,87,180,0,25,100,200 Latin 38 0 45 b # b [62 ]a 48 | t 3 58,66,206,254,57,167,0,47,59,180 Latin 3 0 46 t # t [74 ]a 49 | U 5 58,64,216,255,91,214,0,39,106,220 Latin 4 0 47 U # U [55 ]A 50 | L 5 59,68,216,255,64,193,0,31,74,206 Latin 30 0 48 L # L [4c ]A 51 | N 5 59,68,216,255,87,262,0,27,104,249 Latin 25 0 49 N # N [4e ]A 52 | V 5 59,68,216,255,103,207,0,41,101,245 Latin 56 0 50 V # V [56 ]A 53 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 51 10 51 " # ” [201d ]p 54 | ) 10 0,64,216,255,42,119,0,53,61,173 Common 52 10 86 ) # ) [29 ]p 55 | , 10 14,46,79,115,17,78,0,58,30,173 Common 53 6 53 , # , [2c ]p 56 | k 3 57,68,216,255,85,177,0,35,93,198 Latin 57 0 54 k # k [6b ]a 57 | : 10 58,85,141,221,11,69,0,67,38,173 Common 55 6 55 : # : [3a ]p 58 | v 3 59,68,187,197,84,173,0,32,84,218 Latin 50 0 56 v # v [76 ]a 59 | K 5 57,68,216,255,92,225,0,37,103,216 Latin 54 0 57 K # K [4b ]A 60 | í 3 62,69,222,255,40,279,0,54,47,286 Latin 92 0 58 í # í [ed ]a 61 | é 3 0,64,222,255,87,384,0,32,98,391 Latin 26 0 59 é # é [e9 ]a 62 | ê 3 0,64,222,255,87,409,0,32,98,407 Latin 99 0 60 ê # ê [ea ]a 63 | Ç 5 0,64,217,255,87,286,0,32,107,296 Latin 82 0 61 Ç # Ç [c7 ]A 64 | @ 10 0,65,211,255,99,286,0,39,117,291 Common 62 10 62 @ # @ [40 ]p 65 | Ú 5 0,64,232,255,91,294,0,39,106,291 Latin 107 0 63 Ú # Ú [da ]A 66 | J 5 0,64,216,255,39,242,0,30,62,234 Latin 22 0 64 J # J [4a ]A 67 | € 0 32,68,209,255,97,238,0,49,103,293 Common 65 4 65 € # € [20ac ] 68 | 9 8 0,66,200,255,89,156,0,39,104,173 Common 66 2 66 9 # 9 [39 ]0 69 | 5 8 12,66,199,255,82,160,0,36,103,173 Common 67 2 67 5 # 5 [35 ]0 70 | & 10 53,64,194,255,108,232,0,47,112,239 Common 68 10 68 & # & [26 ]p 71 | x 3 59,68,187,201,85,189,0,25,84,218 Latin 89 0 69 x # x [78 ]a 72 | / 10 0,65,219,255,59,228,0,36,62,238 Common 70 6 70 / # / [2f ]p 73 | ² 0 3,192,209,255,50,248,0,105,0,293 Common 71 2 71 2 # ² [b2 ] 74 | F 5 57,68,216,255,68,210,0,31,77,209 Latin 84 0 72 F # F [46 ]A 75 | Ó 5 0,64,232,255,91,276,0,34,106,286 Latin 41 0 73 Ó # Ó [d3 ]A 76 | 3 8 0,66,196,255,84,158,0,32,103,173 Common 74 2 74 3 # 3 [33 ]0 77 | z 3 46,68,186,199,65,151,0,32,68,173 Latin 112 0 75 z # z [7a ]a 78 | G 5 58,64,219,255,91,230,0,30,106,230 Latin 12 0 76 G # G [47 ]A 79 | á 3 0,64,222,255,85,414,0,26,97,412 Latin 100 0 77 á # á [e1 ]a 80 | - 10 105,161,122,175,49,176,0,43,56,215 Common 78 3 78 - # - [2d ]p 81 | ? 10 40,67,219,255,59,144,0,65,77,188 Common 79 10 79 ? # ? [3f ]p 82 | ! 10 41,67,216,255,11,87,0,71,50,173 Common 80 10 80 ! # ! [21 ]p 83 | q 3 0,47,192,202,88,196,0,30,100,200 Latin 116 0 81 q # q [71 ]a 84 | ç 3 0,31,182,232,80,299,0,36,96,309 Latin 61 0 82 ç # ç [e7 ]a 85 | ã 3 0,64,224,255,85,279,0,26,97,289 Latin 96 0 83 ã # ã [e3 ]a 86 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 72 0 84 f # f [66 ]a 87 | + 0 54,102,171,253,90,176,0,37,103,213 Common 85 3 85 + # + [2b ] 88 | ( 10 0,64,216,255,42,118,0,97,61,173 Common 86 10 52 ( # ( [28 ]p 89 | ' 10 148,225,216,255,11,51,0,97,36,173 Common 87 10 87 ' # ' [27 ]p 90 | ; 10 14,56,131,221,17,93,0,58,38,173 Common 88 10 88 ; # ; [3b ]p 91 | X 5 59,68,216,255,94,275,0,25,93,256 Latin 69 0 89 X # X [58 ]A 92 | * 10 78,183,188,255,49,134,0,60,53,173 Common 90 10 90 * # * [2a ]p 93 | º 3 64,187,188,255,51,189,0,81,64,293 Latin 91 0 91 o # º [ba ]a 94 | Í 5 64,68,232,255,35,197,0,48,55,207 Latin 58 0 92 Í # Í [cd ]A 95 | ³ 0 0,192,209,255,48,268,0,99,0,293 Common 93 2 93 3 # ³ [b3 ] 96 | › 10 64,101,142,215,32,100,0,84,37,173 Common 94 10 94 › # › [203a ]p 97 | ª 3 64,187,207,255,51,286,0,71,62,296 Latin 95 0 95 a # ª [aa ]a 98 | à 5 6,68,232,255,100,204,0,25,98,245 Latin 83 0 96 à # à [c3 ]A 99 | H 5 59,68,216,255,91,258,0,27,107,244 Latin 36 0 97 H # H [48 ]A 100 | # 10 37,84,200,255,99,221,0,41,109,266 Common 98 4 98 # # # [23 ]p 101 | Ê 5 0,68,232,255,72,266,0,31,80,276 Latin 60 0 99 Ê # Ê [ca ]A 102 | Á 5 64,68,232,255,100,203,0,29,98,245 Latin 77 0 100 Á # Á [c1 ]A 103 | | 0 0,67,216,255,8,73,0,80,50,173 Common 101 10 101 | # | [7c ] 104 | " 10 151,225,216,255,52,115,0,71,71,173 Common 102 10 102 " # " [22 ]p 105 | > 0 29,102,173,255,78,184,0,50,90,256 Common 103 10 111 > # > [3e ] 106 | à 3 0,64,222,255,85,407,0,26,97,407 Latin 104 0 104 à # à [e0 ]a 107 | õ 3 58,66,224,255,87,194,0,32,98,204 Latin 105 0 105 õ # õ [f5 ]a 108 | « 10 26,133,148,235,63,279,0,35,71,281 Common 106 10 126 « # « [ab ]p 109 | ú 3 0,65,222,255,85,212,0,39,100,212 Latin 63 0 107 ú # ú [fa ]a 110 | W 5 54,68,216,255,106,314,0,41,117,318 Latin 24 0 108 W # W [57 ]A 111 | Y 5 59,68,216,255,91,205,0,47,91,223 Latin 21 0 109 Y # Y [59 ]A 112 | 7 8 12,68,196,255,72,160,0,60,75,173 Common 110 2 110 7 # 7 [37 ]0 113 | < 0 29,102,173,255,69,184,0,50,90,256 Common 111 10 103 < # < [3c ] 114 | Z 5 64,68,216,255,72,218,0,30,77,236 Latin 75 0 112 Z # Z [5a ]A 115 | ¹ 0 64,192,209,255,24,279,1,119,0,293 Common 113 2 113 1 # ¹ [b9 ] 116 | ü 3 0,65,219,255,85,220,0,39,100,225 Latin 114 0 114 ü # ü [fc ]a 117 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 115 10 115 _ # _ [5f ]p 118 | Q 5 7,64,219,255,91,205,0,30,106,227 Latin 81 0 116 Q # Q [51 ]A 119 | … 10 60,143,79,232,101,332,0,45,107,337 Common 117 10 117 ... # … [2026 ]p 120 | ¡ 10 0,66,185,255,11,176,0,125,49,293 Common 118 10 118 ¡ # ¡ [a1 ]p 121 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 119 4 119 $ # $ [24 ] 122 | © 0 28,125,209,255,118,232,0,32,119,257 Common 120 10 120 © # © [a9 ] 123 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 121 10 123 [ # [ [5b ]p 124 | % 10 27,67,205,255,105,257,0,49,117,288 Common 122 4 122 % # % [25 ]p 125 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 123 10 121 ] # ] [5d ]p 126 | = 0 74,139,144,199,90,186,0,32,103,224 Common 124 10 124 = # = [3d ] 127 | ₂ 0 10,67,113,172,50,118,0,105,77,293 Common 125 2 125 2 # ₂ [2082 ] 128 | » 10 0,133,146,235,63,284,0,32,71,294 Common 126 10 106 » # » [bb ]p 129 | ⁴ 0 115,163,227,255,63,131,0,101,77,293 Common 127 2 127 4 # ⁴ [2074 ] 130 | ô 3 58,66,222,255,87,192,0,32,98,202 Latin 128 0 128 ô # ô [f4 ]a 131 | ° 0 66,247,209,255,22,399,0,98,66,409 Common 129 4 129 ° # ° [b0 ] 132 | ₄ 0 12,67,115,170,62,131,0,97,77,293 Common 130 2 130 4 # ₄ [2084 ] 133 | ₃ 0 8,67,113,172,52,106,0,103,77,293 Common 131 2 131 3 # ₃ [2083 ] 134 | ₁ 0 10,67,113,172,36,78,0,108,77,293 Common 132 2 132 1 # ₁ [2081 ] 135 | fl 3 0,68,216,255,82,408,0,42,82,366 Latin 72 0 84 fl # fl [66 6c ]a 136 | fi 3 0,69,216,255,82,408,0,42,82,366 Latin 72 0 84 fi # fi [66 69 ]a 137 | ... 10 26,67,73,112,90,586,0,67,90,519 Common 39 6 39 ... # ... [2e 2e 2e ]p 138 | ff 3 0,68,216,255,110,428,0,42,110,386 Latin 72 0 84 ff # ff [66 66 ]a 139 | ⁸ 0 124,151,229,255,56,102,0,53,75,173 Common 137 2 137 8 # ⁸ [2078 ] 140 | ⁶ 0 124,151,229,255,56,99,0,56,77,173 Common 138 2 138 6 # ⁶ [2076 ] 141 | ⁹ 0 126,153,230,255,56,104,0,57,77,173 Common 139 2 139 9 # ⁹ [2079 ] 142 | ⁵ 0 124,153,227,255,50,104,0,51,75,173 Common 140 2 140 5 # ⁵ [2075 ] 143 | ⁷ 0 128,153,227,255,52,106,0,58,77,173 Common 141 2 141 7 # ⁷ [2077 ] 144 | ⁰ 0 124,151,229,255,56,102,0,53,77,173 Common 142 2 142 0 # ⁰ [2070 ] 145 | ₆ 0 10,65,118,172,56,99,0,56,77,173 Common 143 2 143 6 # ₆ [2086 ] 146 | ₉ 0 10,65,118,172,56,104,0,57,77,173 Common 144 2 144 9 # ₉ [2089 ] 147 | ₀ 0 8,65,119,172,56,102,0,53,77,173 Common 145 2 145 0 # ₀ [2080 ] 148 | ₅ 0 8,65,113,170,50,98,0,51,77,173 Common 146 2 146 5 # ₅ [2085 ] 149 | ₈ 0 8,65,118,172,59,102,0,53,75,173 Common 147 2 147 8 # ₈ [2088 ] 150 | ffi 3 0,69,216,255,137,601,0,42,137,559 Latin 72 0 84 ffi # ffi [66 66 69 ]a 151 | ₇ 0 10,67,115,170,52,92,0,60,77,173 Common 149 2 149 7 # ₇ [2087 ] 152 | Th 7 59,68,216,255,189,491,0,47,189,444 Latin 46 0 3 Th # Th [54 68 ]a 153 | ft 3 0,68,206,255,114,415,0,42,114,373 Latin 72 0 84 ft # ft [66 74 ]a 154 | ffl 3 0,68,216,255,137,601,0,42,137,559 Latin 72 0 84 ffl # ffl [66 66 6c ]a 155 | NJ 5 0,68,216,255,166,510,0,27,166,483 Latin 25 0 49 NJ # NJ [4e 4a ]A 156 | ij 3 0,69,216,255,77,400,0,54,77,346 Latin 27 0 13 ij # ij [69 6a ]a 157 | tt 3 58,66,206,254,118,407,0,47,118,360 Latin 3 0 46 tt # tt [74 74 ]a 158 | ti 3 58,69,206,255,86,400,0,47,86,353 Latin 3 0 46 ti # ti [74 69 ]a 159 | it 3 58,69,206,255,86,407,0,54,86,353 Latin 27 0 13 it # it [69 74 ]a 160 | sc 3 58,65,192,200,179,381,0,30,179,351 Latin 7 0 15 sc # sc [73 63 ]a 161 | rt 3 58,68,186,254,128,400,0,40,128,360 Latin 9 0 20 rt # rt [72 74 ]a 162 | es 3 58,65,189,200,189,393,0,32,189,361 Latin 33 0 32 es # es [65 73 ]a 163 | ee 3 58,64,189,200,196,408,0,32,196,376 Latin 33 0 32 ee # ee [65 65 ]a 164 | th 3 58,68,206,255,160,435,0,47,160,388 Latin 3 0 46 th # th [74 68 ]a 165 | st 3 58,66,192,254,150,383,0,30,150,353 Latin 7 0 15 st # st [73 74 ]a 166 | ch 3 58,68,192,255,189,422,0,36,189,386 Latin 8 0 31 ch # ch [63 68 ]a 167 | et 3 58,66,189,254,157,400,0,32,157,368 Latin 33 0 32 et # et [65 74 ]a 168 | sh 3 58,68,192,255,192,411,0,30,192,381 Latin 7 0 15 sh # sh [73 68 ]a 169 | il 3 59,69,216,255,54,400,0,54,54,346 Latin 27 0 13 il # il [69 6c ]a 170 | ot 3 58,66,188,254,157,397,0,32,157,365 Latin 28 0 6 ot # ot [6f 74 ]a 171 | ge 3 0,64,188,212,198,430,0,32,198,398 Latin 76 0 12 ge # ge [67 65 ]a 172 | sp 3 0,65,192,226,191,403,0,30,191,373 Latin 7 0 15 sp # sp [73 70 ]a 173 | di 3 57,69,216,255,127,401,0,28,127,373 Latin 23 0 5 di # di [64 69 ]a 174 | fü 3 0,68,216,255,155,460,0,42,155,418 Latin 72 0 84 fü # fü [66 fc ]a 175 | ss 3 58,65,192,200,182,376,0,30,182,346 Latin 7 0 15 ss # ss [73 73 ]a 176 | pp 3 0,47,192,226,200,425,0,25,200,400 Latin 11 0 37 pp # pp [70 70 ]a 177 | pt 3 0,66,192,254,159,405,0,25,159,380 Latin 11 0 37 pt # pt [70 74 ]a 178 | sl 3 58,68,192,255,118,376,0,30,118,346 Latin 7 0 15 sl # sl [73 6c ]a 179 | sf 3 0,68,192,255,146,396,0,30,146,366 Latin 7 0 15 sf # sf [73 66 ]a 180 | cc 3 58,64,192,200,176,392,0,36,176,356 Latin 8 0 31 cc # cc [63 63 ]a 181 | ll 3 59,68,216,255,54,402,0,56,54,346 Latin 48 0 30 ll # ll [6c 6c ]a 182 | ct 3 58,66,192,254,147,394,0,36,147,358 Latin 8 0 31 ct # ct [63 74 ]a 183 | rr 3 59,68,186,202,138,400,0,40,138,360 Latin 9 0 20 rr # rr [72 72 ]a 184 | aa 3 58,65,186,200,194,396,0,26,194,370 Latin 10 0 14 aa # aa [61 61 ]a 185 | fu 3 0,68,187,255,155,443,0,42,155,401 Latin 72 0 84 fu # fu [66 75 ]a 186 | ii 3 59,69,216,255,54,400,0,54,54,346 Latin 27 0 13 ii # ii [69 69 ]a 187 | ph 3 0,68,192,255,201,433,0,25,201,408 Latin 11 0 37 ph # ph [70 68 ]a 188 | gy 3 0,47,187,212,187,472,0,32,187,440 Latin 76 0 12 gy # gy [67 79 ]a 189 | fr 3 0,68,186,255,124,415,0,42,124,373 Latin 72 0 84 fr # fr [66 72 ]a 190 | dt 3 57,66,206,255,159,408,0,28,159,380 Latin 23 0 5 dt # dt [64 74 ]a 191 | cti 3 58,69,192,255,174,567,0,36,174,531 Latin 8 0 31 cti # cti [63 74 69 ]a 192 | oo 3 58,66,188,200,196,402,0,32,196,370 Latin 28 0 6 oo # oo [6f 6f ]a 193 | sti 3 58,69,192,255,177,556,0,30,177,526 Latin 7 0 15 sti # sti [73 74 69 ]a 194 | sk 3 57,68,192,255,184,401,0,30,184,371 Latin 7 0 15 sk # sk [73 6b ]a 195 | cs 3 58,65,192,200,179,387,0,36,179,351 Latin 8 0 31 cs # cs [63 73 ]a 196 | ooo 3 58,66,188,200,294,587,0,32,294,555 Latin 28 0 6 ooo # ooo [6f 6f 6f ]a 197 | ty 3 0,66,187,254,146,457,0,47,146,410 Latin 3 0 46 ty # ty [74 79 ]a 198 | tz 3 46,68,186,254,127,400,0,47,127,353 Latin 3 0 46 tz # tz [74 7a ]a 199 | fk 3 0,68,216,255,148,433,0,42,148,391 Latin 72 0 84 fk # fk [66 6b ]a 200 | ck 3 57,68,192,255,181,412,0,36,181,376 Latin 8 0 31 ck # ck [63 6b ]a 201 | gg 3 0,43,188,212,200,452,0,32,200,420 Latin 76 0 12 gg # gg [67 67 ]a 202 | °C 5 58,247,209,255,173,716,0,98,173,618 Common 129 4 129 °C # °C [b0 43 ]A 203 | !? 10 40,67,216,255,127,432,0,71,127,361 Common 80 10 80 !? # !? [21 3f ]p 204 | !! 10 41,67,216,255,100,417,0,71,100,346 Common 80 10 80 !! # !! [21 21 ]p 205 | Qu 7 7,65,187,255,206,465,0,30,206,435 Latin 81 0 116 Qu # Qu [51 75 ]a 206 | ry 3 0,68,186,202,156,450,0,40,156,410 Latin 9 0 20 ry # ry [72 79 ]a 207 | gj 3 0,47,188,255,150,415,0,32,150,383 Latin 76 0 12 gj # gj [67 6a ]a 208 | bt 3 58,66,206,255,159,405,0,25,159,380 Latin 38 0 45 bt # bt [62 74 ]a 209 | sch 3 58,68,192,255,280,589,0,30,280,559 Latin 7 0 15 sch # sch [73 63 68 ]a 210 | SS 5 57,64,219,255,200,430,0,30,200,400 Latin 15 0 7 SS # SS [53 53 ]A 211 | AND 5 52,68,216,255,309,733,0,17,309,716 Latin 14 0 10 AND # AND [41 4e 44 ]A 212 | ET 5 59,68,216,255,168,486,0,31,168,455 Latin 32 0 33 ET # ET [45 54 ]A 213 | UND 5 58,68,216,255,317,744,0,39,317,705 Latin 4 0 47 UND # UND [55 4e 44 ]A 214 | fb 3 0,68,216,255,155,435,0,42,155,393 Latin 72 0 84 fb # fb [66 62 ]a 215 | fj 3 0,68,216,255,105,408,0,42,105,366 Latin 72 0 84 fj # fj [66 6a ]a 216 | nj 3 0,68,188,255,151,406,0,25,151,381 Latin 49 0 25 nj # nj [6e 6a ]a 217 | ffb 3 0,68,216,255,210,628,0,42,210,586 Latin 72 0 84 ffb # ffb [66 66 62 ]a 218 | fh 3 0,68,216,255,156,443,0,42,156,401 Latin 72 0 84 fh # fh [66 68 ]a 219 | or 3 58,68,186,202,167,397,0,32,167,365 Latin 28 0 6 or # or [6f 72 ]a 220 | on 3 58,68,188,202,199,425,0,32,199,393 Latin 28 0 6 on # on [6f 6e ]a 221 | of 3 0,68,188,255,153,410,0,32,153,378 Latin 28 0 6 of # of [6f 66 ]a 222 | om 3 56,68,188,202,215,523,0,32,215,491 Latin 28 0 6 om # om [6f 6d ]a 223 | op 3 0,66,188,226,198,417,0,32,198,385 Latin 28 0 6 op # op [6f 70 ]a 224 | ou 3 57,66,187,202,198,425,0,32,198,393 Latin 28 0 6 ou # ou [6f 75 ]a 225 | fft 3 0,68,206,255,169,608,0,42,169,566 Latin 72 0 84 fft # fft [66 66 74 ]a 226 | sb 3 58,65,192,255,191,403,0,30,191,373 Latin 7 0 15 sb # sb [73 62 ]a 227 | the 3 58,68,189,255,258,623,0,47,258,576 Latin 3 0 46 the # the [74 68 65 ]a 228 | -------------------------------------------------------------------------------- /testdata/scanftest.txt: -------------------------------------------------------------------------------- 1 | 42.5 17 0.001000 -0.001000 2 | 0 1 123 -123 0x100 3 | abcdefghijklmnopqrstuvwxyz 4 | abcdefghijklmnopqrstuvwxyz 5 | MF 25 6.25e-2 0.5e5 -1e+4 6 | 42 MF 25 6.25e-2 0.5 7 | 24 8 | -------------------------------------------------------------------------------- /testdata/trivial.unicharset: -------------------------------------------------------------------------------- 1 | 4 2 | NULL 0 NULL 0 3 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 1 0 1 i # i [69 ]a 4 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 2 0 2 f # f [66 ]a 5 | fi 3 0,71,216,255,87,202,0,28,105,199 Latin 3 0 3 fi # fi [fb01 ]a 6 | -------------------------------------------------------------------------------- /testing/12.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/12.tif -------------------------------------------------------------------------------- /testing/136.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/136.tif -------------------------------------------------------------------------------- /testing/256.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/256.tif -------------------------------------------------------------------------------- /testing/324.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/324.tif -------------------------------------------------------------------------------- /testing/410.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/410.tif -------------------------------------------------------------------------------- /testing/432.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/432.tif -------------------------------------------------------------------------------- /testing/433.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/433.tif -------------------------------------------------------------------------------- /testing/540.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/540.tif -------------------------------------------------------------------------------- /testing/692.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/692.tif -------------------------------------------------------------------------------- /testing/779.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/779.tif -------------------------------------------------------------------------------- /testing/793.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/793.tif -------------------------------------------------------------------------------- /testing/8071_093.3B.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/8071_093.3B.tif -------------------------------------------------------------------------------- /testing/8071_093.3B.txt: -------------------------------------------------------------------------------- 1 | SHE MOPED. She sighed. She 2 | wandered around the house 3 | with the look of an injured 4 | puppy, sometimes bursting into 5 | tears. When her worried parents 6 | questioned her, she would answer, 7 | "It's nothing." How could she ex- 8 | plain she was desperately in love for 9 | the first time, and the object of her 10 | adoration didn't even know she 11 | existed? Finally her father sat 12 | down, took her hand and tenderly 13 | questioned her until he learned the 14 | 15 | 16 | cause of her sadness. Gently, sim- 17 | ply, he talked with her of life and 18 | love. He comforted her. She was six 19 | years old. 20 | "I've never forgotten that mo- 21 | ment," says Kathleen Kilpatrick, a 22 | special assistant in the office of the 23 | U.S. Secretary of the Interior. 24 | "How often I've thought back and 25 | wondered that he didn't just laugh 26 | at me. Instead, he treated me with 27 | dignity and concern for how deeply 28 | I felt." 29 | Robert Kilpatrick, now retired 30 | board chairman of CIGNA Corp., 31 | never took a course in parenting. 32 | He was simply showing love for 33 | his daughter, giving her his time 34 | and trying to see the world 35 | through her eyes. It would be hard 36 | to come up with a better formula 37 | for fatherhood. 38 | "Fathers bring a unique pres- 39 | ence, a special strength to raising 40 | 41 | 42 | children," says Ray Guarendi, a 43 | clinical psychologist whose book 44 | Back to the Family examines the 45 | long-term experience of 100 46 | ~~ successful American families. 47 | 48 | 49 | Guarendi's book shows that tradi- 50 | tional values, rooted in the bedrock 51 | of mutual trust, truth and uncondi- 52 | tional love, are still the keys to 53 | successful child-rearing. And in 54 | this setting, fathers bring special 55 | gifts to parenting. 56 | Sometimes fatherly instincts 57 | come easily; sometimes they have to 58 | be cultivated. But their payoff in 59 | lasting, positive effects on growing 60 | children is enormous. Here, culled 61 | from real-life experience, is what 62 | kids need most from a dad: 63 | Someone who shows his love for 64 | them. Kenneth Meade, pastor of the 65 | Church of Christ at Manor Woods, 66 | in Rockville, Md., says one of the 67 | most frequently expressed desires 68 | in his family counseling comes 69 | from children saying, "I wish Dad 70 | would tell me or show he really 71 | loves me." Time after time, clergy- 72 | men, counselors and psychologists 73 | encounter variations of this theme-~ 74 | children, often grown men, longing 75 | for more expressed affection from 76 | their fathers. "I never doubted my 77 | father's love," writes author Walt 78 | Harrington, "but even today I can't 79 | recall a time he hugged or kissed 80 | me or said he loved me." 81 | Such overt expressions give 82 | needed assurance and encourage- 83 | ment. Motivational expert Zig Zig- 84 | lar had this confirmed in reverse 85 | 86 | 87 | one day when he was trying to 88 | assemble a tricycle he had just 89 | bought for his then-four-year-old 90 | son, Tom. Ziglar became more ex- 91 | asperated by the minute because 92 | the proverbial bolt A was not fit- 93 | ting into nut B. He was about to 94 | give up when Tom, who was look- 95 | ing on, suddenly blurted, "I sure do 96 | love you, Dad!" Needless to say, 97 | Ziglar finished putting together 98 | that tricycle. 99 | Guarendi found that "in strong 100 | families, fathers who had problems 101 | expressing affection made special 102 | efforts to show it." Some who 103 | couldn't say it wrote it in a letter, 104 | card or note on the back of one of 105 | their children's drawings. One fa- 106 | ther, adept at composing music, 107 | writes songs to his kids to tell them 108 | he loves them. 109 | Fathers might also consider a 110 | technique used by a mother in 111 | Utah. Knowing her children were 112 | uncomfortable with public displays 113 | of affection, she worked out a secret 114 | code, known only to her kids. 115 | When one of them was about to 116 | participate in a sport or other school 117 | activity, she would squeeze the 118 | child's shoulder, meaning "I love 119 | you." 120 | An important "surrogate" way 121 | for a father to convey affection to a 122 | child is through the affection he 123 | 124 | 125 | Listen to the voices 126 | of real experience 127 | 128 | 129 | What Kids Need Most in a Dad 130 | 131 | BY RALPH KINNEY BENNETT 132 | 133 | 134 | PHOTO: ~ PAUL BARTON/THE STOCK MARKET 135 | 136 | 137 | 93 138 | 139 | 140 | -------------------------------------------------------------------------------- /testing/8071_093.3B.uzn: -------------------------------------------------------------------------------- 1 | 144 121 672 654 Text 2 | 832 148 667 1110 Text 3 | 874 1255 618 237 Text 4 | 1723 139 667 1680 Text 5 | 2403 148 685 1680 Text 6 | 886 1647 519 177 Text 7 | 886 1831 2013 226 Text 8 | 895 2074 474 60 Caption 9 | 2983 2092 69 55 Header/Footer 10 | -------------------------------------------------------------------------------- /testing/808.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/808.tif -------------------------------------------------------------------------------- /testing/8087_054.3B.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/8087_054.3B.tif -------------------------------------------------------------------------------- /testing/8087_054.3B.txt: -------------------------------------------------------------------------------- 1 | d a t e l i n e 2 | 3 | 4 | "This is Greater Serbia," yet they didn't take the city. 5 | On returning to the region in March I visited the 6 | Dubrovnik I had heard but never seen. 7 | As I drove along the Dalmatian coast past gnarled 8 | olive trees and black-robed peasant women tending their 9 | gardens, the city suddenly appeared in all its solidness, 10 | fortress walls squatting at the foot of steep cliffs, the 11 | Adriatic Sea sparkling below. Dubrovnik has weath- 12 | ered time and war for so long 13 | 14 | 15 | that historians remain igno- 16 | rant of its origins. They be- 17 | lieve it may have grown out of 18 | the Greco-Roman city of Epi- 19 | daurus, first mentioned in 47 20 | B.C., and legends place a set- 21 | tlement here a thousand years 22 | earlier. 23 | Today the city's quiet con- 24 | fidence has been shaken by 25 | the reality of modern war- 26 | fare. "Even Napoleon, whose 27 | army blew off the nose of 28 | the Sphinx, did not touch 29 | Dubrovnik," said Milo, a big, 30 | mustachioed Croatian and an 31 | unemployed tour guide. "The 32 | Saracens, Ottomans, Austri- 33 | ans . . . none of them harmed 34 | our city. But the Serbs, our 35 | neighbors, tried to destroy us, 36 | and we won't forget." He was 37 | drunkenly attacking the Serbs 38 | when I entered the Troubador 39 | Caf~~, housed in a crowded bar 40 | in a 14th-century stone build- 41 | ing just off Placa, Dubrovnik's 42 | main boulevard. 43 | 44 | 45 | You hear a lot about revenge in Croa- 46 | tia-~people want to get even, and they 47 | want their land back. (The Serbs now con- 48 | trol a third of Croatia, a slender shard of a 49 | state with Dubrovnik near its southern 50 | tip.) This doesn't augur well for peace or 51 | for the return of tourists, which Dubrovnik 52 | 53 | 54 | needs. Tourism accounted for 80 percent of the city's 55 | prewar economy. 56 | Vinko Milutinovitch turned his battered face from 57 | the wheel of his beat-up Volvo taxi and said, "There 58 | were one hundred and twenty taxis in Dubrovnik. Now 59 | there are only seven, and business is still lousy. You're 60 | the first tourist I've driven in two years." Like 70 percent 61 | of Dubrovnik's inhabitants, he depends on Red Cross 62 | food parcels to survive. Once a month, any person in 63 | 64 | 65 | need receives flour, sugar, margarine, oil, cans of beef, 66 | mackerel oil and, sometimes, clothing. 67 | One question I asked every official I met was, "If you 68 | were an American, would you bring your family here 69 | on vacation?" Most answered, "No, not yet." It's just a 70 | matter of time though. The city is rebuilding rapidly, 71 | repairing red-tiled roofs, clearing debris. As of last 72 | March, only one hotel was open, the classic Argentina, 73 | 74 | 75 | but several thousand beds 76 | have become available in 77 | other hotels and private 78 | homes. A few travel compa- 79 | nies are returning: Austin, 80 | Texas-~based Sterling Cruises 81 | and Tours scheduled several 82 | Dalmatian coast trips with 83 | stops in Dubrovnik for the fall. 84 | During a walk I saw trucks 85 | delivering 250,000 roof tiles 86 | from France; Caritas, the 87 | Catholic charity, transporting 88 | cement to fix a church; a Bel- 89 | gian company unloading Se- 90 | curit unbreakable glass. As a 91 | way of shaking a fist at the 92 | Serbs, the city historian Pro- 93 | fessor Ivo Dabeli~~ was putting 94 | the last touches on a tempo- 95 | rary exhibition of shells, rock- 96 | ets, grenades and bombs 97 | hurled at Dubrovnik by Serb 98 | gunners and warplanes. It took 99 | him 10 days to turn the fa- 100 | mous Nautika restaurant into 101 | a war museum. On the terrace, 102 | with its splendid view over the 103 | 104 | 105 | city's western walls and the Adriatic, he'd 106 | placed a collection of phosphorous shells 107 | and cluster bombs ("two hundred and 108 | forty-seven bomblets inside"). 109 | Are there still risks? One official said 110 | several grenades are hurled into the 111 | Dubrovnik region every few days. So far 112 | they've exploded harmlessly in fields, but 113 | 114 | 115 | those responsible for security take seriously a threat by 116 | General Ratko Mladi~~, commander of all Serb forces 117 | in Bosnia, who said he'd "make sure Dubrovnik has a 118 | nice tourist season." His nearest army base is only 15 119 | miles away in Trebinje, capital of the self-styled South 120 | Herzegovina Serb Republic. Serb fighters there are re- 121 | inforced by Russian mercenaries, according to Croa- 122 | tian officers, as well as the notorious White Eagles, 123 | fanatic Serb gunmen accused of raping, torturing and 124 | 125 | 126 | THE TERRA-COTTA 127 | ROOFS OF 128 | DUBROVNIK, 129 | DAMAGED DURING 130 | THE SHELLING, 131 | ARE MOSTLY 132 | RESTORED NOW. 133 | 134 | 135 | 54 ~ T R A V E L & L E I S U R E S E P T E M B E R 1 9 9 3 136 | -------------------------------------------------------------------------------- /testing/8087_054.3B.uzn: -------------------------------------------------------------------------------- 1 | 1101 379 346 60 Text 2 | 262 574 999 453 Text 3 | 261 1026 568 1338 Text 4 | 277 2364 733 343 Text 5 | 268 2704 993 456 Text 6 | 1288 571 997 412 Text 7 | 1728 981 559 1347 Text 8 | 1531 2326 742 385 Text 9 | 1296 2710 987 439 Text 10 | 1053 2290 447 348 Caption 11 | 210 3187 859 61 Header/Footer 12 | -------------------------------------------------------------------------------- /testing/8087_054.3G.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/8087_054.3G.tif -------------------------------------------------------------------------------- /testing/815.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/815.tif -------------------------------------------------------------------------------- /testing/HelloGoogle.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/HelloGoogle.tif -------------------------------------------------------------------------------- /testing/README.md: -------------------------------------------------------------------------------- 1 | ## testing 2 | 3 | #### Images and ground truth text required for testing Tesseract. 4 | 5 | A large number of test images are those provided by Google as part of 6 | original release of Tesseract for testing and as part of unittests 7 | for Tesseract 4.0.0. Original source of various examples used for testing 8 | were documented earlier in [FILES](https://github.com/tesseract-ocr/test/blob/master/testing/FILES). 9 | That information is now moved here. 10 | 11 | - hebrew.png - Sample from Hebrew OCR with Nikud project by Adi Oz and Vered Shani 12 | project URL - http://www.cs.bgu.ac.il/~nlpproj/hocr/ 13 | direct link to image - http://www.cs.bgu.ac.il/~nlpproj/hocr/images/image00.png 14 | 15 | - hebtypo.jpg - Sample from OCR and Hebrew on the Web project at Universiteit van Amsterdam 16 | project URL - http://cf.uba.uva.nl/en/collections/rosenthaliana/menasseh/hebtypo.html 17 | direct link to image - http://cf.uba.uva.nl/en/collections/rosenthaliana/menasseh/gif/hebtypo.jpg 18 | 19 | - DuTillet1004Pg2LG.jpg - Sample from Hebrew Matthew Project with parallel texts in Hebrew & Greek 20 | as well as English page/chapter labels with Arabic numerals - test with -l heb+grc+eng 21 | project URL - http://www.torahresource.com/Dutillet.html 22 | direct link to image - http://www.torahresource.com/DuTillet/DuTillet1004Pg2LG.jpg 23 | 24 | - hebrew-nikud-genesis-1-2.png - Genesis 1-2 Hebrew example from OCR forum 25 | forum post - https://community.logos.com/forums/p/16124/277997.aspx 26 | direct link to image - https://community.logos.com/cfs-filesystemfile.ashx/__key/CommunityServer.Discussions.Components.Files/77/4578.Gen.png 27 | 28 | 29 | 30 | 31 | 32 | #### Fonts required for unittests 33 | 34 | The following fonts should be copied to test/testing folder for stringrenderer_test to run. 35 | 36 | - Arab 37 | https://packages.ubuntu.com/trusty/fonts-arabeyes - GNU General Public License 38 | 39 | - Lohit Hindi 40 | https://releases.pagure.org/lohit/lohit-hindi-ttf-2.4.3.tar.gz - GNU General Public License, version 2 41 | 42 | - UNBatang 43 | https://packages.ubuntu.com/trusty/fonts-unfonts-core - GPL-2 44 | 45 | - Verdana 46 | https://packages.ubuntu.com/trusty/ttf-mscorefonts-installer - /usr/share/doc/ttf-mscorefonts-installer/copyright 47 | -------------------------------------------------------------------------------- /testing/arabic.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/arabic.tif -------------------------------------------------------------------------------- /testing/basicapitest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI(); 8 | // Initialize tesseract-ocr with English 9 | if (api->Init("../../tessdata", "eng")) { 10 | fprintf(stderr, "Could not initialize tesseract.\n"); 11 | exit(1); 12 | } 13 | 14 | // Open input image with leptonica library 15 | Pix *image = pixRead("../test/testing/phototest.tif"); 16 | api->SetImage(image); 17 | // Get OCR result 18 | char *outText = api->GetUTF8Text(); 19 | printf("OCR output:\n%s", outText); 20 | 21 | // Destroy used object and release memory 22 | api->End(); 23 | delete api; 24 | delete [] outText; 25 | pixDestroy(&image); 26 | 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /testing/deslant.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/deslant.tif -------------------------------------------------------------------------------- /testing/devatest-rotated-270.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/devatest-rotated-270.png -------------------------------------------------------------------------------- /testing/devatest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/devatest.png -------------------------------------------------------------------------------- /testing/eng.unicharset: -------------------------------------------------------------------------------- 1 | 111 2 | NULL 0 NULL 0 3 | Joined 7 0,69,188,255,486,1218,0,30,486,1188 Latin 105 0 75 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,69,186,255,892,2138,0,80,892,2058 Common 47 10 47 |Broken|0|1 # Broken 5 | d 3 57,65,216,255,88,174,0,28,100,200 Latin 26 0 3 d # d [64 ]a 6 | i 3 59,69,216,255,11,141,0,54,27,173 Latin 54 0 4 i # i [69 ]a 7 | f 3 0,68,216,255,54,175,0,42,55,193 Latin 52 0 5 f # f [66 ]a 8 | e 3 58,64,189,200,87,154,0,32,98,188 Latin 57 0 6 e # e [65 ]a 9 | r 3 59,68,186,202,58,173,0,40,69,180 Latin 53 0 7 r # r [72 ]a 10 | n 3 59,68,188,202,87,187,0,25,101,208 Latin 10 0 8 n # n [6e ]a 11 | t 3 58,66,206,254,57,167,0,47,59,180 Latin 21 0 9 t # t [74 ]a 12 | N 5 59,68,216,255,87,262,0,27,104,249 Latin 8 0 10 N # N [4e ]A 13 | w 3 59,68,187,195,108,235,0,32,117,286 Latin 74 0 11 w # w [77 ]a 14 | A 5 52,68,216,255,100,216,0,17,98,231 Latin 17 0 12 A # A [41 ]A 15 | c 3 58,64,192,200,80,153,0,36,88,178 Latin 27 0 13 c # c [63 ]a 16 | l 3 59,68,216,255,11,147,0,56,27,173 Latin 68 0 14 l # l [6c ]a 17 | s 3 58,65,192,200,78,147,0,30,91,173 Latin 23 0 15 s # s [73 ]a 18 | p 3 0,47,192,226,87,180,0,25,100,200 Latin 40 0 16 p # p [70 ]a 19 | a 3 58,65,186,200,85,164,0,26,97,185 Latin 12 0 17 a # a [61 ]a 20 | g 3 0,43,188,212,88,176,0,32,100,210 Latin 91 0 18 g # g [67 ]a 21 | 2 8 30,69,194,255,80,160,0,27,97,173 Common 19 2 19 2 # 2 [32 ]0 22 | 3 8 0,66,196,255,84,158,0,32,103,173 Common 20 2 20 3 # 3 [33 ]0 23 | T 5 59,68,216,255,85,227,0,47,88,236 Latin 9 0 21 T # T [54 ]A 24 | o 3 58,66,188,200,87,151,0,32,98,185 Latin 42 0 22 o # o [6f ]a 25 | S 5 57,64,219,255,87,174,0,30,100,200 Latin 15 0 23 S # S [53 ]A 26 | v 3 59,68,187,197,84,173,0,32,84,218 Latin 96 0 24 v # v [76 ]a 27 | ~ 0 91,229,135,255,73,174,0,41,0,200 Common 25 10 25 ~ # ~ [7e ] 28 | D 5 59,68,216,255,93,230,0,27,107,236 Latin 3 0 26 D # D [44 ]A 29 | C 5 58,65,219,255,87,192,0,32,107,209 Latin 13 0 27 C # C [43 ]A 30 | h 3 59,68,216,255,87,187,0,25,101,208 Latin 41 0 28 h # h [68 ]a 31 | ' 10 148,225,216,255,11,51,0,97,36,173 Common 29 10 29 ' # ' [27 ]p 32 | 7 8 12,68,196,255,72,160,0,60,75,173 Common 30 2 30 7 # 7 [37 ]0 33 | « 10 26,133,148,235,63,279,0,35,71,281 Common 31 10 72 « # « [ab ]p 34 | : 10 58,85,141,221,11,69,0,67,38,173 Common 32 6 32 : # : [3a ]p 35 | # 10 37,84,200,255,99,221,0,41,109,266 Common 33 4 33 # # # [23 ]p 36 | 1 8 49,69,192,255,45,128,0,66,74,173 Common 34 2 34 1 # 1 [31 ]0 37 | Z 5 64,68,216,255,72,218,0,30,77,236 Latin 98 0 35 Z # Z [5a ]A 38 | _ 10 0,50,0,64,73,248,0,29,75,259 Common 36 10 36 _ # _ [5f ]p 39 | M 5 57,68,216,255,99,301,0,35,117,286 Latin 39 0 37 M # M [4d ]A 40 | u 3 57,65,187,202,85,184,0,39,100,208 Latin 48 0 38 u # u [75 ]a 41 | m 3 56,68,189,202,108,280,0,25,117,306 Latin 37 0 39 m # m [6d ]a 42 | P 5 57,68,216,255,87,225,0,32,97,230 Latin 16 0 40 P # P [50 ]A 43 | H 5 59,68,216,255,91,258,0,27,107,244 Latin 28 0 41 H # H [48 ]A 44 | O 5 57,64,219,255,91,209,0,34,106,233 Latin 22 0 42 O # O [4f ]A 45 | ( 10 0,64,216,255,42,118,0,97,61,173 Common 43 10 44 ( # ( [28 ]p 46 | ) 10 0,64,216,255,42,119,0,53,61,173 Common 44 10 43 ) # ) [29 ]p 47 | q 3 0,47,192,202,88,196,0,30,100,200 Latin 89 0 45 q # q [71 ]a 48 | y 3 0,47,187,202,87,199,0,25,87,230 Latin 83 0 46 y # y [79 ]a 49 | | 0 0,67,216,255,8,73,0,80,50,173 Common 47 10 47 | # | [7c ] 50 | U 5 58,64,216,255,91,214,0,39,106,220 Latin 38 0 48 U # U [55 ]A 51 | 0 8 58,66,187,255,88,164,0,45,103,180 Common 49 2 49 0 # 0 [30 ]0 52 | % 10 27,67,205,255,105,257,0,49,117,288 Common 50 4 50 % # % [25 ]p 53 | x 3 59,68,187,201,85,189,0,25,84,218 Latin 106 0 51 x # x [78 ]a 54 | F 5 57,68,216,255,68,210,0,31,77,209 Latin 5 0 52 F # F [46 ]A 55 | R 5 57,68,216,255,88,227,0,27,104,232 Latin 7 0 53 R # R [52 ]A 56 | I 5 59,68,216,255,10,155,0,50,29,173 Latin 4 0 54 I # I [49 ]A 57 | , 10 14,46,79,115,17,78,0,58,30,173 Common 55 6 55 , # , [2c ]p 58 | ! 10 41,67,216,255,11,87,0,71,50,173 Common 56 10 56 ! # ! [21 ]p 59 | E 5 59,68,216,255,68,210,0,31,80,219 Latin 6 0 57 E # E [45 ]A 60 | b 3 58,64,216,255,87,180,0,25,100,200 Latin 64 0 58 b # b [62 ]a 61 | \ 10 0,67,219,255,28,250,0,71,62,261 Common 59 10 59 \ # \ [5c ]p 62 | 8 8 57,66,219,255,88,162,0,41,103,174 Common 60 2 60 8 # 8 [38 ]0 63 | ? 10 40,67,219,255,59,144,0,65,77,188 Common 61 10 61 ? # ? [3f ]p 64 | & 10 53,64,194,255,108,232,0,47,112,239 Common 62 10 62 & # & [26 ]p 65 | ; 10 14,56,131,221,17,93,0,58,38,173 Common 63 10 63 ; # ; [3b ]p 66 | B 5 62,68,216,255,91,227,0,27,106,227 Latin 58 0 64 B # B [42 ]A 67 | k 3 57,68,216,255,85,177,0,35,93,198 Latin 94 0 65 k # k [6b ]a 68 | - 10 105,161,122,175,49,176,0,43,56,215 Common 66 3 66 - # - [2d ]p 69 | > 0 29,102,173,255,78,184,0,50,90,256 Common 67 10 102 > # > [3e ] 70 | L 5 59,68,216,255,64,193,0,31,74,206 Latin 14 0 68 L # L [4c ]A 71 | . 10 26,67,73,112,13,51,0,67,30,173 Common 69 6 69 . # . [2e ]p 72 | — 10 110,155,132,167,126,297,0,23,136,298 Common 70 10 70 - # — [2014 ]p 73 | 4 8 0,68,198,255,93,161,0,41,96,173 Common 71 2 71 4 # 4 [34 ]0 74 | » 10 0,133,146,235,63,284,0,32,71,294 Common 72 10 31 » # » [bb ]p 75 | € 0 32,68,209,255,97,238,0,49,103,293 Common 73 4 73 € # € [20ac ] 76 | W 5 54,68,216,255,106,314,0,41,117,318 Latin 11 0 74 W # W [57 ]A 77 | J 5 0,64,216,255,39,242,0,30,62,234 Latin 105 0 75 J # J [4a ]A 78 | é 3 0,64,222,255,87,384,0,32,98,391 Latin 76 0 76 é # é [e9 ]a 79 | 9 8 0,66,200,255,89,156,0,39,104,173 Common 77 2 77 9 # 9 [39 ]0 80 | ® 0 28,163,209,255,83,223,0,48,92,257 Common 78 10 78 ® # ® [ae ] 81 | $ 0 24,63,229,255,85,174,0,36,106,174 Common 79 4 79 $ # $ [24 ] 82 | 5 8 12,66,199,255,82,160,0,36,103,173 Common 80 2 80 5 # 5 [35 ]0 83 | } 10 0,44,216,255,54,148,0,56,59,173 Common 81 10 86 } # } [7d ]p 84 | [ 10 8,64,216,255,39,136,0,80,55,173 Common 82 10 107 [ # [ [5b ]p 85 | Y 5 59,68,216,255,91,205,0,47,91,223 Latin 46 0 83 Y # Y [59 ]A 86 | § 0 9,66,219,255,82,207,0,86,93,293 Common 84 10 84 § # § [a7 ] 87 | " 10 151,225,216,255,52,115,0,71,71,173 Common 85 10 85 " # " [22 ]p 88 | { 10 0,44,216,255,54,148,0,71,59,173 Common 86 10 81 { # { [7b ]p 89 | ¢ 0 14,158,190,255,56,144,0,72,61,270 Common 87 4 87 ¢ # ¢ [a2 ] 90 | / 10 0,65,219,255,59,228,0,36,62,238 Common 88 6 88 / # / [2f ]p 91 | Q 5 7,64,219,255,91,205,0,30,106,227 Latin 45 0 89 Q # Q [51 ]A 92 | 6 8 58,66,219,255,87,156,0,54,104,173 Common 90 2 90 6 # 6 [36 ]0 93 | G 5 58,64,219,255,91,230,0,30,106,230 Latin 18 0 91 G # G [47 ]A 94 | ” 10 141,233,216,255,59,141,0,87,66,298 Common 92 10 92 " # ” [201d ]p 95 | ° 0 66,247,209,255,22,399,0,98,66,409 Common 93 4 93 ° # ° [b0 ] 96 | K 5 57,68,216,255,92,225,0,37,103,216 Latin 65 0 94 K # K [4b ]A 97 | ¥ 0 59,75,209,255,91,238,0,52,91,270 Common 95 4 95 ¥ # ¥ [a5 ] 98 | V 5 59,68,216,255,103,207,0,41,101,245 Latin 24 0 96 V # V [56 ]A 99 | © 0 28,125,209,255,118,232,0,32,119,257 Common 97 10 97 © # © [a9 ] 100 | z 3 46,68,186,199,65,151,0,32,68,173 Latin 35 0 98 z # z [7a ]a 101 | + 0 54,102,171,253,90,176,0,37,103,213 Common 99 3 99 + # + [2b ] 102 | = 0 74,139,144,199,90,186,0,32,103,224 Common 100 10 100 = # = [3d ] 103 | £ 0 0,135,219,255,64,201,0,55,61,298 Common 101 4 101 £ # £ [a3 ] 104 | < 0 29,102,173,255,69,184,0,50,90,256 Common 102 10 67 < # < [3c ] 105 | ’ 10 141,233,212,255,17,78,0,109,30,298 Common 103 10 103 ' # ’ [2019 ]p 106 | ‘ 10 141,233,210,255,17,64,0,216,30,298 Common 104 10 104 ' # ‘ [2018 ]p 107 | j 3 0,47,216,255,36,145,0,49,50,173 Latin 75 0 105 j # j [6a ]a 108 | X 5 59,68,216,255,94,275,0,25,93,256 Latin 51 0 106 X # X [58 ]A 109 | ] 10 8,64,216,255,39,129,0,44,55,173 Common 107 10 82 ] # ] [5d ]p 110 | * 10 78,183,188,255,49,134,0,60,53,173 Common 108 10 108 * # * [2a ]p 111 | “ 10 141,233,216,255,56,133,0,172,66,298 Common 109 10 109 " # “ [201c ]p 112 | @ 10 0,65,211,255,99,286,0,39,117,291 Common 110 10 110 @ # @ [40 ]p 113 | -------------------------------------------------------------------------------- /testing/eng.wordlist.clean.freq: -------------------------------------------------------------------------------- 1 | I 2 | In 3 | It 4 | other 5 | on 6 | only 7 | one 8 | our 9 | out 10 | of 11 | or 12 | it 13 | its 14 | in 15 | is 16 | if 17 | no 18 | not 19 | new 20 | | 21 | - 22 | Search 23 | Show 24 | : 25 | a 26 | about 27 | also 28 | all 29 | an 30 | and 31 | any 32 | are 33 | as 34 | at 35 | more 36 | may 37 | my 38 | For 39 | up 40 | Business 41 | » 42 | first 43 | from 44 | for 45 | do 46 | can 47 | have 48 | has 49 | had 50 | his 51 | he 52 | Contact 53 | the 54 | their 55 | they 56 | that 57 | than 58 | this 59 | to 60 | Links 61 | This 62 | The 63 | To 64 | More 65 | My 66 | you 67 | your 68 | like 69 | New 70 | News 71 | be 72 | been 73 | but 74 | by 75 | . 76 | Home 77 | Help 78 | which 79 | when 80 | who 81 | would 82 | with 83 | will 84 | we 85 | were 86 | was 87 | View 88 | & 89 | A 90 | About 91 | All 92 | Add 93 | Page 94 | / 95 | > 96 | Us 97 | You 98 | Your 99 | -------------------------------------------------------------------------------- /testing/eng.xheights: -------------------------------------------------------------------------------- 1 | AR_PL_UKai_CN 57 2 | AR_PL_UKai_Patched 57 3 | AR_PL_UMing_CN_Light 57 4 | AR_PL_UMing_Patched_Light 57 5 | Aboriginal_Sans 66 6 | Aboriginal_Serif 63 7 | Andale_Mono 70 8 | Arial 70 9 | BPG_Chveulebrivi 71 10 | BPG_Courier 75 11 | BPG_Elite 72 12 | BPG_Glaho 75 13 | BPG_Rioni 75 14 | BPG_Unicode_Standard 75 15 | Baekmuk_Batang 58 16 | Baekmuk_Batang_Patched 58 17 | Baekmuk_Dotum 66 18 | Baekmuk_Gulim 70 19 | Century_Schoolbook_L 64 20 | Comic_Sans_MS 72 21 | Courier_New 59 22 | DejaVu_Sans_Mono 75 23 | DejaVu_Sans_Ultra-Light 75 24 | DejaVu_Serif_Semi-Condensed 71 25 | DejaVu_Serif 71 26 | East_Syriac_Adiabene 51 27 | Estrangelo_Antioch 51 28 | Estrangelo_Midyat 51 29 | Estrangelo_Nisibin 51 30 | Estrangelo_Quenneshrin 51 31 | Estrangelo_Talada 51 32 | Estrangelo_TurAbdin 51 33 | FreeMono 60 34 | FreeSans 74 35 | FreeSerif 64 36 | Garuda 71 37 | GentiumAlt 65 38 | Georgia 66 39 | Khmer_OS_Battambang 70 40 | Khmer_OS_Content 70 41 | Khmer_OS_Muol_Light 70 42 | Khmer_OS_Muol 70 43 | Khmer_OS_Siemreap 75 44 | Khmer_OS_System 70 45 | Khmer_OS 70 46 | Kochi_Gothic 71 47 | Kochi_Mincho 62 48 | Lucida_Bright 72 49 | Lucida_Sans_Typewriter 73 50 | Lucida_Sans 71 51 | Monapo 71 52 | Padauk_Bold 58 53 | Padauk 58 54 | Sazanami_Gothic 72 55 | Sazanami_Mincho 62 56 | Serto_Batnan_Bold 51 57 | Serto_Batnan 52 58 | Serto_Jerusalem_Bold 52 59 | Serto_Jerusalem_Italic 52 60 | Serto_Jerusalem 52 61 | Serto_Kharput 51 62 | Serto_Malankara 51 63 | Serto_Mardin_Bold 52 64 | Serto_Mardin 52 65 | Serto_Urhoy 51 66 | TSCu_Comic 50 67 | TakaoExGothic 69 68 | TakaoExMincho 66 69 | TakaoGothic 63 70 | TakaoMincho 65 71 | TakaoPGothic 68 72 | TakaoPMincho 66 73 | Times_New_Roman 64 74 | Trebuchet_MS 72 75 | URW_Bookman_L 66 76 | UmePlus_P_Gothic 71 77 | VL_Gothic 73 78 | VL_PGothic 73 79 | Verdana 75 80 | Wyld 58 81 | Aboriginal_Sans_Italic 66 82 | Aboriginal_Serif_Italic 65 83 | Arial_Italic 70 84 | Century_Schoolbook_L_Italic 65 85 | Courier_New_Italic 59 86 | DejaVu_Sans_Mono_Oblique 75 87 | DejaVu_Serif_Oblique 71 88 | FreeMono_Italic 59 89 | FreeSans_Oblique 74 90 | FreeSerif_Italic 61 91 | Garuda_Oblique 71 92 | GentiumAlt_Italic 63 93 | Georgia_Italic 68 94 | Lucida_Sans_Typewriter_Oblique 73 95 | Lucida_Sans_Oblique 71 96 | Serto_Jerusalem_Italic 52 97 | Times_New_Roman_Italic 60 98 | Trebuchet_MS_Italic 72 99 | URW_Bookman_L_Italic 67 100 | Verdana_Italic 75 101 | Wyld_Italic 59 102 | Aboriginal_Serif_Bold 63 103 | Arial_Black 71 104 | Arial_Bold 71 105 | BPG_Chveulebrivi_Bold 72 106 | BPG_Courier_Bold 75 107 | BPG_Elite_Bold 73 108 | BPG_Glaho_Bold 75 109 | BPG_Rioni_Bold 75 110 | Baekmuk_Headline 67 111 | Century_Schoolbook_L_Bold 65 112 | Comic_Sans_MS_Bold 72 113 | Courier_New_Bold 59 114 | DejaVu_Sans_Mono_Bold 75 115 | DejaVu_Serif_Bold 71 116 | DejaVu_Serif_Bold_Semi-Condensed 71 117 | FreeMono_Bold 60 118 | FreeSans_Bold 76 119 | FreeSerif_Bold 65 120 | Garuda_Bold 70 121 | Georgia_Bold 66 122 | Impact_Condensed 88 123 | Lucida_Sans_Semi-Bold 73 124 | Lucida_Sans_Typewriter_Bold 73 125 | Serto_Batnan_Bold 51 126 | Serto_Jerusalem_Bold 52 127 | Serto_Mardin_Bold 52 128 | Serto_Urhoy_Bold 51 129 | Times_New_Roman_Bold 64 130 | Trebuchet_MS_Bold 72 131 | URW_Bookman_L_Bold 68 132 | Verdana_Bold 75 133 | Aboriginal_Sans_Bold_Italic 66 134 | Aboriginal_Serif_Bold_Italic 65 135 | Arial_Bold_Italic 71 136 | Century_Schoolbook_L_Bold_Italic 66 137 | Courier_New_Bold_Italic 59 138 | DejaVu_Sans_Mono_Bold_Oblique 75 139 | DejaVu_Serif_Bold_Oblique 71 140 | FreeMono_Bold_Italic 60 141 | FreeSans_Bold_Oblique 74 142 | FreeSerif_Bold_Italic 64 143 | Garuda_Bold_Oblique 71 144 | Georgia_Bold_Italic 68 145 | Lucida_Sans_Typewriter_Bold_Oblique 73 146 | Times_New_Roman_Bold_Italic 64 147 | Trebuchet_MS_Bold_Italic 72 148 | URW_Bookman_L_Bold_Italic 70 149 | Verdana_Bold_Italic 75 150 | -------------------------------------------------------------------------------- /testing/eng_beam.unicharset: -------------------------------------------------------------------------------- 1 | 113 2 | NULL 0 Common 0 3 | Joined 7 0,255,0,255,0,0,0,0,0,0 Latin 1 0 1 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,255,0,255,0,0,0,0,0,0 Common 2 10 2 |Broken|0|1 # Broken 5 | G 5 0,255,0,255,0,0,0,0,0,0 Latin 62 0 3 G # G [47 ]A 6 | r 3 0,255,0,255,0,0,0,0,0,0 Latin 69 0 4 r # r [72 ]a 7 | a 3 0,255,0,255,0,0,0,0,0,0 Latin 47 0 5 a # a [61 ]a 8 | s 3 0,255,0,255,0,0,0,0,0,0 Latin 38 0 6 s # s [73 ]a 9 | l 3 0,255,0,255,0,0,0,0,0,0 Latin 29 0 7 l # l [6c ]a 10 | n 3 0,255,0,255,0,0,0,0,0,0 Latin 56 0 8 n # n [6e ]a 11 | d 3 0,255,0,255,0,0,0,0,0,0 Latin 39 0 9 d # d [64 ]a 12 | . 10 0,255,0,255,0,0,0,0,0,0 Common 10 6 10 . # . [2e ]p 13 | B 5 0,255,0,255,0,0,0,0,0,0 Latin 52 0 11 B # B [42 ]A 14 | C 5 0,255,0,255,0,0,0,0,0,0 Latin 35 0 12 C # C [43 ]A 15 | O 5 0,255,0,255,0,0,0,0,0,0 Latin 28 0 13 O # O [4f ]A 16 | W 5 0,255,0,255,0,0,0,0,0,0 Latin 61 0 14 W # W [57 ]A 17 | Y 5 0,255,0,255,0,0,0,0,0,0 Latin 48 0 15 Y # Y [59 ]A 18 | , 10 0,255,0,255,0,0,0,0,0,0 Common 16 6 16 , # , [2c ]p 19 | ( 10 0,255,0,255,0,0,0,0,0,0 Common 17 10 22 ( # ( [28 ]p 20 | u 3 0,255,0,255,0,0,0,0,0,0 Latin 82 0 18 u # u [75 ]a 21 | z 3 0,255,0,255,0,0,0,0,0,0 Latin 90 0 19 z # z [7a ]a 22 | i 3 0,255,0,255,0,0,0,0,0,0 Latin 57 0 20 i # i [69 ]a 23 | e 3 0,255,0,255,0,0,0,0,0,0 Latin 44 0 21 e # e [65 ]a 24 | ) 10 0,255,0,255,0,0,0,0,0,0 Common 22 10 17 ) # ) [29 ]p 25 | 1 8 0,255,0,255,0,0,0,0,0,0 Common 23 2 23 1 # 1 [31 ]0 26 | 9 8 0,255,0,255,0,0,0,0,0,0 Common 24 2 24 9 # 9 [39 ]0 27 | 2 8 0,255,0,255,0,0,0,0,0,0 Common 25 2 25 2 # 2 [32 ]0 28 | - 10 0,255,0,255,0,0,0,0,0,0 Common 26 3 26 - # - [2d ]p 29 | 6 8 0,255,0,255,0,0,0,0,0,0 Common 27 2 27 6 # 6 [36 ]0 30 | o 3 0,255,0,255,0,0,0,0,0,0 Latin 13 0 28 o # o [6f ]a 31 | L 5 0,255,0,255,0,0,0,0,0,0 Latin 7 0 29 L # L [4c ]A 32 | P 5 0,255,0,255,0,0,0,0,0,0 Latin 60 0 30 P # P [50 ]A 33 | ' 10 0,255,0,255,0,0,0,0,0,0 Common 31 10 31 ' # ' [27 ]p 34 | t 3 0,255,0,255,0,0,0,0,0,0 Latin 58 0 32 t # t [74 ]a 35 | m 3 0,255,0,255,0,0,0,0,0,0 Latin 42 0 33 m # m [6d ]a 36 | K 5 0,255,0,255,0,0,0,0,0,0 Latin 36 0 34 K # K [4b ]A 37 | c 3 0,255,0,255,0,0,0,0,0,0 Latin 12 0 35 c # c [63 ]a 38 | k 3 0,255,0,255,0,0,0,0,0,0 Latin 34 0 36 k # k [6b ]a 39 | V 5 0,255,0,255,0,0,0,0,0,0 Latin 72 0 37 V # V [56 ]A 40 | S 5 0,255,0,255,0,0,0,0,0,0 Latin 6 0 38 S # S [53 ]A 41 | D 5 0,255,0,255,0,0,0,0,0,0 Latin 9 0 39 D # D [44 ]A 42 | J 5 0,255,0,255,0,0,0,0,0,0 Latin 77 0 40 J # J [4a ]A 43 | h 3 0,255,0,255,0,0,0,0,0,0 Latin 63 0 41 h # h [68 ]a 44 | M 5 0,255,0,255,0,0,0,0,0,0 Latin 33 0 42 M # M [4d ]A 45 | x 3 0,255,0,255,0,0,0,0,0,0 Latin 91 0 43 x # x [78 ]a 46 | E 5 0,255,0,255,0,0,0,0,0,0 Latin 21 0 44 E # E [45 ]A 47 | q 3 0,255,0,255,0,0,0,0,0,0 Latin 88 0 45 q # q [71 ]a 48 | ; 10 0,255,0,255,0,0,0,0,0,0 Common 46 10 46 ; # ; [3b ]p 49 | A 5 0,255,0,255,0,0,0,0,0,0 Latin 5 0 47 A # A [41 ]A 50 | y 3 0,255,0,255,0,0,0,0,0,0 Latin 15 0 48 y # y [79 ]a 51 | f 3 0,255,0,255,0,0,0,0,0,0 Latin 65 0 49 f # f [66 ]a 52 | 5 8 0,255,0,255,0,0,0,0,0,0 Common 50 2 50 5 # 5 [35 ]0 53 | 7 8 0,255,0,255,0,0,0,0,0,0 Common 51 2 51 7 # 7 [37 ]0 54 | b 3 0,255,0,255,0,0,0,0,0,0 Latin 11 0 52 b # b [62 ]a 55 | 4 8 0,255,0,255,0,0,0,0,0,0 Common 53 2 53 4 # 4 [34 ]0 56 | 0 8 0,255,0,255,0,0,0,0,0,0 Common 54 2 54 0 # 0 [30 ]0 57 | 3 8 0,255,0,255,0,0,0,0,0,0 Common 55 2 55 3 # 3 [33 ]0 58 | N 5 0,255,0,255,0,0,0,0,0,0 Latin 8 0 56 N # N [4e ]A 59 | I 5 0,255,0,255,0,0,0,0,0,0 Latin 20 0 57 I # I [49 ]A 60 | T 5 0,255,0,255,0,0,0,0,0,0 Latin 32 0 58 T # T [54 ]A 61 | / 10 0,255,0,255,0,0,0,0,0,0 Common 59 6 59 / # / [2f ]p 62 | p 3 0,255,0,255,0,0,0,0,0,0 Latin 30 0 60 p # p [70 ]a 63 | w 3 0,255,0,255,0,0,0,0,0,0 Latin 14 0 61 w # w [77 ]a 64 | g 3 0,255,0,255,0,0,0,0,0,0 Latin 3 0 62 g # g [67 ]a 65 | H 5 0,255,0,255,0,0,0,0,0,0 Latin 41 0 63 H # H [48 ]A 66 | “ 10 0,255,0,255,0,0,0,0,0,0 Common 64 10 64 " # “ [201c ]p 67 | F 5 0,255,0,255,0,0,0,0,0,0 Latin 49 0 65 F # F [46 ]A 68 | ” 10 0,255,0,255,0,0,0,0,0,0 Common 66 10 66 " # ” [201d ]p 69 | " 10 0,255,0,255,0,0,0,0,0,0 Common 67 10 67 " # " [22 ]p 70 | ’ 10 0,255,0,255,0,0,0,0,0,0 Common 68 10 68 ' # ’ [2019 ]p 71 | R 5 0,255,0,255,0,0,0,0,0,0 Latin 4 0 69 R # R [52 ]A 72 | — 10 0,255,0,255,0,0,0,0,0,0 Common 70 10 70 - # — [2014 ]p 73 | 8 8 0,255,0,255,0,0,0,0,0,0 Common 71 2 71 8 # 8 [38 ]0 74 | v 3 0,255,0,255,0,0,0,0,0,0 Latin 37 0 72 v # v [76 ]a 75 | ? 10 0,255,0,255,0,0,0,0,0,0 Common 73 10 73 ? # ? [3f ]p 76 | é 3 0,255,0,255,0,0,0,0,0,0 Latin 74 0 74 é # é [e9 ]a 77 | % 10 0,255,0,255,0,0,0,0,0,0 Common 75 4 75 % # % [25 ]p 78 | : 10 0,255,0,255,0,0,0,0,0,0 Common 76 6 76 : # : [3a ]p 79 | j 3 0,255,0,255,0,0,0,0,0,0 Latin 40 0 77 j # j [6a ]a 80 | \ 10 0,255,0,255,0,0,0,0,0,0 Common 78 10 78 \ # \ [5c ]p 81 | { 10 0,255,0,255,0,0,0,0,0,0 Common 79 10 80 { # { [7b ]p 82 | } 10 0,255,0,255,0,0,0,0,0,0 Common 80 10 79 } # } [7d ]p 83 | | 0 0,255,0,255,0,0,0,0,0,0 Common 81 10 81 | # | [7c ] 84 | U 5 0,255,0,255,0,0,0,0,0,0 Latin 18 0 82 U # U [55 ]A 85 | $ 0 0,255,0,255,0,0,0,0,0,0 Common 83 4 83 $ # $ [24 ] 86 | ° 0 0,255,0,255,0,0,0,0,0,0 Common 84 4 84 ° # ° [b0 ] 87 | * 10 0,255,0,255,0,0,0,0,0,0 Common 85 10 85 * # * [2a ]p 88 | ! 10 0,255,0,255,0,0,0,0,0,0 Common 86 10 86 ! # ! [21 ]p 89 | ] 10 0,255,0,255,0,0,0,0,0,0 Common 87 10 92 ] # ] [5d ]p 90 | Q 5 0,255,0,255,0,0,0,0,0,0 Latin 45 0 88 Q # Q [51 ]A 91 | ‘ 10 0,255,0,255,0,0,0,0,0,0 Common 89 10 89 ' # ‘ [2018 ]p 92 | Z 5 0,255,0,255,0,0,0,0,0,0 Latin 19 0 90 Z # Z [5a ]A 93 | X 5 0,255,0,255,0,0,0,0,0,0 Latin 43 0 91 X # X [58 ]A 94 | [ 10 0,255,0,255,0,0,0,0,0,0 Common 92 10 87 [ # [ [5b ]p 95 | = 0 0,255,0,255,0,0,0,0,0,0 Common 93 10 93 = # = [3d ] 96 | + 0 0,255,0,255,0,0,0,0,0,0 Common 94 3 94 + # + [2b ] 97 | § 10 0,255,0,255,0,0,0,0,0,0 Common 95 10 95 § # § [a7 ]p 98 | _ 10 0,255,0,255,0,0,0,0,0,0 Common 96 10 96 _ # _ [5f ]p 99 | £ 0 0,255,0,255,0,0,0,0,0,0 Common 97 4 97 £ # £ [a3 ] 100 | & 10 0,255,0,255,0,0,0,0,0,0 Common 98 10 98 & # & [26 ]p 101 | # 10 0,255,0,255,0,0,0,0,0,0 Common 99 4 99 # # # [23 ]p 102 | > 0 0,255,0,255,0,0,0,0,0,0 Common 100 10 101 > # > [3e ] 103 | < 0 0,255,0,255,0,0,0,0,0,0 Common 101 10 100 < # < [3c ] 104 | ~ 0 0,255,0,255,0,0,0,0,0,0 Common 102 10 102 ~ # ~ [7e ] 105 | € 0 0,255,0,255,0,0,0,0,0,0 Common 103 4 103 € # € [20ac ] 106 | @ 10 0,255,0,255,0,0,0,0,0,0 Common 104 10 104 @ # @ [40 ]p 107 | ¢ 0 0,255,0,255,0,0,0,0,0,0 Common 105 4 105 ¢ # ¢ [a2 ] 108 | » 10 0,255,0,255,0,0,0,0,0,0 Common 106 10 107 » # » [bb ]p 109 | « 10 0,255,0,255,0,0,0,0,0,0 Common 107 10 106 « # « [ab ]p 110 | fl 3 0,255,0,255,0,0,0,0,0,0 Latin 108 0 108 fl # fl [fb02 ]a 111 | fi 3 0,255,0,255,0,0,0,0,0,0 Latin 109 0 109 fi # fi [fb01 ]a 112 | ® 0 0,255,0,255,0,0,0,0,0,0 Common 110 10 110 ® # ® [ae ] 113 | © 0 0,255,0,255,0,0,0,0,0,0 Common 111 10 111 © # © [a9 ] 114 | ¥ 0 0,255,0,255,0,0,0,0,0,0 Common 112 4 112 ¥ # ¥ [a5 ] 115 | -------------------------------------------------------------------------------- /testing/eurotext.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/eurotext.tif -------------------------------------------------------------------------------- /testing/eurotext.txt: -------------------------------------------------------------------------------- 1 | The (quick) [brown] {fox} jumps! 2 | Over the $43,456.78 #90 dog 3 | & duck/goose, as 12.5% of E-mail 4 | from aspammer@website.com is spam. 5 | Der „schnelle” braune Fuchs springt 6 | über den faulen Hund. Le renard brun 7 | «rapide» saute par-dessus le chien 8 | paresseux. La volpe marrone rapida 9 | salta sopra il cane pigro. El zorro 10 | marrón rápido salta sobre el perro 11 | perezoso. A raposa marrom rápida 12 | salta sobre o cão preguiçoso. 13 | -------------------------------------------------------------------------------- /testing/hebrew-nikud-genesis-1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/hebrew-nikud-genesis-1-2.png -------------------------------------------------------------------------------- /testing/hebrew.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/hebrew.png -------------------------------------------------------------------------------- /testing/hebtypo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/hebtypo.jpg -------------------------------------------------------------------------------- /testing/hin_beam.unicharset: -------------------------------------------------------------------------------- 1 | 123 2 | NULL 0 Common 0 3 | Joined 7 0,255,0,255,0,0,0,0,0,0 Latin 1 0 1 Joined # Joined [4a 6f 69 6e 65 64 ]a 4 | |Broken|0|1 f 0,255,0,255,0,0,0,0,0,0 Common 2 10 2 |Broken|0|1 # Broken 5 | / 10 0,255,0,255,0,0,0,0,0,0 Common 3 6 3 / # / [2f ]p 6 | ( 10 0,255,0,255,0,0,0,0,0,0 Common 4 10 31 ( # ( [28 ]p 7 | व 1 0,255,0,255,0,0,0,0,0,0 Devanagari 5 0 5 व # व [935 ]x 8 | ि 0 0,255,0,255,0,0,0,0,0,0 Devanagari 6 0 6 ि # ि [93f ] 9 | श 1 0,255,0,255,0,0,0,0,0,0 Devanagari 7 0 7 श # श [936 ]x 10 | ल 1 0,255,0,255,0,0,0,0,0,0 Devanagari 8 0 8 ल # ल [932 ]x 11 | य 1 0,255,0,255,0,0,0,0,0,0 Devanagari 9 0 9 य # य [92f ]x 12 | े 0 0,255,0,255,0,0,0,0,0,0 Devanagari 10 17 10 े # े [947 ] 13 | # 10 0,255,0,255,0,0,0,0,0,0 Common 11 4 11 # # # [23 ]p 14 | ! 10 0,255,0,255,0,0,0,0,0,0 Common 12 10 12 ! # ! [21 ]p 15 | १ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 13 0 13 १ # १ [967 ]0 16 | ४ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 14 0 14 ४ # ४ [96a ]0 17 | . 10 0,255,0,255,0,0,0,0,0,0 Common 15 6 15 . # . [2e ]p 18 | अ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 16 0 16 अ # अ [905 ]x 19 | र 1 0,255,0,255,0,0,0,0,0,0 Devanagari 17 0 17 र # र [930 ]x 20 | ब 1 0,255,0,255,0,0,0,0,0,0 Devanagari 18 0 18 ब # ब [92c ]x 21 | ी 0 0,255,0,255,0,0,0,0,0,0 Devanagari 19 0 19 ी # ी [940 ] 22 | , 10 0,255,0,255,0,0,0,0,0,0 Common 20 6 20 , # , [2c ]p 23 | म 1 0,255,0,255,0,0,0,0,0,0 Devanagari 21 0 21 म # म [92e ]x 24 | क 1 0,255,0,255,0,0,0,0,0,0 Devanagari 22 0 22 क # क [915 ]x 25 | ध 1 0,255,0,255,0,0,0,0,0,0 Devanagari 23 0 23 ध # ध [927 ]x 26 | ो 0 0,255,0,255,0,0,0,0,0,0 Devanagari 24 0 24 ो # ो [94b ] 27 | ा 0 0,255,0,255,0,0,0,0,0,0 Devanagari 25 0 25 ा # ा [93e ] 28 | आ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 26 0 26 आ # आ [906 ]x 29 | ई 1 0,255,0,255,0,0,0,0,0,0 Devanagari 27 0 27 ई # ई [908 ]x 30 | ए 1 0,255,0,255,0,0,0,0,0,0 Devanagari 28 0 28 ए # ए [90f ]x 31 | स 1 0,255,0,255,0,0,0,0,0,0 Devanagari 29 0 29 स # स [938 ]x 32 | ज 1 0,255,0,255,0,0,0,0,0,0 Devanagari 30 0 30 ज # ज [91c ]x 33 | ) 10 0,255,0,255,0,0,0,0,0,0 Common 31 10 4 ) # ) [29 ]p 34 | च 1 0,255,0,255,0,0,0,0,0,0 Devanagari 32 0 32 च # च [91a ]x 35 | ग 1 0,255,0,255,0,0,0,0,0,0 Devanagari 33 0 33 ग # ग [917 ]x 36 | ् 0 0,255,0,255,0,0,0,0,0,0 Devanagari 34 17 34 ् # ् [94d ] 37 | न 1 0,255,0,255,0,0,0,0,0,0 Devanagari 35 0 35 न # न [928 ]x 38 | ु 0 0,255,0,255,0,0,0,0,0,0 Devanagari 36 17 36 ु # ु [941 ] 39 | 6 8 0,255,0,255,0,0,0,0,0,0 Common 37 2 37 6 # 6 [36 ]0 40 | ख 1 0,255,0,255,0,0,0,0,0,0 Devanagari 38 0 38 ख # ख [916 ]x 41 | प 1 0,255,0,255,0,0,0,0,0,0 Devanagari 39 0 39 प # प [92a ]x 42 | द 1 0,255,0,255,0,0,0,0,0,0 Devanagari 40 0 40 द # द [926 ]x 43 | ं 0 0,255,0,255,0,0,0,0,0,0 Devanagari 41 17 41 ं # ं [902 ] 44 | ू 0 0,255,0,255,0,0,0,0,0,0 Devanagari 42 17 42 ू # ू [942 ] 45 | त 1 0,255,0,255,0,0,0,0,0,0 Devanagari 43 0 43 त # त [924 ]x 46 | - 10 0,255,0,255,0,0,0,0,0,0 Common 44 3 44 - # - [2d ]p 47 | ट 1 0,255,0,255,0,0,0,0,0,0 Devanagari 45 0 45 ट # ट [91f ]x 48 | ; 10 0,255,0,255,0,0,0,0,0,0 Common 46 10 46 ; # ; [3b ]p 49 | थ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 47 0 47 थ # थ [925 ]x 50 | भ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 48 0 48 भ # भ [92d ]x 51 | उ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 49 0 49 उ # उ [909 ]x 52 | ृ 0 0,255,0,255,0,0,0,0,0,0 Devanagari 50 17 50 ृ # ृ [943 ] 53 | 1 8 0,255,0,255,0,0,0,0,0,0 Common 51 2 51 1 # 1 [31 ]0 54 | 0 8 0,255,0,255,0,0,0,0,0,0 Common 52 2 52 0 # 0 [30 ]0 55 | " 10 0,255,0,255,0,0,0,0,0,0 Common 53 10 53 " # " [22 ]p 56 | झ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 54 0 54 झ # झ [91d ]x 57 | ह 1 0,255,0,255,0,0,0,0,0,0 Devanagari 55 0 55 ह # ह [939 ]x 58 | ' 10 0,255,0,255,0,0,0,0,0,0 Common 56 10 56 ' # ' [27 ]p 59 | ड 1 0,255,0,255,0,0,0,0,0,0 Devanagari 57 0 57 ड # ड [921 ]x 60 | ष 1 0,255,0,255,0,0,0,0,0,0 Devanagari 58 0 58 ष # ष [937 ]x 61 | ण 1 0,255,0,255,0,0,0,0,0,0 Devanagari 59 0 59 ण # ण [923 ]x 62 | ॉ 0 0,255,0,255,0,0,0,0,0,0 Devanagari 60 0 60 ॉ # ॉ [949 ] 63 | 8 8 0,255,0,255,0,0,0,0,0,0 Common 61 2 61 8 # 8 [38 ]0 64 | 9 8 0,255,0,255,0,0,0,0,0,0 Common 62 2 62 9 # 9 [39 ]0 65 | 2 8 0,255,0,255,0,0,0,0,0,0 Common 63 2 63 2 # 2 [32 ]0 66 | > 0 0,255,0,255,0,0,0,0,0,0 Common 64 10 105 > # > [3e ] 67 | । 10 0,255,0,255,0,0,0,0,0,0 Common 65 0 65 । # । [964 ]p 68 | इ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 66 0 66 इ # इ [907 ]x 69 | फ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 67 0 67 फ # फ [92b ]x 70 | ै 0 0,255,0,255,0,0,0,0,0,0 Devanagari 68 17 68 ै # ै [948 ] 71 | ़ 0 0,255,0,255,0,0,0,0,0,0 Devanagari 69 17 69 ़ # ़ [93c ] 72 | ौ 0 0,255,0,255,0,0,0,0,0,0 Devanagari 70 0 70 ौ # ौ [94c ] 73 | ओ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 71 0 71 ओ # ओ [913 ]x 74 | ठ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 72 0 72 ठ # ठ [920 ]x 75 | 7 8 0,255,0,255,0,0,0,0,0,0 Common 73 2 73 7 # 7 [37 ]0 76 | : 10 0,255,0,255,0,0,0,0,0,0 Common 74 6 74 : # : [3a ]p 77 | घ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 75 0 75 घ # घ [918 ]x 78 | % 10 0,255,0,255,0,0,0,0,0,0 Common 76 4 76 % # % [25 ]p 79 | छ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 77 0 77 छ # छ [91b ]x 80 | ” 10 0,255,0,255,0,0,0,0,0,0 Common 78 10 78 " # ” [201d ]p 81 | 3 8 0,255,0,255,0,0,0,0,0,0 Common 79 2 79 3 # 3 [33 ]0 82 | + 0 0,255,0,255,0,0,0,0,0,0 Common 80 3 80 + # + [2b ] 83 | “ 10 0,255,0,255,0,0,0,0,0,0 Common 81 10 81 " # “ [201c ]p 84 | २ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 82 0 82 २ # २ [968 ]0 85 | ९ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 83 0 83 ९ # ९ [96f ]0 86 | 5 8 0,255,0,255,0,0,0,0,0,0 Common 84 2 84 5 # 5 [35 ]0 87 | 4 8 0,255,0,255,0,0,0,0,0,0 Common 85 2 85 4 # 4 [34 ]0 88 | ऊ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 86 0 86 ऊ # ऊ [90a ]x 89 | | 0 0,255,0,255,0,0,0,0,0,0 Common 87 10 87 | # | [7c ] 90 | ँ 0 0,255,0,255,0,0,0,0,0,0 Devanagari 88 17 88 ँ # ँ [901 ] 91 | ] 10 0,255,0,255,0,0,0,0,0,0 Common 89 10 99 ] # ] [5d ]p 92 | ऑ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 90 0 90 ऑ # ऑ [911 ]x 93 | ० 8 0,255,0,255,0,0,0,0,0,0 Devanagari 91 0 91 ० # ० [966 ]0 94 | औ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 92 0 92 औ # औ [914 ]x 95 | ६ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 93 0 93 ६ # ६ [96c ]0 96 | ञ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 94 0 94 ञ # ञ [91e ]x 97 | ्‌ 0 0,255,0,255,0,0,0,0,0,0 Devanagari 95 17 95 ्‌ # ्‌ [94d 200c ] 98 | ढ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 96 0 96 ढ # ढ [922 ]x 99 | स्‍ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 97 0 97 स्‍ # स्‍ [938 94d 200d ]x 100 | ः 0 0,255,0,255,0,0,0,0,0,0 Devanagari 98 0 98 ः # ः [903 ] 101 | [ 10 0,255,0,255,0,0,0,0,0,0 Common 99 10 89 [ # [ [5b ]p 102 | ? 10 0,255,0,255,0,0,0,0,0,0 Common 100 10 100 ? # ? [3f ]p 103 | ७ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 101 0 101 ७ # ७ [96d ]0 104 | * 10 0,255,0,255,0,0,0,0,0,0 Common 102 10 102 * # * [2a ]p 105 | ८ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 103 0 103 ८ # ८ [96e ]0 106 | ३ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 104 0 104 ३ # ३ [969 ]0 107 | < 0 0,255,0,255,0,0,0,0,0,0 Common 105 10 64 < # < [3c ] 108 | _ 10 0,255,0,255,0,0,0,0,0,0 Common 106 10 106 _ # _ [5f ]p 109 | ५ 8 0,255,0,255,0,0,0,0,0,0 Devanagari 107 0 107 ५ # ५ [96b ]0 110 | & 10 0,255,0,255,0,0,0,0,0,0 Common 108 10 108 & # & [26 ]p 111 | ऐ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 109 0 109 ऐ # ऐ [910 ]x 112 | क्‍ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 110 0 110 क्‍ # क्‍ [915 94d 200d ]x 113 | $ 0 0,255,0,255,0,0,0,0,0,0 Common 111 4 111 $ # $ [24 ] 114 | ॥ 10 0,255,0,255,0,0,0,0,0,0 Common 112 0 112 ॥ # ॥ [965 ]p 115 | न्‍ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 113 0 113 न्‍ # न्‍ [928 94d 200d ]x 116 | ल्‍ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 114 0 114 ल्‍ # ल्‍ [932 94d 200d ]x 117 | ऋ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 115 0 115 ऋ # ऋ [90b ]x 118 | ॐ 1 0,255,0,255,0,0,0,0,0,0 Devanagari 116 0 116 ॐ # ॐ [950 ]x 119 | » 10 0,255,0,255,0,0,0,0,0,0 Common 117 10 118 » # » [bb ]p 120 | « 10 0,255,0,255,0,0,0,0,0,0 Common 118 10 117 « # « [ab ]p 121 | £ 0 0,255,0,255,0,0,0,0,0,0 Common 119 4 119 £ # £ [a3 ] 122 | € 0 0,255,0,255,0,0,0,0,0,0 Common 120 4 120 € # € [20ac ] 123 | ॒ 0 0,255,0,255,0,0,0,0,0,0 Inherited 121 17 121 ॒ # ॒ [952 ] 124 | ॑ 0 0,255,0,255,0,0,0,0,0,0 Inherited 122 17 122 ॑ # ॑ [951 ] 125 | -------------------------------------------------------------------------------- /testing/phototest-rotated-180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/phototest-rotated-180.png -------------------------------------------------------------------------------- /testing/phototest-rotated-L.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/phototest-rotated-L.png -------------------------------------------------------------------------------- /testing/phototest-rotated-R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/phototest-rotated-R.png -------------------------------------------------------------------------------- /testing/phototest.gold.txt: -------------------------------------------------------------------------------- 1 | This is a lot of 12 point text to test the 2 | ocr code and see if it works on all types 3 | of file format. 4 | 5 | The quick brown dog jumped over the 6 | lazy fox. The quick brown dog jumped 7 | over the lazy fox. The quick brown dog 8 | jumped over the lazy fox. The quick 9 | brown dog jumped over the lazy fox. 10 | -------------------------------------------------------------------------------- /testing/phototest.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/phototest.tif -------------------------------------------------------------------------------- /testing/phototest.txt: -------------------------------------------------------------------------------- 1 | This is a lot of 12 point text to test the 2 | ocr code and see if it works on all types 3 | of file format. 4 | 5 | The quick brown dog jumped over the 6 | lazy fox. The quick brown dog jumped 7 | over the lazy fox. The quick brown dog 8 | jumped over the lazy fox. The quick 9 | brown dog jumped over the lazy fox. 10 | -------------------------------------------------------------------------------- /testing/phototest_2.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/phototest_2.tif -------------------------------------------------------------------------------- /testing/phototestrot.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/phototestrot.tif -------------------------------------------------------------------------------- /testing/raaj.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/raaj.tif -------------------------------------------------------------------------------- /testing/segmodeimg.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/segmodeimg.odt -------------------------------------------------------------------------------- /testing/segmodeimg.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/segmodeimg.tif -------------------------------------------------------------------------------- /testing/trainingital.box: -------------------------------------------------------------------------------- 1 | T 131 4681 156 4709 0 2 | o 149 4681 167 4700 0 3 | s 190 4680 206 4699 0 4 | i 207 4680 216 4706 0 5 | m 218 4680 246 4699 0 6 | p 243 4670 268 4698 0 7 | l 270 4679 282 4708 0 8 | e 282 4679 298 4698 0 9 | b 321 4678 339 4707 0 10 | u 343 4678 362 4697 0 11 | r 363 4678 379 4697 0 12 | n 379 4678 398 4697 0 13 | r 421 4677 437 4696 0 14 | u 438 4677 457 4696 0 15 | n 458 4676 477 4695 0 16 | n 479 4676 498 4695 0 17 | i 501 4676 510 4702 0 18 | n 512 4676 531 4695 0 19 | g 531 4666 555 4694 0 20 | o 576 4675 594 4694 0 21 | f 589 4666 616 4704 0 22 | g 637 4665 661 4693 0 23 | o 661 4674 679 4693 0 24 | o 682 4673 700 4692 0 25 | d 703 4673 726 4702 0 26 | s 723 4673 739 4692 0 27 | l 760 4672 772 4701 0 28 | a 772 4672 792 4691 0 29 | t 793 4672 805 4697 0 30 | e 805 4672 821 4691 0 31 | l 824 4671 836 4700 0 32 | y 831 4662 854 4690 0 33 | . 853 4671 857 4675 0 34 | -------------------------------------------------------------------------------- /testing/trainingital.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/trainingital.tif -------------------------------------------------------------------------------- /testing/trainingitalline.box: -------------------------------------------------------------------------------- 1 | WordStr 131 4662 857 4709 0 #T o s i m p l e b u r n r u n n i n g o f g o o d s l a t e l y . 2 | -------------------------------------------------------------------------------- /testing/trainingitalline.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/trainingitalline.tif -------------------------------------------------------------------------------- /testing/trainingtimes.box: -------------------------------------------------------------------------------- 1 | T 131 4681 155 4709 0 2 | o 154 4680 172 4701 0 3 | s 194 4679 207 4700 0 4 | i 209 4680 219 4709 0 5 | m 221 4680 253 4700 0 6 | p 253 4670 273 4699 0 7 | l 275 4679 285 4708 0 8 | e 287 4678 303 4699 0 9 | b 325 4677 345 4707 0 10 | u 346 4677 367 4697 0 11 | r 368 4678 382 4698 0 12 | n 383 4677 404 4697 0 13 | r 425 4677 439 4697 0 14 | u 439 4676 460 4696 0 15 | n 461 4676 482 4696 0 16 | n 483 4676 504 4696 0 17 | i 505 4676 515 4705 0 18 | n 517 4676 538 4696 0 19 | g 539 4666 559 4695 0 20 | o 580 4674 598 4695 0 21 | f 599 4675 616 4704 0 22 | g 637 4665 657 4694 0 23 | o 658 4673 676 4694 0 24 | o 678 4672 696 4693 0 25 | d 698 4672 718 4702 0 26 | s 720 4672 733 4693 0 27 | l 755 4672 765 4701 0 28 | a 767 4671 785 4692 0 29 | t 785 4671 797 4697 0 30 | e 798 4671 814 4692 0 31 | l 816 4672 826 4701 0 32 | y 826 4662 846 4690 0 33 | . 846 4670 850 4674 0 34 | -------------------------------------------------------------------------------- /testing/trainingtimes.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/trainingtimes.tif -------------------------------------------------------------------------------- /testing/trainingtimesline.box: -------------------------------------------------------------------------------- 1 | WordStr 131 4670 850 4709 0 #T o s i m p l e b u r n r u n n i n g o f g o o d s l a t e l y . 2 | -------------------------------------------------------------------------------- /testing/trainingtimesline.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/trainingtimesline.tif -------------------------------------------------------------------------------- /testing/viet.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tesseract-ocr/test/232ff181c66516116ec0e84c4963f70de15050fd/testing/viet.tif -------------------------------------------------------------------------------- /unlvtests/README.md: -------------------------------------------------------------------------------- 1 | ## How to run UNLV tests. 2 | 3 | The scripts in this directory make it possible to duplicate the tests 4 | published in the Fourth Annual Test of OCR Accuracy. 5 | See http://www.expervision.com/wp-content/uploads/2012/12/1995.The_Fourth_Annual_Test_of_OCR_Accuracy.pdf 6 | but first you have to get the tools and data used by UNLV: 7 | 8 | ### Step 1: to download the images go to 9 | https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/ 10 | and get doe3.3B.tar.gz, bus.3B.tar.gz, mag.3B.tar.gz and news.3B.tar.gz 11 | spn.3B.tar.gz is incorrect in this repo, so get it from code.google 12 | 13 | ``` 14 | mkdir -p ~/isri-downloads 15 | cd ~/isri-downloads 16 | curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/bus.3B.tar.gz > bus.3B.tar.gz 17 | curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/doe3.3B.tar.gz > doe3.3B.tar.gz 18 | curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/mag.3B.tar.gz > mag.3B.tar.gz 19 | curl -L https://sourceforge.net/projects/isri-ocr-evaluation-tools-alt/files/news.3B.tar.gz > news.3B.tar.gz 20 | curl -L https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/isri-ocr-evaluation-tools/spn.3B.tar.gz > spn.3B.tar.gz 21 | ``` 22 | 23 | ### Step 2: extract the files. 24 | It doesn't really matter where 25 | in your filesystem you put them, but they must go under a common 26 | root so you have directories doe3.3B, bus.3B, mag.3B and news.3B. in, for example, 27 | ~/ISRI-OCRtk. 28 | 29 | ``` 30 | mkdir -p ~/ISRI-OCRtk 31 | cd ~/ISRI-OCRtk 32 | tar xzvf ~/isri-downloads/bus.3B.tar.gz 33 | tar xzvf ~/isri-downloads/doe3.3B.tar.gz 34 | tar xzvf ~/isri-downloads/mag.3B.tar.gz 35 | tar xzvf ~/isri-downloads/news.3B.tar.gz 36 | tar xzvf ~/isri-downloads/spn.3B.tar.gz 37 | mkdir -p stopwords 38 | cd stopwords 39 | wget -O spa.stopwords.txt https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/stopwords-es.txt 40 | ``` 41 | Edit ~/ISRI-OCRtk/stopwords/spa.stopwords.txt 42 | wordacc uses a space delimited stopwords file, not line delimited. 43 | s/\n/ /g 44 | ``` 45 | perl -pi -e 's/\n/ /' ~/ISRI-OCRtk/stopwords/spa.stopwords.txt 46 | ``` 47 | 48 | Edit ~/ISRI-OCRtk/spn.3B/pages 49 | Delete the line containing the following imagename as it [crashes tesseract](https://github.com/tesseract-ocr/tesseract/issues/1647#issuecomment-395954717). 50 | 51 | 7733_005.3B 3 52 | 53 | ### Step 3: Download the modified ISRI toolkit, make and install the tools : 54 | These will be installed in /usr/local/bin. 55 | 56 | ``` 57 | git clone https://github.com/Shreeshrii/ocr-evaluation-tools.git 58 | cd ~/ocr-evaluation-tools 59 | sudo make install 60 | ``` 61 | 62 | ### Step 4: cd back to your main tesseract-ocr dir and Build tesseract. 63 | 64 | ### Step 5: run test/unlvtests/runalltests.sh with the root ISRI data dir, testname, tessdata-dir: 65 | 66 | ``` 67 | test/unlvtests/runalltests.sh ~/ISRI-OCRtk 4_fast_eng ../tessdata_fast 68 | ``` 69 | and go to the gym, have lunch etc. It takes a while to run. 70 | 71 | ### Step 6: There should be a RELEASE.summary file 72 | *unlvtests/reports/4-beta_fast.summary* that contains the final summarized accuracy 73 | report and comparison with the 1995 results. 74 | 75 | ### Step 7: run the test for Spanish. 76 | 77 | ``` 78 | test/unlvtests/runalltests_spa.sh ~/ISRI-OCRtk 4_fast_spa ../tessdata_fast 79 | ``` 80 | 81 | #### Notes from Nick White regarding wordacc 82 | 83 | If you just want to remove all lines which have 100% recognition, 84 | you can add a 'awk' command like this: 85 | 86 | ocrevalutf8 wordacc ground.txt ocr.txt | awk '$3 != 100 {print $0}' 87 | results.txt 88 | 89 | or if you've already got a results file you want to change, you can do this: 90 | 91 | awk '$3 != 100 {print $0}' results.txt newresults.txt 92 | 93 | If you only want the last sections where things are broken down by 94 | word, you can add a sed command, like this: 95 | 96 | ocrevalutf8 wordacc ground.txt ocr.txt | sed '/^ Count Missed %Right $/,$ 97 | !d' | awk '$3 != 100 {print $0}' results.txt 98 | -------------------------------------------------------------------------------- /unlvtests/counttestset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # File: counttestset.sh 3 | # Description: Script to count the errors on a single UNLV set. 4 | # Author: Ray Smith 5 | # Created: Wed Jun 13 11:58:01 PDT 2007 6 | # 7 | # (C) Copyright 2007, Google Inc. 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | if [ $# -ne 2 ] 19 | then 20 | echo "Usage:$0 pagesfile langcode" 21 | exit 1 22 | fi 23 | if [ ! -d src/api ] 24 | then 25 | echo "Run $0 from the tesseract-ocr root directory!" 26 | exit 1 27 | fi 28 | 29 | pages=$1 30 | langcode=$2 31 | 32 | imdir=${pages%/pages} 33 | setname=${imdir##*/} 34 | resdir=unlvtests/results/$setname 35 | mkdir -p unlvtests/reports 36 | echo "Counting on set $setname in directory $imdir to $resdir" 37 | accfiles="" 38 | wafiles="" 39 | while read page dir 40 | do 41 | if [ "$dir" ] 42 | then 43 | srcdir="$imdir/$dir" 44 | else 45 | srcdir="$imdir" 46 | fi 47 | #echo "$srcdir/$page.tif" 48 | # Convert groundtruth and recognized text to UTF-8 to correctly treat accented letters. 49 | iconv -f ISO8859-1 -t UTF-8 "$srcdir/$page.txt" >"$srcdir/$page.text" 50 | iconv -f ISO8859-1 -t UTF-8 "$resdir/$page.unlv" >"$resdir/$page.text" 51 | # Count character errors. 52 | ocrevalutf8 accuracy "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.acc" 53 | accfiles="$accfiles $resdir/$page.acc" 54 | # Count word errors. 55 | #langcode should be either eng or spa 56 | if [ "$langcode" = "eng" ] 57 | then 58 | ocrevalutf8 wordacc "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" 59 | else 60 | cp ~/ISRI-OCRtk/stopwords/spa.stopwords.txt "$resdir/spa.stopwords" 61 | ocrevalutf8 wordacc -S"$resdir/spa.stopwords" "$srcdir/$page.text" "$resdir/$page.text" > "$resdir/$page.wa" 62 | fi 63 | wafiles="$wafiles $resdir/$page.wa" 64 | done <"$pages" 65 | 66 | accsum $accfiles >"unlvtests/results/$setname.characc" 67 | wordaccsum $wafiles >"unlvtests/results/$setname.wordacc" 68 | 69 | -------------------------------------------------------------------------------- /unlvtests/reorgdata.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # Unless required by applicable law or agreed to in writing, software 7 | # distributed under the License is distributed on an "AS IS" BASIS, 8 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | # See the License for the specific language governing permissions and 10 | # limitations under the License. 11 | 12 | if [ $# -ne 1 ] 13 | then 14 | echo "Usage:$0 scantype" 15 | echo "UNLV data comes in several scan types:" 16 | echo "3B=300 dpi binary" 17 | echo "3A=adaptive thresholded 300 dpi" 18 | echo "3G=300 dpi grey" 19 | echo "4B=400dpi binary" 20 | echo "2B=200dpi binary" 21 | echo "For now we only use 3B" 22 | exit 1 23 | fi 24 | ext=$1 25 | 26 | #There are several test sets without meaningful names, so rename 27 | #them with something a bit more meaningful. 28 | #Each s is oldname/newname 29 | for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset 30 | do 31 | old=${s%/*} 32 | #if this set was downloaded then process it. 33 | if [ -r "$old/PAGES" ] 34 | then 35 | new=${s#*/}.$ext 36 | mkdir -p "$new" 37 | echo "Set $old -> $new" 38 | #The pages file had - instead of _ so fix it and add the extension. 39 | for page in $(cat $old/PAGES) 40 | do 41 | echo "${page%-*}_${page#*-}.$ext" 42 | done >"$new/pages" 43 | for f in $(cat "$new/pages") 44 | do 45 | #Put a tif extension on the tif files. 46 | cp "$old/${old}_B/$f" "$new/$f.tif" 47 | #Put a uzn extension on the zone files. 48 | cp "$old/${old}_B/${f}Z" "$new/$f.uzn" 49 | #Cat all the truth files together and put into a single txt file. 50 | cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt" 51 | done 52 | fi 53 | done 54 | -------------------------------------------------------------------------------- /unlvtests/reports/1995.bus.3B.sum: -------------------------------------------------------------------------------- 1 | 1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% 2 | -------------------------------------------------------------------------------- /unlvtests/reports/1995.doe3.3B.sum: -------------------------------------------------------------------------------- 1 | 1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% 2 | -------------------------------------------------------------------------------- /unlvtests/reports/1995.mag.3B.sum: -------------------------------------------------------------------------------- 1 | 1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% 2 | -------------------------------------------------------------------------------- /unlvtests/reports/1995.news.3B.sum: -------------------------------------------------------------------------------- 1 | 1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% 2 | -------------------------------------------------------------------------------- /unlvtests/reports/2.03.summary: -------------------------------------------------------------------------------- 1 | 1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% 2 | 1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% 3 | 1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% 4 | 1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% 5 | 2.03 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26% 6 | 2.03 doe3.3B 29520 97.98% -18.79% 7966 96.27% 1.79% 6764 95.07 -3.95% 7 | 2.03 mag.3B 14568 97.81% -3.16% 4288 96.25% -6.09% 3054 95.47 -9.62% 8 | 2.03 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57% 9 | 2.03 Total 58165 - -8.81% 15734 - -1.47% 12387 - -6.27% 10 | -------------------------------------------------------------------------------- /unlvtests/reports/2.04.summary: -------------------------------------------------------------------------------- 1 | 1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% 2 | 1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% 3 | 1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% 4 | 1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% 5 | 2.04 bus.3B 6422 97.99% 7.77% 1750 96.60% 7.30% 1361 95.51 5.26% 6 | 2.04 doe3.3B 29514 97.98% -18.80% 7963 96.27% 1.75% 6762 95.07 -3.98% 7 | 2.04 mag.3B 14568 97.81% -3.16% 4289 96.25% -6.07% 3053 95.47 -9.65% 8 | 2.04 news.3B 7655 98.44% 19.01% 1730 97.94% -11.10% 1208 97.54 -19.57% 9 | 2.04 Total 58159 - -8.82% 15732 - -1.48% 12384 - -6.30% 10 | -------------------------------------------------------------------------------- /unlvtests/reports/4_best_int_spa.summary: -------------------------------------------------------------------------------- 1 | RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken 2 | 4_best_int_spa spn.3B 2846 99.18% 937 98.39% 739 97.54 6478.02s 3 | -------------------------------------------------------------------------------- /unlvtests/reports/4_best_spa.summary: -------------------------------------------------------------------------------- 1 | RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken 2 | 4_best_spa spn.3B 2823 99.19% 924 98.41% 729 97.57 7233.76s 3 | -------------------------------------------------------------------------------- /unlvtests/reports/4_fast_eng.summary: -------------------------------------------------------------------------------- 1 | 1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% 2 | 1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% 3 | 1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% 4 | 1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% 5 | 4_fast_eng bus.3B 6124 98.11% 2.77% 1138 97.88% -30.23% 963 97.05 -25.52% 3935.26s 6 | 4_fast_eng doe3.3B 30029 97.96% -17.39% 13781 94.45% 76.09% 13178 92.38 87.13% 18847.36s 7 | 4_fast_eng mag.3B 10934 98.37% -27.32% 3343 97.15% -26.78% 2813 96.06 -16.75% 6867.14s 8 | 4_fast_eng news.3B 5734 98.84% -10.85% 1322 98.45% -32.07% 1040 97.94 -30.76% 5527.38s 9 | 4_fast_eng Total 52821 - -17.19% 19584 - 22.64% 17994 - 36.15% 10 | -------------------------------------------------------------------------------- /unlvtests/reports/4_fast_spa.summary: -------------------------------------------------------------------------------- 1 | RELEASE TestSet CharErrors Accuracy WordErrors Accuracy NonStopWordErrors Accuracy TimeTaken 2 | 4_fast_spa spn.3B 2841 99.18% 879 98.49% 742 97.53 3838.82s 3 | -------------------------------------------------------------------------------- /unlvtests/runalltests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # File: runalltests.sh 3 | # Description: Script to run a set of UNLV test sets for English. 4 | # Author: Ray Smith 5 | # 6 | # (C) Copyright 2007, Google Inc. 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | if [ $# -ne 3 ] 18 | then 19 | echo "Usage:$0 unlv-data-dir version-id tessdata-dir" 20 | exit 1 21 | fi 22 | if [ ! -d src/api ] 23 | then 24 | echo "Run $0 from the tesseract-ocr root directory!" 25 | exit 1 26 | fi 27 | if [ ! -r tesseract ] && [ ! -r tesseract.exe ] 28 | then 29 | echo "Please build tesseract before running $0" 30 | exit 1 31 | fi 32 | tessdata=$3 33 | 34 | #deltapc new old calculates the %change from old to new 35 | deltapc() { 36 | awk ' BEGIN { 37 | printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"'); 38 | }' 39 | } 40 | 41 | #timesum computes the total cpu time 42 | timesum() { 43 | awk ' BEGIN { 44 | total = 0.0; 45 | } 46 | { 47 | total += $2; 48 | } 49 | END { 50 | printf("%.2f\n", total); 51 | }' "$1" 52 | } 53 | 54 | imdir="$1" 55 | vid="$2" 56 | bindir=${0%/*} 57 | if [ "$bindir" = "$0" ] 58 | then 59 | bindir="./" 60 | fi 61 | rdir=unlvtests/reports 62 | 63 | testsets="bus.3B doe3.3B mag.3B news.3B" 64 | #testsets="bus.3B" 65 | 66 | totalerrs=0 67 | totalwerrs=0 68 | totalnswerrs=0 69 | totalolderrs=0 70 | totaloldwerrs=0 71 | totaloldnswerrs=0 72 | for set in $testsets 73 | do 74 | if [ -r "$imdir/$set/pages" ] 75 | then 76 | # Run tesseract on all the pages. 77 | $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "eng" 78 | # Count the errors on all the pages. 79 | $bindir/counttestset.sh "$imdir/$set/pages" "eng" 80 | # Get the old character word and nonstop word errors. 81 | olderrs=$(cut -f3 "unlvtests/reports/1995.$set.sum") 82 | oldwerrs=$(cut -f6 "unlvtests/reports/1995.$set.sum") 83 | oldnswerrs=$(cut -f9 "unlvtests/reports/1995.$set.sum") 84 | # Get the new character word and nonstop word errors and accuracy. 85 | cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | 86 | tr -d '[:blank:]') 87 | chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | 88 | tr -d '[:blank:]') 89 | wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | 90 | tr -d '[:blank:]') 91 | wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | 92 | tr -d '[:blank:]') 93 | nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | 94 | cut -c10-17 |tr -d '[:blank:]') 95 | nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | 96 | cut -c19-26 |tr -d '[:blank:]') 97 | # Compute the percent change. 98 | chdelta=$(deltapc "$cherrs" "$olderrs") 99 | wdelta=$(deltapc "$wderrs" "$oldwerrs") 100 | nswdelta=$(deltapc "$nswderrs" "$oldnswerrs") 101 | sumfile=$rdir/$vid.$set.sum 102 | if [ -r "unlvtests/results/$set.times" ] 103 | then 104 | total_time=$(timesum "unlvtests/results/$set.times") 105 | if [ -r "unlvtests/results/prev/$set.times" ] 106 | then 107 | paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | 108 | awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" 109 | fi 110 | else 111 | total_time='0.0' 112 | fi 113 | echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\ 114 | $wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >"$sumfile" 115 | # Sum totals over all the testsets. 116 | let totalerrs=totalerrs+cherrs 117 | let totalwerrs=totalwerrs+wderrs 118 | let totalnswerrs=totalnswerrs+nswderrs 119 | let totalolderrs=totalolderrs+olderrs 120 | let totaloldwerrs=totaloldwerrs+oldwerrs 121 | let totaloldnswerrs=totaloldnswerrs+oldnswerrs 122 | fi 123 | done 124 | # Compute grand total percent change. 125 | chdelta=$(deltapc $totalerrs $totalolderrs) 126 | wdelta=$(deltapc $totalwerrs $totaloldwerrs) 127 | nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs) 128 | tfile=$rdir/$vid.total.sum 129 | echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ 130 | - $wdelta% $totalnswerrs - $nswdelta%" >"$tfile" 131 | cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary 132 | 133 | mv "$rdir/$vid".*.sum unlvtests/results/ 134 | cat "$rdir/$vid".summary 135 | -------------------------------------------------------------------------------- /unlvtests/runalltests_spa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ############################################################################## 3 | # File: runalltests_spa.sh 4 | # Description: Script to run a set of UNLV test sets for Spanish. 5 | # based on runalltests.sh by Ray Smith 6 | # Author: Shree Devi Kumar 7 | # 8 | # (C) Copyright 2007, Google Inc. 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | ############################################################################## 19 | if [ $# -ne 3 ] 20 | then 21 | echo "Usage:$0 unlv-data-dir version-id tessdata-dir" 22 | exit 1 23 | fi 24 | if [ ! -d src/api ] 25 | then 26 | echo "Run $0 from the tesseract-ocr root directory!" 27 | exit 1 28 | fi 29 | if [ ! -r tesseract ] && [ ! -r tesseract.exe ] 30 | then 31 | echo "Please build tesseract before running $0" 32 | exit 1 33 | fi 34 | tessdata=$3 35 | lang=$4 36 | 37 | #timesum computes the total cpu time 38 | timesum() { 39 | awk ' BEGIN { 40 | total = 0.0; 41 | } 42 | { 43 | total += $2; 44 | } 45 | END { 46 | printf("%.2f\n", total); 47 | }' "$1" 48 | } 49 | 50 | imdir="$1" 51 | vid="$2" 52 | bindir=${0%/*} 53 | if [ "$bindir" = "$0" ] 54 | then 55 | bindir="./" 56 | fi 57 | rdir=unlvtests/reports 58 | 59 | testsets="spn.3B" 60 | 61 | totalerrs=0 62 | totalwerrs=0 63 | totalnswerrs=0 64 | for set in $testsets 65 | do 66 | if [ -r "$imdir/$set/pages" ] 67 | then 68 | # Run tesseract on all the pages. 69 | $bindir/runtestset.sh "$imdir/$set/pages" "$tessdata" "spa" 70 | # Count the errors on all the pages. 71 | $bindir/counttestset.sh "$imdir/$set/pages" "spa" 72 | # Get the new character word and nonstop word errors and accuracy. 73 | cherrs=$(head -4 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | 74 | tr -d '[:blank:]') 75 | chacc=$(head -5 "unlvtests/results/$set.characc" |tail -1 |cut -c1-9 | 76 | tr -d '[:blank:]') 77 | wderrs=$(head -4 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | 78 | tr -d '[:blank:]') 79 | wdacc=$(head -5 "unlvtests/results/$set.wordacc" |tail -1 |cut -c1-9 | 80 | tr -d '[:blank:]') 81 | nswderrs=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | 82 | cut -c10-17 |tr -d '[:blank:]') 83 | nswdacc=$(grep Total "unlvtests/results/$set.wordacc" |head -2 |tail -1 | 84 | cut -c19-26 |tr -d '[:blank:]') 85 | 86 | sumfile=$rdir/$vid.$set.sum 87 | if [ -r "unlvtests/results/$set.times" ] 88 | then 89 | total_time=$(timesum "unlvtests/results/$set.times") 90 | if [ -r "unlvtests/results/prev/$set.times" ] 91 | then 92 | paste "unlvtests/results/prev/$set.times" "unlvtests/results/$set.times" | 93 | awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"unlvtests/results/$set.timedelta" 94 | fi 95 | else 96 | total_time='0.0' 97 | fi 98 | echo "RELEASE TestSet CharErrors Accuracy WordErrors Accuracy\ 99 | NonStopWordErrors Accuracy TimeTaken">"$sumfile" 100 | echo "$vid $set $cherrs $chacc $wderrs $wdacc\ 101 | $nswderrs $nswdacc ${total_time}s" >>"$sumfile" 102 | fi 103 | done 104 | 105 | cat "$rdir/$vid".*.sum >"$rdir/$vid".summary 106 | 107 | mv "$rdir/$vid".*.sum unlvtests/results/ 108 | cat "$rdir/$vid".summary 109 | -------------------------------------------------------------------------------- /unlvtests/runtestset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # File: runtestset.sh 3 | # Description: Script to run tesseract on a single UNLV set. 4 | # Author: Ray Smith 5 | # 6 | # (C) Copyright 2007, Google Inc. 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | if [ $# -ne 3 ] && [ $# -ne 4 ] 18 | then 19 | echo "Usage:$0 pagesfile tessdata-dir lang [-zoning]" 20 | exit 1 21 | fi 22 | if [ ! -d src/api ] 23 | then 24 | echo "Run $0 from the tesseract-ocr root directory!" 25 | exit 1 26 | fi 27 | if [ ! -r tesseract ] 28 | then 29 | if [ ! -r tesseract.exe ] 30 | then 31 | echo "Please build tesseract before running $0" 32 | exit 1 33 | else 34 | tess="./tesseract.exe" 35 | fi 36 | else 37 | tess="time -f %U -o times.txt ./tesseract" 38 | #tess="time -f %U -o times.txt tesseract" 39 | fi 40 | 41 | tessdata=$2 42 | lang=$3 43 | pages=$1 44 | imdir=${pages%/pages} 45 | setname=${imdir##*/} 46 | if [ $# -eq 4 ] && [ "$4" = "-zoning" ] 47 | then 48 | config=unlv.auto 49 | resdir=unlvtests/results/zoning.$setname 50 | else 51 | config=unlv 52 | resdir=unlvtests/results/$setname 53 | fi 54 | echo -e "Testing on set $setname in directory $imdir to $resdir\n" 55 | mkdir -p "$resdir" 56 | rm -f "unlvtests/results/$setname.times" 57 | while read page dir 58 | do 59 | # A pages file may be a list of files with subdirs or maybe just 60 | # a plain list of files so accommodate both. 61 | if [ "$dir" ] 62 | then 63 | srcdir="$imdir/$dir" 64 | else 65 | srcdir="$imdir" 66 | fi 67 | # echo "$srcdir/$page.tif" 68 | $tess "$srcdir/$page.tif" "$resdir/$page" --tessdata-dir $tessdata --oem 1 -l $lang --psm 6 $config 2>&1 |grep -v "OCR Engine" |grep -v "Page 1" 69 | if [ -r times.txt ] 70 | then 71 | read t >"unlvtests/results/$setname.times" 73 | echo -e "\033M$page $t" 74 | if [ "$t" = "Command terminated by signal 2" ] 75 | then 76 | exit 0 77 | fi 78 | fi 79 | done <"$pages" 80 | --------------------------------------------------------------------------------