├── VERSION ├── tessdata ├── tessconfigs │ ├── nobatch │ ├── batch.nochop │ ├── batch │ ├── Makefile.am │ ├── matdemo │ ├── segdemo │ └── msdemo ├── configs │ ├── alto │ ├── pdf │ ├── quiet │ ├── tsv │ ├── logfile │ ├── api_config │ ├── get.images │ ├── lstmbox │ ├── makebox │ ├── wordstrbox │ ├── digits │ ├── hocr │ ├── unlv │ ├── inter │ ├── rebox │ ├── linebox │ ├── kannada │ ├── lstmdebug │ ├── bazaar │ ├── bigram │ ├── txt │ ├── ambigs.train │ ├── lstm.train │ ├── box.train │ ├── Makefile.am │ ├── box.train.stderr │ └── strokewidth ├── eng.user-patterns ├── eng.user-words ├── pdf.ttf └── Makefile.am ├── java ├── com │ ├── Makefile.am │ └── google │ │ ├── Makefile.am │ │ └── scrollview │ │ ├── Makefile.am │ │ ├── events │ │ ├── Makefile.am │ │ └── SVEventType.java │ │ └── ui │ │ ├── Makefile.am │ │ ├── SVSubMenuItem.java │ │ ├── SVEmptyMenuItem.java │ │ ├── SVAbstractMenuItem.java │ │ └── SVCheckboxMenuItem.java └── Manifest.txt ├── docker-compose.yml ├── android ├── Makefile.am ├── AndroidManifest.xml └── jni │ ├── Application.mk │ └── Android.mk ├── unittest ├── README.md ├── cleanapi_test.cc ├── lstm_squashed_test.cc ├── log.h ├── cycletimer.h ├── unichar_test.cc ├── lstm_recode_test.cc ├── stats_test.cc └── fileio_test.cc ├── .gitmodules ├── .clang-format ├── src ├── opencl │ └── Makefile.am ├── viewer │ └── Makefile.am ├── cutil │ ├── Makefile.am │ ├── emalloc.h │ ├── cutil_class.cpp │ ├── cutil_class.h │ ├── structures.cpp │ ├── emalloc.cpp │ └── structures.h ├── dict │ ├── Makefile.am │ ├── dawg_cache.h │ └── stopper.h ├── training │ ├── validate_khmer.h │ ├── icuerrorcode.cpp │ ├── tlog.cpp │ ├── tessopt.h │ ├── validate_grapheme.h │ ├── tlog.h │ ├── validate_indic.h │ ├── set_unicharset_properties.cpp │ ├── validate_myanmar.h │ └── tessopt.cpp ├── vs2010 │ └── tesseract │ │ └── resource.h ├── ccmain │ ├── mutableiterator.cpp │ ├── tessvars.cpp │ ├── tessvars.h │ ├── werdit.h │ ├── docqual.h │ ├── output.h │ ├── fixspace.h │ ├── reject.h │ ├── control.h │ ├── pagewalk.cpp │ └── Makefile.am ├── arch │ ├── dotproduct.h │ ├── dotproduct.cpp │ ├── dotproductsse.h │ ├── dotproductavx.h │ ├── Makefile.am │ └── dotproductavx.cpp ├── ccutil │ ├── tprintf.h │ ├── universalambigs.h │ ├── basedir.h │ ├── Makefile.am │ ├── globaloc.h │ ├── fileerr.h │ ├── unicodes.h │ ├── host.h │ ├── ccutil.cpp │ ├── bits16.h │ ├── lsterr.h │ ├── basedir.cpp │ └── unicodes.cpp ├── wordrec │ ├── drawfx.h │ ├── Makefile.am │ ├── chop.h │ └── findseam.h ├── api │ ├── tess_version.h.in │ └── apitypes.h ├── classify │ ├── float2int.h │ ├── mf.h │ ├── normmatch.h │ ├── normfeat.h │ ├── cutoffs.h │ ├── blobclass.h │ ├── outfeat.h │ ├── mfx.h │ └── clusttool.h ├── ccstruct │ ├── crakedge.h │ ├── publictypes.cpp │ ├── Makefile.am │ ├── ccstruct.cpp │ ├── blread.h │ ├── params_training_featdef.cpp │ ├── debugpixa.h │ ├── ccstruct.h │ └── polyaprx.h ├── lstm │ ├── Makefile.am │ └── functions.cpp └── textord │ ├── tordmain.h │ ├── drawedg.h │ ├── gap_map.h │ ├── blobgrid.h │ ├── blobgrid.cpp │ ├── Makefile.am │ └── edgloop.h ├── .lgtm.yml ├── tesseract.pc.cmake ├── cmake ├── templates │ ├── TesseractConfig-version.cmake.in │ └── TesseractConfig.cmake.in ├── BuildFunctions.cmake └── SourceGroups.cmake ├── tesseract.pc.in ├── Dockerfile ├── AUTHORS ├── doc ├── cntraining.1.asc ├── ambiguous_words.1.asc ├── Makefile.am ├── dawg2wordlist.1.asc ├── generate_manpages.sh ├── merge_unicharsets.1.asc ├── set_unicharset_properties.1.asc ├── wordlist2dawg.1.asc ├── classifier_tester.1.asc ├── mftraining.1.asc ├── lstmeval.1.asc ├── shapeclustering.1.asc └── unicharset_extractor.1.asc ├── .github └── ISSUE_TEMPLATE.md ├── appveyor.yml ├── m4 └── ax_split_version.m4 ├── snap └── snapcraft.yaml ├── .travis.yml └── .gitignore /VERSION: -------------------------------------------------------------------------------- 1 | 4.1.0-rc1 2 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/nobatch: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /java/com/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = google 2 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | tesseract: 2 | build: . 3 | -------------------------------------------------------------------------------- /tessdata/configs/alto: -------------------------------------------------------------------------------- 1 | tessedit_create_alto 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/pdf: -------------------------------------------------------------------------------- 1 | tessedit_create_pdf 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/quiet: -------------------------------------------------------------------------------- 1 | debug_file /dev/null 2 | -------------------------------------------------------------------------------- /tessdata/configs/tsv: -------------------------------------------------------------------------------- 1 | tessedit_create_tsv 1 2 | -------------------------------------------------------------------------------- /java/com/google/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = scrollview 2 | -------------------------------------------------------------------------------- /tessdata/configs/logfile: -------------------------------------------------------------------------------- 1 | debug_file tesseract.log 2 | -------------------------------------------------------------------------------- /tessdata/configs/api_config: -------------------------------------------------------------------------------- 1 | tessedit_zero_rejection T 2 | -------------------------------------------------------------------------------- /tessdata/configs/get.images: -------------------------------------------------------------------------------- 1 | tessedit_write_images T 2 | -------------------------------------------------------------------------------- /tessdata/configs/lstmbox: -------------------------------------------------------------------------------- 1 | tessedit_create_lstmbox 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/makebox: -------------------------------------------------------------------------------- 1 | tessedit_create_boxfile 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/wordstrbox: -------------------------------------------------------------------------------- 1 | tessedit_create_wordstrbox 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/digits: -------------------------------------------------------------------------------- 1 | tessedit_char_whitelist 0123456789-. 2 | -------------------------------------------------------------------------------- /tessdata/eng.user-patterns: -------------------------------------------------------------------------------- 1 | 1-\d\d\d-GOOG-411 2 | www.\n\\\*.com 3 | -------------------------------------------------------------------------------- /tessdata/configs/hocr: -------------------------------------------------------------------------------- 1 | tessedit_create_hocr 1 2 | hocr_font_info 0 3 | -------------------------------------------------------------------------------- /tessdata/eng.user-words: -------------------------------------------------------------------------------- 1 | the 2 | quick 3 | brown 4 | fox 5 | jumped 6 | -------------------------------------------------------------------------------- /tessdata/configs/unlv: -------------------------------------------------------------------------------- 1 | tessedit_write_unlv 1 2 | unlv_tilde_crunching T 3 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/batch.nochop: -------------------------------------------------------------------------------- 1 | chop_enable 0 2 | wordrec_enable_assoc 0 3 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/batch: -------------------------------------------------------------------------------- 1 | # No content needed as all defaults are correct. 2 | -------------------------------------------------------------------------------- /tessdata/pdf.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0101011/tesseract/master/tessdata/pdf.ttf -------------------------------------------------------------------------------- /android/Makefile.am: -------------------------------------------------------------------------------- 1 | EXTRA_DIST = AndroidManifest.xml jni/Android.mk jni/Application.mk 2 | -------------------------------------------------------------------------------- /tessdata/configs/inter: -------------------------------------------------------------------------------- 1 | interactive_display_mode T 2 | tessedit_display_outwords T 3 | -------------------------------------------------------------------------------- /tessdata/configs/rebox: -------------------------------------------------------------------------------- 1 | tessedit_resegment_from_boxes 1 2 | tessedit_make_boxes_from_boxes 1 3 | -------------------------------------------------------------------------------- /tessdata/configs/linebox: -------------------------------------------------------------------------------- 1 | tessedit_resegment_from_line_boxes 1 2 | tessedit_make_boxes_from_boxes 1 3 | -------------------------------------------------------------------------------- /java/com/google/scrollview/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = events ui 2 | 3 | EXTRA_DIST = \ 4 | ScrollView.java 5 | -------------------------------------------------------------------------------- /tessdata/configs/kannada: -------------------------------------------------------------------------------- 1 | textord_skewsmooth_offset 8 2 | textord_skewsmooth_offset2 8 3 | textord_merge_desc 0.5 4 | textord_no_rejects 1 5 | -------------------------------------------------------------------------------- /tessdata/configs/lstmdebug: -------------------------------------------------------------------------------- 1 | stopper_debug_level 1 2 | classify_debug_level 1 3 | segsearch_debug_level 1 4 | language_model_debug_level 3 5 | -------------------------------------------------------------------------------- /java/com/google/scrollview/events/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = 2 | 3 | EXTRA_DIST = \ 4 | SVEvent.java SVEventHandler.java \ 5 | SVEventType.java 6 | -------------------------------------------------------------------------------- /tessdata/configs/bazaar: -------------------------------------------------------------------------------- 1 | load_system_dawg F 2 | load_freq_dawg F 3 | user_words_suffix user-words 4 | user_patterns_suffix user-patterns 5 | -------------------------------------------------------------------------------- /android/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 5 | -------------------------------------------------------------------------------- /java/Manifest.txt: -------------------------------------------------------------------------------- 1 | Main-Class: com/google/scrollview/ScrollView 2 | Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar jaxb-api-2.3.1.jar 3 | -------------------------------------------------------------------------------- /tessdata/configs/bigram: -------------------------------------------------------------------------------- 1 | load_bigram_dawg True 2 | tessedit_enable_bigram_correction True 3 | tessedit_bigram_debug 3 4 | save_raw_choices True 5 | save_alt_choices True 6 | -------------------------------------------------------------------------------- /tessdata/configs/txt: -------------------------------------------------------------------------------- 1 | # This config file should be used with other cofig files which creates renderers. 2 | # usage example: tesseract eurotext.tif eurotext txt hocr pdf 3 | tessedit_create_txt 1 4 | -------------------------------------------------------------------------------- /tessdata/configs/ambigs.train: -------------------------------------------------------------------------------- 1 | tessedit_ambigs_training 1 2 | load_freq_dawg 0 3 | load_punc_dawg 0 4 | load_system_dawg 0 5 | load_number_dawg 0 6 | ambigs_debug_level 3 7 | load_fixed_length_dawgs 0 8 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata/tessconfigs 2 | data_DATA = batch batch.nochop nobatch matdemo segdemo msdemo 3 | EXTRA_DIST = batch batch.nochop nobatch matdemo segdemo msdemo 4 | -------------------------------------------------------------------------------- /unittest/README.md: -------------------------------------------------------------------------------- 1 | Unit Testing for Tesseract 2 | ---------- 3 | 4 | To run the tests, do the following in tesseract folder 5 | 6 | ``` 7 | autoreconf -fiv 8 | git submodule update --init 9 | export TESSDATA_PREFIX=/prefix/to/path/to/tessdata 10 | make check 11 | ``` 12 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = 2 | 3 | EXTRA_DIST = \ 4 | SVAbstractMenuItem.java \ 5 | SVCheckboxMenuItem.java SVEmptyMenuItem.java \ 6 | SVImageHandler.java SVMenuBar.java \ 7 | SVMenuItem.java SVPopupMenu.java SVSubMenuItem.java SVWindow.java 8 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/matdemo: -------------------------------------------------------------------------------- 1 | ################################################# 2 | # Adaptive Matcher Using PreAdapted Templates 3 | ################################################# 4 | 5 | classify_enable_adaptive_debugger 1 6 | matcher_debug_flags 6 7 | matcher_debug_level 1 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "abseil"] 2 | path = abseil 3 | url = https://github.com/abseil/abseil-cpp.git 4 | [submodule "googletest"] 5 | path = googletest 6 | url = https://github.com/google/googletest.git 7 | [submodule "test"] 8 | path = test 9 | url = https://github.com/tesseract-ocr/test 10 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | # Only merge empty functions. 4 | AllowShortFunctionsOnASingleLine: Empty 5 | # Do not allow short if statements. 6 | AllowShortIfStatementsOnASingleLine: false 7 | # Enforce always the same pointer alignment. 8 | DerivePointerAlignment: false 9 | IndentPPDirectives: AfterHash 10 | -------------------------------------------------------------------------------- /src/opencl/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += $(OPENCL_CFLAGS) \ 2 | -I$(top_srcdir)/src/ccutil \ 3 | -I$(top_srcdir)/src/ccstruct \ 4 | -I$(top_srcdir)/src/ccmain 5 | 6 | noinst_HEADERS = \ 7 | openclwrapper.h oclkernels.h 8 | 9 | noinst_LTLIBRARIES = libtesseract_opencl.la 10 | 11 | libtesseract_opencl_la_SOURCES = \ 12 | openclwrapper.cpp 13 | -------------------------------------------------------------------------------- /tessdata/configs/lstm.train: -------------------------------------------------------------------------------- 1 | disable_character_fragments T 2 | file_type .bl 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | il1_adaption_test 1 8 | edges_children_fix F 9 | edges_childarea 0.65 10 | edges_boxarea 0.9 11 | tessedit_train_line_recognizer T 12 | textord_no_rejects T 13 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/segdemo: -------------------------------------------------------------------------------- 1 | ################################################# 2 | # Adaptive Matcher Using PreAdapted Templates 3 | ################################################# 4 | 5 | wordrec_display_splits 0 6 | wordrec_display_all_words 1 7 | wordrec_display_all_blobs 1 8 | wordrec_display_segmentations 2 9 | classify_debug_level 1 10 | stopper_debug_level 1 11 | -------------------------------------------------------------------------------- /tessdata/configs/box.train: -------------------------------------------------------------------------------- 1 | disable_character_fragments T 2 | file_type .bl 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | il1_adaption_test 1 8 | edges_children_fix F 9 | edges_childarea 0.65 10 | edges_boxarea 0.9 11 | tessedit_resegment_from_boxes T 12 | tessedit_train_from_boxes T 13 | textord_no_rejects T 14 | -------------------------------------------------------------------------------- /tessdata/configs/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata/configs 2 | data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug 3 | data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images 4 | data_DATA += lstmbox wordstrbox 5 | # Configurations for OCR output. 6 | data_DATA += alto hocr pdf tsv txt 7 | data_DATA += linebox rebox strokewidth bigram 8 | EXTRA_DIST = $(data_DATA) 9 | -------------------------------------------------------------------------------- /tessdata/configs/box.train.stderr: -------------------------------------------------------------------------------- 1 | file_type .bl 2 | #tessedit_use_nn F 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | il1_adaption_test 1 8 | edges_children_fix F 9 | edges_childarea 0.65 10 | edges_boxarea 0.9 11 | tessedit_resegment_from_boxes T 12 | tessedit_train_from_boxes T 13 | #textord_repeat_extraction F 14 | textord_no_rejects T 15 | -------------------------------------------------------------------------------- /.lgtm.yml: -------------------------------------------------------------------------------- 1 | extraction: 2 | cpp: 3 | prepare: 4 | packages: 5 | - libpango1.0-dev 6 | configure: 7 | command: 8 | - ./autogen.sh 9 | - mkdir _lgtm_build_dir 10 | - cd _lgtm_build_dir 11 | - ../configure 12 | index: 13 | build_command: 14 | - cd _lgtm_build_dir 15 | - make training 16 | python: 17 | python_setup: 18 | version: 3 19 | -------------------------------------------------------------------------------- /tessdata/configs/strokewidth: -------------------------------------------------------------------------------- 1 | textord_show_blobs 0 2 | textord_debug_tabfind 3 3 | textord_tabfind_show_partitions 1 4 | textord_tabfind_show_initial_partitions 1 5 | textord_tabfind_show_columns 1 6 | textord_tabfind_show_blocks 1 7 | textord_tabfind_show_initialtabs 1 8 | textord_tabfind_show_finaltabs 1 9 | textord_tabfind_show_strokewidths 1 10 | textord_tabfind_show_vlines 0 11 | textord_tabfind_show_images 1 12 | tessedit_dump_pageseg_images 0 13 | -------------------------------------------------------------------------------- /tesseract.pc.cmake: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=${prefix}/bin 3 | libdir=${prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: @tesseract_NAME@ 7 | Description: An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. 8 | URL: https://github.com/tesseract-ocr/tesseract 9 | Version: @tesseract_VERSION@ 10 | Libs: -L${libdir} -l@tesseract_OUTPUT_NAME@ 11 | Libs.private: 12 | Cflags: -I${includedir} 13 | -------------------------------------------------------------------------------- /src/viewer/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += -I$(top_srcdir)/src/ccutil 2 | 3 | if VISIBILITY 4 | AM_CPPFLAGS += -DTESS_EXPORTS \ 5 | -fvisibility=hidden -fvisibility-inlines-hidden 6 | endif 7 | 8 | noinst_HEADERS = \ 9 | scrollview.h svmnode.h svutil.h 10 | 11 | noinst_LTLIBRARIES = libtesseract_viewer.la 12 | 13 | libtesseract_viewer_la_SOURCES = \ 14 | scrollview.cpp svmnode.cpp svutil.cpp 15 | 16 | # TODO: Add rule to generate svpaint from svpaint.cpp. 17 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/msdemo: -------------------------------------------------------------------------------- 1 | ################################################# 2 | # Adaptive Matcher Using PreAdapted Templates 3 | ################################################# 4 | 5 | classify_enable_adaptive_debugger 1 6 | matcher_debug_flags 6 7 | matcher_debug_level 1 8 | 9 | wordrec_display_splits 0 10 | wordrec_display_all_words 1 11 | wordrec_display_all_blobs 1 12 | wordrec_display_segmentations 2 13 | classify_debug_level 1 14 | -------------------------------------------------------------------------------- /cmake/templates/TesseractConfig-version.cmake.in: -------------------------------------------------------------------------------- 1 | set(Tesseract_VERSION @VERSION_PLAIN@) 2 | set(PACKAGE_VERSION ${Tesseract_VERSION}) 3 | 4 | set(PACKAGE_VERSION_EXACT False) 5 | set(PACKAGE_VERSION_COMPATIBLE False) 6 | 7 | if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) 8 | set(PACKAGE_VERSION_EXACT True) 9 | set(PACKAGE_VERSION_COMPATIBLE True) 10 | endif() 11 | 12 | if(PACKAGE_FIND_VERSION VERSION_LESS PACKAGE_VERSION) 13 | set(PACKAGE_VERSION_COMPATIBLE True) 14 | endif() 15 | -------------------------------------------------------------------------------- /tesseract.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | bindir=@bindir@ 4 | datarootdir = @datarootdir@ 5 | datadir=@datadir@ 6 | libdir=@libdir@ 7 | includedir=@includedir@ 8 | 9 | Name: @PACKAGE_NAME@ 10 | Description: An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. 11 | URL: https://github.com/tesseract-ocr/tesseract 12 | Version: @VERSION@ 13 | Requires.private: lept 14 | Libs: -L${libdir} -ltesseract 15 | Libs.private: -lpthread @OPENCL_LDFLAGS@ 16 | Cflags: -I${includedir} 17 | -------------------------------------------------------------------------------- /src/cutil/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/ccutil \ 3 | -I$(top_srcdir)/src/viewer 4 | 5 | if VISIBILITY 6 | AM_CPPFLAGS += -DTESS_EXPORTS \ 7 | -fvisibility=hidden -fvisibility-inlines-hidden 8 | endif 9 | 10 | noinst_HEADERS = \ 11 | bitvec.h callcpp.h cutil_class.h \ 12 | emalloc.h \ 13 | oldlist.h structures.h 14 | 15 | noinst_LTLIBRARIES = libtesseract_cutil.la 16 | 17 | libtesseract_cutil_la_SOURCES = \ 18 | bitvec.cpp callcpp.cpp cutil_class.cpp \ 19 | emalloc.cpp \ 20 | oldlist.cpp structures.cpp 21 | -------------------------------------------------------------------------------- /android/jni/Application.mk: -------------------------------------------------------------------------------- 1 | # Include common.mk for building google3 native code. 2 | DEPOT_PATH := $(firstword $(subst /google3, ,$(abspath $(call my-dir)))) 3 | ifneq ($(wildcard $(DEPOT_PATH)/google3/mobile/build/common.mk),) 4 | include $(DEPOT_PATH)/google3/mobile/build/common.mk 5 | else 6 | include $(DEPOT_PATH)/READONLY/google3/mobile/build/common.mk 7 | endif 8 | 9 | # Specify the hash namespace that we're using, based on the APP_STL we're using. 10 | APP_CFLAGS += -Werror -DHASH_NAMESPACE=__gnu_cxx 11 | APP_PLATFORM := android-16 12 | APP_STL := gnustl_static 13 | NDK_TOOLCHAIN_VERSION := clang 14 | -------------------------------------------------------------------------------- /src/dict/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/cutil \ 3 | -I$(top_srcdir)/src/ccutil \ 4 | -I$(top_srcdir)/src/ccstruct \ 5 | -I$(top_srcdir)/src/viewer 6 | 7 | if VISIBILITY 8 | AM_CPPFLAGS += -DTESS_EXPORTS \ 9 | -fvisibility=hidden -fvisibility-inlines-hidden 10 | endif 11 | 12 | noinst_HEADERS = \ 13 | dawg.h dawg_cache.h dict.h matchdefs.h \ 14 | stopper.h trie.h 15 | 16 | noinst_LTLIBRARIES = libtesseract_dict.la 17 | 18 | libtesseract_dict_la_SOURCES = \ 19 | context.cpp \ 20 | dawg.cpp dawg_cache.cpp dict.cpp hyphen.cpp \ 21 | permdawg.cpp stopper.cpp trie.cpp 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile for local Travis build test 2 | 3 | FROM ubuntu 4 | LABEL maintainer="Ian Blenke " 5 | 6 | RUN apt-get update 7 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y cmake curl git ruby bundler wget unzip \ 8 | && apt-get clean \ 9 | && rm -rf /var/lib/apt/lists/* 10 | RUN gem install bundler travis --no-ri --no-rdoc 11 | RUN git clone --depth 1 https://github.com/travis-ci/travis-build ~/.travis/travis-build 12 | RUN bundle install --gemfile ~/.travis/travis-build/Gemfile 13 | 14 | ADD . /tesseract 15 | WORKDIR /tesseract 16 | 17 | RUN travis compile | sed -e "s/--branch\\\=\\\'\\\'/--branch=master/g" | bash 18 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Ray Smith (lead developer) 2 | Ahmad Abdulkader 3 | Rika Antonova 4 | Nicholas Beato 5 | Jeff Breidenbach 6 | Samuel Charron 7 | Phil Cheatle 8 | Simon Crouch 9 | David Eger 10 | Sheelagh Huddleston 11 | Dan Johnson 12 | Rajesh Katikam 13 | Thomas Kielbus 14 | Dar-Shyang Lee 15 | Zongyi (Joe) Liu 16 | Robert Moss 17 | Chris Newton 18 | Michael Reimer 19 | Marius Renn 20 | Raquel Romano 21 | Christy Russon 22 | Shobhit Saxena 23 | Mark Seaman 24 | Faisal Shafait 25 | Hiroshi Takenaka 26 | Ranjith Unnikrishnan 27 | Joern Wanke 28 | Ping Ping Xiu 29 | Andrew Ziem 30 | Oscar Zuniga 31 | 32 | Community Contributors: 33 | Zdenko Podobný (Maintainer) 34 | Jim Regan (Maintainer) 35 | James R Barlow 36 | Amit Dovev 37 | Martin Ettl 38 | Shree Devi Kumar 39 | Noah Metzger 40 | Tom Morris 41 | Tobias Müller 42 | Egor Pugin 43 | Sundar M. Vaidya 44 | Stefan Weil 45 | -------------------------------------------------------------------------------- /doc/cntraining.1.asc: -------------------------------------------------------------------------------- 1 | CNTRAINING(1) 2 | ============= 3 | 4 | NAME 5 | ---- 6 | cntraining - character normalization training for Tesseract 7 | 8 | SYNOPSIS 9 | -------- 10 | *cntraining* [-D 'dir'] 'FILE'... 11 | 12 | DESCRIPTION 13 | ----------- 14 | cntraining takes a list of .tr files, from which it generates the 15 | *normproto* data file (the character normalization sensitivity 16 | prototypes). 17 | 18 | OPTIONS 19 | -------- 20 | -D 'dir':: 21 | Directory to write output files to. 22 | 23 | SEE ALSO 24 | -------- 25 | tesseract(1), shapeclustering(1), mftraining(1) 26 | 27 | 28 | 29 | COPYING 30 | ------- 31 | Copyright (c) Hewlett-Packard Company, 1988 32 | Licensed under the Apache License, Version 2.0 33 | 34 | AUTHOR 35 | ------ 36 | The Tesseract OCR engine was written by Ray Smith and his research groups 37 | at Hewlett Packard (1985-1995) and Google (2006-present). 38 | -------------------------------------------------------------------------------- /doc/ambiguous_words.1.asc: -------------------------------------------------------------------------------- 1 | AMBIGUOUS_WORDS(1) 2 | ================== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | ambiguous_words - generate sets of words Tesseract is likely to find ambiguous 8 | 9 | SYNOPSIS 10 | -------- 11 | *ambiguous_words* [-l lang] 'TESSDATADIR' 'WORDLIST' 'AMBIGUOUSFILE' 12 | 13 | DESCRIPTION 14 | ----------- 15 | ambiguous_words(1) runs Tesseract in a special mode, and for each word 16 | in word list, produces a set of words which Tesseract thinks might be 17 | ambiguous with it. 'TESSDATADIR' must be set to the absolute path of 18 | a directory containing 'tessdata/lang.traineddata'. 19 | 20 | SEE ALSO 21 | -------- 22 | tesseract(1) 23 | 24 | COPYING 25 | ------- 26 | Copyright \(C) 2012 Google, Inc. 27 | Licensed under the Apache License, Version 2.0 28 | 29 | AUTHOR 30 | ------ 31 | The Tesseract OCR engine was written by Ray Smith and his research groups 32 | at Hewlett Packard (1985-1995) and Google (2006-present). 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Before you submit an issue, please review [the guidelines for this repository](https://github.com/tesseract-ocr/tesseract/blob/master/CONTRIBUTING.md). 2 | 3 | Please report an issue only for a BUG, not for asking questions. 4 | 5 | Note that it will be much easier for us to fix the issue if a test case that 6 | reproduces the problem is provided. Ideally this test case should not have any 7 | external dependencies. Provide a copy of the image or link to files for the test case. 8 | 9 | Please delete this text and fill in the template below. 10 | 11 | ------------------------ 12 | 13 | ### Environment 14 | 15 | * **Tesseract Version**: 16 | * **Commit Number**: 17 | * **Platform**: 18 | 19 | ### Current Behavior: 20 | 21 | ### Expected Behavior: 22 | 23 | ### Suggested Fix: 24 | -------------------------------------------------------------------------------- /src/training/validate_khmer.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_KHMER_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments Khmer. 9 | class ValidateKhmer : public Validator { 10 | public: 11 | ValidateKhmer(ViramaScript script, bool report_errors) 12 | : Validator(script, report_errors) {} 13 | ~ValidateKhmer() {} 14 | 15 | protected: 16 | // Returns whether codes matches the pattern for an Khmer Grapheme. 17 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 18 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 19 | // otherwise does not increment codes_used_. 20 | bool ConsumeGraphemeIfValid() override; 21 | // Returns the CharClass corresponding to the given Unicode ch. 22 | CharClass UnicodeToCharClass(char32 ch) const override; 23 | }; 24 | 25 | } // namespace tesseract 26 | 27 | #endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_ 28 | -------------------------------------------------------------------------------- /src/vs2010/tesseract/resource.h: -------------------------------------------------------------------------------- 1 | //{{NO_DEPENDENCIES}} 2 | // Microsoft Visual C++ generated include file. 3 | // Used by tesseract.rc 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | // Next default values for new objects 15 | // 16 | #ifdef APSTUDIO_INVOKED 17 | #ifndef APSTUDIO_READONLY_SYMBOLS 18 | #define _APS_NEXT_RESOURCE_VALUE 101 19 | #define _APS_NEXT_COMMAND_VALUE 40001 20 | #define _APS_NEXT_CONTROL_VALUE 1001 21 | #define _APS_NEXT_SYMED_VALUE 101 22 | #endif 23 | #endif 24 | -------------------------------------------------------------------------------- /src/ccmain/mutableiterator.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | /////////////////////////////////////////////////////////////////////// 14 | 15 | #include "mutableiterator.h" 16 | 17 | namespace tesseract { 18 | 19 | // Destructor. 20 | // It is defined here, so the compiler can create a single vtable 21 | // instead of weak vtables in every compilation unit. 22 | MutableIterator::~MutableIterator() = default; 23 | 24 | } // namespace tesseract. 25 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | # not working for some reason now 4 | #- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 5 | #platform: Win32 6 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 7 | platform: Win32 8 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 9 | platform: Win64 10 | #- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019 Preview 11 | #platform: Win64 12 | 13 | configuration: 14 | - Release 15 | 16 | cache: 17 | - c:/Users/appveyor/.sw -> appveyor.yml 18 | 19 | before_build: 20 | - curl -fsS -L -o dl.zip https://github.com/SoftwareNetwork/binaries/raw/master/sw-master-windows-client.zip 21 | - 7z x dl.zip 22 | - set PATH=%PATH%;%cd% 23 | 24 | build_script: 25 | - sw -show-output -platform %platform% build 26 | 27 | after_build: 28 | - 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\.sw\windows_*_msvc_*_shared_Release\*.exe %APPVEYOR_BUILD_FOLDER%\.sw\windows_*_msvc_*_shared_Release\*.dll 29 | 30 | artifacts: 31 | - path: tesseract.zip 32 | name: tesseract-$(APPVEYOR_BUILD_VERSION) 33 | 34 | -------------------------------------------------------------------------------- /src/cutil/emalloc.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: emalloc.h 3 | ** Purpose: Definition of memory allocation routines. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef EMALLOC_H 19 | #define EMALLOC_H 20 | 21 | void *Emalloc(int Size); 22 | 23 | void *Erealloc(void *ptr, int size); 24 | 25 | void Efree(void *ptr); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /src/training/icuerrorcode.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | /////////////////////////////////////////////////////////////////////// 14 | 15 | #include "icuerrorcode.h" 16 | 17 | namespace tesseract { 18 | 19 | // Destructor. 20 | // It is defined here, so the compiler can create a single vtable 21 | // instead of weak vtables in every compilation unit. 22 | IcuErrorCode::~IcuErrorCode() { 23 | if (isFailure()) { 24 | handleFailure(); 25 | } 26 | } 27 | 28 | } // namespace tesseract. 29 | -------------------------------------------------------------------------------- /src/ccmain/tessvars.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tessvars.cpp (Formerly tessvars.c) 3 | * Description: Variables and other globals for tessedit. 4 | * Author: Ray Smith 5 | * Created: Mon Apr 13 13:13:23 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #include 21 | 22 | #include "tessvars.h" 23 | 24 | FILE *debug_fp = stderr; // write debug stuff here 25 | -------------------------------------------------------------------------------- /src/training/tlog.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tlog.cpp 3 | * Description: Variant of printf with logging level controllable by a 4 | * commandline flag. 5 | * Author: Ranjith Unnikrishnan 6 | * Created: Wed Nov 20 2013 7 | * 8 | * (C) Copyright 2013, Google Inc. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | **********************************************************************/ 20 | 21 | #include "tlog.h" 22 | 23 | INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output"); 24 | -------------------------------------------------------------------------------- /cmake/BuildFunctions.cmake: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | # Unless required by applicable law or agreed to in writing, software 6 | # distributed under the License is distributed on an "AS IS" BASIS, 7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | # See the License for the specific language governing permissions and 9 | # limitations under the License. 10 | ################################################################################ 11 | # 12 | # macros and functions 13 | # 14 | ################################################################################ 15 | 16 | ######################################## 17 | # FUNCTION project_group 18 | ######################################## 19 | function(project_group target name) 20 | set_target_properties(${target} PROPERTIES FOLDER ${name}) 21 | endfunction(project_group) 22 | 23 | ################################################################################ 24 | -------------------------------------------------------------------------------- /src/ccmain/tessvars.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tessvars.h (Formerly tessvars.h) 3 | * Description: Variables and other globals for tessedit. 4 | * Author: Ray Smith 5 | * Created: Mon Apr 13 13:13:23 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef TESSVARS_H 21 | #define TESSVARS_H 22 | 23 | #include 24 | 25 | extern FILE *debug_fp; // write debug stuff here 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /doc/Makefile.am: -------------------------------------------------------------------------------- 1 | # doc/Makefile.am 2 | 3 | if ASCIIDOC 4 | 5 | man_MANS = \ 6 | combine_lang_model.1 \ 7 | combine_tessdata.1 \ 8 | dawg2wordlist.1 \ 9 | lstmeval.1 \ 10 | lstmtraining.1 \ 11 | merge_unicharsets.1 \ 12 | set_unicharset_properties.1 \ 13 | tesseract.1 \ 14 | text2image.1 \ 15 | unicharambigs.5 \ 16 | unicharset_extractor.1 \ 17 | wordlist2dawg.1 18 | 19 | if !DISABLED_LEGACY_ENGINE 20 | man_MANS += \ 21 | ambiguous_words.1 \ 22 | classifier_tester.1 \ 23 | cntraining.1 \ 24 | mftraining.1 \ 25 | shapeclustering.1 \ 26 | unicharset.5 27 | endif 28 | 29 | man_xslt = http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl 30 | 31 | EXTRA_DIST = $(man_MANS) Doxyfile 32 | 33 | .PHONY: html 34 | 35 | html: ${man_MANS:%=%.html} 36 | pdf: ${man_MANS:%=%.pdf} 37 | 38 | SUFFIXES = .asc .html .pdf 39 | 40 | .asc: 41 | -asciidoc -b docbook -d manpage -o - $< | \ 42 | xsltproc --nonet $(man_xslt) - 43 | 44 | .asc.html: 45 | asciidoc -b html5 -o $@ $< 46 | 47 | .asc.pdf: 48 | asciidoc -b docbook -d manpage -o $*.dbk $< 49 | docbook2pdf $*.dbk 50 | 51 | MAINTAINERCLEANFILES = $(man_MANS) Doxyfile 52 | 53 | endif 54 | -------------------------------------------------------------------------------- /doc/dawg2wordlist.1.asc: -------------------------------------------------------------------------------- 1 | DAWG2WORDLIST(1) 2 | ================ 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | dawg2wordlist - convert a Tesseract DAWG to a wordlist 8 | 9 | SYNOPSIS 10 | -------- 11 | *dawg2wordlist* 'UNICHARSET' 'DAWG' 'WORDLIST' 12 | 13 | DESCRIPTION 14 | ----------- 15 | dawg2wordlist(1) converts a Tesseract Directed Acyclic Word 16 | Graph (DAWG) to a list of words using a unicharset as key. 17 | 18 | OPTIONS 19 | ------- 20 | 'UNICHARSET' 21 | The unicharset of the language. This is the unicharset 22 | generated by mftraining(1). 23 | 24 | 'DAWG' 25 | The input DAWG, created by wordlist2dawg(1) 26 | 27 | 'WORDLIST' 28 | Plain text (output) file in UTF-8, one word per line 29 | 30 | SEE ALSO 31 | -------- 32 | tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), 33 | combine_tessdata(1) 34 | 35 | 36 | 37 | COPYING 38 | ------- 39 | Copyright \(C) 2012 Google, Inc. 40 | Licensed under the Apache License, Version 2.0 41 | 42 | AUTHOR 43 | ------ 44 | The Tesseract OCR engine was written by Ray Smith and his research groups 45 | at Hewlett Packard (1985-1995) and Google (2006-present). 46 | -------------------------------------------------------------------------------- /src/cutil/cutil_class.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: cutil_class.cpp 3 | // Description: cutil class. 4 | // Author: Samuel Charron 5 | // 6 | // (C) Copyright 2006, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #include "cutil_class.h" 20 | 21 | namespace tesseract { 22 | 23 | // Destructor. 24 | // It is defined here, so the compiler can create a single vtable 25 | // instead of weak vtables in every compilation unit. 26 | CUtil::~CUtil() = default; 27 | 28 | } // namespace tesseract 29 | -------------------------------------------------------------------------------- /src/arch/dotproduct.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproduct.h 3 | // Description: Native dot product function. 4 | // 5 | // (C) Copyright 2018, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////// 16 | 17 | #ifndef TESSERACT_ARCH_DOTPRODUCT_H_ 18 | #define TESSERACT_ARCH_DOTPRODUCT_H_ 19 | 20 | namespace tesseract { 21 | 22 | // Computes and returns the dot product of the n-vectors u and v. 23 | double DotProductNative(const double* u, const double* v, int n); 24 | 25 | } // namespace tesseract. 26 | 27 | #endif // TESSERACT_ARCH_DOTPRODUCT_H_ 28 | -------------------------------------------------------------------------------- /src/arch/dotproduct.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproduct.h 3 | // Description: Native dot product function. 4 | // 5 | // (C) Copyright 2018, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////// 16 | 17 | #include "dotproduct.h" 18 | 19 | namespace tesseract { 20 | 21 | // Computes and returns the dot product of the two n-vectors u and v. 22 | double DotProductNative(const double* u, const double* v, int n) { 23 | double total = 0.0; 24 | for (int k = 0; k < n; ++k) total += u[k] * v[k]; 25 | return total; 26 | } 27 | 28 | } // namespace tesseract 29 | -------------------------------------------------------------------------------- /src/ccmain/werdit.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: wordit.h 3 | * Description: An iterator for passing over all the words in a document. 4 | * Author: Ray Smith 5 | * Created: Mon Apr 27 08:51:22 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef WERDIT_H 21 | #define WERDIT_H 22 | 23 | #include "rect.h" // for TBOX 24 | class PAGE_RES; 25 | class PAGE_RES_IT; 26 | 27 | PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /unittest/cleanapi_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "baseapi.h" 13 | 14 | // Dummy enum in the global namespace that checks for collision with awkward 15 | // names. 16 | // If this test fails to compile, clean up the includes in baseapi.h! 17 | // They are not supposed to drag in definitions of any of the tesseract 18 | // types included in this enum! 19 | enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD }; 20 | 21 | #define ERRCODE_H // avoid redefinition of ABORT in errcode.h 22 | #include "include_gunit.h" 23 | 24 | namespace { 25 | 26 | // Verifies that the global namespace is clean. 27 | TEST(CleanNamespaceTess, DummyTest) { tesseract::TessBaseAPI api; } 28 | 29 | } // namespace. 30 | -------------------------------------------------------------------------------- /src/ccutil/tprintf.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tprintf.h 3 | * Description: Trace version of printf - portable between UX and NT 4 | * Author: Phil Cheatle 5 | * Created: Wed Jun 28 15:01:15 BST 1995 6 | * 7 | * (C) Copyright 1995, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef TESSERACT_CCUTIL_TPRINTF_H 21 | #define TESSERACT_CCUTIL_TPRINTF_H 22 | 23 | #include "platform.h" // for TESS_API 24 | 25 | // Main logging function. 26 | extern TESS_API void tprintf( // Trace printf 27 | const char *format, ...); // Message 28 | 29 | #endif // define TESSERACT_CCUTIL_TPRINTF_H 30 | -------------------------------------------------------------------------------- /src/arch/dotproductsse.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproductsse.h 3 | // Description: Architecture-specific dot-product function. 4 | // Author: Ray Smith 5 | // 6 | // (C) Copyright 2015, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | /////////////////////////////////////////////////////////////////////// 17 | 18 | #ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_ 19 | #define TESSERACT_ARCH_DOTPRODUCTSSE_H_ 20 | 21 | namespace tesseract { 22 | 23 | // Computes and returns the dot product of the n-vectors u and v. 24 | // Uses Intel SSE intrinsics to access the SIMD instruction set. 25 | double DotProductSSE(const double* u, const double* v, int n); 26 | 27 | } // namespace tesseract. 28 | 29 | #endif // TESSERACT_ARCH_DOTPRODUCTSSE_H_ 30 | -------------------------------------------------------------------------------- /src/cutil/cutil_class.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: cutil_class.h 3 | // Description: cutil class. 4 | // Author: Samuel Charron 5 | // 6 | // (C) Copyright 2006, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #ifndef TESSERACT_CUTIL_CUTIL_CLASS_H_ 20 | #define TESSERACT_CUTIL_CUTIL_CLASS_H_ 21 | 22 | #include "ccutil.h" 23 | #include "strngs.h" 24 | 25 | namespace tesseract { 26 | 27 | class CUtil : public CCUtil { 28 | public: 29 | CUtil() = default; 30 | ~CUtil() override; 31 | void read_variables(const char *filename, bool global_only); 32 | }; 33 | 34 | } // namespace tesseract 35 | 36 | #endif // TESSERACT_CUTIL_CUTIL_CLASS_H_ 37 | -------------------------------------------------------------------------------- /src/ccutil/universalambigs.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: universalambigs.h 3 | // Description: Data for a universal ambigs file that is useful for 4 | // any language. 5 | // Author: Ray Smith 6 | // Created: Mon Mar 18 11:26:00 PDT 2013 7 | // 8 | // (C) Copyright 2013, Google Inc. 9 | // Licensed under the Apache License, Version 2.0 (the "License"); 10 | // you may not use this file except in compliance with the License. 11 | // You may obtain a copy of the License at 12 | // http://www.apache.org/licenses/LICENSE-2.0 13 | // Unless required by applicable law or agreed to in writing, software 14 | // distributed under the License is distributed on an "AS IS" BASIS, 15 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | // See the License for the specific language governing permissions and 17 | // limitations under the License. 18 | // 19 | /////////////////////////////////////////////////////////////////////// 20 | 21 | #ifndef TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_ 22 | #define TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_ 23 | 24 | namespace tesseract { 25 | 26 | extern const char kUniversalAmbigsFile[]; 27 | extern const int ksizeofUniversalAmbigsFile; 28 | 29 | } // namespace tesseract 30 | 31 | #endif // TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_ 32 | -------------------------------------------------------------------------------- /src/wordrec/drawfx.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: drawfx.h (Formerly drawfx.h) 3 | * Description: Draw things to do with feature extraction. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1992, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef DRAWFX_H 20 | #define DRAWFX_H 21 | 22 | #include "params.h" 23 | #include "scrollview.h" 24 | 25 | extern STRING_VAR_H (fx_debugfile, DEBUG_WIN_NAME, "Name of debugfile"); 26 | extern ScrollView* fx_win; 27 | extern FILE *fx_debug; 28 | void create_fx_win(); //make features win 29 | void clear_fx_win(); //make features win 30 | void create_fxdebug_win(); //make gradients win 31 | #endif 32 | -------------------------------------------------------------------------------- /src/api/tess_version.h.in: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: version.h 3 | // Description: Version information 4 | // 5 | // (C) Copyright 2018, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | /////////////////////////////////////////////////////////////////////// 17 | 18 | #ifndef TESSERACT_API_VERSION_H_ 19 | #define TESSERACT_API_VERSION_H_ 20 | 21 | #define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ 22 | #define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ 23 | #define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ 24 | #define TESSERACT_VERSION \ 25 | (TESSERACT_MAJOR_VERSION << 16 | \ 26 | TESSERACT_MINOR_VERSION << 8 | \ 27 | TESSERACT_MICRO_VERSION) 28 | #define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" 29 | 30 | #endif // TESSERACT_API_VERSION_H_ 31 | -------------------------------------------------------------------------------- /src/arch/dotproductavx.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproductavx.h 3 | // Description: Architecture-specific dot-product function. 4 | // Author: Ray Smith 5 | // Created: Wed Jul 22 10:51:05 PDT 2015 6 | // 7 | // (C) Copyright 2015, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #ifndef TESSERACT_ARCH_DOTPRODUCTAVX_H_ 20 | #define TESSERACT_ARCH_DOTPRODUCTAVX_H_ 21 | 22 | namespace tesseract { 23 | 24 | // Computes and returns the dot product of the n-vectors u and v. 25 | // Uses Intel AVX intrinsics to access the SIMD instruction set. 26 | double DotProductAVX(const double* u, const double* v, int n); 27 | 28 | } // namespace tesseract. 29 | 30 | #endif // TESSERACT_ARCH_DOTPRODUCTAVX_H_ 31 | -------------------------------------------------------------------------------- /src/ccmain/docqual.h: -------------------------------------------------------------------------------- 1 | /****************************************************************** 2 | * File: docqual.h (Formerly docqual.h) 3 | * Description: Document Quality Metrics 4 | * Author: Phil Cheatle 5 | * Created: Mon May 9 11:27:28 BST 1994 6 | * 7 | * (C) Copyright 1994, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef DOCQUAL_H 21 | #define DOCQUAL_H 22 | 23 | #include // for int16_t 24 | 25 | class PAGE_RES_IT; 26 | class ROW; 27 | class WERD_RES; 28 | 29 | enum GARBAGE_LEVEL 30 | { 31 | G_NEVER_CRUNCH, 32 | G_OK, 33 | G_DODGY, 34 | G_TERRIBLE 35 | }; 36 | 37 | int16_t word_blob_quality(WERD_RES *word, ROW *row); 38 | void reject_whole_page(PAGE_RES_IT &page_res_it); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/classify/float2int.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: float2int.h 3 | ** Purpose: Routines for converting float features to int features 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef FLOAT2INT_H 19 | #define FLOAT2INT_H 20 | 21 | /*----------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | -----------------------------------------------------------------------------*/ 24 | #include "intmatcher.h" 25 | #include "ocrfeatures.h" 26 | 27 | #define INT_FEAT_RANGE 256 28 | #define BASELINE_Y_SHIFT (0.25) 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/training/tessopt.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tessopt.h 3 | * Description: Re-implementation of the unix code. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1995, Hewlett-Packard Co. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef TESSERACT_TRAINING_TESSOPT_H_ 20 | #define TESSERACT_TRAINING_TESSOPT_H_ 21 | 22 | #include // for int32_t 23 | 24 | extern int tessoptind; 25 | extern char *tessoptarg; 26 | 27 | int tessopt ( //parse args 28 | int32_t argc, //arg count 29 | char *argv[], //args 30 | const char *arglist //string of arg chars 31 | ); 32 | 33 | #endif // TESSERACT_TRAINING_TESSOPT_H_ 34 | -------------------------------------------------------------------------------- /src/ccmain/output.h: -------------------------------------------------------------------------------- 1 | /****************************************************************** 2 | * File: output.h (Formerly output.h) 3 | * Description: Output pass 4 | * Author: Phil Cheatle 5 | * Created: Thu Aug 4 10:56:08 BST 1994 6 | * 7 | * (C) Copyright 1994, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef OUTPUT_H 21 | #define OUTPUT_H 22 | 23 | class BLOCK; 24 | class WERD; 25 | 26 | /** test line ends */ 27 | char determine_newline_type(WERD *word, ///< word to do 28 | BLOCK *block, ///< current block 29 | WERD *next_word, ///< next word 30 | BLOCK *next_block ///< block of next word 31 | ); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/ccutil/basedir.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: basedir.h (Formerly getpath.h) 3 | * Description: Header file for getpath.c. Provides relocatability of data. 4 | * Author: Ray Smith 5 | * Created: Mon Jul 09 09:13:03 BST 1990 6 | * 7 | * (C) Copyright 1990, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef BASEDIR_H 21 | #define BASEDIR_H 22 | 23 | #include "platform.h" 24 | #include "strngs.h" 25 | 26 | // Returns the given code_path truncated to the last slash. 27 | // Useful for getting to the directory of argv[0], but does not search 28 | // any paths. 29 | TESS_API void truncate_path(const char *code_path, STRING* trunc_path); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/ccmain/fixspace.h: -------------------------------------------------------------------------------- 1 | /****************************************************************** 2 | * File: fixspace.h (Formerly fixspace.h) 3 | * Description: Implements a pass over the page res, exploring the alternative 4 | * spacing possibilities, trying to use context to improve the 5 | * word spacing 6 | * Author: Phil Cheatle 7 | * Created: Thu Oct 21 11:38:43 BST 1993 8 | * 9 | * (C) Copyright 1993, Hewlett-Packard Ltd. 10 | ** Licensed under the Apache License, Version 2.0 (the "License"); 11 | ** you may not use this file except in compliance with the License. 12 | ** You may obtain a copy of the License at 13 | ** http://www.apache.org/licenses/LICENSE-2.0 14 | ** Unless required by applicable law or agreed to in writing, software 15 | ** distributed under the License is distributed on an "AS IS" BASIS, 16 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | ** See the License for the specific language governing permissions and 18 | ** limitations under the License. 19 | * 20 | **********************************************************************/ 21 | 22 | #ifndef FIXSPACE_H 23 | #define FIXSPACE_H 24 | 25 | class WERD_RES; 26 | class WERD_RES_LIST; 27 | 28 | void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list); 29 | void transform_to_next_perm(WERD_RES_LIST &words); 30 | void fixspace_dbg(WERD_RES *word); 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /doc/generate_manpages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # File: generate_manpages.sh 4 | # Description: Converts .asc files into man pages, etc. for Tesseract. 5 | # Author: eger@google.com (David Eger) 6 | # Created: 9 Feb 2012 7 | # 8 | # (C) Copyright 2012 Google Inc. 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | man_xslt=http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl 20 | asciidoc=$(which asciidoc) 21 | xsltproc=$(which xsltproc) 22 | if [[ -z "${asciidoc}" ]] || [[ -z "${xsltproc}" ]]; then 23 | echo "Please make sure asciidoc and xsltproc are installed." 24 | exit 1 25 | else 26 | for src in *.asc; do 27 | pagename=${src/.asc/} 28 | (${asciidoc} -d manpage ${src} && 29 | ${asciidoc} -d manpage -b docbook ${src} && 30 | ${xsltproc} --nonet ${man_xslt} ${pagename}.xml) || 31 | echo "Error generating ${pagename}" 32 | done 33 | fi 34 | exit 0 35 | -------------------------------------------------------------------------------- /m4/ax_split_version.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_split_version.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_SPLIT_VERSION 8 | # 9 | # DESCRIPTION 10 | # 11 | # Splits a version number in the format MAJOR.MINOR.POINT into its 12 | # separate components. 13 | # 14 | # Sets the variables. 15 | # 16 | # LICENSE 17 | # 18 | # Copyright (c) 2008 Tom Howard 19 | # 20 | # Copying and distribution of this file, with or without modification, are 21 | # permitted in any medium without royalty provided the copyright notice 22 | # and this notice are preserved. This file is offered as-is, without any 23 | # warranty. 24 | 25 | #serial 10 26 | 27 | AC_DEFUN([AX_SPLIT_VERSION],[ 28 | AC_REQUIRE([AC_PROG_SED]) 29 | AX_MAJOR_VERSION=`echo "$VERSION" | $SED 's/\([[^.]][[^.]]*\).*/\1/'` 30 | AX_MINOR_VERSION=`echo "$VERSION" | $SED 's/[[^.]][[^.]]*.\([[^.]][[^.]]*\).*/\1/'` 31 | AX_POINT_VERSION=`echo "$VERSION" | $SED 's/[[^.]][[^.]]*.[[^.]][[^.]]*.\(.*\)/\1/'` 32 | AC_MSG_CHECKING([Major version]) 33 | AC_MSG_RESULT([$AX_MAJOR_VERSION]) 34 | AC_MSG_CHECKING([Minor version]) 35 | AC_MSG_RESULT([$AX_MINOR_VERSION]) 36 | AC_MSG_CHECKING([Point version]) 37 | AC_MSG_RESULT([$AX_POINT_VERSION]) 38 | ]) 39 | -------------------------------------------------------------------------------- /src/ccutil/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CXXFLAGS = 2 | AM_CPPFLAGS = 3 | 4 | if !NO_TESSDATA_PREFIX 5 | AM_CXXFLAGS += -DTESSDATA_PREFIX=@datadir@ 6 | endif 7 | 8 | if VISIBILITY 9 | AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden 10 | AM_CPPFLAGS += -DTESS_EXPORTS 11 | endif 12 | 13 | pkginclude_HEADERS = \ 14 | genericvector.h helpers.h host.h \ 15 | ocrclass.h platform.h serialis.h strngs.h \ 16 | tesscallback.h unichar.h 17 | 18 | noinst_HEADERS = \ 19 | ambigs.h basedir.h bits16.h bitvector.h ccutil.h clst.h doubleptr.h elst2.h \ 20 | elst.h errcode.h fileerr.h fileio.h \ 21 | genericheap.h globaloc.h \ 22 | indexmapbidi.h kdpair.h lsterr.h \ 23 | object_cache.h params.h qrsequence.h sorthelper.h \ 24 | scanutils.h tessdatamanager.h tprintf.h \ 25 | unicharcompress.h unicharmap.h unicharset.h unicity_table.h unicodes.h \ 26 | universalambigs.h 27 | 28 | noinst_LTLIBRARIES = libtesseract_ccutil.la 29 | 30 | libtesseract_ccutil_la_SOURCES = \ 31 | ambigs.cpp basedir.cpp bitvector.cpp \ 32 | ccutil.cpp clst.cpp \ 33 | elst2.cpp elst.cpp errcode.cpp \ 34 | fileio.cpp \ 35 | globaloc.cpp indexmapbidi.cpp \ 36 | mainblk.cpp \ 37 | serialis.cpp strngs.cpp scanutils.cpp \ 38 | tessdatamanager.cpp tprintf.cpp \ 39 | unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \ 40 | params.cpp universalambigs.cpp 41 | 42 | AM_CPPFLAGS += $(libarchive_CFLAGS) 43 | -------------------------------------------------------------------------------- /unittest/lstm_squashed_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "lstm_test.h" 13 | 14 | namespace tesseract { 15 | 16 | // Tests that a Squashed network learns correctly. 17 | // Almost as fast as the 2d-lstm. 18 | TEST_F(LSTMTrainerTest, TestSquashed) { 19 | // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom, and 20 | // a small convolution/maxpool below that. 21 | // Match training conditions to those typically used with this spec: 22 | // recoding on, adam on. 23 | SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]", 24 | "SQU-2-layer-lstm", /*recode*/ true, /*adam*/ true); 25 | double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2); 26 | EXPECT_LT(lstm_2d_err, 80); 27 | LOG(INFO) << "********** < 80 ************\n" ; 28 | TestIntMode(kTrainerIterations); 29 | } 30 | 31 | } // namespace tesseract. 32 | -------------------------------------------------------------------------------- /src/training/validate_grapheme.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments generic unicode into 9 | // grapheme clusters, including Latin with diacritics. 10 | class ValidateGrapheme : public Validator { 11 | public: 12 | ValidateGrapheme(ViramaScript script, bool report_errors) 13 | : Validator(script, report_errors) {} 14 | ~ValidateGrapheme() {} 15 | 16 | protected: 17 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 18 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 19 | // otherwise does not increment codes_used_. 20 | bool ConsumeGraphemeIfValid() override; 21 | // Returns the CharClass corresponding to the given Unicode ch. 22 | CharClass UnicodeToCharClass(char32 ch) const override; 23 | 24 | private: 25 | // Helper returns true if the sequence prev_ch,ch is invalid. 26 | bool IsBadlyFormed(char32 prev_ch, char32 ch); 27 | // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. 28 | static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch); 29 | // Helper returns true if the sequence prev_ch,ch is invalid Thai. 30 | static bool IsBadlyFormedThai(char32 prev_ch, char32 ch); 31 | }; 32 | 33 | } // namespace tesseract 34 | 35 | #endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ 36 | -------------------------------------------------------------------------------- /src/ccutil/globaloc.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: globaloc.h (Formerly error.h) 3 | * Description: Header file for generic error handler class 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1990, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef GLOBALOC_H 20 | #define GLOBALOC_H 21 | 22 | // Saves a clone of the given pix, and notes its resolution in thread-specific 23 | // data, so that the image can be written prior to a crash. 24 | struct Pix; 25 | void SavePixForCrash(int resolution, Pix* pix); 26 | 27 | void signal_exit(int signal_code); 28 | 29 | void err_exit(); 30 | 31 | void set_global_loc_code(int loc_code); 32 | 33 | void set_global_subloc_code(int loc_code); 34 | 35 | void set_global_subsubloc_code(int loc_code); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/ccmain/reject.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: reject.h 3 | * Description: Rejection functions used in tessedit 4 | * Author: Phil Cheatle 5 | * Created: Wed Sep 23 16:50:21 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef REJECT_H 21 | #define REJECT_H 22 | 23 | class WERD_CHOICE; 24 | class WERD_RES; 25 | 26 | void reject_blanks(WERD_RES *word); 27 | void reject_poor_matches(WERD_RES *word); 28 | float compute_reject_threshold(WERD_CHOICE* word); 29 | bool word_contains_non_1_digit(const char* word, const char* word_lengths); 30 | void dont_allow_1Il(WERD_RES *word); 31 | void flip_hyphens(WERD_RES *word); 32 | void flip_0O(WERD_RES *word); 33 | bool non_0_digit(const char* str, int length); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /doc/merge_unicharsets.1.asc: -------------------------------------------------------------------------------- 1 | MERGE_UNICHARSETS(1) 2 | ==================== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | merge_unicharsets - Simple tool to merge two or more unicharsets. 8 | 9 | SYNOPSIS 10 | -------- 11 | *merge_unicharsets* 'unicharset-in-1' ... 'unicharset-in-n' 'unicharset-out' 12 | 13 | DESCRIPTION 14 | ----------- 15 | merge_unicharsets(1) is a simple tool to merge two or more unicharsets. 16 | It could be used to create a combined unicharset for a script-level engine, 17 | like the new Latin or Devanagari. 18 | 19 | IN/OUT ARGUMENTS 20 | ---------------- 21 | 'unicharset-in-1':: 22 | (Input) The name of the first unicharset file to be merged. 23 | 24 | 'unicharset-in-n':: 25 | (Input) The name of the nth unicharset file to be merged. 26 | 27 | 'unicharset-out':: 28 | (Output) The name of the merged unicharset file. 29 | 30 | HISTORY 31 | ------- 32 | merge_unicharsets(1) was first made available for tesseract4.00.00alpha. 33 | 34 | RESOURCES 35 | --------- 36 | Main web site: + 37 | Information on training tesseract LSTM: 38 | 39 | SEE ALSO 40 | -------- 41 | tesseract(1) 42 | 43 | COPYING 44 | ------- 45 | Copyright \(C) 2012 Google, Inc. 46 | Licensed under the Apache License, Version 2.0 47 | 48 | AUTHOR 49 | ------ 50 | The Tesseract OCR engine was written by Ray Smith and his research groups 51 | at Hewlett Packard (1985-1995) and Google (2006-present). 52 | -------------------------------------------------------------------------------- /src/api/apitypes.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: apitypes.h 3 | // Description: Types used in both the API and internally 4 | // Author: Ray Smith 5 | // Created: Wed Mar 03 09:22:53 PST 2010 6 | // 7 | // (C) Copyright 2010, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #ifndef TESSERACT_API_APITYPES_H_ 21 | #define TESSERACT_API_APITYPES_H_ 22 | 23 | #include "publictypes.h" 24 | 25 | // The types used by the API and Page/ResultIterator can be found in: 26 | // ccstruct/publictypes.h 27 | // ccmain/resultiterator.h 28 | // ccmain/pageiterator.h 29 | // API interfaces and API users should be sure to include this file, rather 30 | // than the lower-level one, and lower-level code should be sure to include 31 | // only the lower-level file. 32 | 33 | #endif // TESSERACT_API_APITYPES_H_ 34 | -------------------------------------------------------------------------------- /src/ccstruct/crakedge.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: crakedge.h (Formerly: crkedge.h) 3 | * Description: Structures for the Crack following edge detector. 4 | * Author: Ray Smith 5 | * Created: Fri Mar 22 16:06:38 GMT 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef CRAKEDGE_H 21 | #define CRAKEDGE_H 22 | 23 | #include "points.h" 24 | #include "mod128.h" 25 | 26 | class CRACKEDGE { 27 | public: 28 | CRACKEDGE() = default; 29 | 30 | ICOORD pos; /*position of crack */ 31 | int8_t stepx; //edge step 32 | int8_t stepy; 33 | int8_t stepdir; //chaincode 34 | CRACKEDGE *prev; /*previous point */ 35 | CRACKEDGE *next; /*next point */ 36 | }; 37 | #endif 38 | -------------------------------------------------------------------------------- /src/lstm/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/ccutil \ 3 | -I$(top_srcdir)/src/cutil \ 4 | -I$(top_srcdir)/src/ccstruct \ 5 | -I$(top_srcdir)/src/arch \ 6 | -I$(top_srcdir)/src/viewer \ 7 | -I$(top_srcdir)/src/classify \ 8 | -I$(top_srcdir)/src/dict \ 9 | -I$(top_srcdir)/src/lstm 10 | 11 | AM_CXXFLAGS = $(OPENMP_CXXFLAGS) 12 | 13 | if !NO_TESSDATA_PREFIX 14 | AM_CXXFLAGS += -DTESSDATA_PREFIX=@datadir@ 15 | endif 16 | 17 | if VISIBILITY 18 | AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden 19 | AM_CPPFLAGS += -DTESS_EXPORTS 20 | endif 21 | 22 | pkginclude_HEADERS = 23 | 24 | noinst_HEADERS = convolve.h ctc.h 25 | noinst_HEADERS += fullyconnected.h functions.h input.h 26 | noinst_HEADERS += lstm.h lstmrecognizer.h lstmtrainer.h maxpool.h 27 | noinst_HEADERS += network.h networkbuilder.h networkio.h networkscratch.h 28 | noinst_HEADERS += parallel.h plumbing.h recodebeam.h reconfig.h reversed.h 29 | noinst_HEADERS += series.h static_shape.h stridemap.h 30 | noinst_HEADERS += tfnetwork.h weightmatrix.h 31 | 32 | noinst_LTLIBRARIES = libtesseract_lstm.la 33 | 34 | libtesseract_lstm_la_SOURCES = \ 35 | convolve.cpp ctc.cpp fullyconnected.cpp functions.cpp input.cpp \ 36 | lstm.cpp lstmrecognizer.cpp lstmtrainer.cpp maxpool.cpp \ 37 | networkbuilder.cpp network.cpp networkio.cpp \ 38 | parallel.cpp plumbing.cpp recodebeam.cpp reconfig.cpp reversed.cpp \ 39 | series.cpp stridemap.cpp tfnetwork.cpp weightmatrix.cpp 40 | -------------------------------------------------------------------------------- /snap/snapcraft.yaml: -------------------------------------------------------------------------------- 1 | name: tesseract 2 | version: git 3 | summary: open source optical character recognition engine 4 | description: | 5 | Tesseract has unicode (UTF-8) support, and can recognize more than 100 6 | languages "out of the box". It can be trained to recognize other languages. 7 | Tesseract supports various output formats: plain-text, hocr(html), pdf. 8 | 9 | If you want to access the files under /media/* or /run/media/* you'll have 10 | to connect the snap to the `core` snap's `removable-media` interface: 11 | 12 | $ sudo snap connect tesseract:removable-media 13 | 14 | grade: stable # must be 'stable' to release into candidate/stable channels 15 | confinement: strict 16 | 17 | apps: 18 | tesseract: 19 | command: > 20 | env 21 | TESSDATA_PREFIX=$SNAP_USER_COMMON 22 | tesseract 23 | plugs: 24 | - home 25 | - removable-media 26 | 27 | parts: 28 | tesseract: 29 | source: . 30 | plugin: autotools 31 | build-packages: 32 | - pkg-config 33 | - libpng12-dev 34 | - libjpeg8-dev 35 | - libtiff5-dev 36 | - zlib1g-dev 37 | - libicu-dev 38 | - libpango1.0-dev 39 | - libcairo2-dev 40 | stage-packages: 41 | - libgomp1 42 | after: [leptonica] 43 | leptonica: 44 | source: https://github.com/DanBloomberg/leptonica/archive/1.74.2.tar.gz 45 | plugin: autotools 46 | stage-packages: 47 | - libjbig0 48 | - libjpeg-turbo8 49 | - libopenjp2-7 50 | - libtiff5 51 | -------------------------------------------------------------------------------- /src/ccstruct/publictypes.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: publictypes.cpp 3 | // Description: Types used in both the API and internally 4 | // Author: Ray Smith 5 | // Created: Wed Mar 03 11:17:09 PST 2010 6 | // 7 | // (C) Copyright 2010, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #include "publictypes.h" 21 | 22 | /** String name for each block type. Keep in sync with PolyBlockType. */ 23 | const char* kPolyBlockNames[] = { 24 | "Unknown", 25 | "Flowing Text", 26 | "Heading Text", 27 | "Pullout Text", 28 | "Equation", 29 | "Inline Equation", 30 | "Table", 31 | "Vertical Text", 32 | "Caption Text", 33 | "Flowing Image", 34 | "Heading Image", 35 | "Pullout Image", 36 | "Horizontal Line", 37 | "Vertical Line", 38 | "Noise", 39 | "" // End marker for testing that sizes match. 40 | }; 41 | -------------------------------------------------------------------------------- /tessdata/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata 2 | 3 | data_DATA = pdf.ttf 4 | EXTRA_DIST = $(data_DATA) 5 | 6 | SUBDIRS = configs tessconfigs 7 | 8 | langdata = bul.traineddata mlt.traineddata chr.traineddata \ 9 | slk.traineddata dan-frak.traineddata eng.traineddata \ 10 | ces.traineddata afr.traineddata swa.traineddata \ 11 | kan.traineddata bel.traineddata ind.traineddata \ 12 | lit.traineddata nld.traineddata osd.traineddata \ 13 | mkd.traineddata est.traineddata fra.traineddata \ 14 | hin.traineddata lat_lid.traineddata nor.traineddata \ 15 | por.traineddata ron.traineddata swe.traineddata \ 16 | pol.traineddata ara.traineddata tel.traineddata \ 17 | ell.traineddata mal.traineddata vie.traineddata \ 18 | heb.traineddata deu.traineddata eus.traineddata \ 19 | ita_old.traineddata rus.traineddata sqi.traineddata \ 20 | spa.traineddata glg.traineddata slk-frak.traineddata \ 21 | equ.traineddata hrv.traineddata frk.traineddata \ 22 | cat.traineddata lav.traineddata ukr.traineddata \ 23 | enm.traineddata dan.traineddata fin.traineddata \ 24 | ben.traineddata srp.traineddata tha.traineddata \ 25 | hun.traineddata tgl.traineddata frm.traineddata \ 26 | slv.traineddata chi_sim.traineddata tam.traineddata \ 27 | tur.traineddata epo.traineddata msa.traineddata \ 28 | kor.traineddata isl.traineddata jpn.traineddata \ 29 | chi_tra.traineddata ita.traineddata spa_old.traineddata \ 30 | deu-frak.traineddata aze.traineddata 31 | 32 | uninstall-local: 33 | cd $(DESTDIR)$(datadir); \ 34 | rm -f $(langdata) 35 | -------------------------------------------------------------------------------- /src/wordrec/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/ccstruct \ 3 | -I$(top_srcdir)/src/ccutil \ 4 | -I$(top_srcdir)/src/cutil \ 5 | -I$(top_srcdir)/src/classify \ 6 | -I$(top_srcdir)/src/dict \ 7 | -I$(top_srcdir)/src/viewer 8 | 9 | if DISABLED_LEGACY_ENGINE 10 | AM_CPPFLAGS += -DDISABLED_LEGACY_ENGINE 11 | endif 12 | 13 | if VISIBILITY 14 | AM_CPPFLAGS += -DTESS_EXPORTS \ 15 | -fvisibility=hidden -fvisibility-inlines-hidden 16 | endif 17 | 18 | noinst_HEADERS = \ 19 | wordrec.h 20 | 21 | if !DISABLED_LEGACY_ENGINE 22 | noinst_HEADERS += \ 23 | associate.h \ 24 | chop.h \ 25 | drawfx.h \ 26 | findseam.h \ 27 | language_model.h \ 28 | lm_consistency.h \ 29 | lm_pain_points.h \ 30 | lm_state.h \ 31 | measure.h \ 32 | outlines.h \ 33 | params_model.h \ 34 | plotedges.h \ 35 | render.h 36 | endif 37 | 38 | noinst_LTLIBRARIES = libtesseract_wordrec.la 39 | 40 | libtesseract_wordrec_la_SOURCES = \ 41 | tface.cpp \ 42 | wordrec.cpp 43 | 44 | if !DISABLED_LEGACY_ENGINE 45 | libtesseract_wordrec_la_SOURCES += \ 46 | associate.cpp \ 47 | chop.cpp \ 48 | chopper.cpp \ 49 | drawfx.cpp \ 50 | findseam.cpp \ 51 | gradechop.cpp \ 52 | language_model.cpp \ 53 | lm_consistency.cpp \ 54 | lm_pain_points.cpp \ 55 | lm_state.cpp \ 56 | outlines.cpp \ 57 | params_model.cpp \ 58 | pieces.cpp \ 59 | plotedges.cpp \ 60 | render.cpp \ 61 | segsearch.cpp \ 62 | wordclass.cpp 63 | endif 64 | -------------------------------------------------------------------------------- /src/cutil/structures.cpp: -------------------------------------------------------------------------------- 1 | /* -*-C-*- 2 | ******************************************************************************** 3 | * 4 | * File: structures.cpp (Formerly structures.c) 5 | * Description: Allocate all the different types of structures. 6 | * Author: Mark Seaman, OCR Technology 7 | * 8 | * (c) Copyright 1990, Hewlett-Packard Company. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | *********************************************************************************/ 20 | /*---------------------------------------------------------------------- 21 | I n c l u d e s 22 | ----------------------------------------------------------------------*/ 23 | #include "structures.h" 24 | 25 | #include 26 | 27 | 28 | /*---------------------------------------------------------------------- 29 | F u n c t i o n s 30 | ----------------------------------------------------------------------*/ 31 | makestructure(new_cell, free_cell, list_rec) 32 | -------------------------------------------------------------------------------- /src/ccmain/control.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: control.h (Formerly control.h) 3 | * Description: Module-independent matcher controller. 4 | * Author: Ray Smith 5 | * Created: Thu Apr 23 11:09:58 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | /** 21 | * @file control.h 22 | * Module-independent matcher controller. 23 | */ 24 | 25 | #ifndef CONTROL_H 26 | #define CONTROL_H 27 | 28 | enum ACCEPTABLE_WERD_TYPE 29 | { 30 | AC_UNACCEPTABLE, ///< Unacceptable word 31 | AC_LOWER_CASE, ///< ALL lower case 32 | AC_UPPER_CASE, ///< ALL upper case 33 | AC_INITIAL_CAP, ///< ALL but initial lc 34 | AC_LC_ABBREV, ///< a.b.c. 35 | AC_UC_ABBREV ///< A.B.C. 36 | }; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Travis CI configuration for Tesseract 2 | 3 | language: cpp 4 | 5 | dist: trusty 6 | 7 | env: 8 | - LEPT_VER=1.77.0 9 | 10 | notifications: 11 | email: false 12 | 13 | sudo: false 14 | 15 | os: 16 | - linux 17 | - osx 18 | 19 | addons: 20 | apt: 21 | sources: 22 | #- ubuntu-toolchain-r-test 23 | packages: 24 | - libarchive-dev 25 | #- g++-6 26 | 27 | #matrix: 28 | #include: 29 | #- os: osx 30 | #install: 31 | #script: brew install tesseract --HEAD 32 | #cache: 33 | #directories: 34 | #- $HOME/Library/Caches/Homebrew 35 | #allow_failures: 36 | #- script: brew install tesseract --HEAD 37 | 38 | cache: 39 | directories: 40 | - leptonica-$LEPT_VER 41 | 42 | before_install: 43 | - if [[ $TRAVIS_OS_NAME == linux ]]; then LINUX=true; fi 44 | - if [[ $TRAVIS_OS_NAME == osx ]]; then OSX=true; fi 45 | 46 | install: 47 | #- if [[ $LINUX && "$CXX" = "g++" ]]; then export CXX="g++-6" CC="gcc-6"; fi 48 | - if test ! -d leptonica-$LEPT_VER/src; then curl -Ls https://github.com/DanBloomberg/leptonica/archive/$LEPT_VER.tar.gz | tar -xz; fi 49 | - if test ! -d leptonica-$LEPT_VER/usr; then cmake -Hleptonica-$LEPT_VER -Bleptonica-$LEPT_VER/build -DCMAKE_INSTALL_PREFIX=leptonica-$LEPT_VER/usr; fi 50 | - if test ! -e leptonica-$LEPT_VER/usr/lib/libleptonica.so; then make -C leptonica-$LEPT_VER/build install; fi 51 | 52 | script: 53 | - mkdir build 54 | - cd build 55 | - cmake .. -DLeptonica_DIR=leptonica-$LEPT_VER/build -DCPPAN_BUILD=OFF 56 | - make 57 | -------------------------------------------------------------------------------- /src/ccutil/fileerr.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: fileerr.h (Formerly filerr.h) 3 | * Description: Errors for file utilities. 4 | * Author: Ray Smith 5 | * Created: Tue Aug 14 15:45:16 BST 1990 6 | * 7 | * (C) Copyright 1990, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef FILEERR_H 21 | #define FILEERR_H 22 | 23 | #include "errcode.h" 24 | 25 | const ERRCODE CANTOPENFILE = "Can't open file"; 26 | const ERRCODE CANTCREATEFILE = "Can't create file"; 27 | const ERRCODE CANTMAKEPIPE = "Can't create pipe"; 28 | const ERRCODE CANTCONNECTPIPE = "Can't reconnect pipes to stdin/stdout"; 29 | const ERRCODE READFAILED = "Read of file failed"; 30 | const ERRCODE WRITEFAILED = "Write of file failed"; 31 | const ERRCODE SELECTFAILED = "Select failed"; 32 | 33 | const ERRCODE EXECFAILED = "Could not exec new process"; 34 | #endif 35 | -------------------------------------------------------------------------------- /src/ccstruct/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/ccutil \ 3 | -I$(top_srcdir)/src/cutil \ 4 | -I$(top_srcdir)/src/viewer \ 5 | -I$(top_srcdir)/src/opencl 6 | AM_CPPFLAGS += $(OPENCL_CPPFLAGS) 7 | 8 | if VISIBILITY 9 | AM_CPPFLAGS += -DTESS_EXPORTS \ 10 | -fvisibility=hidden -fvisibility-inlines-hidden 11 | endif 12 | 13 | pkginclude_HEADERS = publictypes.h 14 | noinst_HEADERS = \ 15 | blamer.h blobbox.h blobs.h blread.h boxread.h boxword.h \ 16 | ccstruct.h coutln.h crakedge.h \ 17 | debugpixa.h detlinefit.h dppoint.h fontinfo.h \ 18 | imagedata.h \ 19 | linlsq.h matrix.h mod128.h normalis.h \ 20 | ocrblock.h ocrpara.h ocrrow.h otsuthr.h \ 21 | pageres.h params_training_featdef.h \ 22 | pdblock.h points.h polyaprx.h polyblk.h \ 23 | quadlsq.h quadratc.h quspline.h ratngs.h rect.h rejctmap.h \ 24 | seam.h split.h statistc.h stepblob.h vecfuncs.h werd.h 25 | 26 | noinst_LTLIBRARIES = libtesseract_ccstruct.la 27 | 28 | libtesseract_ccstruct_la_SOURCES = \ 29 | blamer.cpp blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \ 30 | detlinefit.cpp dppoint.cpp fontinfo.cpp \ 31 | imagedata.cpp \ 32 | linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \ 33 | ocrblock.cpp ocrpara.cpp ocrrow.cpp otsuthr.cpp \ 34 | pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \ 35 | params_training_featdef.cpp publictypes.cpp \ 36 | quadlsq.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \ 37 | seam.cpp split.cpp statistc.cpp stepblob.cpp \ 38 | vecfuncs.cpp werd.cpp 39 | -------------------------------------------------------------------------------- /src/ccstruct/ccstruct.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: ccstruct.cpp 3 | // Description: ccstruct class. 4 | // Author: Samuel Charron 5 | // 6 | // (C) Copyright 2006, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #include "ccstruct.h" 20 | 21 | namespace tesseract { 22 | 23 | // APPROXIMATIONS of the fractions of the character cell taken by 24 | // the descenders, ascenders, and x-height. 25 | const double CCStruct::kDescenderFraction = 0.25; 26 | const double CCStruct::kXHeightFraction = 0.5; 27 | const double CCStruct::kAscenderFraction = 0.25; 28 | const double CCStruct::kXHeightCapRatio = CCStruct::kXHeightFraction / 29 | (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction); 30 | 31 | // Destructor. 32 | // It is defined here, so the compiler can create a single vtable 33 | // instead of weak vtables in every compilation unit. 34 | CCStruct::~CCStruct() = default; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/ccstruct/blread.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: blread.h (Formerly pdread.h) 3 | * Description: Friend function of BLOCK to read the uscan pd file. 4 | * Author: Ray Smith 5 | * Created: Mon Mar 18 14:39:00 GMT 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef BLREAD_H 21 | #define BLREAD_H 22 | 23 | #include // for int32_t 24 | #include "strngs.h" // for STRING 25 | 26 | class BLOCK_LIST; 27 | 28 | bool read_unlv_file( //print list of sides 29 | STRING name, //basename of file 30 | int32_t xsize, //image size 31 | int32_t ysize, //image size 32 | BLOCK_LIST *blocks //output list 33 | ); 34 | void FullPageBlock(int width, int height, BLOCK_LIST *blocks); 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /java/com/google/scrollview/events/SVEventType.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.events; 12 | 13 | /** 14 | * These are the defined events which can happen in ScrollView and be 15 | * transferred to the client. They are same events as on the client side part of 16 | * ScrollView (defined in ScrollView.h). 17 | * 18 | * @author wanke@google.com 19 | */ 20 | public enum SVEventType { 21 | SVET_DESTROY, // Window has been destroyed by user. 22 | SVET_EXIT, // User has destroyed the last window by clicking on the 'X' 23 | SVET_CLICK, // Any button pressed that is not a popup trigger. 24 | SVET_SELECTION, // Left button selection. 25 | SVET_INPUT, // Any kind of input 26 | SVET_MOUSE, // The mouse has moved with a button pressed. 27 | SVET_MOTION, // The mouse has moved with no button pressed. 28 | SVET_HOVER, // The mouse has stayed still for a second. 29 | SVET_POPUP, // A command selected through a popup menu 30 | SVET_MENU; // A command selected through the menubar 31 | } 32 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVSubMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import javax.swing.JMenu; 23 | 24 | /** Constructs a new submenu which can hold other entries. */ 25 | class SVSubMenuItem extends SVAbstractMenuItem { 26 | public SVSubMenuItem(String name, JMenu jli) { 27 | super(-1, name, jli); 28 | } 29 | /** Adds a child entry to the submenu. */ 30 | @Override 31 | public void add(SVAbstractMenuItem mli) { 32 | mi.add(mli.mi); 33 | } 34 | /** Adds a child menu to the submenu (or root node). */ 35 | @Override 36 | public void add(JMenu jli) { 37 | mi.add(jli); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/ccstruct/params_training_featdef.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: params_training_featdef.cpp 3 | // Description: Utility functions for params training features. 4 | // Author: David Eger 5 | // Created: Mon Jun 11 11:26:42 PDT 2012 6 | // 7 | // (C) Copyright 2012, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #include 21 | 22 | #include "params_training_featdef.h" 23 | 24 | namespace tesseract { 25 | 26 | int ParamsTrainingFeatureByName(const char *name) { 27 | if (name == nullptr) 28 | return -1; 29 | int array_size = sizeof(kParamsTrainingFeatureTypeName) / 30 | sizeof(kParamsTrainingFeatureTypeName[0]); 31 | for (int i = 0; i < array_size; i++) { 32 | if (kParamsTrainingFeatureTypeName[i] == nullptr) 33 | continue; 34 | if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0) 35 | return i; 36 | } 37 | return -1; 38 | } 39 | 40 | } // namespace tesseract 41 | -------------------------------------------------------------------------------- /src/textord/tordmain.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tordmain.h (Formerly textordp.h) 3 | * Description: C++ top level textord code. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1992, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef TORDMAIN_H 20 | #define TORDMAIN_H 21 | 22 | #include 23 | #include "params.h" 24 | #include "ocrblock.h" 25 | #include "blobs.h" 26 | #include "blobbox.h" 27 | 28 | struct Pix; 29 | namespace tesseract { 30 | class Tesseract; 31 | 32 | void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob); 33 | void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks, 34 | TO_BLOCK_LIST *port_blocks); 35 | } // namespace tesseract 36 | 37 | void tweak_row_baseline(ROW *row, 38 | double blshift_maxshift, 39 | double blshift_xfraction); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /cmake/SourceGroups.cmake: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | # Unless required by applicable law or agreed to in writing, software 6 | # distributed under the License is distributed on an "AS IS" BASIS, 7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | # See the License for the specific language governing permissions and 9 | # limitations under the License. 10 | #include(SourceGroups) 11 | 12 | set(SSRC ${CMAKE_SOURCE_DIR}) 13 | set(BSRC ${CMAKE_BINARY_DIR}) 14 | 15 | set(_CPP ".*\\.cpp") 16 | set(CPP "${_CPP}$") 17 | 18 | set(_H ".*\\.h") 19 | set(H "${_H}$") 20 | 21 | set(H_CPP "(${H}|${CPP})") 22 | 23 | source_group("Resource files" ".*\\.(rc|ico)") 24 | 25 | source_group("api" "${SSRC}/api/${H_CPP}") 26 | source_group("arch" "${SSRC}/arch/${H_CPP}") 27 | source_group("ccmain" "${SSRC}/ccmain/${H_CPP}") 28 | source_group("ccstruct" "${SSRC}/ccstruct/${H_CPP}") 29 | source_group("ccutil" "${SSRC}/ccutil/${H_CPP}") 30 | source_group("classify" "${SSRC}/classify/${H_CPP}") 31 | source_group("cutil" "${SSRC}/cutil/${H_CPP}") 32 | source_group("dict" "${SSRC}/dict/${H_CPP}") 33 | source_group("lstm" "${SSRC}/lstm/${H_CPP}") 34 | source_group("opencl" "${SSRC}/opencl/${H_CPP}") 35 | source_group("textord" "${SSRC}/textord/${H_CPP}") 36 | source_group("viewer" "${SSRC}/viewer/${H_CPP}") 37 | source_group("wordrec" "${SSRC}/wordrec/${H_CPP}") 38 | -------------------------------------------------------------------------------- /src/wordrec/chop.h: -------------------------------------------------------------------------------- 1 | /* -*-C-*- 2 | ******************************************************************************** 3 | * 4 | * File: chop.h 5 | * Author: Mark Seaman, SW Productivity 6 | * 7 | * (c) Copyright 1987, Hewlett-Packard Company. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | *******************************************************************************/ 19 | 20 | #ifndef CHOP_H 21 | #define CHOP_H 22 | 23 | /*---------------------------------------------------------------------- 24 | I n c l u d e s 25 | ----------------------------------------------------------------------*/ 26 | #include "genericheap.h" 27 | #include "kdpair.h" 28 | #include "seam.h" 29 | 30 | /*---------------------------------------------------------------------- 31 | T y p e s 32 | ---------------------------------------------------------------------*/ 33 | #define MAX_NUM_POINTS 50 34 | // The PointPair elements do NOT own the EDGEPTs. 35 | using PointPair = tesseract::KDPairInc; 36 | using PointHeap = tesseract::GenericHeap; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/wordrec/findseam.h: -------------------------------------------------------------------------------- 1 | /* -*-C-*- 2 | ******************************************************************************** 3 | * 4 | * File: findseam.h 5 | * Description: 6 | * Author: Mark Seaman, SW Productivity 7 | * 8 | * (c) Copyright 1987, Hewlett-Packard Company. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | *********************************************************************************/ 20 | 21 | #ifndef FINDSEAM_H 22 | #define FINDSEAM_H 23 | 24 | /*---------------------------------------------------------------------- 25 | I n c l u d e s 26 | ----------------------------------------------------------------------*/ 27 | #include "seam.h" 28 | #include "genericheap.h" 29 | #include "kdpair.h" 30 | #include "chop.h" 31 | 32 | // The SeamPair elements own their SEAMs and delete them upon destruction. 33 | using SeamPair = tesseract::KDPtrPairInc; 34 | using SeamQueue = tesseract::GenericHeap; 35 | 36 | using SeamDecPair = tesseract::KDPtrPairDec; 37 | using SeamPile = tesseract::GenericHeap; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /doc/set_unicharset_properties.1.asc: -------------------------------------------------------------------------------- 1 | SET_UNICHARSET_PROPERTIES(1) 2 | ============================ 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | set_unicharset_properties - set properties about the unichars 8 | 9 | SYNOPSIS 10 | -------- 11 | *set_unicharset_properties* --U 'input_unicharsetfile' --script_dir '/path/to/langdata' --O 'output_unicharsetfile' 12 | 13 | DESCRIPTION 14 | ----------- 15 | set_unicharset_properties(1) reads a unicharset file, puts the result in a UNICHARSET object, fills it with properties about the unichars it contains and writes the result back to another unicharset file. 16 | 17 | OPTIONS 18 | ------- 19 | 20 | '--script_dir /path/to/langdata':: 21 | (Input) Specify the location of directory for universal script unicharsets and font xheights (type:string default:) 22 | 23 | '--U unicharsetfile':: 24 | (Input) Specify the location of the unicharset to load as input. 25 | 26 | '--O unicharsetfile':: 27 | (Output) Specify the location of the unicharset to be written with updated properties. 28 | 29 | HISTORY 30 | ------- 31 | set_unicharset_properties(1) was first made available for tesseract version 3.03. 32 | 33 | RESOURCES 34 | --------- 35 | Main web site: + 36 | Information on training: 37 | 38 | SEE ALSO 39 | -------- 40 | tesseract(1) 41 | 42 | COPYING 43 | ------- 44 | Copyright \(C) 2012 Google, Inc. 45 | Licensed under the Apache License, Version 2.0 46 | 47 | AUTHOR 48 | ------ 49 | The Tesseract OCR engine was written by Ray Smith and his research groups 50 | at Hewlett Packard (1985-1995) and Google (2006-present). 51 | -------------------------------------------------------------------------------- /src/textord/drawedg.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: drawedg.h (Formerly drawedge.h) 3 | * Description: Collection of functions to draw things to do with edge 4 | *detection. 5 | * Author: Ray Smith 6 | * Created: Thu Jun 06 13:29:20 BST 1991 7 | * 8 | * (C) Copyright 1991, Hewlett-Packard Ltd. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | **********************************************************************/ 20 | 21 | #ifndef DRAWEDG_H 22 | #define DRAWEDG_H 23 | #ifndef GRAPHICS_DISABLED 24 | 25 | #include "scrollview.h" 26 | #include "crakedge.h" 27 | 28 | ScrollView* create_edges_window( //make window 29 | ICOORD page_tr //size of image 30 | ); 31 | void draw_raw_edge( //draw the cracks 32 | ScrollView* fd, //window to draw in 33 | CRACKEDGE *start, //start of loop 34 | ScrollView::Color colour //colour to draw in 35 | ); 36 | #endif 37 | #endif 38 | -------------------------------------------------------------------------------- /src/lstm/functions.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: functions.cpp 3 | // Description: Static initialize-on-first-use non-linearity functions. 4 | // Author: Ray Smith 5 | // 6 | // (C) Copyright 2014, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | /////////////////////////////////////////////////////////////////////// 17 | 18 | #include // for exp, tanh 19 | #include "functions.h" 20 | 21 | namespace tesseract { 22 | 23 | #if __cplusplus < 201402 || defined(__clang__) // C++11 24 | 25 | double TanhTable[kTableSize]; 26 | double LogisticTable[kTableSize]; 27 | 28 | class TableInit { 29 | TableInit() { 30 | for (int i = 0; i < kTableSize; i++) { 31 | TanhTable[i] = tanh(i / kScaleFactor); 32 | LogisticTable[i] = 1 / (1 + exp(-i / kScaleFactor)); 33 | } 34 | } 35 | static TableInit tableInit; 36 | }; 37 | 38 | TableInit TableInit::tableInit; 39 | 40 | #else // C++14 or newer 41 | 42 | constexpr LUTTempl TanhTable; 43 | constexpr LUTTempl LogisticTable; 44 | 45 | #endif 46 | 47 | } // namespace tesseract. 48 | -------------------------------------------------------------------------------- /src/ccutil/unicodes.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: unicodes.h 3 | * Description: Unicode related machinery 4 | * Author: David Eger 5 | * Created: Wed Jun 15 16:37:50 PST 2011 6 | * 7 | * (C) Copyright 2011, Google, Inc. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef TESSERACT_CCUTIL_UNICODES_H_ 21 | #define TESSERACT_CCUTIL_UNICODES_H_ 22 | 23 | namespace tesseract { 24 | 25 | extern const char *kUTF8LineSeparator; 26 | extern const char *kUTF8ParagraphSeparator; 27 | extern const char *kLRM; ///< Left-to-Right Mark 28 | extern const char *kRLM; ///< Right-to-Left Mark 29 | extern const char *kRLE; ///< Right-to-Left Embedding 30 | extern const char *kPDF; ///< Pop Directional Formatting 31 | 32 | /// The following are confusable internal word punctuation symbols 33 | /// which we normalize to the first variant when matching in dawgs. 34 | extern const char *kHyphenLikeUTF8[]; 35 | extern const char *kApostropheLikeUTF8[]; 36 | 37 | } // namespace 38 | 39 | #endif // TESSERACT_CCUTIL_UNICODES_H_ 40 | -------------------------------------------------------------------------------- /src/arch/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += -I$(top_srcdir)/src/ccstruct -I$(top_srcdir)/src/ccutil -I$(top_srcdir)/src/viewer 2 | 3 | AM_CXXFLAGS = 4 | 5 | if VISIBILITY 6 | AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden 7 | AM_CPPFLAGS += -DTESS_EXPORTS 8 | endif 9 | 10 | pkginclude_HEADERS = 11 | 12 | noinst_HEADERS = dotproduct.h dotproductavx.h dotproductsse.h 13 | noinst_HEADERS += intsimdmatrix.h 14 | noinst_HEADERS += simddetect.h 15 | 16 | noinst_LTLIBRARIES = libtesseract_native.la 17 | if AVX_OPT 18 | noinst_LTLIBRARIES += libtesseract_avx.la 19 | endif 20 | if AVX2_OPT 21 | noinst_LTLIBRARIES += libtesseract_avx2.la 22 | endif 23 | if SSE41_OPT 24 | noinst_LTLIBRARIES += libtesseract_sse.la 25 | endif 26 | noinst_LTLIBRARIES += libtesseract_arch.la 27 | 28 | libtesseract_arch_la_CPPFLAGS = $(AM_CPPFLAGS) 29 | if AVX_OPT 30 | libtesseract_arch_la_CPPFLAGS += -DAVX 31 | libtesseract_avx_la_CXXFLAGS = -mavx 32 | endif 33 | if AVX2_OPT 34 | libtesseract_arch_la_CPPFLAGS += -DAVX2 35 | libtesseract_avx2_la_CXXFLAGS = -mavx2 36 | endif 37 | if SSE41_OPT 38 | libtesseract_arch_la_CPPFLAGS += -DSSE4_1 39 | libtesseract_sse_la_CXXFLAGS = -msse4.1 40 | endif 41 | 42 | libtesseract_native_la_CXXFLAGS = -O3 -ffast-math 43 | if MARCH_NATIVE_OPT 44 | libtesseract_native_la_CXXFLAGS += -march=native -mtune=native 45 | endif 46 | libtesseract_native_la_SOURCES = dotproduct.cpp 47 | 48 | libtesseract_arch_la_SOURCES = intsimdmatrix.cpp simddetect.cpp 49 | 50 | if AVX_OPT 51 | libtesseract_avx_la_SOURCES = dotproductavx.cpp 52 | endif 53 | 54 | if AVX2_OPT 55 | libtesseract_avx2_la_SOURCES = intsimdmatrixavx2.cpp 56 | endif 57 | 58 | if SSE41_OPT 59 | libtesseract_sse_la_SOURCES = dotproductsse.cpp intsimdmatrixsse.cpp 60 | endif 61 | -------------------------------------------------------------------------------- /src/ccutil/host.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: host.h 3 | ** Purpose: This is the system independent typedefs and defines 4 | ** Author: MN, JG, MD 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988-1996. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | */ 17 | 18 | #ifndef TESSERACT_CCUTIL_HOST_H_ 19 | #define TESSERACT_CCUTIL_HOST_H_ 20 | 21 | #include 22 | #include "platform.h" 23 | /* _WIN32 */ 24 | #ifdef _WIN32 25 | #include 26 | #undef min 27 | #undef max 28 | #endif 29 | 30 | #include // PRId32, ... 31 | #include // int32_t, ... 32 | 33 | #if defined(_WIN32) 34 | 35 | /* MinGW defines the standard PRI... macros, but MSVS doesn't. */ 36 | 37 | #if !defined(PRId32) 38 | #define PRId32 "d" 39 | #endif 40 | 41 | #if !defined(PRId64) 42 | #define PRId64 "I64d" 43 | #endif 44 | 45 | #endif /* _WIN32 */ 46 | 47 | // Return true if x is within tolerance of y 48 | template bool NearlyEqual(T x, T y, T tolerance) { 49 | T diff = x - y; 50 | return diff <= tolerance && -diff <= tolerance; 51 | } 52 | 53 | #endif // TESSERACT_CCUTIL_HOST_H_ 54 | -------------------------------------------------------------------------------- /src/ccstruct/debugpixa.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_ 2 | #define TESSERACT_CCSTRUCT_DEBUGPIXA_H_ 3 | 4 | #include "allheaders.h" 5 | 6 | namespace tesseract { 7 | 8 | // Class to hold a Pixa collection of debug images with captions and save them 9 | // to a PDF file. 10 | class DebugPixa { 11 | public: 12 | // TODO(rays) add another constructor with size control. 13 | DebugPixa() { 14 | pixa_ = pixaCreate(0); 15 | fonts_ = bmfCreate(nullptr, 14); 16 | } 17 | // If the filename_ has been set and there are any debug images, they are 18 | // written to the set filename_. 19 | ~DebugPixa() { 20 | pixaDestroy(&pixa_); 21 | bmfDestroy(&fonts_); 22 | } 23 | 24 | // Adds the given pix to the set of pages in the PDF file, with the given 25 | // caption added to the top. 26 | void AddPix(const Pix* pix, const char* caption) { 27 | int depth = pixGetDepth(const_cast(pix)); 28 | int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80); 29 | Pix* pix_debug = pixAddSingleTextblock( 30 | const_cast(pix), fonts_, caption, color, L_ADD_BELOW, nullptr); 31 | pixaAddPix(pixa_, pix_debug, L_INSERT); 32 | } 33 | 34 | // Sets the destination filename and enables images to be written to a PDF 35 | // on destruction. 36 | void WritePDF(const char* filename) { 37 | if (pixaGetCount(pixa_) > 0) { 38 | pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, "AllDebugImages", filename); 39 | pixaClear(pixa_); 40 | } 41 | } 42 | 43 | private: 44 | // The collection of images to put in the PDF. 45 | Pixa* pixa_; 46 | // The fonts used to draw text captions. 47 | L_Bmf* fonts_; 48 | }; 49 | 50 | } // namespace tesseract 51 | 52 | #endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_ 53 | -------------------------------------------------------------------------------- /src/ccstruct/ccstruct.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: ccstruct.h 3 | // Description: ccstruct class. 4 | // Author: Samuel Charron 5 | // 6 | // (C) Copyright 2006, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_ 20 | #define TESSERACT_CCSTRUCT_CCSTRUCT_H_ 21 | 22 | #include "cutil_class.h" // for CUtil 23 | 24 | namespace tesseract { 25 | class CCStruct : public CUtil { 26 | public: 27 | CCStruct() = default; 28 | ~CCStruct() override; 29 | 30 | // Globally accessible constants. 31 | // APPROXIMATIONS of the fractions of the character cell taken by 32 | // the descenders, ascenders, and x-height. 33 | static const double kDescenderFraction; // = 0.25; 34 | static const double kXHeightFraction; // = 0.5; 35 | static const double kAscenderFraction; // = 0.25; 36 | // Derived value giving the x-height as a fraction of cap-height. 37 | static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender). 38 | }; 39 | } // namespace tesseract 40 | 41 | #endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_ 42 | -------------------------------------------------------------------------------- /src/classify/mf.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: mf.h 3 | ** Purpose: Micro-feature interface to flexible feature extractor. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef MF_H 19 | #define MF_H 20 | 21 | /**---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------**/ 24 | #include "ocrfeatures.h" 25 | #include "blobs.h" 26 | 27 | typedef enum { 28 | MFXPosition, MFYPosition, 29 | MFLength, MFDirection, MFBulge1, MFBulge2, 30 | MFCount // For array sizes. 31 | } MF_PARAM_NAME; 32 | 33 | typedef float MicroFeature[MFCount]; 34 | /*---------------------------------------------------------------------------- 35 | Private Function Prototypes 36 | -----------------------------------------------------------------------------*/ 37 | FEATURE_SET ExtractMicros(TBLOB* Blob, const DENORM& cn_denorm); 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/classify/normmatch.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: normmatch.h 3 | ** Purpose: Simple matcher based on character normalization features. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef NORMMATCH_H 19 | #define NORMMATCH_H 20 | 21 | /**---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------**/ 24 | #include "matchdefs.h" 25 | #include "ocrfeatures.h" 26 | #include "params.h" 27 | 28 | /**---------------------------------------------------------------------------- 29 | Variables 30 | ----------------------------------------------------------------------------**/ 31 | 32 | /* control knobs used to control the normalization adjustment process */ 33 | extern double_VAR_H(classify_norm_adj_midpoint, 32.0, 34 | "Norm adjust midpoint ..."); 35 | extern double_VAR_H(classify_norm_adj_curl, 2.0, "Norm adjust curl ..."); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/classify/normfeat.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: normfeat.h 3 | ** Purpose: Definition of character normalization features. 4 | ** Author: Dan Johnson 5 | ** History: 12/14/90, DSJ, Created. 6 | ** 7 | ** (c) Copyright Hewlett-Packard Company, 1988. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | *****************************************************************************/ 18 | 19 | #ifndef NORMFEAT_H 20 | #define NORMFEAT_H 21 | 22 | /**---------------------------------------------------------------------------- 23 | Include Files and Type Defines 24 | ----------------------------------------------------------------------------**/ 25 | #include "ocrfeatures.h" 26 | 27 | #define LENGTH_COMPRESSION (10.0) 28 | 29 | typedef enum { 30 | CharNormY, CharNormLength, CharNormRx, CharNormRy 31 | } NORM_PARAM_NAME; 32 | 33 | /**---------------------------------------------------------------------------- 34 | Public Function Prototypes 35 | ----------------------------------------------------------------------------**/ 36 | float ActualOutlineLength(FEATURE Feature); 37 | 38 | FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT& fx_info); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/textord/gap_map.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software 6 | // distributed under the License is distributed on an "AS IS" BASIS, 7 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | // See the License for the specific language governing permissions and 9 | // limitations under the License. 10 | 11 | #ifndef GAP_MAP_H 12 | #define GAP_MAP_H 13 | 14 | #include "blobbox.h" 15 | 16 | class GAPMAP 17 | { 18 | public: 19 | GAPMAP( //constructor 20 | TO_BLOCK *block); 21 | 22 | ~GAPMAP () { //destructor 23 | delete[] map; 24 | } 25 | 26 | bool table_gap( //Is gap a table? 27 | int16_t left, //From here 28 | int16_t right); //To here 29 | 30 | private: 31 | int16_t total_rows; //in block 32 | int16_t min_left; //Left extreme 33 | int16_t max_right; //Right extreme 34 | int16_t bucket_size; // half an x ht 35 | int16_t *map; //empty counts 36 | int16_t map_max; //map[0..max_map] defined 37 | bool any_tabs; 38 | }; 39 | 40 | /*-----------------------------*/ 41 | 42 | extern BOOL_VAR_H (gapmap_debug, false, "Say which blocks have tables"); 43 | extern BOOL_VAR_H (gapmap_use_ends, false, 44 | "Use large space at start and end of rows"); 45 | extern BOOL_VAR_H (gapmap_no_isolated_quanta, false, 46 | "Ensure gaps not less than 2quanta wide"); 47 | extern double_VAR_H (gapmap_big_gaps, 1.75, "xht multiplier"); 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/classify/cutoffs.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: cutoffs.h 3 | ** Purpose: Routines to manipulate an array of class cutoffs. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef CUTOFFS_H 19 | #define CUTOFFS_H 20 | 21 | /**---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------**/ 24 | #include "matchdefs.h" 25 | 26 | typedef uint16_t CLASS_CUTOFF_ARRAY[MAX_NUM_CLASSES]; 27 | 28 | /**---------------------------------------------------------------------------- 29 | Public Function Prototypes 30 | ----------------------------------------------------------------------------**/ 31 | 32 | /* 33 | #if defined(__STDC__) || defined(__cplusplus) 34 | # define _ARGS(s) s 35 | #else 36 | # define _ARGS(s) () 37 | #endif*/ 38 | 39 | /* cutoffs.c 40 | void ReadNewCutoffs 41 | _ARGS((char *Filename, 42 | CLASS_CUTOFF_ARRAY Cutoffs)); 43 | #undef _ARGS 44 | */ 45 | #endif 46 | -------------------------------------------------------------------------------- /src/textord/blobgrid.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: blobgrid.h 3 | // Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods. 4 | // Copyright 2011 Google Inc. All Rights Reserved. 5 | // Author: rays@google.com (Ray Smith) 6 | // Created: Sat Jun 11 10:26:01 PST 2011 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | 21 | #ifndef TESSERACT_TEXTORD_BLOBGRID_H_ 22 | #define TESSERACT_TEXTORD_BLOBGRID_H_ 23 | 24 | #include "bbgrid.h" 25 | #include "blobbox.h" 26 | 27 | CLISTIZEH(BLOBNBOX) 28 | 29 | namespace tesseract { 30 | 31 | using BlobGridSearch = GridSearch; 32 | 33 | class BlobGrid : public BBGrid { 34 | public: 35 | BlobGrid(int gridsize, const ICOORD& bleft, const ICOORD& tright); 36 | ~BlobGrid() override; 37 | 38 | // Inserts all the blobs from the given list, with x and y spreading, 39 | // without removing from the source list, so ownership remains with the 40 | // source list. 41 | void InsertBlobList(BLOBNBOX_LIST* blobs); 42 | }; 43 | 44 | } // namespace tesseract. 45 | 46 | #endif // TESSERACT_TEXTORD_BLOBGRID_H_ 47 | -------------------------------------------------------------------------------- /src/ccmain/pagewalk.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: pagewalk.cpp (Formerly walkers.c) 3 | * Description: Block list processors 4 | * Author: Phil Cheatle 5 | * Created: Thu Oct 10 16:25:24 BST 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #include "pageres.h" 21 | #include "tesseractclass.h" 22 | 23 | namespace tesseract { 24 | /** 25 | * @name process_selected_words() 26 | * 27 | * Walk the current block list applying the specified word processor function 28 | * to each word that overlaps the selection_box. 29 | */ 30 | void Tesseract::process_selected_words( 31 | PAGE_RES* page_res, // blocks to check 32 | TBOX& selection_box, 33 | bool (tesseract::Tesseract::* word_processor)(PAGE_RES_IT* pr_it)) { 34 | for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; 35 | page_res_it.forward()) { 36 | WERD* word = page_res_it.word()->word; 37 | if (word->bounding_box().overlap(selection_box)) { 38 | if (!(this->*word_processor)(&page_res_it)) 39 | return; 40 | } 41 | } 42 | } 43 | } // namespace tesseract 44 | -------------------------------------------------------------------------------- /doc/wordlist2dawg.1.asc: -------------------------------------------------------------------------------- 1 | WORDLIST2DAWG(1) 2 | ================ 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | wordlist2dawg - convert a wordlist to a DAWG for Tesseract 8 | 9 | SYNOPSIS 10 | -------- 11 | *wordlist2dawg* 'WORDLIST' 'DAWG' 'lang.unicharset' 12 | 13 | *wordlist2dawg* -t 'WORDLIST' 'DAWG' 'lang.unicharset' 14 | 15 | *wordlist2dawg* -r 1 'WORDLIST' 'DAWG' 'lang.unicharset' 16 | 17 | *wordlist2dawg* -r 2 'WORDLIST' 'DAWG' 'lang.unicharset' 18 | 19 | *wordlist2dawg* -l 'WORDLIST' 'DAWG' 'lang.unicharset' 20 | 21 | DESCRIPTION 22 | ----------- 23 | wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph 24 | (DAWG) for use with Tesseract. A DAWG is a compressed, space and time 25 | efficient representation of a word list. 26 | 27 | OPTIONS 28 | ------- 29 | -t 30 | Verify that a given dawg file is equivalent to a given wordlist. 31 | 32 | -r 1 33 | Reverse a word if it contains an RTL character. 34 | 35 | -r 2 36 | Reverse all words. 37 | 38 | -l 39 | Produce a file with several dawgs in it, one each for words 40 | of length , ,... 41 | 42 | ARGUMENTS 43 | --------- 44 | 45 | 'WORDLIST' 46 | A plain text file in UTF-8, one word per line. 47 | 48 | 'DAWG' 49 | The output DAWG to write. 50 | 51 | 'lang.unicharset' 52 | The unicharset of the language. This is the unicharset 53 | generated by mftraining(1). 54 | 55 | SEE ALSO 56 | -------- 57 | tesseract(1), combine_tessdata(1), dawg2wordlist(1) 58 | 59 | 60 | 61 | COPYING 62 | ------- 63 | Copyright \(C) 2006 Google, Inc. 64 | Licensed under the Apache License, Version 2.0 65 | 66 | AUTHOR 67 | ------ 68 | The Tesseract OCR engine was written by Ray Smith and his research groups 69 | at Hewlett Packard (1985-1995) and Google (2006-present). 70 | -------------------------------------------------------------------------------- /unittest/log.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: log.h 3 | // Description: Include for custom log message for unittest for tesseract. 4 | // based on 5 | // https://stackoverflow.com/questions/16491675/how-to-send-custom-message-in-google-c-testing-framework 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | /////////////////////////////////////////////////////////////////////// 17 | 18 | #ifndef TESSERACT_UNITTEST_LOG_H_ 19 | #define TESSERACT_UNITTEST_LOG_H_ 20 | 21 | #include 22 | 23 | enum LogLevel { 24 | INFO, ERROR 25 | }; 26 | 27 | static inline std::ostream& LOG(enum LogLevel level) 28 | { 29 | switch (level) { 30 | #if 0 31 | case DEBUG: 32 | std::cout << "[DEBUG] "; 33 | break; 34 | #endif 35 | case INFO: 36 | std::cout << "[INFO] "; 37 | break; 38 | case ERROR: 39 | std::cout << "[ERROR] "; 40 | break; 41 | } 42 | return std::cout; 43 | } 44 | 45 | // https://github.com/google/ion/blob/master/ion/base/logging.h 46 | static inline std::ostream& QCHECK(bool condition) 47 | { 48 | static std::ostream null_stream(nullptr); 49 | if (condition) { 50 | return std::cout; 51 | } 52 | return null_stream; 53 | } 54 | 55 | #endif // TESSERACT_UNITTEST_LOG_H_ 56 | -------------------------------------------------------------------------------- /src/training/tlog.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tlog.h 3 | * Description: Variant of printf with logging level controllable by a 4 | * commandline flag. 5 | * Author: Ranjith Unnikrishnan 6 | * Created: Wed Nov 20 2013 7 | * 8 | * (C) Copyright 2013, Google Inc. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | **********************************************************************/ 20 | #ifndef TESSERACT_TRAINING_TLOG_H_ 21 | #define TESSERACT_TRAINING_TLOG_H_ 22 | 23 | #include "commandlineflags.h" 24 | #include "errcode.h" 25 | #include "tprintf.h" 26 | 27 | DECLARE_INT_PARAM_FLAG(tlog_level); 28 | 29 | // Variant guarded by the numeric logging level parameter FLAGS_tlog_level 30 | // (default 0). Code using ParseCommandLineFlags() can control its value using 31 | // the --tlog_level commandline argument. Otherwise it must be specified in a 32 | // config file like other params. 33 | #define tlog(level, ...) { \ 34 | if (FLAGS_tlog_level >= level) { \ 35 | tprintf(__VA_ARGS__); \ 36 | } \ 37 | } 38 | 39 | #define TLOG_IS_ON(level) (FLAGS_tlog_level >= level) 40 | 41 | #endif // TESSERACT_TRAINING_TLOG_H_ 42 | -------------------------------------------------------------------------------- /unittest/cycletimer.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | // Portability include to match the Google test environment. 12 | 13 | #ifndef TESSERACT_UNITTEST_CYCLETIMER_H 14 | #define TESSERACT_UNITTEST_CYCLETIMER_H 15 | 16 | #include "absl/time/clock.h" // for GetCurrentTimeNanos 17 | 18 | // See https://github.com/google/or-tools/blob/master/ortools/base/timer.h 19 | class CycleTimer { 20 | public: 21 | CycleTimer() { 22 | Reset(); 23 | } 24 | 25 | void Reset() { 26 | running_ = false; 27 | sum_ = 0; 28 | start_ = 0; 29 | } 30 | 31 | // When Start() is called multiple times, only the most recent is used. 32 | void Start() { 33 | running_ = true; 34 | start_ = absl::GetCurrentTimeNanos(); 35 | } 36 | 37 | void Restart() { 38 | sum_ = 0; 39 | Start(); 40 | } 41 | 42 | void Stop() { 43 | if (running_) { 44 | sum_ += absl::GetCurrentTimeNanos() - start_; 45 | running_ = false; 46 | } 47 | } 48 | int64_t GetInMs() const { return GetNanos() / 1000000; } 49 | 50 | protected: 51 | int64_t GetNanos() const { 52 | return running_ ? absl::GetCurrentTimeNanos() - start_ + sum_ : sum_; 53 | } 54 | 55 | private: 56 | bool running_; 57 | int64_t start_; 58 | int64_t sum_; 59 | }; 60 | 61 | #endif // TESSERACT_UNITTEST_CYCLETIMER_H 62 | -------------------------------------------------------------------------------- /doc/classifier_tester.1.asc: -------------------------------------------------------------------------------- 1 | CLASSIFIER_TESTER(1) 2 | ==================== 3 | 4 | NAME 5 | ---- 6 | classifier_tester - for *legacy tesseract* engine. 7 | 8 | SYNOPSIS 9 | -------- 10 | *classifier_tester* -U 'unicharset_file' -F 'font_properties_file' -X 'xheights_file' -classifier 'x' -lang 'lang' [-output_trainer trainer] *.tr 11 | 12 | DESCRIPTION 13 | ----------- 14 | classifier_tester(1) runs Tesseract in a special mode. 15 | It takes a list of .tr files and tests a character classifier 16 | on data as formatted for training, 17 | but it doesn't have to be the same as the training data. 18 | 19 | IN/OUT ARGUMENTS 20 | ---------------- 21 | 22 | a list of .tr files 23 | 24 | OPTIONS 25 | ------- 26 | -l 'lang':: 27 | (Input) three character language code; default value 'eng'. 28 | 29 | -classifier 'x':: 30 | (Input) One of "pruner", "full". 31 | 32 | 33 | -U 'unicharset':: 34 | (Input) The unicharset for the language. 35 | 36 | -F 'font_properties_file':: 37 | (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: 38 | 39 | *font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* 40 | 41 | -X 'xheights_file':: 42 | (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] 43 | 44 | *font_name* *xheight* 45 | 46 | -output_trainer 'trainer':: 47 | (Output, Optional) Filename for output trainer. 48 | 49 | SEE ALSO 50 | -------- 51 | tesseract(1) 52 | 53 | COPYING 54 | ------- 55 | Copyright \(C) 2012 Google, Inc. 56 | Licensed under the Apache License, Version 2.0 57 | 58 | AUTHOR 59 | ------ 60 | The Tesseract OCR engine was written by Ray Smith and his research groups 61 | at Hewlett Packard (1985-1995) and Google (2006-present). 62 | -------------------------------------------------------------------------------- /doc/mftraining.1.asc: -------------------------------------------------------------------------------- 1 | MFTRAINING(1) 2 | ============= 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | mftraining - feature training for Tesseract 8 | 9 | SYNOPSIS 10 | -------- 11 | mftraining -U 'unicharset' -O 'lang.unicharset' 'FILE'... 12 | 13 | DESCRIPTION 14 | ----------- 15 | mftraining takes a list of .tr files, from which it generates the 16 | files *inttemp* (the shape prototypes), *shapetable*, and *pffmtable* 17 | (the number of expected features for each character). (A fourth file 18 | called Microfeat is also written by this program, but it is not used.) 19 | 20 | OPTIONS 21 | ------- 22 | -U 'FILE':: 23 | (Input) The unicharset generated by unicharset_extractor(1) 24 | 25 | -F 'font_properties_file':: 26 | (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: 27 | 28 | *font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* 29 | 30 | -X 'xheights_file':: 31 | (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] 32 | 33 | *font_name* *xheight* 34 | 35 | -D 'dir':: 36 | Directory to write output files to. 37 | 38 | -O 'FILE':: 39 | (Output) The output unicharset that will be given to combine_tessdata(1) 40 | 41 | SEE ALSO 42 | -------- 43 | tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), 44 | shapeclustering(1), unicharset(5) 45 | 46 | 47 | 48 | COPYING 49 | ------- 50 | Copyright \(C) Hewlett-Packard Company, 1988 51 | Licensed under the Apache License, Version 2.0 52 | 53 | AUTHOR 54 | ------ 55 | The Tesseract OCR engine was written by Ray Smith and his research groups 56 | at Hewlett Packard (1985-1995) and Google (2006-present). 57 | -------------------------------------------------------------------------------- /src/training/validate_indic.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_INDIC_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments Indic scripts in the 9 | // unicode range 0x900-0xdff (Devanagari-Sinhala). 10 | class ValidateIndic : public Validator { 11 | public: 12 | ValidateIndic(ViramaScript script, bool report_errors) 13 | : Validator(script, report_errors) {} 14 | ~ValidateIndic() {} 15 | 16 | protected: 17 | // Returns whether codes matches the pattern for an Indic Grapheme. 18 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 19 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 20 | // otherwise does not increment codes_used_. 21 | bool ConsumeGraphemeIfValid() override; 22 | // Returns the CharClass corresponding to the given Unicode ch. 23 | Validator::CharClass UnicodeToCharClass(char32 ch) const override; 24 | 25 | private: 26 | // Helper consumes/copies a virama and any associated post-virama joiners. 27 | bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); 28 | // Helper consumes/copies a series of consonants separated by viramas while 29 | // valid, but not any vowel or other modifiers. 30 | bool ConsumeConsonantHeadIfValid(); 31 | // Helper consumes/copies a tail part of a consonant, comprising optional 32 | // matra/piece, vowel modifier, vedic mark, terminating virama. 33 | bool ConsumeConsonantTailIfValid(); 34 | // Helper consumes/copies a vowel and optional modifiers. 35 | bool ConsumeVowelIfValid(); 36 | 37 | // Some special unicodes used only for Indic processing. 38 | static const char32 kYayana = 0xdba; // Sinhala Ya 39 | static const char32 kRayana = 0xdbb; // Sinhala Ra 40 | }; 41 | 42 | } // namespace tesseract 43 | 44 | #endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_ 45 | -------------------------------------------------------------------------------- /doc/lstmeval.1.asc: -------------------------------------------------------------------------------- 1 | LSTMEVAL(1) 2 | =========== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | lstmeval - Evaluation program for LSTM-based networks. 8 | 9 | SYNOPSIS 10 | -------- 11 | *lstmeval* --model 'lang.lstm|langtrain_checkpoint|pluscharsN.NNN_NN.checkpoint' [--traineddata lang/lang.traineddata] --eval_listfile 'lang.eval_files.txt' [--verbosity N] [--max_image_MB NNNN] 12 | 13 | DESCRIPTION 14 | ----------- 15 | lstmeval(1) evaluates LSTM-based networks. Either a recognition model or a training checkpoint can be given as input for evaluation along with a list of lstmf files. If evaluating a training checkpoint, '--traineddata' should also be specified. 16 | 17 | OPTIONS 18 | ------- 19 | '--model FILE':: 20 | Name of model file (training or recognition) (type:string default:) 21 | 22 | '--traineddata FILE':: 23 | If model is a training checkpoint, then traineddata must be the traineddata file that was given to the trainer (type:string default:) 24 | 25 | '--eval_listfile FILE':: 26 | File listing sample files in lstmf training format. (type:string default:) 27 | 28 | '--max_image_MB INT':: 29 | Max memory to use for images. (type:int default:2000) 30 | 31 | '--verbosity INT':: 32 | Amount of diagnosting information to output (0-2). (type:int default:1) 33 | 34 | HISTORY 35 | ------- 36 | lstmeval(1) was first made available for tesseract4.00.00alpha. 37 | 38 | RESOURCES 39 | --------- 40 | Main web site: + 41 | Information on training tesseract LSTM: 42 | 43 | SEE ALSO 44 | -------- 45 | tesseract(1) 46 | 47 | COPYING 48 | ------- 49 | Copyright \(C) 2012 Google, Inc. 50 | Licensed under the Apache License, Version 2.0 51 | 52 | AUTHOR 53 | ------ 54 | The Tesseract OCR engine was written by Ray Smith and his research groups 55 | at Hewlett Packard (1985-1995) and Google (2006-present). 56 | -------------------------------------------------------------------------------- /src/cutil/emalloc.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Filename: emalloc.cpp 3 | * Purpose: Routines for trapping memory allocation errors. 4 | * Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #include "emalloc.h" 19 | #include 20 | #include "errcode.h" // for ASSERT_HOST 21 | 22 | /** 23 | * This routine attempts to allocate the specified number of 24 | * bytes. If the memory can be allocated, a pointer to the 25 | * memory is returned. If the memory cannot be allocated, or 26 | * if the allocation request is negative or zero, 27 | * an error is trapped. 28 | * @param Size number of bytes of memory to be allocated 29 | * @return Pointer to allocated memory. 30 | */ 31 | void *Emalloc(int Size) { 32 | ASSERT_HOST(Size > 0); 33 | void* Buffer = malloc(Size); 34 | ASSERT_HOST(Buffer != nullptr); 35 | return Buffer; 36 | } 37 | 38 | void *Erealloc(void *ptr, int size) { 39 | ASSERT_HOST(size > 0 || (size == 0 && ptr != nullptr)); 40 | void* Buffer = realloc(ptr, size); 41 | ASSERT_HOST(Buffer != nullptr || size == 0); 42 | return Buffer; 43 | } 44 | 45 | void Efree(void *ptr) { 46 | ASSERT_HOST(ptr != nullptr); 47 | free(ptr); 48 | } 49 | -------------------------------------------------------------------------------- /src/dict/dawg_cache.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dawg_cache.h 3 | // Description: A class that knows about loading and caching dawgs. 4 | // Author: David Eger 5 | // Created: Fri Jan 27 12:08:00 PST 2012 6 | // 7 | // (C) Copyright 2012, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #ifndef TESSERACT_DICT_DAWG_CACHE_H_ 21 | #define TESSERACT_DICT_DAWG_CACHE_H_ 22 | 23 | #include "dawg.h" 24 | #include "object_cache.h" 25 | #include "strngs.h" 26 | #include "tessdatamanager.h" 27 | 28 | namespace tesseract { 29 | 30 | class DawgCache { 31 | public: 32 | Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, 33 | int debug_level, TessdataManager *data_file); 34 | 35 | // If we manage the given dawg, decrement its count, 36 | // and possibly delete it if the count reaches zero. 37 | // If dawg is unknown to us, return false. 38 | bool FreeDawg(Dawg *dawg) { 39 | return dawgs_.Free(dawg); 40 | } 41 | 42 | // Free up any currently unused dawgs. 43 | void DeleteUnusedDawgs() { 44 | dawgs_.DeleteUnusedObjects(); 45 | } 46 | 47 | private: 48 | ObjectCache dawgs_; 49 | }; 50 | 51 | } // namespace tesseract 52 | 53 | #endif // TESSERACT_DICT_DAWG_CACHE_H_ 54 | -------------------------------------------------------------------------------- /doc/shapeclustering.1.asc: -------------------------------------------------------------------------------- 1 | SHAPECLUSTERING(1) 2 | ================== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | shapeclustering - shape clustering training for Tesseract 8 | 9 | SYNOPSIS 10 | -------- 11 | shapeclustering -D 'output_dir' 12 | -U 'unicharset' -O 'mfunicharset' 13 | -F 'font_props' -X 'xheights' 14 | 'FILE'... 15 | 16 | DESCRIPTION 17 | ----------- 18 | shapeclustering(1) takes extracted feature .tr files (generated by 19 | tesseract(1) run in a special mode from box files) and produces a 20 | file *shapetable* and an enhanced unicharset. This program is still 21 | experimental, and is not required (yet) for training Tesseract. 22 | 23 | OPTIONS 24 | ------- 25 | -U 'FILE':: 26 | The unicharset generated by unicharset_extractor(1). 27 | 28 | -D 'dir':: 29 | Directory to write output files to. 30 | 31 | -F 'font_properties_file':: 32 | (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: 33 | 34 | 'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur' 35 | 36 | -X 'xheights_file':: 37 | (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] 38 | 39 | 'font_name' 'xheight' 40 | 41 | -O 'FILE':: 42 | The output unicharset that will be given to combine_tessdata(1). 43 | 44 | SEE ALSO 45 | -------- 46 | tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), 47 | unicharset(5) 48 | 49 | 50 | 51 | COPYING 52 | ------- 53 | Copyright \(C) Google, 2011 54 | Licensed under the Apache License, Version 2.0 55 | 56 | AUTHOR 57 | ------ 58 | The Tesseract OCR engine was written by Ray Smith and his research groups 59 | at Hewlett Packard (1985-1995) and Google (2006-present). 60 | -------------------------------------------------------------------------------- /src/textord/blobgrid.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: blobgrid.cpp 3 | // Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods. 4 | // Copyright 2011 Google Inc. All Rights Reserved. 5 | // Author: rays@google.com (Ray Smith) 6 | // Created: Sat Jun 11 10:30:01 PST 2011 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #include "blobgrid.h" 21 | 22 | namespace tesseract { 23 | 24 | BlobGrid::BlobGrid(int gridsize, const ICOORD& bleft, const ICOORD& tright) 25 | : BBGrid(gridsize, bleft, tright) { 26 | } 27 | 28 | // Destructor. 29 | // It is defined here, so the compiler can create a single vtable 30 | // instead of weak vtables in every compilation unit. 31 | BlobGrid::~BlobGrid() = default; 32 | 33 | // Inserts all the blobs from the given list, with x and y spreading, 34 | // without removing from the source list, so ownership remains with the 35 | // source list. 36 | void BlobGrid::InsertBlobList(BLOBNBOX_LIST* blobs) { 37 | BLOBNBOX_IT blob_it(blobs); 38 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 39 | BLOBNBOX* blob = blob_it.data(); 40 | if (!blob->joined_to_prev()) 41 | InsertBBox(true, true, blob); 42 | } 43 | } 44 | 45 | } // namespace tesseract. 46 | -------------------------------------------------------------------------------- /src/ccstruct/polyaprx.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: polyaprx.h (Formerly polygon.h) 3 | * Description: Code for polygonal approximation from old edgeprog. 4 | * Author: Ray Smith 5 | * Created: Thu Nov 25 11:42:04 GMT 1993 6 | * 7 | * (C) Copyright 1993, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef POLYAPRX_H 21 | #define POLYAPRX_H 22 | 23 | class C_OUTLINE; 24 | struct EDGEPT; 25 | struct TESSLINE; 26 | 27 | // convert a chain-coded input to the old OUTLINE approximation 28 | TESSLINE* ApproximateOutline(bool allow_detailed_fx, C_OUTLINE *c_outline); 29 | EDGEPT *edgesteps_to_edgepts ( //convert outline 30 | C_OUTLINE * c_outline, //input 31 | EDGEPT edgepts[] //output is array 32 | ); 33 | void fix2( //polygonal approx 34 | EDGEPT *start, /*loop to approimate */ 35 | int area); 36 | EDGEPT *poly2( //second poly 37 | EDGEPT *startpt, /*start of loop */ 38 | int area /*area of blob box */ 39 | ); 40 | void cutline( //recursive refine 41 | EDGEPT *first, /*ends of line */ 42 | EDGEPT *last, 43 | int area /*area of object */ 44 | ); 45 | #endif 46 | -------------------------------------------------------------------------------- /src/ccutil/ccutil.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2008 Google Inc. All Rights Reserved. 2 | // Author: scharron@google.com (Samuel Charron) 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | 13 | #include "ccutil.h" 14 | 15 | namespace tesseract { 16 | CCUtil::CCUtil() : 17 | params_(), 18 | INT_INIT_MEMBER(ambigs_debug_level, 0, "Debug level for unichar ambiguities", 19 | ¶ms_), 20 | BOOL_MEMBER(use_definite_ambigs_for_classifier, false, "Use definite" 21 | " ambiguities when running character classifier", ¶ms_), 22 | BOOL_MEMBER(use_ambigs_for_adaption, false, "Use ambigs for deciding" 23 | " whether to adapt to a character", ¶ms_) { 24 | } 25 | 26 | // Destructor. 27 | // It is defined here, so the compiler can create a single vtable 28 | // instead of weak vtables in every compilation unit. 29 | CCUtil::~CCUtil() = default; 30 | 31 | CCUtilMutex::CCUtilMutex() { 32 | #ifdef _WIN32 33 | mutex_ = CreateMutex(0, FALSE, 0); 34 | #else 35 | pthread_mutex_init(&mutex_, nullptr); 36 | #endif 37 | } 38 | 39 | void CCUtilMutex::Lock() { 40 | #ifdef _WIN32 41 | WaitForSingleObject(mutex_, INFINITE); 42 | #else 43 | pthread_mutex_lock(&mutex_); 44 | #endif 45 | } 46 | 47 | void CCUtilMutex::Unlock() { 48 | #ifdef _WIN32 49 | ReleaseMutex(mutex_); 50 | #else 51 | pthread_mutex_unlock(&mutex_); 52 | #endif 53 | } 54 | 55 | CCUtilMutex tprintfMutex; // should remain global 56 | } // namespace tesseract 57 | -------------------------------------------------------------------------------- /src/classify/blobclass.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: blobclass.h 3 | ** Purpose: Interface to high level classification and training. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef BLOBCLASS_H 19 | #define BLOBCLASS_H 20 | 21 | /**---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------**/ 24 | #include "strngs.h" 25 | 26 | /**---------------------------------------------------------------------------- 27 | Public Function Prototypes 28 | ----------------------------------------------------------------------------**/ 29 | namespace tesseract { 30 | // Finds the name of the training font and returns it in fontname, by cutting 31 | // it out based on the expectation that the filename is of the form: 32 | // /path/to/dir/[lang].[fontname].exp[num] 33 | // The [lang], [fontname] and [num] fields should not have '.' characters. 34 | // If the global parameter classify_font_name is set, its value is used instead. 35 | void ExtractFontName(const STRING& filename, STRING* fontname); 36 | 37 | } // namespace tesseract. 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /android/jni/Android.mk: -------------------------------------------------------------------------------- 1 | LOCAL_PATH := $(call my-dir) 2 | include $(CLEAR_VARS) 3 | 4 | LOCAL_MODULE := tesseract-$(APP_ABI) 5 | 6 | LOCAL_STATIC_LIBRARIES := \ 7 | base \ 8 | leptonica-$(APP_ABI) 9 | 10 | LOCAL_C_INCLUDES := $(APP_C_INCLUDES) 11 | 12 | LOCAL_C_INCLUDES += \ 13 | $(LOCAL_PATH)/../../api \ 14 | $(LOCAL_PATH)/../../ccmain\ 15 | $(LOCAL_PATH)/../../ccstruct\ 16 | $(LOCAL_PATH)/../../ccutil\ 17 | $(LOCAL_PATH)/../../classify\ 18 | $(LOCAL_PATH)/../../cutil\ 19 | $(LOCAL_PATH)/../../dict\ 20 | $(LOCAL_PATH)/../../image\ 21 | $(LOCAL_PATH)/../../textord\ 22 | $(LOCAL_PATH)/../../third_party\ 23 | $(LOCAL_PATH)/../../wordrec\ 24 | $(LOCAL_PATH)/../../opencl\ 25 | $(LOCAL_PATH)/../../viewer\ 26 | $(LOCAL_PATH)/../../../leptonica/include 27 | 28 | $(info local c includes=$(LOCAL_C_INCLUDES)) 29 | $(info local path=$(LOCAL_PATH)) 30 | LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../ccmain/*.cpp $(LOCAL_PATH)/../../ccstruct/*.cpp $(LOCAL_PATH)/../../ccutil/*.cpp $(LOCAL_PATH)/../../classify/*.cpp $(LOCAL_PATH)/../../cutil/*.cpp $(LOCAL_PATH)/../../dict/*.cpp $(LOCAL_PATH)/../../image/*.cpp $(LOCAL_PATH)/../../textord/*.cpp $(LOCAL_PATH)/../../viewer/*.cpp $(LOCAL_PATH)/../../wordrec/*.cpp) 31 | 32 | EXPLICIT_SRC_EXCLUDES := \ 33 | $(LOCAL_PATH)/../../api/altorenderer.cpp \ 34 | $(LOCAL_PATH)/../../api/hocrrenderer.cpp \ 35 | $(LOCAL_PATH)/../../api/pdfrenderer.cpp \ 36 | $(LOCAL_PATH)/../../api/tesseractmain.cpp \ 37 | 38 | LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES)) 39 | 40 | LOCAL_SRC_FILES := $(LOCAL_SRC_FILES:$(LOCAL_PATH)/%=%) 41 | 42 | $(info local src files = $(LOCAL_SRC_FILES)) 43 | 44 | LOCAL_LDLIBS := -ldl -llog -ljnigraphics 45 | LOCAL_CFLAGS := -DANDROID_BUILD -DGRAPHICS_DISABLED 46 | 47 | include $(BUILD_SHARED_LIBRARY) 48 | 49 | $(call import-module,base/port) 50 | $(call import-module,mobile/util/hash) 51 | $(call import-module,third_party/leptonica/android/jni) 52 | -------------------------------------------------------------------------------- /src/classify/outfeat.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: outfeat.h 3 | ** Purpose: Definition of outline features. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef OUTFEAT_H 19 | #define OUTFEAT_H 20 | 21 | /**---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------**/ 24 | #include "ocrfeatures.h" 25 | #include "fpoint.h" 26 | #include "mfoutline.h" 27 | 28 | typedef enum { 29 | OutlineFeatX, 30 | OutlineFeatY, 31 | OutlineFeatLength, 32 | OutlineFeatDir 33 | } OUTLINE_FEAT_PARAM_NAME; 34 | 35 | #define MAX_OUTLINE_FEATURES (100) 36 | 37 | /*--------------------------------------------------------------------------- 38 | Privat Function Prototypes 39 | ----------------------------------------------------------------------------*/ 40 | void AddOutlineFeatureToSet(FPOINT *Start, 41 | FPOINT *End, 42 | FEATURE_SET FeatureSet); 43 | 44 | void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet); 45 | 46 | void NormalizeOutlineX(FEATURE_SET FeatureSet); 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/ccutil/bits16.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: bits16.h (Formerly bits8.h) 3 | * Description: Code for 8 bit field class. 4 | * Author: Phil Cheatle 5 | * 6 | * (C) Copyright 1991, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef BITS16_H 20 | #define BITS16_H 21 | 22 | #include // for uint8_t, ... 23 | #include "platform.h" // for DLLSYM 24 | 25 | class DLLSYM BITS16 { 26 | public: 27 | uint16_t val; 28 | 29 | BITS16() { val = 0; } // constructor 30 | 31 | BITS16(uint16_t init) { val = init; } 32 | 33 | void turn_on_bit( // flip specified bit 34 | uint8_t bit_num) { // bit to flip 0..7 35 | val = val | 01 << bit_num; 36 | } 37 | 38 | void turn_off_bit( // flip specified bit 39 | uint8_t bit_num) { // bit to flip 0..7 40 | val = val & ~(01 << bit_num); 41 | } 42 | 43 | void set_bit( // flip specified bit 44 | uint8_t bit_num, // bit to flip 0..7 45 | bool value) { // value to flip to 46 | if (value) 47 | val = val | 01 << bit_num; 48 | else 49 | val = val & ~(01 << bit_num); 50 | } 51 | 52 | bool bit( // access bit 53 | uint8_t bit_num) const { // bit to access 54 | return (val >> bit_num) & 01; 55 | } 56 | }; 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/dict/stopper.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: stopper.h 3 | ** Purpose: Stopping criteria for word classifier. 4 | ** Author: Dan Johnson 5 | ** History: Wed May 1 09:42:57 1991, DSJ, Created. 6 | ** 7 | ** (c) Copyright Hewlett-Packard Company, 1988. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | ******************************************************************************/ 18 | #ifndef STOPPER_H 19 | #define STOPPER_H 20 | 21 | /*---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------*/ 24 | 25 | #include "genericvector.h" 26 | #include "params.h" 27 | #include "ratngs.h" 28 | #include "unichar.h" 29 | 30 | class WERD_CHOICE; 31 | 32 | using BLOB_WIDTH = uint8_t; 33 | 34 | struct DANGERR_INFO { 35 | DANGERR_INFO() : 36 | begin(-1), end(-1), dangerous(false), correct_is_ngram(false), 37 | leftmost(INVALID_UNICHAR_ID) {} 38 | DANGERR_INFO(int b, int e, bool d, bool n, UNICHAR_ID l) : 39 | begin(b), end(e), dangerous(d), correct_is_ngram(n), leftmost(l) {} 40 | int begin; 41 | int end; 42 | bool dangerous; 43 | bool correct_is_ngram; 44 | UNICHAR_ID leftmost; // in the replacement, what's the leftmost character? 45 | }; 46 | 47 | using DANGERR = GenericVector; 48 | 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/training/set_unicharset_properties.cpp: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software 6 | // distributed under the License is distributed on an "AS IS" BASIS, 7 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | // See the License for the specific language governing permissions and 9 | // limitations under the License. 10 | 11 | // This program reads a unicharset file, puts the result in a UNICHARSET 12 | // object, fills it with properties about the unichars it contains and writes 13 | // the result back to a file. 14 | 15 | #include "commandlineflags.h" 16 | #include "commontraining.h" // CheckSharedLibraryVersion 17 | #include "tprintf.h" 18 | #include "unicharset_training_utils.h" 19 | 20 | // The directory that is searched for universal script unicharsets. 21 | STRING_PARAM_FLAG(script_dir, "", 22 | "Directory name for input script unicharsets/xheights"); 23 | 24 | // Flags from commontraining.cpp 25 | DECLARE_STRING_PARAM_FLAG(U); 26 | DECLARE_STRING_PARAM_FLAG(O); 27 | DECLARE_STRING_PARAM_FLAG(X); 28 | 29 | int main(int argc, char** argv) { 30 | tesseract::CheckSharedLibraryVersion(); 31 | tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); 32 | 33 | // Check validity of input flags. 34 | if (FLAGS_U.empty() || FLAGS_O.empty()) { 35 | tprintf("Specify both input and output unicharsets!\n"); 36 | exit(1); 37 | } 38 | if (FLAGS_script_dir.empty()) { 39 | tprintf("Must specify a script_dir!\n"); 40 | exit(1); 41 | } 42 | 43 | tesseract::SetPropertiesForInputFile(FLAGS_script_dir.c_str(), 44 | FLAGS_U.c_str(), FLAGS_O.c_str(), 45 | FLAGS_X.c_str()); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /src/textord/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/ccstruct \ 3 | -I$(top_srcdir)/src/ccutil \ 4 | -I$(top_srcdir)/src/viewer \ 5 | -I$(top_srcdir)/src/ccmain \ 6 | -I$(top_srcdir)/src/wordrec \ 7 | -I$(top_srcdir)/src/api \ 8 | -I$(top_srcdir)/src/cutil \ 9 | -I$(top_srcdir)/src/classify \ 10 | -I$(top_srcdir)/src/dict \ 11 | -I$(top_srcdir)/src/opencl 12 | 13 | AM_CPPFLAGS += $(OPENCL_CPPFLAGS) 14 | 15 | if VISIBILITY 16 | AM_CPPFLAGS += -DTESS_EXPORTS \ 17 | -fvisibility=hidden -fvisibility-inlines-hidden 18 | endif 19 | 20 | 21 | noinst_HEADERS = \ 22 | alignedblob.h baselinedetect.h bbgrid.h blkocc.h blobgrid.h \ 23 | ccnontextdetect.h cjkpitch.h colfind.h colpartition.h colpartitionset.h \ 24 | colpartitiongrid.h \ 25 | devanagari_processing.h drawedg.h drawtord.h edgblob.h edgloop.h \ 26 | equationdetectbase.h \ 27 | fpchop.h gap_map.h imagefind.h linefind.h makerow.h oldbasel.h \ 28 | pithsync.h pitsync1.h scanedg.h sortflts.h strokewidth.h \ 29 | tabfind.h tablefind.h tabvector.h \ 30 | tablerecog.h textlineprojection.h textord.h \ 31 | topitch.h tordmain.h tovars.h \ 32 | underlin.h wordseg.h workingpartset.h 33 | 34 | noinst_LTLIBRARIES = libtesseract_textord.la 35 | 36 | libtesseract_textord_la_SOURCES = \ 37 | alignedblob.cpp baselinedetect.cpp bbgrid.cpp blkocc.cpp blobgrid.cpp \ 38 | ccnontextdetect.cpp cjkpitch.cpp colfind.cpp colpartition.cpp colpartitionset.cpp \ 39 | colpartitiongrid.cpp devanagari_processing.cpp \ 40 | drawedg.cpp drawtord.cpp edgblob.cpp edgloop.cpp \ 41 | equationdetectbase.cpp \ 42 | fpchop.cpp gap_map.cpp imagefind.cpp linefind.cpp makerow.cpp oldbasel.cpp \ 43 | pithsync.cpp pitsync1.cpp scanedg.cpp sortflts.cpp strokewidth.cpp \ 44 | tabfind.cpp tablefind.cpp tabvector.cpp \ 45 | tablerecog.cpp textlineprojection.cpp textord.cpp \ 46 | topitch.cpp tordmain.cpp tospace.cpp tovars.cpp \ 47 | underlin.cpp wordseg.cpp workingpartset.cpp 48 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVEmptyMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.ScrollView; 23 | import com.google.scrollview.events.SVEvent; 24 | import com.google.scrollview.events.SVEventType; 25 | 26 | import javax.swing.JMenuItem; 27 | 28 | /** 29 | * Constructs a new menulistitem which just has an ID and a name attached to 30 | * it. In this case, we will have to ask for the value of the item and its 31 | * description if it gets called. 32 | */ 33 | class SVEmptyMenuItem extends SVAbstractMenuItem { 34 | SVEmptyMenuItem(int id, String name) { 35 | super(id, name, new JMenuItem(name)); 36 | } 37 | /** What to do when user clicks on this item. */ 38 | @Override 39 | public void performAction(SVWindow window, SVEventType eventType) { 40 | // Send an event indicating that someone clicked on an entry. 41 | // Value will be null here. 42 | SVEvent svme = 43 | new SVEvent(eventType, window, id, getValue()); 44 | ScrollView.addMessage(svme); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/training/validate_myanmar.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments Myanmar. 9 | class ValidateMyanmar : public Validator { 10 | public: 11 | ValidateMyanmar(ViramaScript script, bool report_errors) 12 | : Validator(script, report_errors) {} 13 | ~ValidateMyanmar() {} 14 | 15 | protected: 16 | // Returns whether codes matches the pattern for a Myanmar Grapheme. 17 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 18 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 19 | // otherwise does not increment codes_used_. 20 | bool ConsumeGraphemeIfValid() override; 21 | // Returns the CharClass corresponding to the given Unicode ch. 22 | Validator::CharClass UnicodeToCharClass(char32 ch) const override; 23 | 24 | private: 25 | // Helper consumes/copies a virama and any subscript consonant. 26 | // Returns true if the end of input is reached. 27 | bool ConsumeSubscriptIfPresent(); 28 | // Helper consumes/copies a series of optional signs. 29 | // Returns true if the end of input is reached. 30 | bool ConsumeOptionalSignsIfPresent(); 31 | // Returns true if the unicode is a Myanmar "letter" including consonants 32 | // and independent vowels. Although table 16-3 distinguishes between some 33 | // base consonants and vowels, the extensions make no such distinction, so we 34 | // put them all into a single bucket. 35 | static bool IsMyanmarLetter(char32 ch); 36 | // Returns true if ch is a Myanmar digit or other symbol that does not take 37 | // part in being a syllable. 38 | static bool IsMyanmarOther(char32 ch); 39 | 40 | // Some special unicodes used only for Myanmar processing. 41 | static const char32 kMyanmarAsat = 0x103a; 42 | static const char32 kMyanmarMedialYa = 0x103b; 43 | }; 44 | 45 | } // namespace tesseract 46 | 47 | #endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ 48 | -------------------------------------------------------------------------------- /unittest/unichar_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "include_gunit.h" 13 | #include "gmock/gmock.h" // for testing::ElementsAreArray 14 | #include "unichar.h" 15 | 16 | using tesseract::char32; 17 | using tesseract::UNICHAR; 18 | 19 | namespace { 20 | 21 | TEST(UnicharTest, Conversion) { 22 | // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8 23 | // show the required conversion properties. 24 | // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes. 25 | const char* kUTF8Src = "a\u05d0\u0ca4\U0002a714"; 26 | const std::vector kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714}; 27 | // Check for round-trip conversion. 28 | std::vector utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src); 29 | EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src)); 30 | std::string utf8 = UNICHAR::UTF32ToUTF8(utf32); 31 | EXPECT_STREQ(kUTF8Src, utf8.c_str()); 32 | } 33 | 34 | TEST(UnicharTest, InvalidText) { 35 | // This test verifies that Unichar correctly deals with invalid text. 36 | const char* kInvalidUTF8 = "a b\200d string"; 37 | const std::vector kInvalidUTF32 = {'a', ' ', 0x200000, 'x'}; 38 | // Invalid utf8 produces an empty vector. 39 | std::vector utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8); 40 | EXPECT_TRUE(utf32.empty()); 41 | // Invalid utf32 produces an empty string. 42 | std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32); 43 | EXPECT_TRUE(utf8.empty()); 44 | } 45 | 46 | } // namespace 47 | -------------------------------------------------------------------------------- /unittest/lstm_recode_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "lstm_test.h" 13 | 14 | namespace tesseract { 15 | 16 | // Tests that training with unicharset recoding learns faster than without, 17 | // for Korean. This test is split in two, so it can be run sharded. 18 | 19 | TEST_F(LSTMTrainerTest, RecodeTestKorBase) { 20 | // A basic single-layer, bi-di 1d LSTM on Korean. 21 | SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-full", "kor/kor.unicharset", 22 | "kor.Arial_Unicode_MS.exp0.lstmf", false, true, 5e-4, false, "kor"); 23 | double kor_full_err = TrainIterations(kTrainerIterations * 2); 24 | EXPECT_LT(kor_full_err, 88); 25 | // EXPECT_GT(kor_full_err, 85); 26 | LOG(INFO) << "********** Expected < 88 ************\n" ; 27 | } 28 | 29 | TEST_F(LSTMTrainerTest, RecodeTestKor) { 30 | // A basic single-layer, bi-di 1d LSTM on Korean. 31 | SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-recode", "kor/kor.unicharset", 32 | "kor.Arial_Unicode_MS.exp0.lstmf", true, true, 5e-4, false, "kor"); 33 | double kor_recode_err = TrainIterations(kTrainerIterations); 34 | EXPECT_LT(kor_recode_err, 60); 35 | LOG(INFO) << "********** Expected < 60 ************\n" ; 36 | } 37 | 38 | // Tests that the given string encodes and decodes back to the same 39 | // with both recode on and off for Korean. 40 | 41 | TEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) { 42 | TestEncodeDecodeBoth("kor", "한국어 위키백과에 오신 것을 환영합니다!"); 43 | } 44 | 45 | 46 | } // namespace tesseract. 47 | -------------------------------------------------------------------------------- /unittest/stats_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "genericvector.h" 13 | #include "kdpair.h" 14 | #include "statistc.h" 15 | 16 | #include "include_gunit.h" 17 | 18 | namespace { 19 | 20 | const int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1}; 21 | 22 | class STATSTest : public testing::Test { 23 | public: 24 | void SetUp() { 25 | stats_.set_range(0, 16); 26 | for (size_t i = 0; i < ARRAYSIZE(kTestData); ++i) 27 | stats_.add(i, kTestData[i]); 28 | } 29 | 30 | void TearDown() {} 31 | 32 | STATS stats_; 33 | }; 34 | 35 | // Tests some basic numbers from the stats_. 36 | TEST_F(STATSTest, BasicStats) { 37 | EXPECT_EQ(37, stats_.get_total()); 38 | EXPECT_EQ(2, stats_.mode()); 39 | EXPECT_EQ(12, stats_.pile_count(2)); 40 | } 41 | 42 | // Tests the top_n_modes function. 43 | TEST_F(STATSTest, TopNModes) { 44 | GenericVector > modes; 45 | int num_modes = stats_.top_n_modes(3, &modes); 46 | EXPECT_EQ(3, num_modes); 47 | // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14. 48 | EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key); 49 | EXPECT_EQ(14, modes[0].data); 50 | // Mode 1 is 2 10 1 = 13 total count with a mean of 5 12/13. 51 | EXPECT_FLOAT_EQ(5.0f + 12.0f / 13, modes[1].key); 52 | EXPECT_EQ(13, modes[1].data); 53 | // Mode 2 is 4 1 1 = 6 total count with a mean of 13.5. 54 | EXPECT_FLOAT_EQ(13.5f, modes[2].key); 55 | EXPECT_EQ(6, modes[2].data); 56 | } 57 | 58 | } // namespace. 59 | -------------------------------------------------------------------------------- /cmake/templates/TesseractConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # =================================================================================== 2 | # The Tesseract CMake configuration file 3 | # 4 | # ** File generated automatically, do not modify ** 5 | # 6 | # Usage from an external project: 7 | # In your CMakeLists.txt, add these lines: 8 | # 9 | # find_package(Tesseract REQUIRED) 10 | # include_directories(${Tesseract_INCLUDE_DIRS}) 11 | # target_link_libraries(MY_TARGET_NAME ${Tesseract_LIBRARIES}) 12 | # 13 | # This file will define the following variables: 14 | # - Tesseract_LIBRARIES : The list of all imported targets for OpenCV modules. 15 | # - Tesseract_INCLUDE_DIRS : The Tesseract include directories. 16 | # - Tesseract_VERSION : The version of this Tesseract build: "@VERSION_PLAIN@" 17 | # - Tesseract_VERSION_MAJOR : Major version part of Tesseract_VERSION: "@VERSION_MAJOR@" 18 | # - Tesseract_VERSION_MINOR : Minor version part of Tesseract_VERSION: "@VERSION_MINOR@" 19 | # 20 | # =================================================================================== 21 | 22 | include(${CMAKE_CURRENT_LIST_DIR}/TesseractTargets.cmake) 23 | 24 | find_package(Leptonica REQUIRED) 25 | 26 | # ====================================================== 27 | # Version variables: 28 | # ====================================================== 29 | 30 | SET(Tesseract_VERSION @VERSION_PLAIN@) 31 | SET(Tesseract_VERSION_MAJOR @VERSION_MAJOR@) 32 | SET(Tesseract_VERSION_MINOR @VERSION_MINOR@) 33 | 34 | # ====================================================== 35 | # Include directories to add to the user project: 36 | # ====================================================== 37 | 38 | # Provide the include directories to the caller 39 | set(Tesseract_INCLUDE_DIRS @INCLUDE_DIR@) 40 | 41 | # ==================================================================== 42 | # Link libraries: 43 | # ==================================================================== 44 | 45 | set(Tesseract_LIBRARIES libtesseract) 46 | -------------------------------------------------------------------------------- /src/ccutil/lsterr.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: lsterr.h (Formerly listerr.h) 3 | * Description: Errors shared by list modules 4 | * Author: Phil Cheatle 5 | * Created: Wed Jan 23 09:10:35 GMT 1991 6 | * 7 | * (C) Copyright 1990, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef TESSERACT_CCUTIL_LSTERR_H_ 21 | #define TESSERACT_CCUTIL_LSTERR_H_ 22 | 23 | #include "errcode.h" //must be last include 24 | 25 | const ERRCODE DONT_CONSTRUCT_LIST_BY_COPY = 26 | "Can't create a list by assignment"; 27 | const ERRCODE DONT_ASSIGN_LISTS = "Can't assign to lists"; 28 | const ERRCODE SERIALISE_LINKS = "Attempted to (de)serialise a link element"; 29 | 30 | #ifndef NDEBUG 31 | 32 | const ERRCODE NO_LIST = "Iterator not set to a list"; 33 | const ERRCODE NULL_OBJECT = "List found this = nullptr!"; 34 | const ERRCODE NULL_DATA = "List would have returned a nullptr data pointer"; 35 | const ERRCODE NULL_CURRENT = "List current position is nullptr"; 36 | const ERRCODE NULL_NEXT = "Next element on the list is nullptr"; 37 | const ERRCODE NULL_PREV = "Previous element on the list is nullptr"; 38 | const ERRCODE EMPTY_LIST = "List is empty"; 39 | const ERRCODE BAD_PARAMETER = "List parameter error"; 40 | const ERRCODE STILL_LINKED = 41 | "Attempting to add an element with non nullptr links, to a list"; 42 | 43 | #endif // !NDEBUG 44 | #endif // TESSERACT_CCUTIL_LSTERR_H_ 45 | -------------------------------------------------------------------------------- /doc/unicharset_extractor.1.asc: -------------------------------------------------------------------------------- 1 | UNICHARSET_EXTRACTOR(1) 2 | ======================= 3 | 4 | NAME 5 | ---- 6 | unicharset_extractor - extract unicharset from Tesseract boxfiles 7 | 8 | SYNOPSIS 9 | -------- 10 | *unicharset_extractor* '[-D dir]' 'FILE'... 11 | 12 | DESCRIPTION 13 | ----------- 14 | Tesseract needs to know the set of possible characters it can output. 15 | To generate the unicharset data file, use the unicharset_extractor 16 | program on the same training pages bounding box files as used for 17 | clustering: 18 | 19 | unicharset_extractor fontfile_1.box fontfile_2.box ... 20 | 21 | The unicharset will be put into the file 'dir/unicharset', or simply 22 | './unicharset' if no output directory is provided. 23 | 24 | Tesseract also needs to have access to character properties isalpha, 25 | isdigit, isupper, islower, ispunctuation. all of this auxilury data 26 | and more is encoded in this file. (See unicharset(5)) 27 | 28 | If your system supports the wctype functions, these values will be set 29 | automatically by unicharset_extractor and there is no need to edit the 30 | unicharset file. On some older systems (eg Windows 95), the unicharset 31 | file must be edited by hand to add these property description codes. 32 | 33 | *NOTE* The unicharset file must be regenerated whenever inttemp, normproto 34 | and pffmtable are generated (i.e. they must all be recreated when the box 35 | file is changed) as they have to be in sync. This is made easier than in 36 | previous versions by running unicharset_extractor before mftraining and 37 | cntraining, and giving the unicharset to mftraining. 38 | 39 | SEE ALSO 40 | -------- 41 | tesseract(1), unicharset(5) 42 | 43 | 44 | 45 | HISTORY 46 | ------- 47 | unicharset_extractor first appeared in Tesseract 2.00. 48 | 49 | COPYING 50 | ------- 51 | Copyright \(C) 2006, Google Inc. 52 | Licensed under the Apache License, Version 2.0 53 | 54 | AUTHOR 55 | ------ 56 | The Tesseract OCR engine was written by Ray Smith and his research groups 57 | at Hewlett Packard (1985-1995) and Google (2006-present). 58 | -------------------------------------------------------------------------------- /src/classify/mfx.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: mfx.h 3 | ** Purpose: Definition of micro-feature extraction routines 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef MFX_H 19 | #define MFX_H 20 | 21 | /*---------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | ----------------------------------------------------------------------------**/ 24 | #include "mfdefs.h" 25 | #include "params.h" 26 | 27 | class DENORM; 28 | struct TBLOB; 29 | 30 | /*---------------------------------------------------------------------------- 31 | Variables 32 | ----------------------------------------------------------------------------**/ 33 | 34 | /* old numbers corresponded to 10.0 degrees and 80.0 degrees */ 35 | extern double_VAR_H(classify_min_slope, 0.414213562, 36 | "Slope below which lines are called horizontal"); 37 | extern double_VAR_H(classify_max_slope, 2.414213562, 38 | "Slope above which lines are called vertical"); 39 | 40 | /*---------------------------------------------------------------------------- 41 | Public Function Prototypes 42 | ----------------------------------------------------------------------------**/ 43 | MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm); 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/ccmain/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS += \ 2 | -I$(top_srcdir)/src/ccutil \ 3 | -I$(top_srcdir)/src/ccstruct \ 4 | -I$(top_srcdir)/src/arch \ 5 | -I$(top_srcdir)/src/lstm \ 6 | -I$(top_srcdir)/src/viewer \ 7 | -I$(top_srcdir)/src/classify \ 8 | -I$(top_srcdir)/src/dict \ 9 | -I$(top_srcdir)/src/wordrec \ 10 | -I$(top_srcdir)/src/cutil \ 11 | -I$(top_srcdir)/src/textord \ 12 | -I$(top_srcdir)/src/opencl 13 | 14 | AM_CPPFLAGS += $(OPENCL_CPPFLAGS) 15 | AM_CPPFLAGS += $(OPENMP_CXXFLAGS) 16 | 17 | if DISABLED_LEGACY_ENGINE 18 | AM_CPPFLAGS += -DDISABLED_LEGACY_ENGINE 19 | endif 20 | 21 | if VISIBILITY 22 | AM_CPPFLAGS += -DTESS_EXPORTS \ 23 | -fvisibility=hidden -fvisibility-inlines-hidden 24 | endif 25 | 26 | pkginclude_HEADERS = \ 27 | thresholder.h \ 28 | osdetect.h \ 29 | ltrresultiterator.h \ 30 | pageiterator.h \ 31 | resultiterator.h 32 | 33 | noinst_HEADERS = \ 34 | control.h \ 35 | equationdetect.h \ 36 | mutableiterator.h \ 37 | output.h \ 38 | paragraphs.h \ 39 | paragraphs_internal.h \ 40 | paramsd.h \ 41 | pgedit.h \ 42 | tesseractclass.h \ 43 | tessvars.h \ 44 | werdit.h 45 | 46 | if !DISABLED_LEGACY_ENGINE 47 | noinst_HEADERS += \ 48 | docqual.h \ 49 | fixspace.h \ 50 | reject.h 51 | endif 52 | 53 | noinst_LTLIBRARIES = libtesseract_main.la 54 | 55 | libtesseract_main_la_SOURCES = \ 56 | applybox.cpp \ 57 | control.cpp \ 58 | linerec.cpp \ 59 | ltrresultiterator.cpp \ 60 | mutableiterator.cpp \ 61 | output.cpp \ 62 | pageiterator.cpp \ 63 | pagesegmain.cpp \ 64 | pagewalk.cpp \ 65 | paragraphs.cpp \ 66 | paramsd.cpp \ 67 | pgedit.cpp \ 68 | recogtraining.cpp \ 69 | reject.cpp \ 70 | resultiterator.cpp \ 71 | tessedit.cpp \ 72 | tesseractclass.cpp \ 73 | tessvars.cpp \ 74 | thresholder.cpp \ 75 | werdit.cpp 76 | 77 | if !DISABLED_LEGACY_ENGINE 78 | libtesseract_main_la_SOURCES += \ 79 | adaptions.cpp \ 80 | docqual.cpp \ 81 | equationdetect.cpp \ 82 | fixspace.cpp \ 83 | fixxht.cpp \ 84 | osdetect.cpp \ 85 | par_control.cpp \ 86 | superscript.cpp \ 87 | tessbox.cpp \ 88 | tfacepp.cpp 89 | endif 90 | -------------------------------------------------------------------------------- /src/ccutil/basedir.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: basedir.cpp (Formerly getpath.c) 3 | * Description: Find the directory location of the current executable using 4 | *PATH. Author: Ray Smith Created: Mon Jul 09 09:06:39 BST 1990 5 | * 6 | * (C) Copyright 1990, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #include "basedir.h" 20 | 21 | #include 22 | 23 | // Assuming that code_path is the name of some file in a desired directory, 24 | // returns the given code_path stripped back to the last slash, leaving 25 | // the last slash in place. If there is no slash, returns ./ assuming that 26 | // the input was the name of something in the current directory. 27 | // Useful for getting to the directory of argv[0], but does not search 28 | // any paths. 29 | TESS_API void truncate_path(const char *code_path, STRING* trunc_path) { 30 | int trunc_index = -1; 31 | if (code_path != nullptr) { 32 | const char* last_slash = strrchr(code_path, '/'); 33 | if (last_slash != nullptr && last_slash + 1 - code_path > trunc_index) 34 | trunc_index = last_slash + 1 - code_path; 35 | last_slash = strrchr(code_path, '\\'); 36 | if (last_slash != nullptr && last_slash + 1 - code_path > trunc_index) 37 | trunc_index = last_slash + 1 - code_path; 38 | } 39 | *trunc_path = code_path; 40 | if (trunc_index >= 0) 41 | trunc_path->truncate_at(trunc_index); 42 | else 43 | *trunc_path = "./"; 44 | } 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | # Windows 3 | *.user 4 | *.log 5 | *.tlog 6 | *.cache 7 | *.obj 8 | *.sdf 9 | *.opensdf 10 | *.lastbuildstate 11 | *.unsuccessfulbuild 12 | *.suo 13 | *.res 14 | *.ipch 15 | *.manifest 16 | 17 | # Linux 18 | # ignore local configuration 19 | config.* 20 | config/* 21 | Makefile 22 | Makefile.in 23 | *.m4 24 | 25 | # ignore help scripts/files 26 | configure 27 | libtool 28 | stamp-h1 29 | tesseract.pc 30 | config_auto.h 31 | /doc/html/* 32 | /doc/*.1 33 | /doc/*.5 34 | /doc/*.html 35 | /doc/*.xml 36 | 37 | # generated version file 38 | /src/api/tess_version.h 39 | 40 | # executables 41 | /src/api/tesseract 42 | /src/training/ambiguous_words 43 | /src/training/classifier_tester 44 | /src/training/cntraining 45 | /src/training/combine_tessdata 46 | /src/training/dawg2wordlist 47 | /src/training/merge_unicharsets 48 | /src/training/mftraining 49 | /src/training/set_unicharset_properties 50 | /src/training/shapeclustering 51 | /src/training/text2image 52 | /src/training/unicharset_extractor 53 | /src/training/wordlist2dawg 54 | 55 | *.patch 56 | 57 | # files generated by libtool 58 | /src/training/combine_lang_model 59 | /src/training/lstmeval 60 | /src/training/lstmtraining 61 | 62 | # ignore compilation files 63 | build/* 64 | /bin 65 | */.deps/* 66 | */.libs/* 67 | */*/.deps/* 68 | */*/.libs/* 69 | *.lo 70 | *.la 71 | *.o 72 | *.Plo 73 | *.a 74 | *.class 75 | *.jar 76 | __pycache__ 77 | 78 | # tessdata 79 | *.traineddata 80 | 81 | # OpenCL 82 | tesseract_opencl_profile_devices.dat 83 | kernel*.bin 84 | 85 | # build dirs 86 | /build* 87 | /.cppan 88 | /cppan 89 | /*.dll 90 | /*.lib 91 | /*.exe 92 | /*.lnk 93 | /win* 94 | .vs* 95 | .s* 96 | 97 | # files generated by "make check" 98 | /tests/.dirstamp 99 | /unittest/*.trs 100 | /unittest/tmp/* 101 | 102 | # test programs 103 | /unittest/*_test 104 | /unittest/primesbitvector 105 | /unittest/primesmap 106 | 107 | # generated files from unlvtests 108 | times.txt 109 | /unlvtests/results* 110 | 111 | # snap packaging specific rules 112 | /parts/ 113 | /stage/ 114 | /prime/ 115 | /snap/.snapcraft/ 116 | 117 | /*.snap 118 | /*_source.tar.bz2 119 | -------------------------------------------------------------------------------- /src/classify/clusttool.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: clusttool.h 3 | ** Purpose: Definition of clustering utility tools 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef TESSERACT_CLASSIFY_CLUSTTOOL_H_ 19 | #define TESSERACT_CLASSIFY_CLUSTTOOL_H_ 20 | 21 | //--------------------------Include Files--------------------------------------- 22 | #include 23 | #include "cluster.h" 24 | #include "serialis.h" 25 | 26 | /*------------------------------------------------------------------------- 27 | Public Function Prototype 28 | --------------------------------------------------------------------------*/ 29 | uint16_t ReadSampleSize(tesseract::TFile *fp); 30 | 31 | PARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uint16_t N); 32 | 33 | PROTOTYPE *ReadPrototype(tesseract::TFile *fp, uint16_t N); 34 | 35 | float *ReadNFloats(tesseract::TFile *fp, uint16_t N, float Buffer[]); 36 | 37 | void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]); 38 | 39 | void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto); 40 | 41 | void WriteNFloats (FILE * File, uint16_t N, float Array[]); 42 | 43 | void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle); 44 | 45 | void WriteProtoList(FILE* File, uint16_t N, PARAM_DESC* ParamDesc, 46 | LIST ProtoList, bool WriteSigProtos, 47 | bool WriteInsigProtos); 48 | 49 | #endif // TESSERACT_CLASSIFY_CLUSTTOOL_H_ 50 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVAbstractMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.events.SVEventType; 23 | 24 | import javax.swing.JMenu; 25 | import javax.swing.JMenuItem; 26 | 27 | abstract class SVAbstractMenuItem { 28 | JMenuItem mi; 29 | public String name; 30 | public int id; 31 | 32 | /** 33 | * Sets the basic attributes for name, id and the corresponding swing item 34 | */ 35 | SVAbstractMenuItem(int id, String name, JMenuItem jmi) { 36 | this.mi = jmi; 37 | this.name = name; 38 | this.id = id; 39 | } 40 | 41 | /** Returns the actual value of the MenuListItem. */ 42 | public String getValue() { return null; } 43 | 44 | /** Adds a child entry to the submenu. */ 45 | public void add(SVAbstractMenuItem mli) { } 46 | 47 | /** Adds a child menu to the submenu (or root node). */ 48 | public void add(JMenu jli) { } 49 | 50 | /** 51 | * What to do when user clicks on this item. 52 | * @param window The window the event happened. 53 | * @param eventType What kind of event will be associated 54 | * (usually SVET_POPUP or SVET_MENU). 55 | */ 56 | public void performAction(SVWindow window, SVEventType eventType) {} 57 | } 58 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVCheckboxMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.ScrollView; 23 | import com.google.scrollview.events.SVEvent; 24 | import com.google.scrollview.events.SVEventType; 25 | 26 | import javax.swing.JCheckBoxMenuItem; 27 | 28 | /** 29 | * Constructs a new menulistitem which possesses a flag that can be toggled. 30 | */ 31 | class SVCheckboxMenuItem extends SVAbstractMenuItem { 32 | public boolean bvalue; 33 | 34 | SVCheckboxMenuItem(int id, String name, boolean val) { 35 | super(id, name, new JCheckBoxMenuItem(name, val)); 36 | bvalue = val; 37 | } 38 | 39 | /** What to do when user clicks on this item. */ 40 | @Override 41 | public void performAction(SVWindow window, SVEventType eventType) { 42 | // Checkbox entry - trigger and send event. 43 | if (bvalue) { 44 | bvalue = false; 45 | } else { 46 | bvalue = true; 47 | } 48 | SVEvent svme = new SVEvent(eventType, window, id, getValue()); 49 | ScrollView.addMessage(svme); 50 | } 51 | 52 | /** Returns the actual value of the MenuListItem. */ 53 | @Override 54 | public String getValue() { 55 | return Boolean.toString(bvalue); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/textord/edgloop.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: edgloop.h (Formerly edgeloop.h) 3 | * Description: Functions to clean up an outline before approximation. 4 | * Author: Ray Smith 5 | * Created: Tue Mar 26 16:56:25 GMT 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef EDGLOOP_H 21 | #define EDGLOOP_H 22 | 23 | #include "scrollview.h" 24 | #include "params.h" 25 | #include "pdblock.h" 26 | #include "coutln.h" 27 | #include "crakedge.h" 28 | 29 | #define BUCKETSIZE 16 30 | 31 | 32 | extern INT_VAR_H (edges_children_per_grandchild, 10, 33 | "Importance ratio for chucking outlines"); 34 | extern INT_VAR_H (edges_children_count_limit, 45, 35 | "Max holes allowed in blob"); 36 | extern double_VAR_H (edges_childarea, 0.5, 37 | "Max area fraction of child outline"); 38 | extern double_VAR_H (edges_boxarea, 0.8, 39 | "Min area fraction of grandchild for box"); 40 | void complete_edge(CRACKEDGE *start, //start of loop 41 | C_OUTLINE_IT* outline_it); 42 | ScrollView::Color check_path_legal( //certify outline 43 | CRACKEDGE *start //start of loop 44 | ); 45 | int16_t loop_bounding_box( //get bounding box 46 | CRACKEDGE *&start, //edge loop 47 | ICOORD &botleft, //bounding box 48 | ICOORD &topright); 49 | #endif 50 | -------------------------------------------------------------------------------- /src/ccutil/unicodes.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: unicodes.cpp 3 | * Description: Unicode related machinery 4 | * Author: David Eger 5 | * 6 | * (C) Copyright 2011, Google, Inc. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #include "unicodes.h" 20 | 21 | namespace tesseract { 22 | 23 | const char *kUTF8LineSeparator = "\u2028"; // "\xe2\x80\xa8"; 24 | const char *kUTF8ParagraphSeparator = "\u2029"; // "\xe2\x80\xa9"; 25 | const char *kLRM = "\u200E"; // Left-to-Right Mark 26 | const char *kRLM = "\u200F"; // Right-to-Left Mark 27 | const char *kRLE = "\u202A"; // Right-to-Left Embedding 28 | const char *kPDF = "\u202C"; // Pop Directional Formatting 29 | 30 | const char *kHyphenLikeUTF8[] = { 31 | "-", // ASCII hyphen-minus 32 | "\u05BE", // word hyphen in hybrew 33 | "\u2010", // hyphen 34 | "\u2011", // non-breaking hyphen 35 | "\u2012", // a hyphen the same width as digits 36 | "\u2013", // en dash 37 | "\u2014", // em dash 38 | "\u2015", // horizontal bar 39 | "\u2212", // arithmetic minus sign 40 | "\uFE58", // small em dash 41 | "\uFE63", // small hyphen-minus 42 | "\uFF0D", // fullwidth hyphen-minus 43 | nullptr, // end of our list 44 | }; 45 | 46 | const char *kApostropheLikeUTF8[] = { 47 | "'", // ASCII apostrophe 48 | "`", // ASCII backtick 49 | "\u2018", // opening single quote 50 | "\u2019", // closing single quote 51 | "\u2032", // mathematical prime mark 52 | nullptr, // end of our list. 53 | }; 54 | 55 | } // namespace 56 | -------------------------------------------------------------------------------- /src/training/tessopt.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tessopt.cpp 3 | * Description: Re-implementation of the unix code. 4 | * Author: Ray Smith 5 | * Created: Tue Nov 28 05:52:50 MST 1995 6 | * 7 | * (C) Copyright 1995, Hewlett-Packard Co. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #include 21 | #include 22 | #include "tessopt.h" 23 | 24 | int tessoptind; 25 | char *tessoptarg; 26 | 27 | /********************************************************************** 28 | * tessopt 29 | * 30 | * parse command line args. 31 | **********************************************************************/ 32 | 33 | int tessopt ( //parse args 34 | int32_t argc, //arg count 35 | char *argv[], //args 36 | const char *arglist //string of arg chars 37 | ) { 38 | const char *arg; //arg char 39 | 40 | if (tessoptind == 0) 41 | tessoptind = 1; 42 | if (tessoptind < argc && argv[tessoptind][0] == '-') { 43 | arg = strchr (arglist, argv[tessoptind][1]); 44 | if (arg == nullptr || *arg == ':') 45 | return '?'; //dud option 46 | tessoptind++; 47 | tessoptarg = argv[tessoptind]; 48 | if (arg[1] == ':') { 49 | if (argv[tessoptind - 1][2] != '\0') 50 | //immediately after 51 | tessoptarg = argv[tessoptind - 1] + 2; 52 | else 53 | tessoptind++; 54 | } 55 | return *arg; 56 | } 57 | else 58 | return EOF; 59 | } 60 | -------------------------------------------------------------------------------- /unittest/fileio_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | 13 | #include 14 | #include 15 | 16 | #include "absl/strings/str_split.h" 17 | 18 | #include "fileio.h" 19 | #include "include_gunit.h" 20 | 21 | namespace { 22 | 23 | using tesseract::File; 24 | using tesseract::InputBuffer; 25 | using tesseract::OutputBuffer; 26 | 27 | TEST(FileTest, JoinPath) { 28 | EXPECT_EQ("/abc/def", File::JoinPath("/abc", "def")); 29 | EXPECT_EQ("/abc/def", File::JoinPath("/abc/", "def")); 30 | EXPECT_EQ("def", File::JoinPath("", "def")); 31 | } 32 | 33 | TEST(OutputBufferTest, WriteString) { 34 | const int kMaxBufSize = 128; 35 | char buffer[kMaxBufSize]; 36 | for (int i = 0; i < kMaxBufSize; ++i) buffer[i] = '\0'; 37 | FILE* fp = fmemopen(buffer, kMaxBufSize, "w"); 38 | CHECK(fp != nullptr); 39 | 40 | { 41 | std::unique_ptr output(new OutputBuffer(fp)); 42 | output->WriteString("Hello "); 43 | output->WriteString("world!"); 44 | } 45 | EXPECT_STREQ("Hello world!", buffer); 46 | } 47 | 48 | TEST(InputBufferTest, Read) { 49 | const int kMaxBufSize = 128; 50 | char buffer[kMaxBufSize]; 51 | snprintf(buffer, kMaxBufSize, "Hello\n world!"); 52 | EXPECT_STREQ("Hello\n world!", buffer); 53 | FILE* fp = fmemopen(buffer, kMaxBufSize, "r"); 54 | CHECK(fp != nullptr); 55 | 56 | std::string str; 57 | std::unique_ptr input(new InputBuffer(fp)); 58 | EXPECT_TRUE(input->Read(&str)); 59 | std::vector lines = absl::StrSplit(str, '\n', absl::SkipEmpty()); 60 | EXPECT_EQ(2, lines.size()); 61 | EXPECT_EQ("Hello", lines[0]); 62 | EXPECT_EQ(" world!", lines[1]); 63 | } 64 | 65 | } // namespace 66 | -------------------------------------------------------------------------------- /src/arch/dotproductavx.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproductavx.cpp 3 | // Description: Architecture-specific dot-product function. 4 | // Author: Ray Smith 5 | // Created: Wed Jul 22 10:48:05 PDT 2015 6 | // 7 | // (C) Copyright 2015, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #if !defined(__AVX__) 20 | #error Implementation only for AVX capable architectures 21 | #endif 22 | 23 | #include 24 | #include 25 | #include "dotproductavx.h" 26 | 27 | namespace tesseract { 28 | 29 | // Computes and returns the dot product of the n-vectors u and v. 30 | // Uses Intel AVX intrinsics to access the SIMD instruction set. 31 | double DotProductAVX(const double* u, const double* v, int n) { 32 | const unsigned quot = n / 8; 33 | const unsigned rem = n % 8; 34 | __m256d t0 = _mm256_setzero_pd(); 35 | __m256d t1 = _mm256_setzero_pd(); 36 | for (unsigned k = 0; k < quot; k++) { 37 | __m256d f0 = _mm256_loadu_pd(u); 38 | __m256d f1 = _mm256_loadu_pd(v); 39 | f0 = _mm256_mul_pd(f0, f1); 40 | t0 = _mm256_add_pd(t0, f0); 41 | u += 4; 42 | v += 4; 43 | __m256d f2 = _mm256_loadu_pd(u); 44 | __m256d f3 = _mm256_loadu_pd(v); 45 | f2 = _mm256_mul_pd(f2, f3); 46 | t1 = _mm256_add_pd(t1, f2); 47 | u += 4; 48 | v += 4; 49 | } 50 | t0 = _mm256_hadd_pd(t0, t1); 51 | alignas(32) double tmp[4]; 52 | _mm256_store_pd(tmp, t0); 53 | double result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; 54 | for (unsigned k = 0; k < rem; k++) { 55 | result += *u++ * *v++; 56 | } 57 | return result; 58 | } 59 | 60 | } // namespace tesseract. 61 | -------------------------------------------------------------------------------- /src/cutil/structures.h: -------------------------------------------------------------------------------- 1 | /* -*-C-*- 2 | ******************************************************************************** 3 | * 4 | * File: structures.h 5 | * Description: Allocate all the different types of structures. 6 | * Author: Mark Seaman, OCR Technology 7 | * 8 | * (c) Copyright 1990, Hewlett-Packard Company. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | *********************************************************************************/ 20 | #ifndef STRUCTURES_H 21 | #define STRUCTURES_H 22 | 23 | /*---------------------------------------------------------------------- 24 | I n c l u d e s 25 | ----------------------------------------------------------------------*/ 26 | #include "oldlist.h" 27 | 28 | /*---------------------------------------------------------------------- 29 | M a c r o s 30 | ----------------------------------------------------------------------*/ 31 | /********************************************************************** 32 | * makestructure 33 | * 34 | * Allocate a chunk of memory for a particular data type. This macro 35 | * defines an allocation, deallocation, and status printing function 36 | * for each new data type. 37 | **********************************************************************/ 38 | 39 | #define makestructure(newfunc, old, type) \ 40 | type* newfunc() { return new type; } \ 41 | \ 42 | void old(type* deadelement) { delete deadelement; } 43 | 44 | /*---------------------------------------------------------------------- 45 | F u n c t i o n s 46 | ----------------------------------------------------------------------*/ 47 | extern LIST new_cell(); 48 | extern void free_cell(LIST); 49 | #endif 50 | --------------------------------------------------------------------------------