├── VERSION ├── .gitattributes ├── tessdata ├── tessconfigs │ ├── nobatch │ ├── batch.nochop │ ├── batch │ ├── Makefile.am │ ├── matdemo │ ├── segdemo │ └── msdemo ├── configs │ ├── alto │ ├── pdf │ ├── quiet │ ├── tsv │ ├── logfile │ ├── api_config │ ├── get.images │ ├── lstmbox │ ├── makebox │ ├── wordstrbox │ ├── digits │ ├── hocr │ ├── unlv │ ├── inter │ ├── rebox │ ├── linebox │ ├── page │ ├── kannada │ ├── lstmdebug │ ├── bazaar │ ├── bigram │ ├── txt │ ├── ambigs.train │ ├── lstm.train │ ├── box.train │ ├── box.train.stderr │ ├── Makefile.am │ └── strokewidth ├── eng.user-patterns ├── eng.user-words ├── pdf.ttf └── Makefile.am ├── java ├── com │ ├── Makefile.am │ └── google │ │ ├── Makefile.am │ │ └── scrollview │ │ ├── Makefile.am │ │ ├── events │ │ ├── Makefile.am │ │ └── SVEventType.java │ │ └── ui │ │ ├── Makefile.am │ │ ├── SVSubMenuItem.java │ │ ├── SVEmptyMenuItem.java │ │ ├── SVAbstractMenuItem.java │ │ ├── SVCheckboxMenuItem.java │ │ └── SVMenuItem.java └── Manifest.txt ├── docker-compose.yml ├── src ├── training │ ├── common │ │ └── export.h │ ├── pango │ │ ├── export.h │ │ ├── tlog.cpp │ │ └── tlog.h │ ├── unicharset │ │ ├── export.h │ │ ├── validate_khmer.h │ │ ├── icuerrorcode.cpp │ │ ├── validate_grapheme.h │ │ ├── validate_indic.h │ │ └── validate_myanmar.h │ ├── set_unicharset_properties.cpp │ └── merge_unicharsets.cpp ├── lstm │ ├── generate_lut.py │ └── maxpool.h ├── ccmain │ ├── mutableiterator.cpp │ ├── tessvars.cpp │ ├── tessvars.h │ ├── werdit.h │ ├── docqual.h │ ├── control.h │ ├── output.h │ ├── fixspace.h │ ├── reject.h │ ├── pagewalk.cpp │ └── pgedit.h ├── ccutil │ ├── fileerr.h │ ├── scanutils.h │ ├── tesstypes.h │ ├── tprintf.h │ ├── host.h │ └── lsterr.h ├── classify │ ├── mf.h │ ├── normmatch.h │ ├── float2int.h │ ├── mfdefs.h │ ├── normfeat.h │ ├── clusttool.h │ ├── outfeat.h │ ├── mfx.h │ ├── fpoint.h │ └── fpoint.cpp ├── ccstruct │ ├── polyaprx.h │ ├── ccstruct.cpp │ ├── crakedge.h │ ├── blread.h │ ├── params_training_featdef.cpp │ ├── ccstruct.h │ ├── image.cpp │ ├── debugpixa.h │ ├── image.h │ ├── quadratc.h │ └── quadlsq.h ├── wordrec │ ├── chop.h │ ├── drawfx.h │ ├── findseam.h │ ├── plotedges.h │ └── render.h ├── arch │ ├── dotproduct.cpp │ ├── dotproduct.h │ └── dotproductneon.cpp ├── textord │ ├── scanedg.h │ ├── tordmain.h │ ├── gap_map.h │ ├── edgloop.h │ ├── blobgrid.h │ ├── blobgrid.cpp │ ├── sortflts.h │ ├── underlin.h │ └── equationdetectbase.h └── dict │ ├── stopper.h │ ├── dawg_cache.h │ └── matchdefs.h ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── issue-feature-request.yml ├── workflows │ ├── msys2-4.1.1.yml │ └── cifuzz.yml └── ISSUE_TEMPLATE.md ├── .gitmodules ├── unittest ├── tesseract_leaksanitizer.supp ├── third_party │ └── utf │ │ └── utfdef.h ├── capiexample_c_test.c ├── capiexample_test.cc ├── cleanapi_test.cc ├── lstm_squashed_test.cc ├── cycletimer.h ├── unichar_test.cc ├── lstm_recode_test.cc ├── syntaxnet │ └── base.h ├── fileio_test.cc ├── validate_khmer_test.cc ├── util │ └── utf8 │ │ └── unilib.cc ├── log.h ├── stats_test.cc ├── colpartition_test.cc └── fuzzers │ └── oss-fuzz-build.sh ├── tesseract.pc.cmake ├── tesseract.pc.in ├── .clang-format ├── doc ├── cntraining.1.asc ├── ambiguous_words.1.asc ├── tesseract.natvis ├── dawg2wordlist.1.asc ├── generate_manpages.sh ├── merge_unicharsets.1.asc ├── unicharset_extractor.1.asc ├── set_unicharset_properties.1.asc ├── wordlist2dawg.1.asc ├── classifier_tester.1.asc ├── mftraining.1.asc ├── shapeclustering.1.asc └── lstmeval.1.asc ├── AUTHORS ├── cmake ├── BuildFunctions.cmake ├── templates │ ├── cmake_uninstall.cmake.in │ └── TesseractConfig.cmake.in └── SourceGroups.cmake ├── include └── tesseract │ ├── version.h.in │ └── export.h ├── m4 ├── ax_split_version.m4 └── ax_check_compile_flag.m4 ├── snap └── snapcraft.yaml ├── .mailmap ├── appveyor.yml └── .gitignore /VERSION: -------------------------------------------------------------------------------- 1 | 5.4.0-rc2 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/nobatch: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /java/com/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = google 2 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | tesseract: 2 | build: . 3 | -------------------------------------------------------------------------------- /tessdata/configs/alto: -------------------------------------------------------------------------------- 1 | tessedit_create_alto 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/pdf: -------------------------------------------------------------------------------- 1 | tessedit_create_pdf 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/quiet: -------------------------------------------------------------------------------- 1 | debug_file /dev/null 2 | -------------------------------------------------------------------------------- /tessdata/configs/tsv: -------------------------------------------------------------------------------- 1 | tessedit_create_tsv 1 2 | -------------------------------------------------------------------------------- /java/com/google/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = scrollview 2 | -------------------------------------------------------------------------------- /tessdata/configs/logfile: -------------------------------------------------------------------------------- 1 | debug_file tesseract.log 2 | -------------------------------------------------------------------------------- /tessdata/configs/api_config: -------------------------------------------------------------------------------- 1 | tessedit_zero_rejection T 2 | -------------------------------------------------------------------------------- /tessdata/configs/get.images: -------------------------------------------------------------------------------- 1 | tessedit_write_images T 2 | -------------------------------------------------------------------------------- /tessdata/configs/lstmbox: -------------------------------------------------------------------------------- 1 | tessedit_create_lstmbox 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/makebox: -------------------------------------------------------------------------------- 1 | tessedit_create_boxfile 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/wordstrbox: -------------------------------------------------------------------------------- 1 | tessedit_create_wordstrbox 1 2 | -------------------------------------------------------------------------------- /tessdata/configs/digits: -------------------------------------------------------------------------------- 1 | tessedit_char_whitelist 0123456789-. 2 | -------------------------------------------------------------------------------- /tessdata/eng.user-patterns: -------------------------------------------------------------------------------- 1 | 1-\d\d\d-GOOG-411 2 | www.\n\\\*.com 3 | -------------------------------------------------------------------------------- /tessdata/configs/hocr: -------------------------------------------------------------------------------- 1 | tessedit_create_hocr 1 2 | hocr_font_info 0 3 | -------------------------------------------------------------------------------- /tessdata/eng.user-words: -------------------------------------------------------------------------------- 1 | the 2 | quick 3 | brown 4 | fox 5 | jumped 6 | -------------------------------------------------------------------------------- /tessdata/configs/unlv: -------------------------------------------------------------------------------- 1 | tessedit_write_unlv 1 2 | unlv_tilde_crunching T 3 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/batch.nochop: -------------------------------------------------------------------------------- 1 | chop_enable 0 2 | wordrec_enable_assoc 0 3 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/batch: -------------------------------------------------------------------------------- 1 | # No content needed as all defaults are correct. 2 | -------------------------------------------------------------------------------- /tessdata/pdf.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joykom/tesseract/HEAD/tessdata/pdf.ttf -------------------------------------------------------------------------------- /tessdata/configs/inter: -------------------------------------------------------------------------------- 1 | interactive_display_mode T 2 | tessedit_display_outwords T 3 | -------------------------------------------------------------------------------- /tessdata/configs/rebox: -------------------------------------------------------------------------------- 1 | tessedit_resegment_from_boxes 1 2 | tessedit_make_boxes_from_boxes 1 3 | -------------------------------------------------------------------------------- /tessdata/configs/linebox: -------------------------------------------------------------------------------- 1 | tessedit_resegment_from_line_boxes 1 2 | tessedit_make_boxes_from_boxes 1 3 | -------------------------------------------------------------------------------- /tessdata/configs/page: -------------------------------------------------------------------------------- 1 | tessedit_create_page_xml 1 2 | # page_xml_polygon 1 3 | # page_xml_level 0 4 | -------------------------------------------------------------------------------- /java/com/google/scrollview/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = events ui 2 | 3 | EXTRA_DIST = \ 4 | ScrollView.java 5 | -------------------------------------------------------------------------------- /src/training/common/export.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef CMAKE_BUILD 4 | # include 5 | #endif 6 | -------------------------------------------------------------------------------- /src/training/pango/export.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef CMAKE_BUILD 4 | # include 5 | #endif 6 | -------------------------------------------------------------------------------- /src/training/unicharset/export.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef CMAKE_BUILD 4 | # include 5 | #endif 6 | -------------------------------------------------------------------------------- /tessdata/configs/kannada: -------------------------------------------------------------------------------- 1 | textord_skewsmooth_offset 8 2 | textord_skewsmooth_offset2 8 3 | textord_merge_desc 0.5 4 | textord_no_rejects 1 5 | -------------------------------------------------------------------------------- /tessdata/configs/lstmdebug: -------------------------------------------------------------------------------- 1 | stopper_debug_level 1 2 | classify_debug_level 1 3 | segsearch_debug_level 1 4 | language_model_debug_level 3 5 | -------------------------------------------------------------------------------- /java/com/google/scrollview/events/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = 2 | 3 | EXTRA_DIST = \ 4 | SVEvent.java SVEventHandler.java \ 5 | SVEventType.java 6 | -------------------------------------------------------------------------------- /tessdata/configs/bazaar: -------------------------------------------------------------------------------- 1 | load_system_dawg F 2 | load_freq_dawg F 3 | user_words_suffix user-words 4 | user_patterns_suffix user-patterns 5 | -------------------------------------------------------------------------------- /java/Manifest.txt: -------------------------------------------------------------------------------- 1 | Main-Class: com/google/scrollview/ScrollView 2 | Class-Path: ScrollView.jar piccolo2d-core-3.0.1.jar piccolo2d-extras-3.0.1.jar jaxb-api-2.3.1.jar 3 | -------------------------------------------------------------------------------- /tessdata/configs/bigram: -------------------------------------------------------------------------------- 1 | load_bigram_dawg True 2 | tessedit_enable_bigram_correction True 3 | tessedit_bigram_debug 3 4 | save_raw_choices True 5 | save_alt_choices True 6 | -------------------------------------------------------------------------------- /tessdata/configs/txt: -------------------------------------------------------------------------------- 1 | # This config file should be used with other config files which create renderers. 2 | # usage example: tesseract eurotext.tif eurotext txt hocr pdf 3 | tessedit_create_txt 1 4 | -------------------------------------------------------------------------------- /tessdata/configs/ambigs.train: -------------------------------------------------------------------------------- 1 | tessedit_ambigs_training 1 2 | load_freq_dawg 0 3 | load_punc_dawg 0 4 | load_system_dawg 0 5 | load_number_dawg 0 6 | ambigs_debug_level 3 7 | load_fixed_length_dawgs 0 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Question 4 | url: https://groups.google.com/g/tesseract-ocr 5 | about: Please ask questions in our forum 6 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata/tessconfigs 2 | data_DATA = batch batch.nochop nobatch matdemo segdemo msdemo 3 | EXTRA_DIST = batch batch.nochop nobatch matdemo segdemo msdemo 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "googletest"] 2 | path = unittest/third_party/googletest 3 | url = https://github.com/google/googletest.git 4 | [submodule "test"] 5 | path = test 6 | url = https://github.com/tesseract-ocr/test.git 7 | -------------------------------------------------------------------------------- /tessdata/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata 2 | 3 | data_DATA = pdf.ttf 4 | EXTRA_DIST = $(data_DATA) 5 | 6 | SUBDIRS = configs tessconfigs 7 | 8 | langdata = 9 | 10 | uninstall-local: 11 | cd $(DESTDIR)$(datadir); \ 12 | rm -f $(langdata) 13 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = 2 | 3 | EXTRA_DIST = \ 4 | SVAbstractMenuItem.java \ 5 | SVCheckboxMenuItem.java SVEmptyMenuItem.java \ 6 | SVImageHandler.java SVMenuBar.java \ 7 | SVMenuItem.java SVPopupMenu.java SVSubMenuItem.java SVWindow.java 8 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/matdemo: -------------------------------------------------------------------------------- 1 | ################################################# 2 | # Adaptive Matcher Using PreAdapted Templates 3 | ################################################# 4 | 5 | classify_enable_adaptive_debugger 1 6 | matcher_debug_flags 6 7 | matcher_debug_level 1 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue-feature-request.yml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: File a feature request 3 | body: 4 | - type: textarea 5 | attributes: 6 | label: Your Feature Request 7 | description: Please look first at the [open issues labeled as 'feature request'](https://github.com/tesseract-ocr/tesseract/labels/feature%20request). 8 | -------------------------------------------------------------------------------- /tessdata/configs/lstm.train: -------------------------------------------------------------------------------- 1 | file_type .bl 2 | textord_fast_pitch_test T 3 | tessedit_zero_rejection T 4 | tessedit_minimal_rejection F 5 | tessedit_write_rep_codes F 6 | edges_children_fix F 7 | edges_childarea 0.65 8 | edges_boxarea 0.9 9 | tessedit_train_line_recognizer T 10 | textord_no_rejects T 11 | tessedit_init_config_only T 12 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/segdemo: -------------------------------------------------------------------------------- 1 | ################################################# 2 | # Adaptive Matcher Using PreAdapted Templates 3 | ################################################# 4 | 5 | wordrec_display_splits 0 6 | wordrec_display_all_blobs 1 7 | wordrec_display_segmentations 2 8 | classify_debug_level 1 9 | stopper_debug_level 1 10 | -------------------------------------------------------------------------------- /tessdata/configs/box.train: -------------------------------------------------------------------------------- 1 | disable_character_fragments T 2 | file_type .bl 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | edges_children_fix F 8 | edges_childarea 0.65 9 | edges_boxarea 0.9 10 | tessedit_resegment_from_boxes T 11 | tessedit_train_from_boxes T 12 | textord_no_rejects T 13 | -------------------------------------------------------------------------------- /tessdata/configs/box.train.stderr: -------------------------------------------------------------------------------- 1 | file_type .bl 2 | #tessedit_use_nn F 3 | textord_fast_pitch_test T 4 | tessedit_zero_rejection T 5 | tessedit_minimal_rejection F 6 | tessedit_write_rep_codes F 7 | edges_children_fix F 8 | edges_childarea 0.65 9 | edges_boxarea 0.9 10 | tessedit_resegment_from_boxes T 11 | tessedit_train_from_boxes T 12 | #textord_repeat_extraction F 13 | textord_no_rejects T 14 | -------------------------------------------------------------------------------- /unittest/tesseract_leaksanitizer.supp: -------------------------------------------------------------------------------- 1 | # Suppress memory leaks. 2 | # Use with LSAN_OPTIONS=suppressions=tesseract_lsan.supp 3 | leak:FcLangSetCreate 4 | leak:FcPatternObjectAddWithBinding 5 | leak:FcPatternObjectInsertElt 6 | leak:FcValueListAppend 7 | leak:FcValueListDuplicate 8 | leak:FcValueListPrepend 9 | leak:IA__FcLangSetCreate 10 | leak:IA__FcValueSave 11 | leak:libfontconfig.so 12 | leak:libfreetype.so 13 | -------------------------------------------------------------------------------- /tessdata/configs/Makefile.am: -------------------------------------------------------------------------------- 1 | datadir = @datadir@/tessdata/configs 2 | data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug 3 | data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images 4 | data_DATA += lstmbox wordstrbox 5 | # Configurations for OCR output. 6 | data_DATA += alto hocr page pdf tsv txt 7 | data_DATA += linebox rebox strokewidth bigram 8 | EXTRA_DIST = $(data_DATA) 9 | -------------------------------------------------------------------------------- /unittest/third_party/utf/utfdef.h: -------------------------------------------------------------------------------- 1 | #define uchar _utfuchar 2 | #define ushort _utfushort 3 | #define uint _utfuint 4 | #define ulong _utfulong 5 | #define vlong _utfvlong 6 | #define uvlong _utfuvlong 7 | 8 | typedef unsigned char uchar; 9 | typedef unsigned short ushort; 10 | typedef unsigned int uint; 11 | typedef unsigned long ulong; 12 | 13 | #define nelem(x) (sizeof(x) / sizeof((x)[0])) 14 | #define nil ((void *)0) 15 | -------------------------------------------------------------------------------- /tessdata/tessconfigs/msdemo: -------------------------------------------------------------------------------- 1 | ################################################# 2 | # Adaptive Matcher Using PreAdapted Templates 3 | ################################################# 4 | 5 | classify_enable_adaptive_debugger 1 6 | matcher_debug_flags 6 7 | matcher_debug_level 1 8 | 9 | wordrec_display_splits 0 10 | wordrec_display_all_blobs 1 11 | wordrec_display_segmentations 2 12 | classify_debug_level 1 13 | -------------------------------------------------------------------------------- /tessdata/configs/strokewidth: -------------------------------------------------------------------------------- 1 | textord_show_blobs 0 2 | textord_debug_tabfind 3 3 | textord_tabfind_show_partitions 1 4 | textord_tabfind_show_initial_partitions 1 5 | textord_tabfind_show_columns 1 6 | textord_tabfind_show_blocks 1 7 | textord_tabfind_show_initialtabs 1 8 | textord_tabfind_show_finaltabs 1 9 | textord_tabfind_show_strokewidths 1 10 | textord_tabfind_show_vlines 0 11 | textord_tabfind_show_images 1 12 | tessedit_dump_pageseg_images 0 13 | -------------------------------------------------------------------------------- /tesseract.pc.cmake: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=${prefix}/bin 3 | libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ 4 | includedir=${prefix}/include 5 | 6 | Name: @tesseract_NAME@ 7 | Description: An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. 8 | URL: https://github.com/tesseract-ocr/tesseract 9 | Version: @tesseract_VERSION@ 10 | Requires.private: lept 11 | Libs: -L${libdir} -l@tesseract_OUTPUT_NAME@ @libarchive_LIBS@ @libcurl_LIBS@ @TENSORFLOW_LIBS@ 12 | Libs.private: 13 | Cflags: -I${includedir} 14 | -------------------------------------------------------------------------------- /tesseract.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | bindir=@bindir@ 4 | datarootdir = @datarootdir@ 5 | datadir=@datadir@ 6 | libdir=@libdir@ 7 | includedir=@includedir@ 8 | 9 | Name: @PACKAGE_NAME@ 10 | Description: An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. 11 | URL: https://github.com/tesseract-ocr/tesseract 12 | Version: @VERSION@ 13 | Requires.private: lept 14 | Libs: -L${libdir} -ltesseract @libarchive_LIBS@ @libcurl_LIBS@ @TENSORFLOW_LIBS@ 15 | Libs.private: -lpthread 16 | Cflags: -I${includedir} 17 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | 3 | # Modifications for Tesseract. 4 | 5 | # Only merge empty functions. 6 | AllowShortFunctionsOnASingleLine: Empty 7 | # Do not allow short if statements. 8 | AllowShortIfStatementsOnASingleLine: false 9 | IndentPPDirectives: AfterHash 10 | 11 | # Default style for some settings. 12 | 13 | AccessModifierOffset: -2 14 | AllowShortLoopsOnASingleLine: false 15 | # Enforce always the same pointer alignment. 16 | DerivePointerAlignment: false 17 | IncludeBlocks: Preserve 18 | PointerAlignment: Right 19 | SpacesBeforeTrailingComments: 1 20 | -------------------------------------------------------------------------------- /src/lstm/generate_lut.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Create C/C++ code for two lookup tables. 4 | 5 | import math 6 | 7 | # kTableSize and kScaleFactor must match the values in functions.h. 8 | 9 | # Size of static tables. 10 | kTableSize = 4096 11 | # Scale factor for float arg to int index. 12 | kScaleFactor = 256.0 13 | 14 | print("// Generated code with lookup tables (see generate_lut.py)") 15 | print('#include "functions.h"') 16 | print("namespace tesseract {") 17 | 18 | print("const TFloat TanhTable[] = {") 19 | for i in range(kTableSize): 20 | print(" %a," % math.tanh(i / kScaleFactor)) 21 | print("};") 22 | 23 | print("const TFloat LogisticTable[] = {") 24 | for i in range(kTableSize): 25 | print(" %a," % (1 / (1 + math.exp(-i / kScaleFactor)))) 26 | print("};") 27 | print("} // namespace tesseract.") 28 | -------------------------------------------------------------------------------- /.github/workflows/msys2-4.1.1.yml: -------------------------------------------------------------------------------- 1 | name: msys2-4.1.1 2 | on: 3 | #push: 4 | schedule: 5 | - cron: 0 18 1 * * 6 | jobs: 7 | windows: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | include: 13 | - os: windows-2019 14 | msystem: MINGW32 15 | mingw_package_prefix: mingw-w64-i686 16 | - os: windows-2019 17 | msystem: MINGW64 18 | mingw_package_prefix: mingw-w64-x86_64 19 | defaults: 20 | run: 21 | shell: msys2 {0} 22 | steps: 23 | - uses: msys2/setup-msys2@v2 24 | with: 25 | msystem: ${{ matrix.msystem }} 26 | - run: pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-tesseract-ocr 27 | - name: Display version 28 | run: | 29 | tesseract -v 30 | text2image -v 31 | lstmtraining -v 32 | -------------------------------------------------------------------------------- /doc/cntraining.1.asc: -------------------------------------------------------------------------------- 1 | CNTRAINING(1) 2 | ============= 3 | 4 | NAME 5 | ---- 6 | cntraining - character normalization training for Tesseract 7 | 8 | SYNOPSIS 9 | -------- 10 | *cntraining* [-D 'dir'] 'FILE'... 11 | 12 | DESCRIPTION 13 | ----------- 14 | cntraining takes a list of .tr files, from which it generates the 15 | *normproto* data file (the character normalization sensitivity 16 | prototypes). 17 | 18 | OPTIONS 19 | -------- 20 | -D 'dir':: 21 | Directory to write output files to. 22 | 23 | SEE ALSO 24 | -------- 25 | tesseract(1), shapeclustering(1), mftraining(1) 26 | 27 | 28 | 29 | COPYING 30 | ------- 31 | Copyright (c) Hewlett-Packard Company, 1988 32 | Licensed under the Apache License, Version 2.0 33 | 34 | AUTHOR 35 | ------ 36 | The Tesseract OCR engine was written by Ray Smith and his research groups 37 | at Hewlett Packard (1985-1995) and Google (2006-2018). 38 | -------------------------------------------------------------------------------- /doc/ambiguous_words.1.asc: -------------------------------------------------------------------------------- 1 | AMBIGUOUS_WORDS(1) 2 | ================== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | ambiguous_words - generate sets of words Tesseract is likely to find ambiguous 8 | 9 | SYNOPSIS 10 | -------- 11 | *ambiguous_words* [-l lang] 'TESSDATADIR' 'WORDLIST' 'AMBIGUOUSFILE' 12 | 13 | DESCRIPTION 14 | ----------- 15 | ambiguous_words(1) runs Tesseract in a special mode, and for each word 16 | in word list, produces a set of words which Tesseract thinks might be 17 | ambiguous with it. 'TESSDATADIR' must be set to the absolute path of 18 | a directory containing 'tessdata/lang.traineddata'. 19 | 20 | SEE ALSO 21 | -------- 22 | tesseract(1) 23 | 24 | COPYING 25 | ------- 26 | Copyright \(C) 2012 Google, Inc. 27 | Licensed under the Apache License, Version 2.0 28 | 29 | AUTHOR 30 | ------ 31 | The Tesseract OCR engine was written by Ray Smith and his research groups 32 | at Hewlett Packard (1985-1995) and Google (2006-2018). 33 | -------------------------------------------------------------------------------- /unittest/capiexample_c_test.c: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | /////////////////////////////////////////////////////////////////////// 12 | 13 | // Verifies that C is able to include capi header. 14 | #include 15 | 16 | // Verifies that the libtesseract library has C API symbols. 17 | int main() { 18 | printf("%s\n", TessVersion()); 19 | return 0; 20 | } 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Before you submit an issue, please review [the guidelines for this repository](https://github.com/tesseract-ocr/tesseract/blob/main/CONTRIBUTING.md). 2 | 3 | Please report an issue only for a BUG, not for asking questions. 4 | 5 | Note that it will be much easier for us to fix the issue if a test case that 6 | reproduces the problem is provided. Ideally this test case should not have any 7 | external dependencies. Provide a copy of the image or link to files for the test case. 8 | 9 | Please delete this text and fill in the template below. 10 | 11 | ------------------------ 12 | 13 | ### Environment 14 | 15 | * **Tesseract Version**: 16 | * **Commit Number**: 17 | * **Platform**: 18 | 19 | ### Current Behavior: 20 | 21 | ### Expected Behavior: 22 | 23 | ### Suggested Fix: 24 | -------------------------------------------------------------------------------- /unittest/capiexample_test.cc: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | /////////////////////////////////////////////////////////////////////// 12 | 13 | // Verifies that C++ is able to include capi header. 14 | #include 15 | 16 | #include 17 | 18 | // Verifies that the libtesseract library has C API symbols. 19 | TEST(C, VersionTest) { 20 | TessVersion(); 21 | } 22 | -------------------------------------------------------------------------------- /.github/workflows/cifuzz.yml: -------------------------------------------------------------------------------- 1 | name: CIFuzz 2 | # OSS-Fuzz CI 3 | # See https://google.github.io/oss-fuzz/getting-started/continuous-integration/ 4 | on: 5 | pull_request: 6 | branches: 7 | - main 8 | paths: 9 | - '**.cpp' 10 | - '**.h' 11 | jobs: 12 | Fuzzing: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Build Fuzzers 16 | id: build 17 | uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master 18 | with: 19 | oss-fuzz-project-name: 'tesseract-ocr' 20 | language: c++ 21 | dry-run: false 22 | - name: Run Fuzzers 23 | uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master 24 | with: 25 | oss-fuzz-project-name: 'tesseract-ocr' 26 | fuzz-seconds: 600 27 | dry-run: false 28 | - name: Upload Crash 29 | uses: actions/upload-artifact@v3 30 | if: failure() && steps.build.outcome == 'success' 31 | with: 32 | name: artifacts 33 | path: ./out/artifacts 34 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Ray Smith (lead developer) 2 | Ahmad Abdulkader 3 | Rika Antonova 4 | Nicholas Beato 5 | Jeff Breidenbach 6 | Samuel Charron 7 | Phil Cheatle 8 | Simon Crouch 9 | David Eger 10 | Sheelagh Huddleston 11 | Dan Johnson 12 | Rajesh Katikam 13 | Thomas Kielbus 14 | Dar-Shyang Lee 15 | Zongyi (Joe) Liu 16 | Robert Moss 17 | Chris Newton 18 | Michael Reimer 19 | Marius Renn 20 | Raquel Romano 21 | Christy Russon 22 | Shobhit Saxena 23 | Mark Seaman 24 | Faisal Shafait 25 | Hiroshi Takenaka 26 | Ranjith Unnikrishnan 27 | Joern Wanke 28 | Ping Ping Xiu 29 | Andrew Ziem 30 | Oscar Zuniga 31 | 32 | Community Contributors: 33 | Zdenko Podobný (Maintainer) 34 | Jim Regan (Maintainer) 35 | James R Barlow 36 | Stefan Brechtken 37 | Thomas Breuel 38 | Amit Dovev 39 | Martin Ettl 40 | Shree Devi Kumar 41 | Noah Metzger 42 | Tom Morris 43 | Tobias Müller 44 | Egor Pugin 45 | Robert Sachunsky 46 | Raf Schietekat 47 | Sundar M. Vaidya 48 | Robin Watts 49 | Stefan Weil 50 | Nick White 51 | Alexander Zaitsev 52 | -------------------------------------------------------------------------------- /doc/tesseract.natvis: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{size={size_used_}}} 5 | 6 | size_used_ 7 | size_reserved_ 8 | 9 | size_used_ 10 | data_ 11 | 12 | 13 | 14 | 15 | 16 | {value_} 17 | 18 | 19 | {value_} 20 | 21 | 22 | 23 | {value_} 24 | 25 | 26 | 27 | {value_} 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/training/unicharset/validate_khmer.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_KHMER_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments Khmer. 9 | class ValidateKhmer : public Validator { 10 | public: 11 | ValidateKhmer(ViramaScript script, bool report_errors) : Validator(script, report_errors) {} 12 | ~ValidateKhmer() override = default; 13 | 14 | protected: 15 | // Returns whether codes matches the pattern for an Khmer Grapheme. 16 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 17 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 18 | // otherwise does not increment codes_used_. 19 | bool ConsumeGraphemeIfValid() override; 20 | // Returns the CharClass corresponding to the given Unicode ch. 21 | CharClass UnicodeToCharClass(char32 ch) const override; 22 | }; 23 | 24 | } // namespace tesseract 25 | 26 | #endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_ 27 | -------------------------------------------------------------------------------- /src/ccmain/mutableiterator.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | /////////////////////////////////////////////////////////////////////// 14 | 15 | #include "mutableiterator.h" 16 | 17 | namespace tesseract { 18 | 19 | // Destructor. 20 | // It is defined here, so the compiler can create a single vtable 21 | // instead of weak vtables in every compilation unit. 22 | MutableIterator::~MutableIterator() = default; 23 | 24 | } // namespace tesseract. 25 | -------------------------------------------------------------------------------- /src/ccmain/tessvars.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tessvars.cpp (Formerly tessvars.c) 3 | * Description: Variables and other globals for tessedit. 4 | * Author: Ray Smith 5 | * Created: Mon Apr 13 13:13:23 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #include 21 | 22 | #include "tessvars.h" 23 | 24 | FILE *debug_fp = stderr; // write debug stuff here 25 | -------------------------------------------------------------------------------- /src/training/unicharset/icuerrorcode.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | /////////////////////////////////////////////////////////////////////// 14 | 15 | #include "icuerrorcode.h" 16 | 17 | namespace tesseract { 18 | 19 | // Destructor. 20 | // It is defined here, so the compiler can create a single vtable 21 | // instead of weak vtables in every compilation unit. 22 | IcuErrorCode::~IcuErrorCode() { 23 | if (isFailure()) { 24 | handleFailure(); 25 | } 26 | } 27 | 28 | } // namespace tesseract. 29 | -------------------------------------------------------------------------------- /cmake/BuildFunctions.cmake: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | # Unless required by applicable law or agreed to in writing, software 6 | # distributed under the License is distributed on an "AS IS" BASIS, 7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | # See the License for the specific language governing permissions and 9 | # limitations under the License. 10 | ################################################################################ 11 | # 12 | # macros and functions 13 | # 14 | ################################################################################ 15 | 16 | ######################################## 17 | # FUNCTION project_group 18 | ######################################## 19 | function(project_group target name) 20 | set_target_properties(${target} PROPERTIES FOLDER ${name}) 21 | endfunction(project_group) 22 | 23 | ################################################################################ 24 | -------------------------------------------------------------------------------- /src/ccmain/tessvars.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tessvars.h (Formerly tessvars.h) 3 | * Description: Variables and other globals for tessedit. 4 | * Author: Ray Smith 5 | * Created: Mon Apr 13 13:13:23 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef TESSVARS_H 21 | #define TESSVARS_H 22 | 23 | #include 24 | 25 | extern FILE *debug_fp; // write debug stuff here 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /src/ccutil/fileerr.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: fileerr.h (Formerly filerr.h) 3 | * Description: Errors for file utilities. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1990, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef FILEERR_H 20 | #define FILEERR_H 21 | 22 | #include "errcode.h" 23 | 24 | namespace tesseract { 25 | 26 | constexpr ERRCODE CANTOPENFILE("Can't open file"); 27 | 28 | } // namespace tesseract 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/training/pango/tlog.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tlog.cpp 3 | * Description: Variant of printf with logging level controllable by a 4 | * commandline flag. 5 | * Author: Ranjith Unnikrishnan 6 | * Created: Wed Nov 20 2013 7 | * 8 | * (C) Copyright 2013, Google Inc. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | **********************************************************************/ 20 | 21 | #include "tlog.h" 22 | 23 | using namespace tesseract; 24 | 25 | INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output"); 26 | -------------------------------------------------------------------------------- /doc/dawg2wordlist.1.asc: -------------------------------------------------------------------------------- 1 | DAWG2WORDLIST(1) 2 | ================ 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | dawg2wordlist - convert a Tesseract DAWG to a wordlist 8 | 9 | SYNOPSIS 10 | -------- 11 | *dawg2wordlist* 'UNICHARSET' 'DAWG' 'WORDLIST' 12 | 13 | DESCRIPTION 14 | ----------- 15 | dawg2wordlist(1) converts a Tesseract Directed Acyclic Word 16 | Graph (DAWG) to a list of words using a unicharset as key. 17 | 18 | OPTIONS 19 | ------- 20 | 'UNICHARSET' 21 | The unicharset of the language. This is the unicharset 22 | generated by mftraining(1). 23 | 24 | 'DAWG' 25 | The input DAWG, created by wordlist2dawg(1) 26 | 27 | 'WORDLIST' 28 | Plain text (output) file in UTF-8, one word per line 29 | 30 | SEE ALSO 31 | -------- 32 | tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), 33 | combine_tessdata(1) 34 | 35 | 36 | 37 | COPYING 38 | ------- 39 | Copyright \(C) 2012 Google, Inc. 40 | Licensed under the Apache License, Version 2.0 41 | 42 | AUTHOR 43 | ------ 44 | The Tesseract OCR engine was written by Ray Smith and his research groups 45 | at Hewlett Packard (1985-1995) and Google (2006-2018). 46 | -------------------------------------------------------------------------------- /cmake/templates/cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | # https://gitlab.kitware.com/cmake/community/wikis/FAQ#can-i-do-make-uninstall-with-cmake 2 | if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt") 3 | message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt") 4 | endif(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt") 5 | 6 | file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files) 7 | string(REGEX REPLACE "\n" ";" files "${files}") 8 | foreach(file ${files}) 9 | message(STATUS "Uninstalling $ENV{DESTDIR}${file}") 10 | if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 11 | exec_program( 12 | "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" 13 | OUTPUT_VARIABLE rm_out 14 | RETURN_VALUE rm_retval 15 | ) 16 | if(NOT "${rm_retval}" STREQUAL 0) 17 | message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") 18 | endif(NOT "${rm_retval}" STREQUAL 0) 19 | else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 20 | message(STATUS "File $ENV{DESTDIR}${file} does not exist.") 21 | endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 22 | endforeach(file) 23 | -------------------------------------------------------------------------------- /src/classify/mf.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: mf.h 3 | ** Purpose: Micro-feature interface to flexible feature extractor. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef MF_H 19 | #define MF_H 20 | 21 | #include "blobs.h" 22 | #include "ocrfeatures.h" 23 | 24 | namespace tesseract { 25 | 26 | FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm); 27 | 28 | } // namespace tesseract 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/ccstruct/polyaprx.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: polyaprx.h 3 | * Description: Code for polygonal approximation from old edgeprog. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1993, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef POLYAPRX_H 20 | #define POLYAPRX_H 21 | 22 | namespace tesseract { 23 | 24 | class C_OUTLINE; 25 | struct TESSLINE; 26 | 27 | // convert a chain-coded input to the old OUTLINE approximation 28 | TESSLINE *ApproximateOutline(bool allow_detailed_fx, C_OUTLINE *c_outline); 29 | 30 | } // namespace tesseract 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /unittest/cleanapi_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include 13 | 14 | // Dummy enum in the global namespace that checks for collision with awkward 15 | // names. 16 | // If this test fails to compile, clean up the includes in tesseract/baseapi.h! 17 | // They are not supposed to drag in definitions of any of the tesseract 18 | // types included in this enum! 19 | enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD }; 20 | 21 | #include "gtest/gtest.h" 22 | 23 | namespace tesseract { 24 | 25 | // Verifies that the global namespace is clean. 26 | TEST(CleanNamespaceTess, DummyTest) { 27 | tesseract::TessBaseAPI api; 28 | } 29 | 30 | } // namespace tesseract 31 | -------------------------------------------------------------------------------- /src/ccmain/werdit.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: wordit.h 3 | * Description: An iterator for passing over all the words in a document. 4 | * Author: Ray Smith 5 | * Created: Mon Apr 27 08:51:22 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef WERDIT_H 21 | #define WERDIT_H 22 | 23 | #include "rect.h" // for TBOX 24 | 25 | namespace tesseract { 26 | 27 | class PAGE_RES; 28 | class PAGE_RES_IT; 29 | 30 | PAGE_RES_IT *make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box); 31 | 32 | } // namespace tesseract 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/wordrec/chop.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * 3 | * File: chop.h 4 | * Author: Mark Seaman, SW Productivity 5 | * 6 | * (c) Copyright 1987, Hewlett-Packard Company. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | *****************************************************************************/ 18 | 19 | #ifndef CHOP_H 20 | #define CHOP_H 21 | 22 | #include "genericheap.h" 23 | #include "kdpair.h" 24 | #include "seam.h" 25 | 26 | namespace tesseract { 27 | 28 | #define MAX_NUM_POINTS 50 29 | 30 | // The PointPair elements do NOT own the EDGEPTs. 31 | using PointPair = KDPairInc; 32 | using PointHeap = GenericHeap; 33 | 34 | } // namespace tesseract 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/ccutil/scanutils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Google Inc. 2 | // All Rights Reserved. 3 | // Author: renn 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef TESSERACT_CCUTIL_SCANUTILS_H_ 16 | #define TESSERACT_CCUTIL_SCANUTILS_H_ 17 | 18 | #include 19 | 20 | #include // for FILE 21 | 22 | /** 23 | * fscanf variant to ensure correct reading regardless of locale. 24 | * 25 | * tfscanf parse a file stream according to the given format. See the fscanf 26 | * manpage for more information, as this function attempts to mimic its 27 | * behavior. 28 | * 29 | * @note Note that scientific floating-point notation is not supported. 30 | * 31 | */ 32 | TESS_API 33 | int tfscanf(FILE *stream, const char *format, ...); 34 | 35 | #endif // TESSERACT_CCUTIL_SCANUTILS_H_ 36 | -------------------------------------------------------------------------------- /include/tesseract/version.h.in: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | // File: version.h 3 | // Description: Version information 4 | // 5 | // (C) Copyright 2018, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef TESSERACT_API_VERSION_H_ 17 | #define TESSERACT_API_VERSION_H_ 18 | 19 | // clang-format off 20 | 21 | #define TESSERACT_MAJOR_VERSION @GENERIC_MAJOR_VERSION@ 22 | #define TESSERACT_MINOR_VERSION @GENERIC_MINOR_VERSION@ 23 | #define TESSERACT_MICRO_VERSION @GENERIC_MICRO_VERSION@ 24 | 25 | #define TESSERACT_VERSION \ 26 | (TESSERACT_MAJOR_VERSION << 16 | \ 27 | TESSERACT_MINOR_VERSION << 8 | \ 28 | TESSERACT_MICRO_VERSION) 29 | 30 | #define TESSERACT_VERSION_STR "@PACKAGE_VERSION@" 31 | 32 | // clang-format on 33 | 34 | #endif // TESSERACT_API_VERSION_H_ 35 | -------------------------------------------------------------------------------- /src/arch/dotproduct.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproduct.cpp 3 | // Description: Native dot product function. 4 | // 5 | // (C) Copyright 2018, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////// 16 | 17 | #include "dotproduct.h" 18 | 19 | namespace tesseract { 20 | 21 | // Computes and returns the dot product of the two n-vectors u and v. 22 | TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) { 23 | TFloat total = 0; 24 | #if defined(OPENMP_SIMD) || defined(_OPENMP) 25 | #pragma omp simd reduction(+:total) 26 | #endif 27 | for (int k = 0; k < n; k++) { 28 | total += u[k] * v[k]; 29 | } 30 | return total; 31 | } 32 | 33 | } // namespace tesseract 34 | -------------------------------------------------------------------------------- /src/ccmain/docqual.h: -------------------------------------------------------------------------------- 1 | /****************************************************************** 2 | * File: docqual.h (Formerly docqual.h) 3 | * Description: Document Quality Metrics 4 | * Author: Phil Cheatle 5 | * 6 | * (C) Copyright 1994, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef DOCQUAL_H 20 | #define DOCQUAL_H 21 | 22 | #include // for int16_t 23 | 24 | namespace tesseract { 25 | 26 | class PAGE_RES_IT; 27 | class ROW; 28 | class WERD_RES; 29 | 30 | enum GARBAGE_LEVEL { G_NEVER_CRUNCH, G_OK, G_DODGY, G_TERRIBLE }; 31 | 32 | int16_t word_blob_quality(WERD_RES *word); 33 | void reject_whole_page(PAGE_RES_IT &page_res_it); 34 | 35 | } // namespace tesseract 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /include/tesseract/export.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: Apache-2.0 2 | // File: export.h 3 | // Description: Place holder 4 | // 5 | // (C) Copyright 2006, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef TESSERACT_PLATFORM_H_ 17 | #define TESSERACT_PLATFORM_H_ 18 | 19 | #ifndef TESS_API 20 | # if defined(_WIN32) || defined(__CYGWIN__) 21 | # if defined(TESS_EXPORTS) 22 | # define TESS_API __declspec(dllexport) 23 | # elif defined(TESS_IMPORTS) 24 | # define TESS_API __declspec(dllimport) 25 | # else 26 | # define TESS_API 27 | # endif 28 | # else 29 | # if defined(TESS_EXPORTS) || defined(TESS_IMPORTS) 30 | # define TESS_API __attribute__((visibility("default"))) 31 | # else 32 | # define TESS_API 33 | # endif 34 | # endif 35 | #endif 36 | 37 | #endif // TESSERACT_PLATFORM_H_ 38 | -------------------------------------------------------------------------------- /src/classify/normmatch.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: normmatch.h 3 | ** Purpose: Simple matcher based on character normalization features. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef NORMMATCH_H 19 | #define NORMMATCH_H 20 | 21 | #include "matchdefs.h" 22 | #include "ocrfeatures.h" 23 | #include "params.h" 24 | 25 | namespace tesseract { 26 | 27 | /* control knobs used to control the normalization adjustment process */ 28 | extern double_VAR_H(classify_norm_adj_midpoint); 29 | extern double_VAR_H(classify_norm_adj_curl); 30 | 31 | } // namespace tesseract 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/textord/scanedg.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: scanedg.h (Formerly scanedge.h) 3 | * Description: Raster scanning crack based edge extractor. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1991, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef SCANEDG_H 20 | #define SCANEDG_H 21 | 22 | #include "image.h" 23 | #include "params.h" 24 | 25 | struct Pix; 26 | 27 | namespace tesseract { 28 | 29 | class C_OUTLINE_IT; 30 | class PDBLK; 31 | 32 | void block_edges(Image t_image, // thresholded image 33 | PDBLK *block, // block in image 34 | C_OUTLINE_IT *outline_it); 35 | 36 | } // namespace tesseract 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/wordrec/drawfx.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: drawfx.h 3 | * Description: Draw things to do with feature extraction. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1992, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef DRAWFX_H 20 | #define DRAWFX_H 21 | 22 | #include "params.h" 23 | #include "scrollview.h" 24 | 25 | namespace tesseract { 26 | 27 | #ifndef GRAPHICS_DISABLED 28 | extern ScrollView *fx_win; 29 | #endif // !GRAPHICS_DISABLED 30 | void create_fx_win(); // make features win 31 | void clear_fx_win(); // make features win 32 | void create_fxdebug_win(); // make gradients win 33 | 34 | } // namespace tesseract 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/classify/float2int.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: float2int.h 3 | ** Purpose: Routines for converting float features to int features 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef FLOAT2INT_H 19 | #define FLOAT2INT_H 20 | 21 | /*----------------------------------------------------------------------------- 22 | Include Files and Type Defines 23 | -----------------------------------------------------------------------------*/ 24 | #include "intmatcher.h" 25 | #include "ocrfeatures.h" 26 | 27 | #define INT_FEAT_RANGE 256 28 | #define BASELINE_Y_SHIFT (0.25) 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/ccstruct/ccstruct.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: ccstruct.cpp 3 | // Description: ccstruct class. 4 | // Author: Samuel Charron 5 | // 6 | // (C) Copyright 2006, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #include "ccstruct.h" 20 | 21 | namespace tesseract { 22 | 23 | // APPROXIMATIONS of the fractions of the character cell taken by 24 | // the descenders, ascenders, and x-height. 25 | const double CCStruct::kDescenderFraction = 0.25; 26 | const double CCStruct::kXHeightFraction = 0.5; 27 | const double CCStruct::kAscenderFraction = 0.25; 28 | const double CCStruct::kXHeightCapRatio = 29 | CCStruct::kXHeightFraction / (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction); 30 | 31 | } // namespace tesseract 32 | -------------------------------------------------------------------------------- /doc/generate_manpages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # File: generate_manpages.sh 4 | # Description: Converts .asc files into man pages, etc. for Tesseract. 5 | # Author: eger@google.com (David Eger) 6 | # Created: 9 Feb 2012 7 | # 8 | # (C) Copyright 2012 Google Inc. 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | man_xslt=http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl 20 | asciidoc=$(which asciidoc) 21 | xsltproc=$(which xsltproc) 22 | if [[ -z "${asciidoc}" ]] || [[ -z "${xsltproc}" ]]; then 23 | echo "Please make sure asciidoc and xsltproc are installed." 24 | exit 1 25 | else 26 | for src in *.asc; do 27 | pagename=${src/.asc/} 28 | (${asciidoc} -d manpage ${src} && 29 | ${asciidoc} -d manpage -b docbook ${src} && 30 | ${xsltproc} --nonet ${man_xslt} ${pagename}.xml) || 31 | echo "Error generating ${pagename}" 32 | done 33 | fi 34 | exit 0 35 | -------------------------------------------------------------------------------- /m4/ax_split_version.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_split_version.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_SPLIT_VERSION 8 | # 9 | # DESCRIPTION 10 | # 11 | # Splits a version number in the format MAJOR.MINOR.POINT into its 12 | # separate components. 13 | # 14 | # Sets the variables. 15 | # 16 | # LICENSE 17 | # 18 | # Copyright (c) 2008 Tom Howard 19 | # 20 | # Copying and distribution of this file, with or without modification, are 21 | # permitted in any medium without royalty provided the copyright notice 22 | # and this notice are preserved. This file is offered as-is, without any 23 | # warranty. 24 | 25 | #serial 10 26 | 27 | AC_DEFUN([AX_SPLIT_VERSION],[ 28 | AC_REQUIRE([AC_PROG_SED]) 29 | AX_MAJOR_VERSION=`echo "$VERSION" | $SED 's/\([[^.]][[^.]]*\).*/\1/'` 30 | AX_MINOR_VERSION=`echo "$VERSION" | $SED 's/[[^.]][[^.]]*.\([[^.]][[^.]]*\).*/\1/'` 31 | AX_POINT_VERSION=`echo "$VERSION" | $SED 's/[[^.]][[^.]]*.[[^.]][[^.]]*.\(.*\)/\1/'` 32 | AC_MSG_CHECKING([Major version]) 33 | AC_MSG_RESULT([$AX_MAJOR_VERSION]) 34 | AC_MSG_CHECKING([Minor version]) 35 | AC_MSG_RESULT([$AX_MINOR_VERSION]) 36 | AC_MSG_CHECKING([Point version]) 37 | AC_MSG_RESULT([$AX_POINT_VERSION]) 38 | ]) 39 | -------------------------------------------------------------------------------- /src/wordrec/findseam.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * 3 | * File: findseam.h 4 | * Author: Mark Seaman, SW Productivity 5 | * 6 | * (c) Copyright 1987, Hewlett-Packard Company. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | *****************************************************************************/ 18 | 19 | #ifndef FINDSEAM_H 20 | #define FINDSEAM_H 21 | 22 | #include "chop.h" 23 | #include "genericheap.h" 24 | #include "kdpair.h" 25 | #include "seam.h" 26 | 27 | namespace tesseract { 28 | 29 | // The SeamPair elements own their SEAMs and delete them upon destruction. 30 | using SeamPair = KDPtrPairInc; 31 | using SeamQueue = GenericHeap; 32 | 33 | using SeamDecPair = KDPtrPairDec; 34 | using SeamPile = GenericHeap; 35 | 36 | } // namespace tesseract 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/ccmain/control.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: control.h (Formerly control.h) 3 | * Description: Module-independent matcher controller. 4 | * Author: Ray Smith 5 | * Created: Thu Apr 23 11:09:58 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | /** 21 | * @file control.h 22 | * Module-independent matcher controller. 23 | */ 24 | 25 | #ifndef CONTROL_H 26 | #define CONTROL_H 27 | 28 | enum ACCEPTABLE_WERD_TYPE { 29 | AC_UNACCEPTABLE, ///< Unacceptable word 30 | AC_LOWER_CASE, ///< ALL lower case 31 | AC_UPPER_CASE, ///< ALL upper case 32 | AC_INITIAL_CAP, ///< ALL but initial lc 33 | AC_LC_ABBREV, ///< a.b.c. 34 | AC_UC_ABBREV ///< A.B.C. 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/ccmain/output.h: -------------------------------------------------------------------------------- 1 | /****************************************************************** 2 | * File: output.h (Formerly output.h) 3 | * Description: Output pass 4 | * Author: Phil Cheatle 5 | * Created: Thu Aug 4 10:56:08 BST 1994 6 | * 7 | * (C) Copyright 1994, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef OUTPUT_H 21 | #define OUTPUT_H 22 | 23 | namespace tesseract { 24 | 25 | class BLOCK; 26 | class WERD; 27 | 28 | /** test line ends */ 29 | char determine_newline_type(WERD *word, ///< word to do 30 | BLOCK *block, ///< current block 31 | WERD *next_word, ///< next word 32 | BLOCK *next_block ///< block of next word 33 | ); 34 | 35 | } // namespace tesseract 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /unittest/lstm_squashed_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "lstm_test.h" 13 | 14 | namespace tesseract { 15 | 16 | // Tests that a Squashed network learns correctly. 17 | // Almost as fast as the 2d-lstm. 18 | TEST_F(LSTMTrainerTest, TestSquashed) { 19 | // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom, and 20 | // a small convolution/maxpool below that. 21 | // Match training conditions to those typically used with this spec: 22 | // recoding on, adam on. 23 | SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]", "SQU-2-layer-lstm", 24 | /*recode*/ true, /*adam*/ true); 25 | double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2); 26 | EXPECT_LT(lstm_2d_err, 80); 27 | LOG(INFO) << "********** < 80 ************\n"; 28 | TestIntMode(kTrainerIterations); 29 | } 30 | 31 | } // namespace tesseract. 32 | -------------------------------------------------------------------------------- /src/classify/mfdefs.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: mfdefs.h 3 | ** Purpose: Definition of micro-features 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | #ifndef MFDEFS_H 18 | #define MFDEFS_H 19 | 20 | #include 21 | #include 22 | 23 | namespace tesseract { 24 | 25 | enum class MicroFeatureParameter { 26 | MFXPosition, 27 | MFYPosition, 28 | MFLength, 29 | MFDirection, 30 | MFBulge1, 31 | MFBulge2, 32 | 33 | MFCount // For array sizes. 34 | }; 35 | 36 | using MicroFeature = std::array; 37 | using MICROFEATURES = std::forward_list; 38 | 39 | } // namespace tesseract 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /src/textord/tordmain.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tordmain.h (Formerly textordp.h) 3 | * Description: C++ top level textord code. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1992, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef TORDMAIN_H 20 | #define TORDMAIN_H 21 | 22 | #include "blobbox.h" 23 | #include "blobs.h" 24 | #include "ocrblock.h" 25 | #include "params.h" 26 | 27 | struct Pix; 28 | 29 | namespace tesseract { 30 | 31 | class Tesseract; 32 | 33 | void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob); 34 | void assign_blobs_to_blocks2(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks); 35 | 36 | void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction); 37 | 38 | } // namespace tesseract 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/training/unicharset/validate_grapheme.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments generic unicode into 9 | // grapheme clusters, including Latin with diacritics. 10 | class ValidateGrapheme : public Validator { 11 | public: 12 | ValidateGrapheme(ViramaScript script, bool report_errors) : Validator(script, report_errors) {} 13 | ~ValidateGrapheme() override = default; 14 | 15 | protected: 16 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 17 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 18 | // otherwise does not increment codes_used_. 19 | bool ConsumeGraphemeIfValid() override; 20 | // Returns the CharClass corresponding to the given Unicode ch. 21 | CharClass UnicodeToCharClass(char32 ch) const override; 22 | 23 | private: 24 | // Helper returns true if the sequence prev_ch,ch is invalid. 25 | bool IsBadlyFormed(char32 prev_ch, char32 ch); 26 | // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. 27 | static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch); 28 | // Helper returns true if the sequence prev_ch,ch is invalid Thai. 29 | static bool IsBadlyFormedThai(char32 prev_ch, char32 ch); 30 | }; 31 | 32 | } // namespace tesseract 33 | 34 | #endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ 35 | -------------------------------------------------------------------------------- /doc/merge_unicharsets.1.asc: -------------------------------------------------------------------------------- 1 | MERGE_UNICHARSETS(1) 2 | ==================== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | merge_unicharsets - Simple tool to merge two or more unicharsets. 8 | 9 | SYNOPSIS 10 | -------- 11 | *merge_unicharsets* 'unicharset-in-1' ... 'unicharset-in-n' 'unicharset-out' 12 | 13 | DESCRIPTION 14 | ----------- 15 | merge_unicharsets(1) is a simple tool to merge two or more unicharsets. 16 | It could be used to create a combined unicharset for a script-level engine, 17 | like the new Latin or Devanagari. 18 | 19 | IN/OUT ARGUMENTS 20 | ---------------- 21 | 'unicharset-in-1':: 22 | (Input) The name of the first unicharset file to be merged. 23 | 24 | 'unicharset-in-n':: 25 | (Input) The name of the nth unicharset file to be merged. 26 | 27 | 'unicharset-out':: 28 | (Output) The name of the merged unicharset file. 29 | 30 | HISTORY 31 | ------- 32 | merge_unicharsets(1) was first made available for tesseract4.00.00alpha. 33 | 34 | RESOURCES 35 | --------- 36 | Main web site: + 37 | Information on training tesseract LSTM: 38 | 39 | SEE ALSO 40 | -------- 41 | tesseract(1) 42 | 43 | COPYING 44 | ------- 45 | Copyright \(C) 2012 Google, Inc. 46 | Licensed under the Apache License, Version 2.0 47 | 48 | AUTHOR 49 | ------ 50 | The Tesseract OCR engine was written by Ray Smith and his research groups 51 | at Hewlett Packard (1985-1995) and Google (2006-2018). 52 | -------------------------------------------------------------------------------- /src/classify/normfeat.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: normfeat.h 3 | ** Purpose: Definition of character normalization features. 4 | ** Author: Dan Johnson 5 | ** History: 12/14/90, DSJ, Created. 6 | ** 7 | ** (c) Copyright Hewlett-Packard Company, 1988. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | *****************************************************************************/ 18 | 19 | #ifndef NORMFEAT_H 20 | #define NORMFEAT_H 21 | 22 | #include "ocrfeatures.h" 23 | 24 | namespace tesseract { 25 | 26 | #define LENGTH_COMPRESSION (10.0) 27 | 28 | struct INT_FX_RESULT_STRUCT; 29 | 30 | typedef enum { CharNormY, CharNormLength, CharNormRx, CharNormRy } NORM_PARAM_NAME; 31 | 32 | float ActualOutlineLength(FEATURE Feature); 33 | 34 | FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info); 35 | 36 | } // namespace tesseract 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/ccmain/fixspace.h: -------------------------------------------------------------------------------- 1 | /****************************************************************** 2 | * File: fixspace.h (Formerly fixspace.h) 3 | * Description: Implements a pass over the page res, exploring the alternative 4 | * spacing possibilities, trying to use context to improve the 5 | * word spacing 6 | * Author: Phil Cheatle 7 | * Created: Thu Oct 21 11:38:43 BST 1993 8 | * 9 | * (C) Copyright 1993, Hewlett-Packard Ltd. 10 | ** Licensed under the Apache License, Version 2.0 (the "License"); 11 | ** you may not use this file except in compliance with the License. 12 | ** You may obtain a copy of the License at 13 | ** http://www.apache.org/licenses/LICENSE-2.0 14 | ** Unless required by applicable law or agreed to in writing, software 15 | ** distributed under the License is distributed on an "AS IS" BASIS, 16 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | ** See the License for the specific language governing permissions and 18 | ** limitations under the License. 19 | * 20 | **********************************************************************/ 21 | 22 | #ifndef FIXSPACE_H 23 | #define FIXSPACE_H 24 | 25 | namespace tesseract { 26 | 27 | class WERD_RES; 28 | class WERD_RES_LIST; 29 | 30 | void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list); 31 | void transform_to_next_perm(WERD_RES_LIST &words); 32 | void fixspace_dbg(WERD_RES *word); 33 | 34 | } // namespace tesseract 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/ccstruct/crakedge.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: crakedge.h (Formerly: crkedge.h) 3 | * Description: Structures for the Crack following edge detector. 4 | * Author: Ray Smith 5 | * Created: Fri Mar 22 16:06:38 GMT 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef CRAKEDGE_H 21 | #define CRAKEDGE_H 22 | 23 | #include "mod128.h" 24 | #include "points.h" 25 | 26 | namespace tesseract { 27 | 28 | class CRACKEDGE { 29 | public: 30 | CRACKEDGE() = default; 31 | 32 | ICOORD pos; /*position of crack */ 33 | int8_t stepx; // edge step 34 | int8_t stepy; 35 | int8_t stepdir; // chaincode 36 | CRACKEDGE *prev; /*previous point */ 37 | CRACKEDGE *next; /*next point */ 38 | }; 39 | 40 | } // namespace tesseract 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /src/ccstruct/blread.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: blread.h (Formerly pdread.h) 3 | * Description: Friend function of BLOCK to read the uscan pd file. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1991, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef BLREAD_H 20 | #define BLREAD_H 21 | 22 | #include // for int32_t 23 | #include // for std::string 24 | 25 | namespace tesseract { 26 | 27 | class BLOCK_LIST; 28 | 29 | bool read_unlv_file( // print list of sides 30 | std::string &name, // basename of file 31 | int32_t xsize, // image size 32 | int32_t ysize, // image size 33 | BLOCK_LIST *blocks // output list 34 | ); 35 | 36 | void FullPageBlock(int width, int height, BLOCK_LIST *blocks); 37 | 38 | } // namespace tesseract 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /doc/unicharset_extractor.1.asc: -------------------------------------------------------------------------------- 1 | UNICHARSET_EXTRACTOR(1) 2 | ======================= 3 | 4 | NAME 5 | ---- 6 | unicharset_extractor - Reads box or plain text files to extract the unicharset. 7 | 8 | SYNOPSIS 9 | -------- 10 | *unicharset_extractor* [--output_unicharset filename] [--norm_mode mode] box_or_text_file [...] 11 | 12 | Where mode means: 13 | 1=combine graphemes (use for Latin and other simple scripts) 14 | 2=split graphemes (use for Indic/Khmer/Myanmar) 15 | 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan) 16 | 17 | DESCRIPTION 18 | ----------- 19 | Tesseract needs to know the set of possible characters it can output. 20 | To generate the unicharset data file, use the unicharset_extractor 21 | program on training pages bounding box files or a plain text file: 22 | 23 | unicharset_extractor fontfile_1.box fontfile_2.box ... 24 | 25 | The unicharset will be put into the file './unicharset' if no output filename is provided. 26 | 27 | *NOTE* Use the appropriate norm_mode based on the language. 28 | 29 | SEE ALSO 30 | -------- 31 | tesseract(1), unicharset(5) 32 | 33 | 34 | 35 | HISTORY 36 | ------- 37 | unicharset_extractor first appeared in Tesseract 2.00. 38 | 39 | COPYING 40 | ------- 41 | Copyright \(C) 2006, Google Inc. 42 | Licensed under the Apache License, Version 2.0 43 | 44 | AUTHOR 45 | ------ 46 | The Tesseract OCR engine was written by Ray Smith and his research groups 47 | at Hewlett Packard (1985-1995) and Google (2006-2018). 48 | -------------------------------------------------------------------------------- /src/ccutil/tesstypes.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: tesstypes.h 3 | // Description: Simple data types used by Tesseract code. 4 | // Author: Stefan Weil 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////// 16 | 17 | #ifndef TESSERACT_TESSTYPES_H 18 | #define TESSERACT_TESSTYPES_H 19 | 20 | #ifdef HAVE_CONFIG_H 21 | # include "config_auto.h" // FAST_FLOAT 22 | #endif 23 | 24 | #include // for int16_t, int32_t 25 | 26 | namespace tesseract { 27 | 28 | // Image dimensions (width and height, coordinates). 29 | #if defined(LARGE_IMAGES) 30 | using TDimension = int32_t; 31 | #else 32 | using TDimension = int16_t; 33 | #endif 34 | 35 | // Floating point data type used for LSTM calculations. 36 | #if defined(FAST_FLOAT) 37 | using TFloat = float; 38 | #else 39 | using TFloat = double; 40 | #endif 41 | 42 | } 43 | 44 | #endif // TESSERACT_TESSTYPES_H 45 | -------------------------------------------------------------------------------- /snap/snapcraft.yaml: -------------------------------------------------------------------------------- 1 | name: tesseract 2 | version: git 3 | summary: open source optical character recognition engine 4 | description: | 5 | Tesseract has unicode (UTF-8) support, and can recognize more than 100 6 | languages "out of the box". It can be trained to recognize other languages. 7 | Tesseract supports various output formats: plain-text, hocr(html), pdf. 8 | 9 | If you want to access the files under /media/* or /run/media/* you'll have 10 | to connect the snap to the `core` snap's `removable-media` interface: 11 | 12 | $ sudo snap connect tesseract:removable-media 13 | 14 | grade: stable # must be 'stable' to release into candidate/stable channels 15 | confinement: strict 16 | base: core22 17 | 18 | apps: 19 | tesseract: 20 | command: usr/local/bin/tesseract 21 | environment: 22 | TESSDATA_PREFIX: $SNAP_USER_COMMON 23 | plugs: 24 | - home 25 | - removable-media 26 | 27 | parts: 28 | tesseract: 29 | source: . 30 | plugin: autotools 31 | build-packages: 32 | - pkg-config 33 | - libpng-dev 34 | - libjpeg-dev 35 | - libtiff-dev 36 | - zlib1g-dev 37 | - libicu-dev 38 | - libpango1.0-dev 39 | - libcairo2-dev 40 | stage-packages: 41 | - libgomp1 42 | after: [leptonica] 43 | leptonica: 44 | source: https://github.com/DanBloomberg/leptonica/archive/1.83.1.tar.gz 45 | plugin: autotools 46 | stage-packages: 47 | - libjbig0 48 | - libjpeg-turbo8 49 | - libopenjp2-7 50 | - libtiff5 51 | -------------------------------------------------------------------------------- /src/ccmain/reject.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: reject.h 3 | * Description: Rejection functions used in tessedit 4 | * Author: Phil Cheatle 5 | * Created: Wed Sep 23 16:50:21 BST 1992 6 | * 7 | * (C) Copyright 1992, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef REJECT_H 21 | #define REJECT_H 22 | 23 | namespace tesseract { 24 | 25 | class WERD_CHOICE; 26 | class WERD_RES; 27 | 28 | void reject_blanks(WERD_RES *word); 29 | void reject_poor_matches(WERD_RES *word); 30 | float compute_reject_threshold(WERD_CHOICE *word); 31 | bool word_contains_non_1_digit(const char *word, const char *word_lengths); 32 | void dont_allow_1Il(WERD_RES *word); 33 | void flip_hyphens(WERD_RES *word); 34 | void flip_0O(WERD_RES *word); 35 | bool non_0_digit(const char *str, int length); 36 | 37 | } // namespace tesseract 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/textord/gap_map.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software 6 | // distributed under the License is distributed on an "AS IS" BASIS, 7 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | // See the License for the specific language governing permissions and 9 | // limitations under the License. 10 | 11 | #ifndef GAP_MAP_H 12 | #define GAP_MAP_H 13 | 14 | #include "blobbox.h" 15 | 16 | namespace tesseract { 17 | 18 | class GAPMAP { 19 | public: 20 | GAPMAP( // constructor 21 | TO_BLOCK *block); 22 | 23 | ~GAPMAP() { // destructor 24 | delete[] map; 25 | } 26 | 27 | bool table_gap( // Is gap a table? 28 | int16_t left, // From here 29 | int16_t right); // To here 30 | 31 | private: 32 | int16_t total_rows; // in block 33 | int16_t min_left; // Left extreme 34 | int16_t max_right; // Right extreme 35 | int16_t bucket_size; // half an x ht 36 | int16_t *map; // empty counts 37 | int16_t map_max; // map[0..max_map] defined 38 | bool any_tabs; 39 | }; 40 | 41 | /*-----------------------------*/ 42 | 43 | extern BOOL_VAR_H(gapmap_debug); 44 | extern BOOL_VAR_H(gapmap_use_ends); 45 | extern BOOL_VAR_H(gapmap_no_isolated_quanta); 46 | extern double_VAR_H(gapmap_big_gaps); 47 | 48 | } // namespace tesseract 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /cmake/SourceGroups.cmake: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | # Unless required by applicable law or agreed to in writing, software 6 | # distributed under the License is distributed on an "AS IS" BASIS, 7 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | # See the License for the specific language governing permissions and 9 | # limitations under the License. 10 | #include(SourceGroups) 11 | 12 | set(SSRC ${CMAKE_SOURCE_DIR}) 13 | set(BSRC ${CMAKE_BINARY_DIR}) 14 | 15 | set(_CPP ".*\\.cpp") 16 | set(CPP "${_CPP}$") 17 | 18 | set(_H ".*\\.h") 19 | set(H "${_H}$") 20 | 21 | set(H_CPP "(${H}|${CPP})") 22 | 23 | source_group("Resource files" ".*\\.(rc|ico)") 24 | 25 | source_group("api" "${SSRC}/api/${H_CPP}") 26 | source_group("arch" "${SSRC}/arch/${H_CPP}") 27 | source_group("ccmain" "${SSRC}/ccmain/${H_CPP}") 28 | source_group("ccstruct" "${SSRC}/ccstruct/${H_CPP}") 29 | source_group("ccutil" "${SSRC}/ccutil/${H_CPP}") 30 | source_group("classify" "${SSRC}/classify/${H_CPP}") 31 | source_group("cutil" "${SSRC}/cutil/${H_CPP}") 32 | source_group("dict" "${SSRC}/dict/${H_CPP}") 33 | source_group("lstm" "${SSRC}/lstm/${H_CPP}") 34 | source_group("textord" "${SSRC}/textord/${H_CPP}") 35 | source_group("viewer" "${SSRC}/viewer/${H_CPP}") 36 | source_group("wordrec" "${SSRC}/wordrec/${H_CPP}") 37 | -------------------------------------------------------------------------------- /java/com/google/scrollview/events/SVEventType.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.events; 12 | 13 | /** 14 | * These are the defined events which can happen in ScrollView and be 15 | * transferred to the client. They are same events as on the client side part of 16 | * ScrollView (defined in ScrollView.h). 17 | * 18 | * @author wanke@google.com 19 | */ 20 | public enum SVEventType { 21 | SVET_DESTROY, // Window has been destroyed by user. 22 | SVET_EXIT, // User has destroyed the last window by clicking on the 'X' 23 | SVET_CLICK, // Any button pressed that is not a popup trigger. 24 | SVET_SELECTION, // Left button selection. 25 | SVET_INPUT, // Any kind of input 26 | SVET_MOUSE, // The mouse has moved with a button pressed. 27 | SVET_MOTION, // The mouse has moved with no button pressed. 28 | SVET_HOVER, // The mouse has stayed still for a second. 29 | SVET_POPUP, // A command selected through a popup menu 30 | SVET_MENU; // A command selected through the menubar 31 | } 32 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVSubMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import javax.swing.JMenu; 23 | 24 | /** Constructs a new submenu which can hold other entries. */ 25 | class SVSubMenuItem extends SVAbstractMenuItem { 26 | public SVSubMenuItem(String name, JMenu jli) { 27 | super(-1, name, jli); 28 | } 29 | /** Adds a child entry to the submenu. */ 30 | @Override 31 | public void add(SVAbstractMenuItem mli) { 32 | mi.add(mli.mi); 33 | } 34 | /** Adds a child menu to the submenu (or root node). */ 35 | @Override 36 | public void add(JMenu jli) { 37 | mi.add(jli); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/classify/clusttool.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: clusttool.h 3 | ** Purpose: Definition of clustering utility tools 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef TESSERACT_CLASSIFY_CLUSTTOOL_H_ 19 | #define TESSERACT_CLASSIFY_CLUSTTOOL_H_ 20 | 21 | #include "cluster.h" 22 | 23 | #include "serialis.h" 24 | 25 | #include 26 | 27 | namespace tesseract { 28 | 29 | uint16_t ReadSampleSize(tesseract::TFile *fp); 30 | 31 | PARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uint16_t N); 32 | 33 | PROTOTYPE *ReadPrototype(tesseract::TFile *fp, uint16_t N); 34 | 35 | TESS_API 36 | void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]); 37 | 38 | TESS_API 39 | void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto); 40 | 41 | } // namespace tesseract 42 | 43 | #endif // TESSERACT_CLASSIFY_CLUSTTOOL_H_ 44 | -------------------------------------------------------------------------------- /src/ccstruct/params_training_featdef.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: params_training_featdef.cpp 3 | // Description: Utility functions for params training features. 4 | // Author: David Eger 5 | // Created: Mon Jun 11 11:26:42 PDT 2012 6 | // 7 | // (C) Copyright 2012, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #include 21 | 22 | #include "params_training_featdef.h" 23 | 24 | namespace tesseract { 25 | 26 | int ParamsTrainingFeatureByName(const char *name) { 27 | if (name == nullptr) { 28 | return -1; 29 | } 30 | int array_size = 31 | sizeof(kParamsTrainingFeatureTypeName) / sizeof(kParamsTrainingFeatureTypeName[0]); 32 | for (int i = 0; i < array_size; i++) { 33 | if (kParamsTrainingFeatureTypeName[i] == nullptr) { 34 | continue; 35 | } 36 | if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0) { 37 | return i; 38 | } 39 | } 40 | return -1; 41 | } 42 | 43 | } // namespace tesseract 44 | -------------------------------------------------------------------------------- /doc/set_unicharset_properties.1.asc: -------------------------------------------------------------------------------- 1 | SET_UNICHARSET_PROPERTIES(1) 2 | ============================ 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | set_unicharset_properties - set properties about the unichars 8 | 9 | SYNOPSIS 10 | -------- 11 | *set_unicharset_properties* --U 'input_unicharsetfile' --script_dir '/path/to/langdata' --O 'output_unicharsetfile' 12 | 13 | DESCRIPTION 14 | ----------- 15 | set_unicharset_properties(1) reads a unicharset file, puts the result in a UNICHARSET object, fills it with properties about the unichars it contains and writes the result back to another unicharset file. 16 | 17 | OPTIONS 18 | ------- 19 | 20 | '--script_dir /path/to/langdata':: 21 | (Input) Specify the location of directory for universal script unicharsets and font xheights (type:string default:) 22 | 23 | '--U unicharsetfile':: 24 | (Input) Specify the location of the unicharset to load as input. 25 | 26 | '--O unicharsetfile':: 27 | (Output) Specify the location of the unicharset to be written with updated properties. 28 | 29 | HISTORY 30 | ------- 31 | set_unicharset_properties(1) was first made available for tesseract version 3.03. 32 | 33 | RESOURCES 34 | --------- 35 | Main web site: + 36 | Information on training: 37 | 38 | SEE ALSO 39 | -------- 40 | tesseract(1) 41 | 42 | COPYING 43 | ------- 44 | Copyright \(C) 2012 Google, Inc. 45 | Licensed under the Apache License, Version 2.0 46 | 47 | AUTHOR 48 | ------ 49 | The Tesseract OCR engine was written by Ray Smith and his research groups 50 | at Hewlett Packard (1985-1995) and Google (2006-2018). 51 | -------------------------------------------------------------------------------- /src/textord/edgloop.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: edgloop.h (Formerly edgeloop.h) 3 | * Description: Functions to clean up an outline before approximation. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1991, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef EDGLOOP_H 20 | #define EDGLOOP_H 21 | 22 | #include "coutln.h" 23 | #include "crakedge.h" 24 | #include "params.h" 25 | #include "pdblock.h" 26 | #include "scrollview.h" 27 | 28 | namespace tesseract { 29 | 30 | #define BUCKETSIZE 16 31 | 32 | void complete_edge(CRACKEDGE *start, // start of loop 33 | C_OUTLINE_IT *outline_it); 34 | ScrollView::Color check_path_legal( // certify outline 35 | CRACKEDGE *start // start of loop 36 | ); 37 | int16_t loop_bounding_box( // get bounding box 38 | CRACKEDGE *&start, // edge loop 39 | ICOORD &botleft, // bounding box 40 | ICOORD &topright); 41 | 42 | } // namespace tesseract 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | Amit Dovev 2 | 3 | Egor Pugin 4 | 5 | Jeff Breidenbach 6 | Jeff Breidenbach 7 | 8 | Jim O'Regan 9 | Jim O'Regan 10 | Jim O'Regan 11 | 12 | Ray Smith 13 | Ray Smith 14 | Ray Smith 15 | Ray Smith 16 | Ray Smith 17 | Ray Smith 18 | 19 | Shree Devi Kumar <5095331+Shreeshrii@users.noreply.github.com> 20 | Shree Devi Kumar <5095331+Shreeshrii@users.noreply.github.com> <5095331+Shreeshrii@users.noreply.github.com5095331+Shreeshrii@users.noreply.github.com> 21 | 22 | Stefan Weil 23 | Stefan Weil 24 | Stefan Weil 25 | Stefan Weil 26 | Stefan Weil 27 | Stefan Weil 28 | 29 | Zdenko Podobný 30 | Zdenko Podobný 31 | Zdenko Podobný 32 | Zdenko Podobný 33 | -------------------------------------------------------------------------------- /src/ccstruct/ccstruct.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: ccstruct.h 3 | // Description: ccstruct class. 4 | // Author: Samuel Charron 5 | // 6 | // (C) Copyright 2006, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_ 20 | #define TESSERACT_CCSTRUCT_CCSTRUCT_H_ 21 | 22 | #include "ccutil.h" // for CCUtil 23 | 24 | namespace tesseract { 25 | 26 | class CCStruct : public CCUtil { 27 | public: 28 | // Globally accessible constants. 29 | // APPROXIMATIONS of the fractions of the character cell taken by 30 | // the descenders, ascenders, and x-height. 31 | static const double kDescenderFraction; // = 0.25; 32 | static const double kXHeightFraction; // = 0.5; 33 | static const double kAscenderFraction; // = 0.25; 34 | // Derived value giving the x-height as a fraction of cap-height. 35 | static const double kXHeightCapRatio; // = XHeight/(XHeight + Ascender). 36 | }; 37 | 38 | } // namespace tesseract 39 | 40 | #endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_ 41 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022 4 | platform: Win64 5 | 6 | configuration: 7 | - Release 8 | 9 | cache: 10 | - c:/Users/appveyor/.sw -> appveyor.yml 11 | 12 | only_commits: 13 | files: 14 | - appveyor.yml 15 | - '**.cpp' 16 | - '**.h' 17 | - 'unittest/**.c' 18 | - 'unittest/**.cc' 19 | 20 | before_build: 21 | - git submodule update --init --recursive 22 | - curl -fsS -L -o dl.zip https://software-network.org/client/sw-master-windows_x86_64-client.zip 23 | - 7z x dl.zip 24 | - set PATH=%PATH%;%cd% 25 | 26 | build_script: 27 | - sw -version 28 | # -show-output - show command output 29 | # debug build causes long builds (> 1h), appveyor drops them 30 | - sw -platform %platform% -config r build -Dwith-tests=1 31 | # test 32 | - git clone https://github.com/egorpugin/tessdata tessdata_unittest 33 | - ps: Copy-Item -Path "tessdata_unittest\fonts\*" -Destination "test\testing" -Recurse 34 | - sw -platform %platform% -config r test -Dwith-tests=1 -Dskip-tests=lstm,lstm_recode 35 | 36 | after_build: 37 | - 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\.sw\out\**\*.exe %APPVEYOR_BUILD_FOLDER%\.sw\out\**\*.dll 38 | #- 7z a tesseract.zip %APPVEYOR_BUILD_FOLDER%\.sw\Windows_*_Shared_Release_MSVC_*\*.exe %APPVEYOR_BUILD_FOLDER%\.sw\Windows_*_Shared_Release_MSVC_*\*.dll 39 | 40 | on_finish: 41 | # gather tests 42 | - ps: $wc = New-Object 'System.Net.WebClient' 43 | - ps: $wc.UploadFile("https://ci.appveyor.com/api/testresults/junit/$($env:APPVEYOR_JOB_ID)", (Resolve-Path .\.sw\test\results.xml)) 44 | 45 | artifacts: 46 | - path: tesseract.zip 47 | name: tesseract-$(APPVEYOR_BUILD_VERSION) 48 | 49 | -------------------------------------------------------------------------------- /src/ccutil/tprintf.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tprintf.h 3 | * Description: Trace version of printf - portable between UX and NT 4 | * Author: Phil Cheatle 5 | * 6 | * (C) Copyright 1995, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef TESSERACT_CCUTIL_TPRINTF_H 20 | #define TESSERACT_CCUTIL_TPRINTF_H 21 | 22 | #include "params.h" // for BOOL_VAR_H 23 | #include // for TESS_API 24 | 25 | namespace tesseract { 26 | 27 | #if !defined(__GNUC__) && !defined(__attribute__) 28 | # define __attribute__(attr) // compiler without support for __attribute__ 29 | #endif 30 | 31 | // Disable some log messages by setting log_level > 0. 32 | extern TESS_API INT_VAR_H(log_level); 33 | 34 | // Main logging function. 35 | extern TESS_API void tprintf( // Trace printf 36 | const char *format, ...) // Message 37 | __attribute__((format(printf, 1, 2))); 38 | 39 | } // namespace tesseract 40 | 41 | #undef __attribute__ 42 | 43 | #endif // define TESSERACT_CCUTIL_TPRINTF_H 44 | -------------------------------------------------------------------------------- /src/ccstruct/image.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | /////////////////////////////////////////////////////////////////////// 12 | 13 | // Include automatically generated configuration file if running autoconf. 14 | #ifdef HAVE_CONFIG_H 15 | # include "config_auto.h" 16 | #endif 17 | 18 | #include "image.h" 19 | 20 | #include 21 | 22 | namespace tesseract { 23 | 24 | Image Image::clone() const { 25 | return pix_ ? pixClone(pix_) : nullptr; 26 | } 27 | 28 | Image Image::copy() const { 29 | return pixCopy(nullptr, pix_); 30 | } 31 | 32 | void Image::destroy() { 33 | pixDestroy(&pix_); 34 | } 35 | 36 | bool Image::isZero() const { 37 | l_int32 r = 0; 38 | pixZero(pix_, &r); 39 | return r == 1; 40 | } 41 | 42 | Image Image::operator|(Image i) const { 43 | return pixOr(nullptr, pix_, i); 44 | } 45 | 46 | Image &Image::operator|=(Image i) { 47 | pixOr(pix_, pix_, i); 48 | return *this; 49 | } 50 | 51 | Image Image::operator&(Image i) const { 52 | return pixAnd(nullptr, pix_, i); 53 | } 54 | 55 | Image &Image::operator&=(Image i) { 56 | pixAnd(pix_, pix_, i); 57 | return *this; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/classify/outfeat.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: outfeat.h 3 | ** Purpose: Definition of outline features. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef OUTFEAT_H 19 | #define OUTFEAT_H 20 | 21 | #include "fpoint.h" 22 | #include "mfoutline.h" 23 | #include "ocrfeatures.h" 24 | 25 | namespace tesseract { 26 | 27 | typedef enum { 28 | OutlineFeatX, 29 | OutlineFeatY, 30 | OutlineFeatLength, 31 | OutlineFeatDir 32 | } OUTLINE_FEAT_PARAM_NAME; 33 | 34 | #define MAX_OUTLINE_FEATURES (100) 35 | 36 | /*--------------------------------------------------------------------------- 37 | Privat Function Prototypes 38 | ----------------------------------------------------------------------------*/ 39 | void AddOutlineFeatureToSet(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet); 40 | 41 | void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet); 42 | 43 | void NormalizeOutlineX(FEATURE_SET FeatureSet); 44 | 45 | } // namespace tesseract 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/classify/mfx.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: mfx.h 3 | ** Purpose: Definition of micro-feature extraction routines 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef MFX_H 19 | #define MFX_H 20 | 21 | #include "mfdefs.h" 22 | #include "params.h" 23 | 24 | namespace tesseract { 25 | 26 | class DENORM; 27 | struct TBLOB; 28 | 29 | /*---------------------------------------------------------------------------- 30 | Variables 31 | ----------------------------------------------------------------------------**/ 32 | 33 | /* old numbers corresponded to 10.0 degrees and 80.0 degrees */ 34 | extern double_VAR_H(classify_min_slope); 35 | extern double_VAR_H(classify_max_slope); 36 | 37 | /*---------------------------------------------------------------------------- 38 | Public Function Prototypes 39 | ----------------------------------------------------------------------------**/ 40 | MICROFEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM &cn_denorm); 41 | 42 | } // namespace tesseract 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/dict/stopper.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: stopper.h 3 | ** Purpose: Stopping criteria for word classifier. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | #ifndef STOPPER_H 18 | #define STOPPER_H 19 | 20 | #include "params.h" 21 | #include "ratngs.h" 22 | 23 | #include 24 | 25 | namespace tesseract { 26 | 27 | class WERD_CHOICE; 28 | 29 | using BLOB_WIDTH = uint8_t; 30 | 31 | struct DANGERR_INFO { 32 | DANGERR_INFO() 33 | : begin(-1) 34 | , end(-1) 35 | , dangerous(false) 36 | , correct_is_ngram(false) 37 | , leftmost(INVALID_UNICHAR_ID) {} 38 | DANGERR_INFO(int b, int e, bool d, bool n, UNICHAR_ID l) 39 | : begin(b), end(e), dangerous(d), correct_is_ngram(n), leftmost(l) {} 40 | int begin; 41 | int end; 42 | bool dangerous; 43 | bool correct_is_ngram; 44 | UNICHAR_ID leftmost; // in the replacement, what's the leftmost character? 45 | }; 46 | 47 | using DANGERR = std::vector; 48 | 49 | } // namespace tesseract 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/ccmain/pagewalk.cpp: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: pagewalk.cpp (Formerly walkers.c) 3 | * Description: Block list processors 4 | * Author: Phil Cheatle 5 | * Created: Thu Oct 10 16:25:24 BST 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #include "pageres.h" 21 | #include "tesseractclass.h" 22 | 23 | namespace tesseract { 24 | /** 25 | * @name process_selected_words() 26 | * 27 | * Walk the current block list applying the specified word processor function 28 | * to each word that overlaps the selection_box. 29 | */ 30 | void Tesseract::process_selected_words( 31 | PAGE_RES *page_res, // blocks to check 32 | TBOX &selection_box, bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) { 33 | for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr; page_res_it.forward()) { 34 | WERD *word = page_res_it.word()->word; 35 | if (word->bounding_box().overlap(selection_box)) { 36 | if (!(this->*word_processor)(&page_res_it)) { 37 | return; 38 | } 39 | } 40 | } 41 | } 42 | } // namespace tesseract 43 | -------------------------------------------------------------------------------- /src/wordrec/plotedges.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * 3 | * File: plotedges.h 4 | * Description: Convert the various data type into line lists 5 | * Author: Mark Seaman, OCR Technology 6 | * 7 | * (c) Copyright 1989, Hewlett-Packard Company. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | *****************************************************************************/ 19 | 20 | #ifndef PLOTEDGES_H 21 | #define PLOTEDGES_H 22 | 23 | #include "oldlist.h" // for LIST 24 | 25 | namespace tesseract { 26 | 27 | class ScrollView; 28 | 29 | struct EDGEPT; 30 | struct TBLOB; 31 | 32 | /*---------------------------------------------------------------------- 33 | V a r i a b l e s 34 | ----------------------------------------------------------------------*/ 35 | extern ScrollView *edge_window; /* Window for edges */ 36 | 37 | /*---------------------------------------------------------------------- 38 | F u n c t i o n s 39 | ---------------------------------------------------------------------*/ 40 | void display_edgepts(LIST outlines); 41 | 42 | void draw_blob_edges(TBLOB *blob); 43 | 44 | void mark_outline(EDGEPT *edgept); 45 | 46 | } // namespace tesseract 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/ccutil/host.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: host.h 3 | ** Purpose: This is the system independent typedefs and defines 4 | ** Author: MN, JG, MD 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988-1996. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | */ 17 | 18 | #ifndef TESSERACT_CCUTIL_HOST_H_ 19 | #define TESSERACT_CCUTIL_HOST_H_ 20 | 21 | #include 22 | 23 | #include 24 | #include 25 | 26 | /* _WIN32 */ 27 | #ifdef _WIN32 28 | # ifndef NOMINMAX 29 | # define NOMINMAX 30 | # endif /* NOMINMAX */ 31 | # ifndef WIN32_LEAN_AND_MEAN 32 | # define WIN32_LEAN_AND_MEAN 33 | # endif 34 | # include 35 | # undef min 36 | # undef max 37 | #endif // _WIN32 38 | 39 | #ifndef _WIN32 40 | # ifndef PATH_MAX 41 | # define MAX_PATH 4096 42 | # else 43 | # define MAX_PATH PATH_MAX 44 | # endif 45 | #endif 46 | 47 | namespace tesseract { 48 | 49 | // Return true if x is within tolerance of y 50 | template 51 | bool NearlyEqual(T x, T y, T tolerance) { 52 | T diff = x - y; 53 | return diff <= tolerance && -diff <= tolerance; 54 | } 55 | 56 | } // namespace tesseract 57 | 58 | #endif // TESSERACT_CCUTIL_HOST_H_ 59 | -------------------------------------------------------------------------------- /src/ccutil/lsterr.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: lsterr.h (Formerly listerr.h) 3 | * Description: Errors shared by list modules 4 | * Author: Phil Cheatle 5 | * 6 | * (C) Copyright 1990, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef TESSERACT_CCUTIL_LSTERR_H_ 20 | #define TESSERACT_CCUTIL_LSTERR_H_ 21 | 22 | #include "errcode.h" //must be last include 23 | 24 | namespace tesseract { 25 | 26 | #ifndef NDEBUG 27 | 28 | constexpr ERRCODE NO_LIST("Iterator not set to a list"); 29 | constexpr ERRCODE NULL_DATA("List would have returned a nullptr data pointer"); 30 | constexpr ERRCODE NULL_CURRENT("List current position is nullptr"); 31 | constexpr ERRCODE NULL_NEXT("Next element on the list is nullptr"); 32 | constexpr ERRCODE NULL_PREV("Previous element on the list is nullptr"); 33 | constexpr ERRCODE EMPTY_LIST("List is empty"); 34 | constexpr ERRCODE BAD_PARAMETER("List parameter error"); 35 | constexpr ERRCODE STILL_LINKED("Attempting to add an element with non nullptr links, to a list"); 36 | 37 | #endif // !NDEBUG 38 | 39 | } // namespace tesseract 40 | 41 | #endif // TESSERACT_CCUTIL_LSTERR_H_ 42 | -------------------------------------------------------------------------------- /src/ccstruct/debugpixa.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_ 2 | #define TESSERACT_CCSTRUCT_DEBUGPIXA_H_ 3 | 4 | #include "image.h" 5 | 6 | #include 7 | 8 | namespace tesseract { 9 | 10 | // Class to hold a Pixa collection of debug images with captions and save them 11 | // to a PDF file. 12 | class DebugPixa { 13 | public: 14 | // TODO(rays) add another constructor with size control. 15 | DebugPixa() { 16 | pixa_ = pixaCreate(0); 17 | #ifdef TESSERACT_DISABLE_DEBUG_FONTS 18 | fonts_ = NULL; 19 | #else 20 | fonts_ = bmfCreate(nullptr, 14); 21 | #endif 22 | } 23 | // If the filename_ has been set and there are any debug images, they are 24 | // written to the set filename_. 25 | ~DebugPixa() { 26 | pixaDestroy(&pixa_); 27 | bmfDestroy(&fonts_); 28 | } 29 | 30 | // Adds the given pix to the set of pages in the PDF file, with the given 31 | // caption added to the top. 32 | void AddPix(const Image pix, const char *caption) { 33 | int depth = pixGetDepth(pix); 34 | int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80); 35 | Image pix_debug = 36 | pixAddSingleTextblock(pix, fonts_, caption, color, L_ADD_BELOW, nullptr); 37 | pixaAddPix(pixa_, pix_debug, L_INSERT); 38 | } 39 | 40 | // Sets the destination filename and enables images to be written to a PDF 41 | // on destruction. 42 | void WritePDF(const char *filename) { 43 | if (pixaGetCount(pixa_) > 0) { 44 | pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, "AllDebugImages", filename); 45 | pixaClear(pixa_); 46 | } 47 | } 48 | 49 | private: 50 | // The collection of images to put in the PDF. 51 | Pixa *pixa_; 52 | // The fonts used to draw text captions. 53 | L_Bmf *fonts_; 54 | }; 55 | 56 | } // namespace tesseract 57 | 58 | #endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_ 59 | -------------------------------------------------------------------------------- /src/textord/blobgrid.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: blobgrid.h 3 | // Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods. 4 | // Copyright 2011 Google Inc. All Rights Reserved. 5 | // Author: rays@google.com (Ray Smith) 6 | // Created: Sat Jun 11 10:26:01 PST 2011 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #ifndef TESSERACT_TEXTORD_BLOBGRID_H_ 21 | #define TESSERACT_TEXTORD_BLOBGRID_H_ 22 | 23 | #include "bbgrid.h" 24 | #include "blobbox.h" 25 | 26 | namespace tesseract { 27 | 28 | CLISTIZEH(BLOBNBOX) 29 | 30 | using BlobGridSearch = GridSearch; 31 | 32 | class TESS_API BlobGrid : public BBGrid { 33 | public: 34 | BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright); 35 | ~BlobGrid() override; 36 | 37 | // Inserts all the blobs from the given list, with x and y spreading, 38 | // without removing from the source list, so ownership remains with the 39 | // source list. 40 | void InsertBlobList(BLOBNBOX_LIST *blobs); 41 | }; 42 | 43 | } // namespace tesseract. 44 | 45 | #endif // TESSERACT_TEXTORD_BLOBGRID_H_ 46 | -------------------------------------------------------------------------------- /doc/wordlist2dawg.1.asc: -------------------------------------------------------------------------------- 1 | WORDLIST2DAWG(1) 2 | ================ 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | wordlist2dawg - convert a wordlist to a DAWG for Tesseract 8 | 9 | SYNOPSIS 10 | -------- 11 | *wordlist2dawg* 'WORDLIST' 'DAWG' 'lang.unicharset' 12 | 13 | *wordlist2dawg* -t 'WORDLIST' 'DAWG' 'lang.unicharset' 14 | 15 | *wordlist2dawg* -r 1 'WORDLIST' 'DAWG' 'lang.unicharset' 16 | 17 | *wordlist2dawg* -r 2 'WORDLIST' 'DAWG' 'lang.unicharset' 18 | 19 | *wordlist2dawg* -l 'WORDLIST' 'DAWG' 'lang.unicharset' 20 | 21 | DESCRIPTION 22 | ----------- 23 | wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph 24 | (DAWG) for use with Tesseract. A DAWG is a compressed, space and time 25 | efficient representation of a word list. 26 | 27 | OPTIONS 28 | ------- 29 | -t 30 | Verify that a given dawg file is equivalent to a given wordlist. 31 | 32 | -r 1 33 | Reverse a word if it contains an RTL character. 34 | 35 | -r 2 36 | Reverse all words. 37 | 38 | -l 39 | Produce a file with several dawgs in it, one each for words 40 | of length , ,... 41 | 42 | ARGUMENTS 43 | --------- 44 | 45 | 'WORDLIST' 46 | A plain text file in UTF-8, one word per line. 47 | 48 | 'DAWG' 49 | The output DAWG to write. 50 | 51 | 'lang.unicharset' 52 | The unicharset of the language. This is the unicharset 53 | generated by mftraining(1). 54 | 55 | SEE ALSO 56 | -------- 57 | tesseract(1), combine_tessdata(1), dawg2wordlist(1) 58 | 59 | 60 | 61 | COPYING 62 | ------- 63 | Copyright \(C) 2006 Google, Inc. 64 | Licensed under the Apache License, Version 2.0 65 | 66 | AUTHOR 67 | ------ 68 | The Tesseract OCR engine was written by Ray Smith and his research groups 69 | at Hewlett Packard (1985-1995) and Google (2006-2018). 70 | -------------------------------------------------------------------------------- /src/training/set_unicharset_properties.cpp: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // http://www.apache.org/licenses/LICENSE-2.0 5 | // Unless required by applicable law or agreed to in writing, software 6 | // distributed under the License is distributed on an "AS IS" BASIS, 7 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8 | // See the License for the specific language governing permissions and 9 | // limitations under the License. 10 | 11 | // This program reads a unicharset file, puts the result in a UNICHARSET 12 | // object, fills it with properties about the unichars it contains and writes 13 | // the result back to a file. 14 | 15 | #include "commandlineflags.h" 16 | #include "commontraining.h" // CheckSharedLibraryVersion 17 | #include "tprintf.h" 18 | #include "unicharset_training_utils.h" 19 | 20 | using namespace tesseract; 21 | 22 | // The directory that is searched for universal script unicharsets. 23 | static STRING_PARAM_FLAG(script_dir, "", "Directory name for input script unicharsets/xheights"); 24 | 25 | int main(int argc, char **argv) { 26 | tesseract::CheckSharedLibraryVersion(); 27 | tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); 28 | 29 | // Check validity of input flags. 30 | if (FLAGS_U.empty() || FLAGS_O.empty()) { 31 | tprintf("Specify both input and output unicharsets!\n"); 32 | return EXIT_FAILURE; 33 | } 34 | if (FLAGS_script_dir.empty()) { 35 | tprintf("Must specify a script_dir!\n"); 36 | return EXIT_FAILURE; 37 | } 38 | 39 | tesseract::SetPropertiesForInputFile(FLAGS_script_dir.c_str(), FLAGS_U.c_str(), FLAGS_O.c_str(), 40 | FLAGS_X.c_str()); 41 | return EXIT_SUCCESS; 42 | } 43 | -------------------------------------------------------------------------------- /src/arch/dotproduct.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproduct.h 3 | // Description: Native dot product function. 4 | // 5 | // (C) Copyright 2018, Google Inc. 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////// 16 | 17 | #ifndef TESSERACT_ARCH_DOTPRODUCT_H_ 18 | #define TESSERACT_ARCH_DOTPRODUCT_H_ 19 | 20 | #include "tesstypes.h" 21 | 22 | namespace tesseract { 23 | 24 | // Computes and returns the dot product of the n-vectors u and v. 25 | TFloat DotProductNative(const TFloat *u, const TFloat *v, int n); 26 | 27 | // Uses Intel AVX intrinsics to access the SIMD instruction set. 28 | TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n); 29 | 30 | // Uses Intel AVX512F intrinsics to access the SIMD instruction set. 31 | TFloat DotProductAVX512F(const TFloat *u, const TFloat *v, int n); 32 | 33 | // Use Intel FMA. 34 | TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n); 35 | 36 | // Uses Intel SSE intrinsics to access the SIMD instruction set. 37 | TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n); 38 | 39 | // Use NEON intrinsics. 40 | TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n); 41 | 42 | } // namespace tesseract. 43 | 44 | #endif // TESSERACT_ARCH_DOTPRODUCT_H_ 45 | -------------------------------------------------------------------------------- /doc/classifier_tester.1.asc: -------------------------------------------------------------------------------- 1 | CLASSIFIER_TESTER(1) 2 | ==================== 3 | 4 | NAME 5 | ---- 6 | classifier_tester - for *legacy tesseract* engine. 7 | 8 | SYNOPSIS 9 | -------- 10 | *classifier_tester* -U 'unicharset_file' -F 'font_properties_file' -X 'xheights_file' -classifier 'x' -lang 'lang' [-output_trainer trainer] *.tr 11 | 12 | DESCRIPTION 13 | ----------- 14 | classifier_tester(1) runs Tesseract in a special mode. 15 | It takes a list of .tr files and tests a character classifier 16 | on data as formatted for training, 17 | but it doesn't have to be the same as the training data. 18 | 19 | IN/OUT ARGUMENTS 20 | ---------------- 21 | 22 | a list of .tr files 23 | 24 | OPTIONS 25 | ------- 26 | -l 'lang':: 27 | (Input) three character language code; default value 'eng'. 28 | 29 | -classifier 'x':: 30 | (Input) One of "pruner", "full". 31 | 32 | 33 | -U 'unicharset':: 34 | (Input) The unicharset for the language. 35 | 36 | -F 'font_properties_file':: 37 | (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: 38 | 39 | *font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* 40 | 41 | -X 'xheights_file':: 42 | (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] 43 | 44 | *font_name* *xheight* 45 | 46 | -output_trainer 'trainer':: 47 | (Output, Optional) Filename for output trainer. 48 | 49 | SEE ALSO 50 | -------- 51 | tesseract(1) 52 | 53 | COPYING 54 | ------- 55 | Copyright \(C) 2012 Google, Inc. 56 | Licensed under the Apache License, Version 2.0 57 | 58 | AUTHOR 59 | ------ 60 | The Tesseract OCR engine was written by Ray Smith and his research groups 61 | at Hewlett Packard (1985-1995) and Google (2006-2018). 62 | -------------------------------------------------------------------------------- /cmake/templates/TesseractConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # =================================================================================== 2 | # The Tesseract CMake configuration file 3 | # 4 | # ** File generated automatically, do not modify ** 5 | # 6 | # Usage from an external project: 7 | # In your CMakeLists.txt, add these lines: 8 | # 9 | # find_package(Tesseract REQUIRED) 10 | # target_link_libraries(MY_TARGET_NAME Tesseract::libtesseract) 11 | # 12 | # This file will define the following variables: 13 | # - Tesseract_LIBRARIES : The list of all imported targets. 14 | # - Tesseract_INCLUDE_DIRS : The Tesseract include directories. 15 | # - Tesseract_LIBRARY_DIRS : The Tesseract library directories. 16 | # - Tesseract_VERSION : The version of this Tesseract build: "@VERSION_PLAIN@" 17 | # - Tesseract_VERSION_MAJOR : Major version part of Tesseract_VERSION: "@VERSION_MAJOR@" 18 | # - Tesseract_VERSION_MINOR : Minor version part of Tesseract_VERSION: "@VERSION_MINOR@" 19 | # - Tesseract_VERSION_PATCH : Patch version part of Tesseract_VERSION: "@VERSION_PATCH@" 20 | # 21 | # =================================================================================== 22 | 23 | include(CMakeFindDependencyMacro) 24 | find_dependency(Leptonica) 25 | 26 | include(${CMAKE_CURRENT_LIST_DIR}/TesseractTargets.cmake) 27 | 28 | @PACKAGE_INIT@ 29 | 30 | SET(Tesseract_VERSION @VERSION_PLAIN@) 31 | SET(Tesseract_VERSION_MAJOR @VERSION_MAJOR@) 32 | SET(Tesseract_VERSION_MINOR @VERSION_MINOR@) 33 | SET(Tesseract_VERSION_PATCH @VERSION_PATCH@) 34 | 35 | set_and_check(Tesseract_INCLUDE_DIRS "@PACKAGE_INCLUDE_DIR@") 36 | set_and_check(Tesseract_LIBRARY_DIRS "@PACKAGE_LIBRARY_DIRS@") 37 | set(Tesseract_LIBRARIES @tesseract_OUTPUT_NAME@) 38 | 39 | check_required_components(Tesseract) 40 | -------------------------------------------------------------------------------- /doc/mftraining.1.asc: -------------------------------------------------------------------------------- 1 | MFTRAINING(1) 2 | ============= 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | mftraining - feature training for Tesseract 8 | 9 | SYNOPSIS 10 | -------- 11 | mftraining -U 'unicharset' -O 'lang.unicharset' 'FILE'... 12 | 13 | DESCRIPTION 14 | ----------- 15 | mftraining takes a list of .tr files, from which it generates the 16 | files *inttemp* (the shape prototypes), *shapetable*, and *pffmtable* 17 | (the number of expected features for each character). (A fourth file 18 | called Microfeat is also written by this program, but it is not used.) 19 | 20 | OPTIONS 21 | ------- 22 | -U 'FILE':: 23 | (Input) The unicharset generated by unicharset_extractor(1) 24 | 25 | -F 'font_properties_file':: 26 | (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: 27 | 28 | *font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* 29 | 30 | -X 'xheights_file':: 31 | (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] 32 | 33 | *font_name* *xheight* 34 | 35 | -D 'dir':: 36 | Directory to write output files to. 37 | 38 | -O 'FILE':: 39 | (Output) The output unicharset that will be given to combine_tessdata(1) 40 | 41 | SEE ALSO 42 | -------- 43 | tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), 44 | shapeclustering(1), unicharset(5) 45 | 46 | 47 | 48 | COPYING 49 | ------- 50 | Copyright \(C) Hewlett-Packard Company, 1988 51 | Licensed under the Apache License, Version 2.0 52 | 53 | AUTHOR 54 | ------ 55 | The Tesseract OCR engine was written by Ray Smith and his research groups 56 | at Hewlett Packard (1985-1995) and Google (2006-2018). 57 | -------------------------------------------------------------------------------- /src/dict/dawg_cache.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dawg_cache.h 3 | // Description: A class that knows about loading and caching dawgs. 4 | // Author: David Eger 5 | // Created: Fri Jan 27 12:08:00 PST 2012 6 | // 7 | // (C) Copyright 2012, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #ifndef TESSERACT_DICT_DAWG_CACHE_H_ 21 | #define TESSERACT_DICT_DAWG_CACHE_H_ 22 | 23 | #include "dawg.h" 24 | #include "object_cache.h" 25 | #include "tessdatamanager.h" 26 | 27 | namespace tesseract { 28 | 29 | class DawgCache { 30 | public: 31 | Dawg *GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level, 32 | TessdataManager *data_file); 33 | 34 | // If we manage the given dawg, decrement its count, 35 | // and possibly delete it if the count reaches zero. 36 | // If dawg is unknown to us, return false. 37 | bool FreeDawg(Dawg *dawg) { 38 | return dawgs_.Free(dawg); 39 | } 40 | 41 | // Free up any currently unused dawgs. 42 | void DeleteUnusedDawgs() { 43 | dawgs_.DeleteUnusedObjects(); 44 | } 45 | 46 | private: 47 | ObjectCache dawgs_; 48 | }; 49 | 50 | } // namespace tesseract 51 | 52 | #endif // TESSERACT_DICT_DAWG_CACHE_H_ 53 | -------------------------------------------------------------------------------- /unittest/cycletimer.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | // Portability include to match the Google test environment. 12 | 13 | #ifndef TESSERACT_UNITTEST_CYCLETIMER_H 14 | #define TESSERACT_UNITTEST_CYCLETIMER_H 15 | 16 | #include // for std::chrono 17 | 18 | // See https://github.com/google/or-tools/blob/master/ortools/base/timer.h 19 | class CycleTimer { 20 | private: 21 | static int64_t now() { 22 | return std::chrono::duration_cast( 23 | std::chrono::steady_clock::now().time_since_epoch()).count(); 24 | } 25 | 26 | public: 27 | CycleTimer() { 28 | Reset(); 29 | } 30 | 31 | void Reset() { 32 | running_ = false; 33 | sum_ = 0; 34 | start_ = 0; 35 | } 36 | 37 | // When Start() is called multiple times, only the most recent is used. 38 | void Start() { 39 | running_ = true; 40 | start_ = now(); 41 | } 42 | 43 | void Restart() { 44 | sum_ = 0; 45 | Start(); 46 | } 47 | 48 | void Stop() { 49 | if (running_) { 50 | sum_ += now() - start_; 51 | running_ = false; 52 | } 53 | } 54 | int64_t GetInMs() const { 55 | return running_ ? now() - start_ + sum_ : sum_; 56 | } 57 | 58 | private: 59 | bool running_; 60 | int64_t start_; 61 | int64_t sum_; 62 | }; 63 | 64 | #endif // TESSERACT_UNITTEST_CYCLETIMER_H 65 | -------------------------------------------------------------------------------- /src/training/pango/tlog.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: tlog.h 3 | * Description: Variant of printf with logging level controllable by a 4 | * commandline flag. 5 | * Author: Ranjith Unnikrishnan 6 | * Created: Wed Nov 20 2013 7 | * 8 | * (C) Copyright 2013, Google Inc. 9 | ** Licensed under the Apache License, Version 2.0 (the "License"); 10 | ** you may not use this file except in compliance with the License. 11 | ** You may obtain a copy of the License at 12 | ** http://www.apache.org/licenses/LICENSE-2.0 13 | ** Unless required by applicable law or agreed to in writing, software 14 | ** distributed under the License is distributed on an "AS IS" BASIS, 15 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | ** See the License for the specific language governing permissions and 17 | ** limitations under the License. 18 | * 19 | **********************************************************************/ 20 | #ifndef TESSERACT_TRAINING_TLOG_H_ 21 | #define TESSERACT_TRAINING_TLOG_H_ 22 | 23 | #include "export.h" 24 | 25 | #include "commandlineflags.h" 26 | #include "errcode.h" 27 | #include "tprintf.h" 28 | 29 | TESS_PANGO_TRAINING_API 30 | DECLARE_INT_PARAM_FLAG(tlog_level); 31 | 32 | // Variant guarded by the numeric logging level parameter FLAGS_tlog_level 33 | // (default 0). Code using ParseCommandLineFlags() can control its value using 34 | // the --tlog_level commandline argument. Otherwise it must be specified in a 35 | // config file like other params. 36 | #define tlog(level, ...) \ 37 | { \ 38 | if (FLAGS_tlog_level >= level) { \ 39 | tprintf(__VA_ARGS__); \ 40 | } \ 41 | } 42 | 43 | #define TLOG_IS_ON(level) (FLAGS_tlog_level >= level) 44 | 45 | #endif // TESSERACT_TRAINING_TLOG_H_ 46 | -------------------------------------------------------------------------------- /src/training/unicharset/validate_indic.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_INDIC_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments Indic scripts in the 9 | // unicode range 0x900-0xdff (Devanagari-Sinhala). 10 | class ValidateIndic : public Validator { 11 | public: 12 | ValidateIndic(ViramaScript script, bool report_errors) : Validator(script, report_errors) {} 13 | ~ValidateIndic() override = default; 14 | 15 | protected: 16 | // Returns whether codes matches the pattern for an Indic Grapheme. 17 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 18 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 19 | // otherwise does not increment codes_used_. 20 | bool ConsumeGraphemeIfValid() override; 21 | // Returns the CharClass corresponding to the given Unicode ch. 22 | Validator::CharClass UnicodeToCharClass(char32 ch) const override; 23 | 24 | private: 25 | // Helper consumes/copies a virama and any associated post-virama joiners. 26 | bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); 27 | // Helper consumes/copies a series of consonants separated by viramas while 28 | // valid, but not any vowel or other modifiers. 29 | bool ConsumeConsonantHeadIfValid(); 30 | // Helper consumes/copies a tail part of a consonant, comprising optional 31 | // matra/piece, vowel modifier, vedic mark, terminating virama. 32 | bool ConsumeConsonantTailIfValid(); 33 | // Helper consumes/copies a vowel and optional modifiers. 34 | bool ConsumeVowelIfValid(); 35 | 36 | // Some special unicodes used only for Indic processing. 37 | static const char32 kYayana = 0xdba; // Sinhala Ya 38 | static const char32 kRayana = 0xdbb; // Sinhala Ra 39 | }; 40 | 41 | } // namespace tesseract 42 | 43 | #endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_ 44 | -------------------------------------------------------------------------------- /src/ccstruct/image.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: image.h 3 | // Description: Image wrapper. 4 | // 5 | // Licensed under the Apache License, Version 2.0 (the "License"); 6 | // you may not use this file except in compliance with the License. 7 | // You may obtain a copy of the License at 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | /////////////////////////////////////////////////////////////////////// 15 | 16 | #ifndef TESSERACT_CCSTRUCT_IMAGE_H_ 17 | #define TESSERACT_CCSTRUCT_IMAGE_H_ 18 | 19 | #include 20 | 21 | struct Pix; 22 | 23 | namespace tesseract { 24 | 25 | class TESS_API Image { 26 | public: 27 | Pix *pix_ = nullptr; 28 | 29 | public: 30 | Image() = default; 31 | Image(Pix *pix) : pix_(pix) {} 32 | 33 | // service 34 | bool operator==(decltype(nullptr)) const { return pix_ == nullptr; } 35 | bool operator!=(decltype(nullptr)) const { return pix_ != nullptr; } 36 | explicit operator bool() const { return pix_ != nullptr; } 37 | operator Pix *() const { return pix_; } 38 | explicit operator Pix **() { return &pix_; } 39 | Pix *operator->() const { return pix_; } 40 | 41 | // api 42 | Image clone() const; // increases refcount 43 | Image copy() const; // does full copy 44 | void destroy(); 45 | bool isZero() const; 46 | 47 | // ops 48 | Image operator|(Image) const; 49 | Image &operator|=(Image); 50 | Image operator&(Image) const; 51 | Image &operator&=(Image); 52 | }; 53 | 54 | } // namespace tesseract 55 | 56 | #endif // TESSERACT_CCSTRUCT_IMAGE_H_ 57 | -------------------------------------------------------------------------------- /doc/shapeclustering.1.asc: -------------------------------------------------------------------------------- 1 | SHAPECLUSTERING(1) 2 | ================== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | shapeclustering - shape clustering training for Tesseract 8 | 9 | SYNOPSIS 10 | -------- 11 | shapeclustering -D 'output_dir' 12 | -U 'unicharset' -O 'mfunicharset' 13 | -F 'font_props' -X 'xheights' 14 | 'FILE'... 15 | 16 | DESCRIPTION 17 | ----------- 18 | shapeclustering(1) takes extracted feature .tr files (generated by 19 | tesseract(1) run in a special mode from box files) and produces a 20 | file *shapetable* and an enhanced unicharset. This program is still 21 | experimental, and is not required (yet) for training Tesseract. 22 | 23 | OPTIONS 24 | ------- 25 | -U 'FILE':: 26 | The unicharset generated by unicharset_extractor(1). 27 | 28 | -D 'dir':: 29 | Directory to write output files to. 30 | 31 | -F 'font_properties_file':: 32 | (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: 33 | 34 | 'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur' 35 | 36 | -X 'xheights_file':: 37 | (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] 38 | 39 | 'font_name' 'xheight' 40 | 41 | -O 'FILE':: 42 | The output unicharset that will be given to combine_tessdata(1). 43 | 44 | SEE ALSO 45 | -------- 46 | tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), 47 | unicharset(5) 48 | 49 | 50 | 51 | COPYING 52 | ------- 53 | Copyright \(C) Google, 2011 54 | Licensed under the Apache License, Version 2.0 55 | 56 | AUTHOR 57 | ------ 58 | The Tesseract OCR engine was written by Ray Smith and his research groups 59 | at Hewlett Packard (1985-1995) and Google (2006-2018). 60 | -------------------------------------------------------------------------------- /src/textord/blobgrid.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: blobgrid.cpp 3 | // Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods. 4 | // Copyright 2011 Google Inc. All Rights Reserved. 5 | // Author: rays@google.com (Ray Smith) 6 | // Created: Sat Jun 11 10:30:01 PST 2011 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #include "blobgrid.h" 21 | 22 | namespace tesseract { 23 | 24 | BlobGrid::BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright) 25 | : BBGrid(gridsize, bleft, tright) {} 26 | 27 | // Destructor. 28 | // It is defined here, so the compiler can create a single vtable 29 | // instead of weak vtables in every compilation unit. 30 | BlobGrid::~BlobGrid() = default; 31 | 32 | // Inserts all the blobs from the given list, with x and y spreading, 33 | // without removing from the source list, so ownership remains with the 34 | // source list. 35 | void BlobGrid::InsertBlobList(BLOBNBOX_LIST *blobs) { 36 | BLOBNBOX_IT blob_it(blobs); 37 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 38 | BLOBNBOX *blob = blob_it.data(); 39 | if (!blob->joined_to_prev()) { 40 | InsertBBox(true, true, blob); 41 | } 42 | } 43 | } 44 | 45 | } // namespace tesseract. 46 | -------------------------------------------------------------------------------- /doc/lstmeval.1.asc: -------------------------------------------------------------------------------- 1 | LSTMEVAL(1) 2 | =========== 3 | :doctype: manpage 4 | 5 | NAME 6 | ---- 7 | lstmeval - Evaluation program for LSTM-based networks. 8 | 9 | SYNOPSIS 10 | -------- 11 | *lstmeval* --model 'lang.lstm|modelname_checkpoint|modelname_N.NN_NN_NN.checkpoint' [--traineddata lang/lang.traineddata] --eval_listfile 'lang.eval_files.txt' [--verbosity N] [--max_image_MB NNNN] 12 | 13 | DESCRIPTION 14 | ----------- 15 | lstmeval(1) evaluates LSTM-based networks. Either a recognition model or a training checkpoint can be given as input for evaluation along with a list of lstmf files. If evaluating a training checkpoint, '--traineddata' should also be specified. Intermediate training checkpoints can also be used. 16 | 17 | OPTIONS 18 | ------- 19 | '--model FILE':: 20 | Name of model file (training or recognition) (type:string default:) 21 | 22 | '--traineddata FILE':: 23 | If model is a training checkpoint, then traineddata must be the traineddata file that was given to the trainer (type:string default:) 24 | 25 | '--eval_listfile FILE':: 26 | File listing sample files in lstmf training format. (type:string default:) 27 | 28 | '--max_image_MB INT':: 29 | Max memory to use for images. (type:int default:2000) 30 | 31 | '--verbosity INT':: 32 | Amount of diagnosting information to output (0-2). (type:int default:1) 33 | 34 | HISTORY 35 | ------- 36 | lstmeval(1) was first made available for tesseract4.00.00alpha. 37 | 38 | RESOURCES 39 | --------- 40 | Main web site: + 41 | Information on training tesseract LSTM: 42 | 43 | SEE ALSO 44 | -------- 45 | tesseract(1) 46 | 47 | COPYING 48 | ------- 49 | Copyright \(C) 2012 Google, Inc. 50 | Licensed under the Apache License, Version 2.0 51 | 52 | AUTHOR 53 | ------ 54 | The Tesseract OCR engine was written by Ray Smith and his research groups 55 | at Hewlett Packard (1985-1995) and Google (2006-2018). 56 | -------------------------------------------------------------------------------- /unittest/unichar_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include 13 | #include "gmock/gmock.h" // for testing::ElementsAreArray 14 | #include "include_gunit.h" 15 | 16 | namespace tesseract { 17 | 18 | TEST(UnicharTest, Conversion) { 19 | // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8 20 | // show the required conversion properties. 21 | // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes. 22 | const char *kUTF8Src = "a\u05d0\u0ca4\U0002a714"; 23 | const std::vector kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714}; 24 | // Check for round-trip conversion. 25 | std::vector utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src); 26 | EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src)); 27 | std::string utf8 = UNICHAR::UTF32ToUTF8(utf32); 28 | EXPECT_STREQ(kUTF8Src, utf8.c_str()); 29 | } 30 | 31 | TEST(UnicharTest, InvalidText) { 32 | // This test verifies that Unichar correctly deals with invalid text. 33 | const char *kInvalidUTF8 = "a b\200d string"; 34 | const std::vector kInvalidUTF32 = {'a', ' ', 0x200000, 'x'}; 35 | // Invalid utf8 produces an empty vector. 36 | std::vector utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8); 37 | EXPECT_TRUE(utf32.empty()); 38 | // Invalid utf32 produces an empty string. 39 | std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32); 40 | EXPECT_TRUE(utf8.empty()); 41 | } 42 | 43 | } // namespace tesseract 44 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVEmptyMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.ScrollView; 23 | import com.google.scrollview.events.SVEvent; 24 | import com.google.scrollview.events.SVEventType; 25 | 26 | import javax.swing.JMenuItem; 27 | 28 | /** 29 | * Constructs a new menulistitem which just has an ID and a name attached to 30 | * it. In this case, we will have to ask for the value of the item and its 31 | * description if it gets called. 32 | */ 33 | class SVEmptyMenuItem extends SVAbstractMenuItem { 34 | SVEmptyMenuItem(int id, String name) { 35 | super(id, name, new JMenuItem(name)); 36 | } 37 | /** What to do when user clicks on this item. */ 38 | @Override 39 | public void performAction(SVWindow window, SVEventType eventType) { 40 | // Send an event indicating that someone clicked on an entry. 41 | // Value will be null here. 42 | SVEvent svme = 43 | new SVEvent(eventType, window, id, getValue()); 44 | ScrollView.addMessage(svme); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/dict/matchdefs.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: matchdefs.h 3 | ** Purpose: Generic interface definitions for feature matchers. 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | 18 | #ifndef MATCHDEFS_H 19 | #define MATCHDEFS_H 20 | 21 | #include 22 | 23 | #include // INT16_MAX 24 | #include // int16_t 25 | 26 | namespace tesseract { 27 | 28 | /* define the maximum number of classes defined for any matcher 29 | and the maximum class id for any matcher. This must be changed 30 | if more different classes need to be classified */ 31 | #define MAX_NUM_CLASSES INT16_MAX 32 | 33 | /** a CLASS_ID is the ascii character to be associated with a class */ 34 | using CLASS_ID = UNICHAR_ID; 35 | #define NO_CLASS (0) 36 | 37 | /** a PROTO_ID is the index of a prototype within it's class. Valid proto 38 | id's are 0 to N-1 where N is the number of prototypes that make up the 39 | class. */ 40 | using PROTO_ID = int16_t; 41 | #define NO_PROTO (-1) 42 | 43 | /** FEATURE_ID is the index of a feature within a character description 44 | The feature id ranges from 0 to N-1 where N is the number 45 | of features in a character description. */ 46 | using FEATURE_ID = uint8_t; 47 | 48 | } // namespace tesseract 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/training/unicharset/validate_myanmar.h: -------------------------------------------------------------------------------- 1 | #ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ 2 | #define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ 3 | 4 | #include "validator.h" 5 | 6 | namespace tesseract { 7 | 8 | // Subclass of Validator that validates and segments Myanmar. 9 | class ValidateMyanmar : public Validator { 10 | public: 11 | ValidateMyanmar(ViramaScript script, bool report_errors) : Validator(script, report_errors) {} 12 | ~ValidateMyanmar() override = default; 13 | 14 | protected: 15 | // Returns whether codes matches the pattern for a Myanmar Grapheme. 16 | // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to 17 | // parts_ and output_. Returns true if a valid Grapheme was consumed, 18 | // otherwise does not increment codes_used_. 19 | bool ConsumeGraphemeIfValid() override; 20 | // Returns the CharClass corresponding to the given Unicode ch. 21 | Validator::CharClass UnicodeToCharClass(char32 ch) const override; 22 | 23 | private: 24 | // Helper consumes/copies a virama and any subscript consonant. 25 | // Returns true if the end of input is reached. 26 | bool ConsumeSubscriptIfPresent(); 27 | // Helper consumes/copies a series of optional signs. 28 | // Returns true if the end of input is reached. 29 | bool ConsumeOptionalSignsIfPresent(); 30 | // Returns true if the unicode is a Myanmar "letter" including consonants 31 | // and independent vowels. Although table 16-3 distinguishes between some 32 | // base consonants and vowels, the extensions make no such distinction, so we 33 | // put them all into a single bucket. 34 | static bool IsMyanmarLetter(char32 ch); 35 | // Returns true if ch is a Myanmar digit or other symbol that does not take 36 | // part in being a syllable. 37 | static bool IsMyanmarOther(char32 ch); 38 | 39 | // Some special unicodes used only for Myanmar processing. 40 | static const char32 kMyanmarAsat = 0x103a; 41 | static const char32 kMyanmarMedialYa = 0x103b; 42 | }; 43 | 44 | } // namespace tesseract 45 | 46 | #endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ 47 | -------------------------------------------------------------------------------- /unittest/lstm_recode_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "lstm_test.h" 13 | 14 | namespace tesseract { 15 | 16 | // Tests that training with unicharset recoding learns faster than without, 17 | // for Korean. This test is split in two, so it can be run sharded. 18 | 19 | TEST_F(LSTMTrainerTest, RecodeTestKorBase) { 20 | // A basic single-layer, bi-di 1d LSTM on Korean. 21 | SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-full", "kor/kor.unicharset", 22 | "kor.Arial_Unicode_MS.exp0.lstmf", false, true, 5e-4, false, "kor"); 23 | double kor_full_err = TrainIterations(kTrainerIterations * 2); 24 | EXPECT_LT(kor_full_err, 88); 25 | // EXPECT_GT(kor_full_err, 85); 26 | LOG(INFO) << "********** Expected < 88 ************\n"; 27 | } 28 | 29 | TEST_F(LSTMTrainerTest, RecodeTestKor) { 30 | // A basic single-layer, bi-di 1d LSTM on Korean. 31 | SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-recode", "kor/kor.unicharset", 32 | "kor.Arial_Unicode_MS.exp0.lstmf", true, true, 5e-4, false, "kor"); 33 | double kor_recode_err = TrainIterations(kTrainerIterations); 34 | EXPECT_LT(kor_recode_err, 60); 35 | LOG(INFO) << "********** Expected < 60 ************\n"; 36 | } 37 | 38 | // Tests that the given string encodes and decodes back to the same 39 | // with both recode on and off for Korean. 40 | 41 | TEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) { 42 | TestEncodeDecodeBoth("kor", "한국어 위키백과에 오신 것을 환영합니다!"); 43 | } 44 | 45 | } // namespace tesseract. 46 | -------------------------------------------------------------------------------- /unittest/syntaxnet/base.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2016 Google Inc. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef SYNTAXNET_BASE_H_ 17 | #define SYNTAXNET_BASE_H_ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #ifdef INCLUDE_TENSORFLOW 27 | 28 | #include "google/protobuf/util/message_differencer.h" 29 | 30 | #include "tensorflow/core/lib/core/status.h" 31 | #include "tensorflow/core/lib/strings/strcat.h" 32 | #include "tensorflow/core/lib/strings/stringprintf.h" 33 | #include "tensorflow/core/platform/default/integral_types.h" 34 | #include "tensorflow/core/platform/mutex.h" 35 | #include "tensorflow/core/platform/protobuf.h" 36 | 37 | #endif 38 | 39 | using std::map; 40 | using std::pair; 41 | using std::unordered_map; 42 | using std::unordered_set; 43 | using std::vector; 44 | #ifdef INCLUDE_TENSORFLOW 45 | using tensorflow::int16; 46 | using tensorflow::int32; 47 | using tensorflow::int64; 48 | using tensorflow::int8; 49 | using tensorflow::mutex; 50 | using tensorflow::mutex_lock; 51 | using tensorflow::uint16; 52 | using tensorflow::uint32; 53 | using tensorflow::uint64; 54 | using tensorflow::uint8; 55 | using tensorflow::protobuf::TextFormat; 56 | #endif 57 | typedef signed int char32; 58 | 59 | using std::string; 60 | #ifdef INCLUDE_TENSORFLOW 61 | using tensorflow::StringPiece; 62 | #endif 63 | 64 | // namespace syntaxnet 65 | 66 | #endif // SYNTAXNET_BASE_H_ 67 | -------------------------------------------------------------------------------- /unittest/fileio_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include 13 | #include 14 | 15 | #include "fileio.h" 16 | #include "include_gunit.h" 17 | 18 | namespace tesseract { 19 | 20 | TEST(FileTest, JoinPath) { 21 | EXPECT_EQ("/abc/def", File::JoinPath("/abc", "def")); 22 | EXPECT_EQ("/abc/def", File::JoinPath("/abc/", "def")); 23 | EXPECT_EQ("def", File::JoinPath("", "def")); 24 | } 25 | 26 | TEST(OutputBufferTest, WriteString) { 27 | const int kMaxBufSize = 128; 28 | char buffer[kMaxBufSize]; 29 | for (char &i : buffer) { 30 | i = '\0'; 31 | } 32 | FILE *fp = tmpfile(); 33 | CHECK(fp != nullptr); 34 | 35 | auto output = std::make_unique(fp); 36 | output->WriteString("Hello "); 37 | output->WriteString("world!"); 38 | 39 | rewind(fp); 40 | auto s = "Hello world!"; 41 | fread(buffer, strlen(s), 1, fp); 42 | EXPECT_STREQ(s, buffer); 43 | } 44 | 45 | TEST(InputBufferTest, Read) { 46 | const int kMaxBufSize = 128; 47 | char buffer[kMaxBufSize]; 48 | auto s = "Hello\n world!"; 49 | strncpy(buffer, s, kMaxBufSize); 50 | EXPECT_STREQ(s, buffer); 51 | FILE *fp = tmpfile(); 52 | CHECK(fp != nullptr); 53 | fwrite(buffer, strlen(s), 1, fp); 54 | rewind(fp); 55 | 56 | std::string str; 57 | auto input = std::make_unique(fp); 58 | EXPECT_TRUE(input->Read(&str)); 59 | std::vector lines = split(str, '\n'); 60 | EXPECT_EQ(2, lines.size()); 61 | EXPECT_EQ("Hello", lines[0]); 62 | EXPECT_EQ(" world!", lines[1]); 63 | } 64 | 65 | } // namespace tesseract 66 | -------------------------------------------------------------------------------- /src/ccmain/pgedit.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: pgedit.h 3 | // Description: Page structure file editor 4 | // Author: Joern Wanke 5 | // 6 | // (C) Copyright 2007, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #ifndef PGEDIT_H 20 | #define PGEDIT_H 21 | 22 | #include "params.h" // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam 23 | #include "scrollview.h" // for SVEvent (ptr only), SVEventHandler, ScrollView 24 | 25 | namespace tesseract { 26 | 27 | class BLOCK_LIST; 28 | class PAGE_RES; 29 | 30 | class Tesseract; 31 | 32 | #ifndef GRAPHICS_DISABLED 33 | // A small event handler class to process incoming events to 34 | // this window. 35 | class PGEventHandler : public SVEventHandler { 36 | public: 37 | PGEventHandler(tesseract::Tesseract *tess) : tess_(tess) {} 38 | void Notify(const SVEvent *sve) override; 39 | 40 | private: 41 | tesseract::Tesseract *tess_; 42 | }; 43 | #endif // !GRAPHICS_DISABLED 44 | 45 | extern BLOCK_LIST *current_block_list; 46 | extern STRING_VAR_H(editor_image_win_name); 47 | extern INT_VAR_H(editor_image_xpos); 48 | extern INT_VAR_H(editor_image_ypos); 49 | extern INT_VAR_H(editor_image_word_bb_color); 50 | extern INT_VAR_H(editor_image_blob_bb_color); 51 | extern STRING_VAR_H(editor_word_name); 52 | extern INT_VAR_H(editor_word_xpos); 53 | extern INT_VAR_H(editor_word_ypos); 54 | extern INT_VAR_H(editor_word_height); 55 | extern INT_VAR_H(editor_word_width); 56 | 57 | } // namespace tesseract 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | # Windows 3 | *.user.* 4 | *.idea* 5 | *.log 6 | *.tlog 7 | *.cache 8 | *.obj 9 | *.sdf 10 | *.opensdf 11 | *.lastbuildstate 12 | *.unsuccessfulbuild 13 | *.suo 14 | *.res 15 | *.ipch 16 | *.manifest 17 | 18 | # Linux 19 | # ignore local configuration 20 | config.* 21 | config/* 22 | Makefile 23 | Makefile.in 24 | *.m4 25 | 26 | # ignore help scripts/files 27 | configure 28 | libtool 29 | stamp-h1 30 | tesseract.pc 31 | config_auto.h 32 | /doc/html/* 33 | /doc/*.1 34 | /doc/*.5 35 | /doc/*.html 36 | /doc/*.xml 37 | 38 | # generated version file 39 | /include/tesseract/version.h 40 | 41 | # executables 42 | /tesseract 43 | /src/training/ambiguous_words 44 | /src/training/classifier_tester 45 | /src/training/cntraining 46 | /src/training/combine_tessdata 47 | /src/training/dawg2wordlist 48 | /src/training/merge_unicharsets 49 | /src/training/mftraining 50 | /src/training/set_unicharset_properties 51 | /src/training/shapeclustering 52 | /src/training/text2image 53 | /src/training/unicharset_extractor 54 | /src/training/wordlist2dawg 55 | 56 | *.patch 57 | 58 | # files generated by libtool 59 | /src/training/combine_lang_model 60 | /src/training/lstmeval 61 | /src/training/lstmtraining 62 | 63 | # ignore compilation files 64 | build/* 65 | /bin 66 | /cmake-* 67 | .deps 68 | .dirstamp 69 | /.libs 70 | */.libs/* 71 | */*/.deps/* 72 | */*/.libs/* 73 | *.lo 74 | *.la 75 | *.o 76 | *.Plo 77 | *.a 78 | *.class 79 | *.jar 80 | __pycache__ 81 | 82 | # tessdata 83 | *.traineddata 84 | tessdata_* 85 | 86 | # build dirs 87 | /build* 88 | /*.dll 89 | /*.lib 90 | /*.exe 91 | /*.lnk 92 | /win* 93 | .vs* 94 | .s* 95 | 96 | # files generated by "make check" 97 | /tests/.dirstamp 98 | /unittest/*.trs 99 | /unittest/tmp/* 100 | 101 | # test programs 102 | /unittest/*_test 103 | /unittest/primesbitvector 104 | /unittest/primesmap 105 | 106 | # generated files from unlvtests 107 | times.txt 108 | /unlvtests/results* 109 | 110 | # snap packaging specific rules 111 | /parts/ 112 | /stage/ 113 | /prime/ 114 | /snap/.snapcraft/ 115 | 116 | /*.snap 117 | /*_source.tar.bz2 118 | -------------------------------------------------------------------------------- /unittest/validate_khmer_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "include_gunit.h" 13 | #include "normstrngs.h" 14 | #include "normstrngs_test.h" 15 | 16 | namespace tesseract { 17 | 18 | // Test some random Khmer words. 19 | TEST(ValidateKhmerTest, GoodKhmerWords) { 20 | std::string str = "ព័ត៏មានប្លែកៗ"; 21 | ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 13, 12, 7, str); 22 | str = "ទំនុកច្រៀង"; 23 | ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 10, 9, 5, str); 24 | str = "កាលីហ្វូញ៉ា"; 25 | ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 4, str); 26 | str = "ចាប់ពីផ្លូវ"; 27 | ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 5, str); 28 | } 29 | 30 | // Test some random Khmer words with dotted circles. 31 | TEST(ValidateKhmerTest, BadKhmerWords) { 32 | std::string result; 33 | // Multiple dependent vowels not allowed 34 | std::string str = "\u1796\u17b6\u17b7"; 35 | EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, 36 | str.c_str(), &result)); 37 | // Multiple shifters not allowed 38 | str = "\u1798\u17c9\u17ca"; 39 | EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, 40 | str.c_str(), &result)); 41 | // Multiple signs not allowed 42 | str = "\u1780\u17b6\u17cb\u17cd"; 43 | EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, 44 | str.c_str(), &result)); 45 | } 46 | 47 | } // namespace tesseract 48 | -------------------------------------------------------------------------------- /src/ccstruct/quadratc.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: quadratc.h (Formerly quadrtic.h) 3 | * Description: Code for the QUAD_COEFFS class. 4 | * Author: Ray Smith 5 | * Created: Tue Oct 08 17:24:40 BST 1991 6 | * 7 | * (C) Copyright 1991, Hewlett-Packard Ltd. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | **********************************************************************/ 19 | 20 | #ifndef QUADRATC_H 21 | #define QUADRATC_H 22 | 23 | #include "points.h" 24 | 25 | namespace tesseract { 26 | 27 | class QUAD_COEFFS { 28 | public: 29 | QUAD_COEFFS() = default; 30 | QUAD_COEFFS( // constructor 31 | double xsq, // coefficients 32 | float x, float constant) { 33 | a = xsq; 34 | b = x; 35 | c = constant; 36 | } 37 | 38 | float y( // evaluate 39 | float x) const { // at x 40 | return static_cast((a * x + b) * x + c); 41 | } 42 | 43 | void move( // reposition word 44 | ICOORD vec) { // by vector 45 | /************************************************************ 46 | y - q = a (x - p)^2 + b (x - p) + c 47 | y - q = ax^2 - 2apx + ap^2 + bx - bp + c 48 | y = ax^2 + (b - 2ap)x + (c - bp + ap^2 + q) 49 | ************************************************************/ 50 | int16_t p = vec.x(); 51 | int16_t q = vec.y(); 52 | 53 | c = static_cast(c - b * p + a * p * p + q); 54 | b = static_cast(b - 2 * a * p); 55 | } 56 | 57 | double a; // x squared 58 | float b; // x 59 | float c; // constant 60 | private: 61 | }; 62 | 63 | } // namespace tesseract 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVAbstractMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.events.SVEventType; 23 | 24 | import javax.swing.JMenu; 25 | import javax.swing.JMenuItem; 26 | 27 | abstract class SVAbstractMenuItem { 28 | JMenuItem mi; 29 | public String name; 30 | public int id; 31 | 32 | /** 33 | * Sets the basic attributes for name, id and the corresponding swing item 34 | */ 35 | SVAbstractMenuItem(int id, String name, JMenuItem jmi) { 36 | this.mi = jmi; 37 | this.name = name; 38 | this.id = id; 39 | } 40 | 41 | /** Returns the actual value of the MenuListItem. */ 42 | public String getValue() { return null; } 43 | 44 | /** Adds a child entry to the submenu. */ 45 | public void add(SVAbstractMenuItem mli) { } 46 | 47 | /** Adds a child menu to the submenu (or root node). */ 48 | public void add(JMenu jli) { } 49 | 50 | /** 51 | * What to do when user clicks on this item. 52 | * @param window The window the event happened. 53 | * @param eventType What kind of event will be associated 54 | * (usually SVET_POPUP or SVET_MENU). 55 | */ 56 | public void performAction(SVWindow window, SVEventType eventType) {} 57 | } 58 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVCheckboxMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.ScrollView; 23 | import com.google.scrollview.events.SVEvent; 24 | import com.google.scrollview.events.SVEventType; 25 | 26 | import javax.swing.JCheckBoxMenuItem; 27 | 28 | /** 29 | * Constructs a new menulistitem which possesses a flag that can be toggled. 30 | */ 31 | class SVCheckboxMenuItem extends SVAbstractMenuItem { 32 | public boolean bvalue; 33 | 34 | SVCheckboxMenuItem(int id, String name, boolean val) { 35 | super(id, name, new JCheckBoxMenuItem(name, val)); 36 | bvalue = val; 37 | } 38 | 39 | /** What to do when user clicks on this item. */ 40 | @Override 41 | public void performAction(SVWindow window, SVEventType eventType) { 42 | // Checkbox entry - trigger and send event. 43 | if (bvalue) { 44 | bvalue = false; 45 | } else { 46 | bvalue = true; 47 | } 48 | SVEvent svme = new SVEvent(eventType, window, id, getValue()); 49 | ScrollView.addMessage(svme); 50 | } 51 | 52 | /** Returns the actual value of the MenuListItem. */ 53 | @Override 54 | public String getValue() { 55 | return Boolean.toString(bvalue); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/textord/sortflts.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: sortflts.h (Formerly sfloats.h) 3 | * Description: Code to maintain a sorted list of floats. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1993, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef SORTFLTS_H 20 | #define SORTFLTS_H 21 | 22 | #include "elst.h" 23 | 24 | namespace tesseract { 25 | 26 | class SORTED_FLOAT : public ELIST_LINK { 27 | friend class SORTED_FLOATS; 28 | 29 | public: 30 | SORTED_FLOAT() = default; 31 | SORTED_FLOAT( // create one 32 | float value, // value of entry 33 | int32_t key) { // reference 34 | entry = value; 35 | address = key; 36 | } 37 | 38 | private: 39 | float entry; // value of float 40 | int32_t address; // key 41 | }; 42 | 43 | ELISTIZEH(SORTED_FLOAT) 44 | class SORTED_FLOATS { 45 | public: 46 | /** empty constructor */ 47 | SORTED_FLOATS() { 48 | it.set_to_list(&list); 49 | } 50 | /** 51 | * add sample 52 | * @param value sample float 53 | * @param key retrieval key 54 | */ 55 | void add(float value, int32_t key); 56 | /** 57 | * delete sample 58 | * @param key key to delete 59 | */ 60 | void remove(int32_t key); 61 | /** 62 | * index to list 63 | * @param index item to get 64 | */ 65 | float operator[](int32_t index); 66 | 67 | private: 68 | SORTED_FLOAT_LIST list; // list of floats 69 | SORTED_FLOAT_IT it; // iterator built-in 70 | }; 71 | 72 | } // namespace tesseract 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /src/wordrec/render.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * 3 | * File: render.h 4 | * Description: Convert the various data type into line lists 5 | * Author: Mark Seaman, OCR Technology 6 | * 7 | * (c) Copyright 1989, Hewlett-Packard Company. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | * 18 | *****************************************************************************/ 19 | #ifndef RENDER_H 20 | #define RENDER_H 21 | 22 | #include "params.h" // for BOOL_VAR_H, BoolParam 23 | #include "scrollview.h" // ScrollView 24 | 25 | namespace tesseract { 26 | 27 | struct EDGEPT; 28 | struct TBLOB; 29 | struct TESSLINE; 30 | 31 | /*---------------------------------------------------------------------- 32 | V a r i a b l e s 33 | ----------------------------------------------------------------------*/ 34 | extern ScrollView *blob_window; // Window for blobs 35 | extern ScrollView::Color color_list[]; // Colors for outlines 36 | 37 | extern BOOL_VAR_H(wordrec_display_all_blobs); 38 | 39 | extern BOOL_VAR_H(wordrec_blob_pause); 40 | 41 | #define NUM_COLORS 6 42 | 43 | /*---------------------------------------------------------------------- 44 | F u n c t i o n s 45 | ----------------------------------------------------------------------*/ 46 | void display_blob(TBLOB *blob, ScrollView::Color color); 47 | 48 | void render_blob(ScrollView *window, TBLOB *blob, ScrollView::Color color); 49 | 50 | void render_edgepts(ScrollView *window, EDGEPT *edgept, ScrollView::Color color); 51 | 52 | void render_outline(ScrollView *window, TESSLINE *outline, ScrollView::Color color); 53 | 54 | } // namespace tesseract 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /unittest/util/utf8/unilib.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2010 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Author: sligocki@google.com (Shawn Ligocki) 18 | 19 | #include "util/utf8/unilib.h" 20 | 21 | #include "syntaxnet/base.h" 22 | #include "third_party/utf/utf.h" 23 | 24 | namespace UniLib { 25 | 26 | // Codepoints not allowed for interchange are: 27 | // C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020), 28 | // Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A), 29 | // Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D) 30 | // C1 controls: U+007F to U+009F 31 | // Surrogates: U+D800 to U+DFFF 32 | // Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx 33 | bool IsInterchangeValid(char32 c) { 34 | return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || 35 | (c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) || 36 | (c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE); 37 | } 38 | 39 | int SpanInterchangeValid(const char *begin, int byte_length) { 40 | char32 rune; 41 | const char *p = begin; 42 | const char *end = begin + byte_length; 43 | while (p < end) { 44 | int bytes_consumed = charntorune(&rune, p, end - p); 45 | // We want to accept Runeerror == U+FFFD as a valid char, but it is used 46 | // by chartorune to indicate error. Luckily, the real codepoint is size 3 47 | // while errors return bytes_consumed <= 1. 48 | if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) { 49 | break; // Found 50 | } 51 | p += bytes_consumed; 52 | } 53 | return p - begin; 54 | } 55 | 56 | } // namespace UniLib 57 | -------------------------------------------------------------------------------- /unittest/log.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: log.h 3 | // Description: Include for custom log message for unittest for tesseract. 4 | // based on 5 | // https://stackoverflow.com/questions/16491675/how-to-send-custom-message-in-google-c-testing-framework 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | /////////////////////////////////////////////////////////////////////// 17 | 18 | #ifndef TESSERACT_UNITTEST_LOG_H_ 19 | #define TESSERACT_UNITTEST_LOG_H_ 20 | 21 | // This is a minimal implementation of the TensorFlow logging API 22 | // which is sufficient for the Tesseract unit tests. 23 | 24 | // See tensorflow/core/platform/default/logging.h for the original code. 25 | 26 | #include 27 | 28 | enum LogLevel { INFO, WARNING, ERROR, FATAL }; 29 | 30 | // Avoid conflict with logging.h from TensorFlow. 31 | #undef LOG 32 | 33 | static inline std::ostream &LOG(enum LogLevel level) { 34 | switch (level) { 35 | case INFO: 36 | std::cout << "[INFO] "; 37 | break; 38 | case WARNING: 39 | std::cout << "[WARN] "; 40 | break; 41 | case ERROR: 42 | std::cout << "[ERROR] "; 43 | break; 44 | case FATAL: 45 | std::cout << "[FATAL] "; 46 | break; 47 | } 48 | return std::cout; 49 | } 50 | 51 | // Avoid conflict with logging.h from TensorFlow. 52 | #undef QCHECK 53 | 54 | // https://github.com/google/ion/blob/master/ion/base/logging.h 55 | static inline std::ostream &QCHECK(bool condition) { 56 | if (condition) { 57 | static std::ostream null_stream(nullptr); 58 | return null_stream; 59 | } 60 | return std::cout; 61 | } 62 | 63 | #endif // TESSERACT_UNITTEST_LOG_H_ 64 | -------------------------------------------------------------------------------- /src/ccstruct/quadlsq.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: quadlsq.h (Formerly qlsq.h) 3 | * Description: Code for least squares approximation of quadratics. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1993, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef QUADLSQ_H 20 | #define QUADLSQ_H 21 | 22 | #include "points.h" 23 | 24 | namespace tesseract { 25 | 26 | class QLSQ { 27 | public: 28 | QLSQ() { // constructor 29 | clear(); // set to zeros 30 | } 31 | void clear(); // initialize 32 | 33 | void add( // add element 34 | double x, // coords to add 35 | double y); 36 | void remove( // delete element 37 | double x, // coords to delete 38 | double y); 39 | int32_t count() { // no of elements 40 | return n; 41 | } 42 | 43 | void fit( // fit the given 44 | int degree); // return actual 45 | double get_a() const { // get x squard 46 | return a; 47 | } 48 | double get_b() const { // get x squard 49 | return b; 50 | } 51 | double get_c() const { // get x squard 52 | return c; 53 | } 54 | 55 | private: 56 | int32_t n; // no of elements 57 | double a, b, c; // result 58 | double sigx; // sum of x 59 | double sigy; // sum of y 60 | double sigxx; // sum x squared 61 | double sigxy; // sum of xy 62 | double sigyy; // sum y squared 63 | long double sigxxx; // sum x cubed 64 | long double sigxxy; // sum xsquared y 65 | long double sigxxxx; // sum x fourth 66 | }; 67 | 68 | } // namespace tesseract 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /src/textord/underlin.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * File: underlin.h (Formerly undrline.h) 3 | * Description: Code to chop blobs apart from underlines. 4 | * Author: Ray Smith 5 | * 6 | * (C) Copyright 1994, Hewlett-Packard Ltd. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | * 17 | **********************************************************************/ 18 | 19 | #ifndef UNDERLIN_H 20 | #define UNDERLIN_H 21 | 22 | #include "fpchop.h" 23 | 24 | namespace tesseract { 25 | 26 | extern double_VAR_H(textord_underline_offset); 27 | extern BOOL_VAR_H(textord_restore_underlines); 28 | void restore_underlined_blobs( // get chop points 29 | TO_BLOCK *block // block to do 30 | ); 31 | TO_ROW *most_overlapping_row( // find best row 32 | TO_ROW_LIST *rows, // list of rows 33 | BLOBNBOX *blob // blob to place 34 | ); 35 | void find_underlined_blobs( // get chop points 36 | BLOBNBOX *u_line, // underlined unit 37 | QSPLINE *baseline, // actual baseline 38 | float xheight, // height of line 39 | float baseline_offset, // amount to shrinke it 40 | ICOORDELT_LIST *chop_cells // places to chop 41 | ); 42 | void vertical_cunderline_projection( // project outlines 43 | C_OUTLINE *outline, // outline to project 44 | QSPLINE *baseline, // actual baseline 45 | float xheight, // height of line 46 | float baseline_offset, // amount to shrinke it 47 | STATS *lower_proj, // below baseline 48 | STATS *middle_proj, // centre region 49 | STATS *upper_proj // top region 50 | ); 51 | 52 | } // namespace tesseract 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /m4/ax_check_compile_flag.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check whether the given FLAG works with the current language's compiler 12 | # or gives an error. (Warnings, however, are ignored) 13 | # 14 | # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on 15 | # success/failure. 16 | # 17 | # If EXTRA-FLAGS is defined, it is added to the current language's default 18 | # flags (e.g. CFLAGS) when the check is done. The check is thus made with 19 | # the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to 20 | # force the compiler to issue an error when a bad flag is given. 21 | # 22 | # INPUT gives an alternative input source to AC_COMPILE_IFELSE. 23 | # 24 | # NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this 25 | # macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. 26 | # 27 | # LICENSE 28 | # 29 | # Copyright (c) 2008 Guido U. Draheim 30 | # Copyright (c) 2011 Maarten Bosmans 31 | # 32 | # Copying and distribution of this file, with or without modification, are 33 | # permitted in any medium without royalty provided the copyright notice 34 | # and this notice are preserved. This file is offered as-is, without any 35 | # warranty. 36 | 37 | #serial 6 38 | 39 | AC_DEFUN([AX_CHECK_COMPILE_FLAG], 40 | [AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF 41 | AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl 42 | AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ 43 | ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS 44 | _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" 45 | AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], 46 | [AS_VAR_SET(CACHEVAR,[yes])], 47 | [AS_VAR_SET(CACHEVAR,[no])]) 48 | _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) 49 | AS_VAR_IF(CACHEVAR,yes, 50 | [m4_default([$2], :)], 51 | [m4_default([$3], :)]) 52 | AS_VAR_POPDEF([CACHEVAR])dnl 53 | ])dnl AX_CHECK_COMPILE_FLAGS 54 | -------------------------------------------------------------------------------- /unittest/stats_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "kdpair.h" 13 | #include "statistc.h" 14 | 15 | #include "include_gunit.h" 16 | 17 | namespace tesseract { 18 | 19 | const int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1}; 20 | 21 | class STATSTest : public testing::Test { 22 | public: 23 | void SetUp() override { 24 | std::locale::global(std::locale("")); 25 | stats_.set_range(0, 15); 26 | for (size_t i = 0; i < countof(kTestData); ++i) { 27 | stats_.add(i, kTestData[i]); 28 | } 29 | } 30 | 31 | void TearDown() override {} 32 | 33 | STATS stats_; 34 | }; 35 | 36 | // Tests some basic numbers from the stats_. 37 | TEST_F(STATSTest, BasicStats) { 38 | EXPECT_EQ(37, stats_.get_total()); 39 | EXPECT_EQ(2, stats_.mode()); 40 | EXPECT_EQ(12, stats_.pile_count(2)); 41 | } 42 | 43 | TEST_F(STATSTest, InitStats) { 44 | STATS stats; 45 | EXPECT_EQ(0, stats.get_total()); 46 | EXPECT_EQ(0, stats.mode()); 47 | EXPECT_EQ(0, stats.pile_count(2)); 48 | } 49 | 50 | // Tests the top_n_modes function. 51 | TEST_F(STATSTest, TopNModes) { 52 | std::vector > modes; 53 | int num_modes = stats_.top_n_modes(3, modes); 54 | EXPECT_EQ(3, num_modes); 55 | // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14. 56 | EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key()); 57 | EXPECT_EQ(14, modes[0].data()); 58 | // Mode 1 is 2 10 1 = 13 total count with a mean of 5 12/13. 59 | EXPECT_FLOAT_EQ(5.0f + 12.0f / 13, modes[1].key()); 60 | EXPECT_EQ(13, modes[1].data()); 61 | // Mode 2 is 4 1 1 = 6 total count with a mean of 13.5. 62 | EXPECT_FLOAT_EQ(13.5f, modes[2].key()); 63 | EXPECT_EQ(6, modes[2].data()); 64 | } 65 | 66 | } // namespace tesseract 67 | -------------------------------------------------------------------------------- /java/com/google/scrollview/ui/SVMenuItem.java: -------------------------------------------------------------------------------- 1 | // Copyright 2007 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); You may not 4 | // use this file except in compliance with the License. You may obtain a copy of 5 | // the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by 6 | // applicable law or agreed to in writing, software distributed under the 7 | // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 8 | // OF ANY KIND, either express or implied. See the License for the specific 9 | // language governing permissions and limitations under the License. 10 | 11 | package com.google.scrollview.ui; 12 | 13 | /** 14 | * A MenuListItem is any sort of menu entry. This can either be within a popup 15 | * menu or within a menubar. It can either be a submenu (only name and 16 | * command-id) or a name with an associated value and possibly description. They 17 | * can also have new entries added (if they are submenus). 18 | * 19 | * @author wanke@google.com 20 | */ 21 | 22 | import com.google.scrollview.events.SVEventType; 23 | 24 | import javax.swing.JMenuItem; 25 | 26 | /** 27 | * Constructs a new menulistitem which also has a value and a description. For 28 | * these, we will not have to ask the server what the value is when the user 29 | * wants to change it, but can just call the client with the new value. 30 | */ 31 | class SVMenuItem extends SVAbstractMenuItem { 32 | public String value = null; 33 | public String desc = null; 34 | 35 | SVMenuItem(int id, String name, String v, String d) { 36 | super(id, name, new JMenuItem(name)); 37 | value = v; 38 | desc = d; 39 | } 40 | 41 | /** 42 | * Ask the user for new input for a variable and send it. 43 | * Depending on whether there is a description given for the entry, show 44 | * the description in the dialog or just show the name. 45 | */ 46 | @Override 47 | public void performAction(SVWindow window, SVEventType eventType) { 48 | if (desc != null) { 49 | window.showInputDialog(desc, value, id, eventType); 50 | } else { 51 | window.showInputDialog(name, value, id, eventType); 52 | } 53 | } 54 | 55 | /** Returns the actual value of the MenuListItem. */ 56 | @Override 57 | public String getValue() { 58 | return value; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /unittest/colpartition_test.cc: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2017, Google Inc. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // Unless required by applicable law or agreed to in writing, software 7 | // distributed under the License is distributed on an "AS IS" BASIS, 8 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9 | // See the License for the specific language governing permissions and 10 | // limitations under the License. 11 | 12 | #include "colpartition.h" 13 | 14 | #include "include_gunit.h" 15 | 16 | namespace tesseract { 17 | 18 | class TestableColPartition : public ColPartition { 19 | public: 20 | void SetColumnRange(int first, int last) { 21 | set_first_column(first); 22 | set_last_column(last); 23 | } 24 | }; 25 | 26 | class ColPartitionTest : public testing::Test { 27 | protected: 28 | void SetUp() override { 29 | std::locale::global(std::locale("")); 30 | } 31 | 32 | void TearDown() override {} 33 | }; 34 | 35 | TEST_F(ColPartitionTest, IsInSameColumnAsReflexive) { 36 | TestableColPartition a, b; 37 | a.SetColumnRange(1, 2); 38 | b.SetColumnRange(3, 3); 39 | 40 | EXPECT_TRUE(a.IsInSameColumnAs(a)); 41 | EXPECT_TRUE(b.IsInSameColumnAs(b)); 42 | } 43 | 44 | TEST_F(ColPartitionTest, IsInSameColumnAsBorders) { 45 | TestableColPartition a, b, c, d; 46 | a.SetColumnRange(0, 1); 47 | b.SetColumnRange(1, 2); 48 | c.SetColumnRange(2, 3); 49 | d.SetColumnRange(4, 5); 50 | 51 | EXPECT_TRUE(a.IsInSameColumnAs(b)); 52 | EXPECT_TRUE(b.IsInSameColumnAs(a)); 53 | EXPECT_FALSE(c.IsInSameColumnAs(d)); 54 | EXPECT_FALSE(d.IsInSameColumnAs(c)); 55 | EXPECT_FALSE(a.IsInSameColumnAs(d)); 56 | } 57 | 58 | TEST_F(ColPartitionTest, IsInSameColumnAsSuperset) { 59 | TestableColPartition a, b; 60 | a.SetColumnRange(4, 7); 61 | b.SetColumnRange(2, 8); 62 | 63 | EXPECT_TRUE(a.IsInSameColumnAs(b)); 64 | EXPECT_TRUE(b.IsInSameColumnAs(a)); 65 | } 66 | 67 | TEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) { 68 | TestableColPartition a, b; 69 | a.SetColumnRange(3, 8); 70 | b.SetColumnRange(6, 10); 71 | 72 | EXPECT_TRUE(a.IsInSameColumnAs(b)); 73 | EXPECT_TRUE(b.IsInSameColumnAs(a)); 74 | } 75 | 76 | } // namespace tesseract 77 | -------------------------------------------------------------------------------- /unittest/fuzzers/oss-fuzz-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | # Copyright 2019 Google Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | ################################################################################ 17 | 18 | cd $SRC/leptonica 19 | ./autogen.sh 20 | ./configure --disable-shared 21 | make SUBDIRS=src install -j$(nproc) 22 | ldconfig 23 | 24 | cd $SRC/tesseract 25 | ./autogen.sh 26 | CXXFLAGS="$CXXFLAGS -D_GLIBCXX_DEBUG" ./configure --disable-graphics --disable-shared 27 | make -j$(nproc) 28 | 29 | # Get the models which are needed for the fuzzers. 30 | 31 | mkdir -p $OUT/tessdata 32 | ( 33 | cd $OUT/tessdata 34 | test -f eng.traineddata || \ 35 | curl -L -O https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata 36 | ) 37 | 38 | # OSS-Fuzz requires static linking for the project specific libraries, 39 | # so get the list of those libraries for Leptonica and TIFF. 40 | # Note that libm must be linker dynamically to avoid linker errors. 41 | 42 | LEPTONICA_CFLAGS=$(pkg-config --cflags lept) 43 | LEPTONICA_LIBS=$(pkg-config --static --libs lept) 44 | LIBTIFF_LIBS=$(pkg-config --static --libs libtiff-4 | sed 's/ -lm//') 45 | 46 | $CXX $CXXFLAGS \ 47 | -I $SRC/tesseract/include \ 48 | $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api \ 49 | $SRC/tesseract/.libs/libtesseract.a \ 50 | $LEPTONICA_CFLAGS \ 51 | -Wl,-Bstatic $LEPTONICA_LIBS $LIBTIFF_LIBS -Wl,-Bdynamic \ 52 | $LIB_FUZZING_ENGINE 53 | 54 | $CXX $CXXFLAGS \ 55 | -DTESSERACT_FUZZER_WIDTH=512 \ 56 | -DTESSERACT_FUZZER_HEIGHT=256 \ 57 | -I $SRC/tesseract/include \ 58 | $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api-512x256 \ 59 | $SRC/tesseract/.libs/libtesseract.a \ 60 | $LEPTONICA_CFLAGS \ 61 | -Wl,-Bstatic $LEPTONICA_LIBS $LIBTIFF_LIBS -Wl,-Bdynamic \ 62 | $LIB_FUZZING_ENGINE 63 | -------------------------------------------------------------------------------- /src/arch/dotproductneon.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: dotproductneon.cpp 3 | // Description: Dot product function for ARM NEON. 4 | // Author: Stefan Weil 5 | // 6 | // Licensed under the Apache License, Version 2.0 (the "License"); 7 | // you may not use this file except in compliance with the License. 8 | // You may obtain a copy of the License at 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////// 16 | 17 | #if defined(__ARM_NEON) 18 | 19 | #include 20 | #include "dotproduct.h" 21 | 22 | namespace tesseract { 23 | 24 | // Documentation: 25 | // https://developer.arm.com/architectures/instruction-sets/intrinsics/ 26 | 27 | #if defined(FAST_FLOAT) && defined(__ARM_ARCH_ISA_A64) 28 | 29 | float DotProductNEON(const float *u, const float *v, int n) { 30 | float32x4_t result0123 = vdupq_n_f32(0.0f); 31 | float32x4_t result4567 = vdupq_n_f32(0.0f); 32 | while (n > 7) { 33 | // Calculate 8 dot products per iteration. 34 | float32x4_t u0 = vld1q_f32(u); 35 | float32x4_t v0 = vld1q_f32(v); 36 | float32x4_t u4 = vld1q_f32(u + 4); 37 | float32x4_t v4 = vld1q_f32(v + 4); 38 | result0123 = vfmaq_f32(result0123, u0, v0); 39 | result4567 = vfmaq_f32(result4567, u4, v4); 40 | u += 8; 41 | v += 8; 42 | n -= 8; 43 | } 44 | float total = vaddvq_f32(result0123); 45 | total += vaddvq_f32(result4567); 46 | while (n > 0) { 47 | total += *u++ * *v++; 48 | n--; 49 | } 50 | return total; 51 | } 52 | 53 | #else 54 | 55 | // Computes and returns the dot product of the two n-vectors u and v. 56 | TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n) { 57 | TFloat total = 0; 58 | #if defined(OPENMP_SIMD) || defined(_OPENMP) 59 | #pragma omp simd reduction(+:total) 60 | #endif 61 | for (int k = 0; k < n; k++) { 62 | total += u[k] * v[k]; 63 | } 64 | return total; 65 | } 66 | 67 | #endif 68 | 69 | } // namespace tesseract 70 | 71 | #endif /* __ARM_NEON */ 72 | -------------------------------------------------------------------------------- /src/classify/fpoint.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: fpoint.h 3 | ** Purpose: Abstract data type for 2D points (floating point coords) 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** (c) Copyright Hewlett-Packard Company, 1988. 8 | ** Licensed under the Apache License, Version 2.0 (the "License"); 9 | ** you may not use this file except in compliance with the License. 10 | ** You may obtain a copy of the License at 11 | ** http://www.apache.org/licenses/LICENSE-2.0 12 | ** Unless required by applicable law or agreed to in writing, software 13 | ** distributed under the License is distributed on an "AS IS" BASIS, 14 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | ** See the License for the specific language governing permissions and 16 | ** limitations under the License. 17 | ******************************************************************************/ 18 | 19 | #ifndef FPOINT_H 20 | #define FPOINT_H 21 | 22 | /**---------------------------------------------------------------------------- 23 | Include Files and Type Defines 24 | ----------------------------------------------------------------------------**/ 25 | #include 26 | #include 27 | 28 | /* define data structure to hold 2D points or vectors using floating point */ 29 | struct FPOINT { 30 | float x, y; 31 | }; 32 | using FVECTOR = FPOINT; 33 | 34 | /**---------------------------------------------------------------------------- 35 | Macros 36 | ----------------------------------------------------------------------------**/ 37 | /* macros for computing miscellaneous functions of 2 points */ 38 | #define XDelta(A, B) ((B).x - (A).x) 39 | #define YDelta(A, B) ((B).y - (A).y) 40 | #define SlopeFrom(A, B) (YDelta(A, B) / XDelta(A, B)) 41 | #define AngleFrom(A, B) (atan2((double)YDelta(A, B), (double)XDelta(A, B))) 42 | 43 | #define XIntersectionOf(A, B, X) (SlopeFrom(A, B) * ((X)-A.x) + A.y) 44 | 45 | /*------------------------------------------------------------------------- 46 | Public Function Prototypes 47 | ---------------------------------------------------------------------------*/ 48 | 49 | float DistanceBetween(FPOINT A, FPOINT B); 50 | 51 | float NormalizedAngleFrom(FPOINT *Point1, FPOINT *Point2, float FullScale); 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /src/training/merge_unicharsets.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: merge_unicharsets.cpp 3 | // Description: Simple tool to merge two or more unicharsets. 4 | // Author: Ray Smith 5 | // 6 | // (C) Copyright 2015, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | // 17 | /////////////////////////////////////////////////////////////////////// 18 | 19 | #include "commontraining.h" // CheckSharedLibraryVersion 20 | #include "unicharset.h" 21 | 22 | int main(int argc, char **argv) { 23 | tesseract::CheckSharedLibraryVersion(); 24 | 25 | if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) { 26 | printf("%s\n", tesseract::TessBaseAPI::Version()); 27 | return EXIT_SUCCESS; 28 | } else if (argc < 4) { 29 | // Print usage 30 | printf( 31 | "Usage: %s -v | --version |\n" 32 | " %s unicharset-in-1 ... unicharset-in-n unicharset-out\n", 33 | argv[0], argv[0]); 34 | return EXIT_FAILURE; 35 | } 36 | 37 | tesseract::UNICHARSET input_unicharset, result_unicharset; 38 | for (int arg = 1; arg < argc - 1; ++arg) { 39 | // Load the input unicharset 40 | if (input_unicharset.load_from_file(argv[arg])) { 41 | printf("Loaded unicharset of size %zu from file %s\n", input_unicharset.size(), argv[arg]); 42 | result_unicharset.AppendOtherUnicharset(input_unicharset); 43 | } else { 44 | printf("Failed to load unicharset from file %s!!\n", argv[arg]); 45 | return EXIT_FAILURE; 46 | } 47 | } 48 | 49 | // Save the combined unicharset. 50 | if (result_unicharset.save_to_file(argv[argc - 1])) { 51 | printf("Wrote unicharset file %s.\n", argv[argc - 1]); 52 | } else { 53 | printf("Cannot save unicharset file %s.\n", argv[argc - 1]); 54 | return EXIT_FAILURE; 55 | } 56 | return EXIT_SUCCESS; 57 | } 58 | -------------------------------------------------------------------------------- /src/textord/equationdetectbase.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: equationdetectbase.h 3 | // Description: The base class equation detection class. 4 | // Author: Zongyi (Joe) Liu (joeliu@google.com) 5 | // Created: Fri Aug 31 11:13:01 PST 2011 6 | // 7 | // (C) Copyright 2011, Google Inc. 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // Unless required by applicable law or agreed to in writing, software 13 | // distributed under the License is distributed on an "AS IS" BASIS, 14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | // See the License for the specific language governing permissions and 16 | // limitations under the License. 17 | // 18 | /////////////////////////////////////////////////////////////////////// 19 | 20 | #ifndef TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_ 21 | #define TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_ 22 | 23 | #include "image.h" 24 | 25 | class BLOBNBOX_LIST; 26 | class TO_BLOCK; 27 | struct Pix; 28 | 29 | namespace tesseract { 30 | 31 | class ColPartitionGrid; 32 | class ColPartitionSet; 33 | class BLOBNBOX; 34 | 35 | class TESS_API EquationDetectBase { 36 | public: 37 | EquationDetectBase() = default; 38 | virtual ~EquationDetectBase(); 39 | 40 | // Iterate over the blobs inside to_block, and set the blobs that we want to 41 | // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function 42 | // returns 0 upon success. 43 | virtual int LabelSpecialText(TO_BLOCK *to_block) = 0; 44 | 45 | // Interface to find possible equation partition grid from part_grid. This 46 | // should be called after IdentifySpecialText function. 47 | virtual int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns) = 0; 48 | 49 | // Debug function: Render a bounding box on pix based on the value of its 50 | // special_text_type, specifically: 51 | // BSTT_MATH: red box 52 | // BSTT_DIGIT: cyan box 53 | // BSTT_ITALIC: green box 54 | // BSTT_UNCLEAR: blue box 55 | // All others: yellow box 56 | static void RenderSpecialText(Image pix, BLOBNBOX *blob); 57 | }; 58 | 59 | } // namespace tesseract 60 | 61 | #endif // TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_ 62 | -------------------------------------------------------------------------------- /src/classify/fpoint.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Filename: fpoint.cpp 3 | ** Purpose: Abstract data type for a 2D point (floating point coords) 4 | ** Author: Dan Johnson 5 | ** 6 | ** (c) Copyright Hewlett-Packard Company, 1988. 7 | ** Licensed under the Apache License, Version 2.0 (the "License"); 8 | ** you may not use this file except in compliance with the License. 9 | ** You may obtain a copy of the License at 10 | ** http://www.apache.org/licenses/LICENSE-2.0 11 | ** Unless required by applicable law or agreed to in writing, software 12 | ** distributed under the License is distributed on an "AS IS" BASIS, 13 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ** See the License for the specific language governing permissions and 15 | ** limitations under the License. 16 | ******************************************************************************/ 17 | /*---------------------------------------------------------------------------- 18 | Include Files and Type Defines 19 | ----------------------------------------------------------------------------*/ 20 | #define _USE_MATH_DEFINES // for M_PI 21 | #include "fpoint.h" 22 | #include // for M_PI 23 | #include 24 | 25 | /*---------------------------------------------------------------------------- 26 | Public Code 27 | ----------------------------------------------------------------------------*/ 28 | 29 | float DistanceBetween(FPOINT A, FPOINT B) { 30 | const double xd = XDelta(A, B); 31 | const double yd = YDelta(A, B); 32 | return sqrt(static_cast(xd * xd + yd * yd)); 33 | } 34 | 35 | /** 36 | * Return the angle from Point1 to Point2 normalized to 37 | * lie in the range 0 to FullScale (where FullScale corresponds 38 | * to 2*pi or 360 degrees). 39 | * @param Point1 points to compute angle between 40 | * @param Point2 points to compute angle between 41 | * @param FullScale value to associate with 2*pi 42 | * @return angle 43 | */ 44 | float NormalizedAngleFrom(FPOINT *Point1, FPOINT *Point2, float FullScale) { 45 | float NumRadsInCircle = 2.0 * M_PI; 46 | 47 | float Angle = AngleFrom(*Point1, *Point2); 48 | if (Angle < 0.0) { 49 | Angle += NumRadsInCircle; 50 | } 51 | Angle *= FullScale / NumRadsInCircle; 52 | if (Angle < 0.0 || Angle >= FullScale) { 53 | Angle = 0.0; 54 | } 55 | return (Angle); 56 | } 57 | -------------------------------------------------------------------------------- /src/lstm/maxpool.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // File: maxpool.h 3 | // Description: Standard Max-Pooling layer. 4 | // Author: Ray Smith 5 | // 6 | // (C) Copyright 2014, Google Inc. 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // http://www.apache.org/licenses/LICENSE-2.0 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | /////////////////////////////////////////////////////////////////////// 17 | 18 | #ifndef TESSERACT_LSTM_MAXPOOL_H_ 19 | #define TESSERACT_LSTM_MAXPOOL_H_ 20 | 21 | #include "reconfig.h" 22 | 23 | namespace tesseract { 24 | 25 | // Maxpooling reduction. Independently for each input, selects the location 26 | // in the rectangle that contains the max value. 27 | // Backprop propagates only to the position that was the max. 28 | class Maxpool : public Reconfig { 29 | public: 30 | TESS_API 31 | Maxpool(const std::string &name, int ni, int x_scale, int y_scale); 32 | ~Maxpool() override = default; 33 | 34 | // Accessors. 35 | std::string spec() const override { 36 | return "Mp" + std::to_string(y_scale_) + "," + std::to_string(x_scale_); 37 | } 38 | 39 | // Reads from the given file. Returns false in case of error. 40 | bool DeSerialize(TFile *fp) override; 41 | 42 | // Runs forward propagation of activations on the input line. 43 | // See Network for a detailed discussion of the arguments. 44 | void Forward(bool debug, const NetworkIO &input, const TransposedArray *input_transpose, 45 | NetworkScratch *scratch, NetworkIO *output) override; 46 | 47 | // Runs backward propagation of errors on the deltas line. 48 | // See Network for a detailed discussion of the arguments. 49 | bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch, 50 | NetworkIO *back_deltas) override; 51 | 52 | private: 53 | // Memory of which input was the max. 54 | GENERIC_2D_ARRAY maxes_; 55 | }; 56 | 57 | } // namespace tesseract. 58 | 59 | #endif // TESSERACT_LSTM_MAXPOOL_H_ 60 | --------------------------------------------------------------------------------