├── bin └── .gitignore ├── test ├── .gitignore ├── test_utils.h ├── run.c ├── Makefile ├── test_utils.c ├── test_accuracy_large_files.py ├── text_test.c ├── word_test.c └── test_accsum_graphic_characters.py ├── _config.yml ├── user-guide.pdf ├── .gitignore ├── libexec ├── WordBreakProperty.txt.gz ├── Unicode-License.txt └── generate_word_break.py ├── lib └── .gitignore ├── src ├── ocreval_version.h ├── charclass.h ├── accsum.c ├── wordaccsum.c ├── editopsum.c ├── sort.h ├── stopword.h ├── accdist.c ├── word.h ├── dist.h ├── wordaccdist.c ├── dist.c ├── ci.h ├── sort.c ├── edorpt.h ├── editopcost.c ├── accci.c ├── wordaccci.c ├── nonstopacc.c ├── groupacc.c ├── ci.c ├── sync.h ├── wordfreq.c ├── list.h ├── wacrpt.h ├── table.h ├── table.c ├── stopword.c ├── accrpt.h ├── edorpt.c ├── list.c ├── accuracy.c ├── util.h ├── ngram.c ├── synctext.c ├── text.h ├── charclass.c ├── editop.c ├── wordacc.c ├── util.c ├── wacrpt.c └── vote.c ├── .travis.yml ├── share └── man │ └── man1 │ ├── wordfreq.1 │ ├── accsum.1 │ ├── accdist.1 │ ├── wordaccdist.1 │ ├── editopsum.1 │ ├── wordaccsum.1 │ ├── accuracy.1 │ ├── groupacc.1 │ ├── editop.1 │ ├── nonstopacc.1 │ ├── wordaccci.1 │ ├── accci.1 │ ├── ngram.1 │ ├── editopcost.1 │ ├── wordacc.1 │ ├── vote.1 │ └── synctext.1 ├── install_utf8proc.sh ├── docs └── install_utf8proc.sh ├── NOTICE ├── use-libocreval-internal.mk ├── CHANGELOG.md ├── Makefile └── README.md /bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | run 2 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /user-guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eddieantonio/ocreval/HEAD/user-guide.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Objects and dependencies. 2 | *.o 3 | *.d 4 | # Debugging 5 | *.dSYM/ 6 | core 7 | -------------------------------------------------------------------------------- /libexec/WordBreakProperty.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eddieantonio/ocreval/HEAD/libexec/WordBreakProperty.txt.gz -------------------------------------------------------------------------------- /lib/.gitignore: -------------------------------------------------------------------------------- 1 | # Ensure that this directory exists in git 2 | # but ignore every file that it will be used for. 3 | *.a 4 | *.so 5 | *.so.* 6 | -------------------------------------------------------------------------------- /src/ocreval_version.h: -------------------------------------------------------------------------------- 1 | #define OCREVAL_VERSION_MAJOR "7" 2 | #define OCREVAL_VERSION_MINOR "0" 3 | #define OCREVAL_VERSION OCREVAL_VERSION_MAJOR "." OCREVAL_VERSION_MINOR 4 | -------------------------------------------------------------------------------- /test/test_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_UTILS_H 2 | #define TEST_UTILS_H 3 | 4 | #include 5 | 6 | extern Text* text; 7 | 8 | void initialize_texts(void *list); 9 | void deinitialize_texts(void *list); 10 | 11 | #endif /* TEST_UTILS_H */ 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | compiler: 3 | - clang 4 | - gcc 5 | 6 | os: 7 | - linux 8 | - osx 9 | dist: xenial 10 | 11 | addons: 12 | apt: 13 | packages: libutf8proc-dev 14 | homebrew: 15 | packages: utf8proc 16 | update: true 17 | 18 | script: 19 | - make 20 | - make test 21 | -------------------------------------------------------------------------------- /test/run.c: -------------------------------------------------------------------------------- 1 | #include "text_test.c" 2 | #include "word_test.c" 3 | 4 | #include "greatest.h" 5 | 6 | GREATEST_MAIN_DEFS(); 7 | 8 | int main(int argc, char **argv) { 9 | GREATEST_MAIN_BEGIN(); 10 | 11 | RUN_SUITE(cstring_to_text_suite); 12 | RUN_SUITE(char_to_string_suite); 13 | 14 | RUN_SUITE(find_words_suite); 15 | 16 | GREATEST_MAIN_END(); 17 | } 18 | -------------------------------------------------------------------------------- /share/man/man1/wordfreq.1: -------------------------------------------------------------------------------- 1 | .TH WORDFREQ 1 2 | .SH NAME 3 | wordfreq \- determines the frequency of words 4 | .SH SYNOPSIS 5 | .B wordfreq 6 | textfile1 textfile2 ... >resultfile 7 | .SH DESCRIPTION 8 | .I Wordfreq 9 | reads one or more text files and writes to stdout the number of occurrences of 10 | each distinct word found in these files, where a word is defined to be any 11 | sequence of one or more letters. 12 | .SH "SEE ALSO" 13 | .IR ngram (1). 14 | -------------------------------------------------------------------------------- /share/man/man1/accsum.1: -------------------------------------------------------------------------------- 1 | .TH ACCSUM 1 2 | .SH NAME 3 | accsum \- combines character accuracy reports 4 | .SH SYNOPSIS 5 | .B accsum 6 | accuracy_report1 accuracy_report2 ... >accuracy_report 7 | .SH DESCRIPTION 8 | .I Accsum 9 | combines two or more character accuracy reports and writes an aggregate report 10 | to stdout. The input reports must have been produced by either 11 | .I accuracy 12 | or 13 | .IR accsum . 14 | .SH "SEE ALSO" 15 | .IR accuracy (1), 16 | .IR editopsum (1) 17 | and 18 | .IR wordaccsum (1). 19 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | include ../use-libocreval-internal.mk 2 | 3 | test: unit-test large-file-test accsum-test 4 | 5 | unit-test: run 6 | ./$< $(TEST_ARGS) 7 | 8 | large-file-test: 9 | ./test_accuracy_large_files.py 10 | 11 | accsum-test: 12 | ./test_accsum_graphic_characters.py 13 | 14 | clean: 15 | $(RM) run 16 | 17 | run: run.c $(wildcard *_test.c) $(LIBOCREVAL) test_utils.c test_utils.h 18 | $(LINK.c) test_utils.c $< -locreval -lutf8proc -o $@ 19 | 20 | .PHONY: test clean accsum-test large-file-test unit-test 21 | -------------------------------------------------------------------------------- /share/man/man1/accdist.1: -------------------------------------------------------------------------------- 1 | .TH ACCDIST 1 2 | .SH NAME 3 | accdist \- displays the distribution of character accuracies 4 | .SH SYNOPSIS 5 | .B accdist 6 | accuracy_report1 accuracy_report2 ... >xyfile 7 | .SH DESCRIPTION 8 | .I Accdist 9 | writes to stdout the distribution of character accuracies found in the input 10 | reports. For 11 | .I X 12 | = 0 to 100, the percentage of characters recognized with at least 13 | .IR X % 14 | accuracy is reported. 15 | .SH "SEE ALSO" 16 | .IR accuracy (1) 17 | and 18 | .IR wordaccdist (1). 19 | 20 | -------------------------------------------------------------------------------- /share/man/man1/wordaccdist.1: -------------------------------------------------------------------------------- 1 | .TH WORDACCDIST 1 2 | .SH NAME 3 | wordaccdist \- displays the distribution of word accuracies 4 | .SH SYNOPSIS 5 | .B wordaccdist 6 | wordacc_report1 wordacc_report2 ... >xyfile 7 | .SH DESCRIPTION 8 | .I Wordaccdist 9 | writes to stdout the distribution of word accuracies found in the input 10 | reports. For 11 | .I X 12 | = 0 to 100, the percentage of words recognized with at least 13 | .IR X % 14 | accuracy is reported. 15 | .SH "SEE ALSO" 16 | .IR accdist (1) 17 | and 18 | .IR wordacc (1). 19 | 20 | -------------------------------------------------------------------------------- /share/man/man1/editopsum.1: -------------------------------------------------------------------------------- 1 | .TH EDITOPSUM 1 2 | .SH NAME 3 | editopsum \- combines edit operation reports 4 | .SH SYNOPSIS 5 | .B editopsum 6 | editop_report1 editop_report2 ... >editop_report 7 | .SH DESCRIPTION 8 | .I Editopsum 9 | combines two or more edit operation reports and writes an aggregate report 10 | to stdout. The input reports must have been produced by either 11 | .I editop 12 | or 13 | .IR editopsum . 14 | .SH "SEE ALSO" 15 | .IR accsum (1), 16 | .IR editop (1), 17 | .IR editopcost (1) 18 | and 19 | .IR wordaccsum (1). 20 | -------------------------------------------------------------------------------- /share/man/man1/wordaccsum.1: -------------------------------------------------------------------------------- 1 | .TH WORDACCSUM 1 2 | .SH NAME 3 | wordaccsum \- combines word accuracy reports 4 | .SH SYNOPSIS 5 | .B wordaccsum 6 | wordacc_report1 wordacc_report2 ... >wordacc_report 7 | .SH DESCRIPTION 8 | .I Wordaccsum 9 | combines two or more word accuracy reports and writes an aggregate report 10 | to stdout. The input reports must have been produced by either 11 | .I wordacc 12 | or 13 | .IR wordaccsum . 14 | .SH "SEE ALSO" 15 | .IR accsum (1), 16 | .IR editopsum (1), 17 | .IR nonstopacc (1) 18 | and 19 | .IR wordacc (1). 20 | -------------------------------------------------------------------------------- /test/test_utils.c: -------------------------------------------------------------------------------- 1 | #include "test_utils.h" 2 | 3 | #include 4 | 5 | static Text text_; 6 | Text* text = &text_; 7 | 8 | void initialize_texts(void *list) { 9 | Text** texts = (Text**) list; 10 | for (; *texts != NULL; texts++) { 11 | list_initialize(*texts); 12 | } 13 | } 14 | 15 | void deinitialize_texts(void *list) { 16 | Text** texts = (Text**) list; 17 | for (; *texts != NULL; texts++) { 18 | /* Frees each character and clears the text. */ 19 | list_empty(*texts, free); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /install_utf8proc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | PROJECT=utf8proc 4 | VERSION=1.3.1 5 | DIRECTORY="$PROJECT-$VERSION" 6 | TAR_NAME="v${VERSION}.tar.gz" 7 | TAR_URL="https://github.com/JuliaLang/$PROJECT/archive/$TAR_NAME" 8 | 9 | set -ex 10 | 11 | cd /tmp/ 12 | curl -OL $TAR_URL 13 | tar xzf $TAR_NAME 14 | cd $DIRECTORY/ 15 | make 16 | sudo make install 17 | 18 | if [ `uname -s` != Darwin ] ; then 19 | # Rebuild the shared object cache - needed to load the library 20 | # at runtime 21 | sudo ldconfig 22 | fi 23 | -------------------------------------------------------------------------------- /docs/install_utf8proc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | PROJECT=utf8proc 4 | VERSION=1.3.1 5 | DIRECTORY="$PROJECT-$VERSION" 6 | TAR_NAME="v${VERSION}.tar.gz" 7 | TAR_URL="https://github.com/JuliaLang/$PROJECT/archive/$TAR_NAME" 8 | 9 | set -ex 10 | 11 | cd /tmp/ 12 | curl -OL $TAR_URL 13 | tar xzf $TAR_NAME 14 | cd $DIRECTORY/ 15 | make 16 | sudo make install 17 | 18 | if [ `uname -s` != Darwin ] ; then 19 | # Rebuild the shared object cache - needed to load the library 20 | # at runtime 21 | sudo ldconfig 22 | fi 23 | -------------------------------------------------------------------------------- /share/man/man1/accuracy.1: -------------------------------------------------------------------------------- 1 | .TH ACCURACY 1 2 | .SH NAME 3 | accuracy \- computes character accuracy 4 | .SH SYNOPSIS 5 | .B accuracy 6 | correctfile generatedfile [ accuracy_report ] 7 | .SH DESCRIPTION 8 | .I Accuracy 9 | computes the character accuracy of the OCR-generated text in 10 | .I generatedfile 11 | using the correct text found in 12 | .IR correctfile . 13 | A report containing accuracy statistics is written to 14 | .I accuracy_report 15 | if specified; otherwise, it is written to stdout. 16 | .SH "SEE ALSO" 17 | .IR accci (1), 18 | .IR accsum (1), 19 | .IR editop (1), 20 | .IR synctext (1) 21 | and 22 | .IR wordacc (1). 23 | -------------------------------------------------------------------------------- /share/man/man1/groupacc.1: -------------------------------------------------------------------------------- 1 | .TH GROUPACC 1 2 | .SH NAME 3 | groupacc \- shows the accuracy for characters belonging to a group 4 | .SH SYNOPSIS 5 | .B groupacc 6 | groupfile accuracy_report [ groupacc_report ] 7 | .SH DESCRIPTION 8 | .I Groupacc 9 | extracts the accuracy results from 10 | .I accuracy_report 11 | for each character found in 12 | .IR groupfile . 13 | The results are written to 14 | .I groupacc_report 15 | if specified; otherwise, they are written to stdout. 16 | The input character accuracy report must have been produced by either 17 | .I accuracy 18 | or 19 | .IR accsum . 20 | .SH "SEE ALSO" 21 | .IR accsum (1) 22 | and 23 | .IR accuracy (1). 24 | -------------------------------------------------------------------------------- /share/man/man1/editop.1: -------------------------------------------------------------------------------- 1 | .TH EDITOP 1 2 | .SH NAME 3 | editop \- counts edit operations 4 | .SH SYNOPSIS 5 | .B editop 6 | correctfile generatedfile [ editop_report ] 7 | .SH DESCRIPTION 8 | .I Editop 9 | counts the edit operations (character insertions, character deletions, and 10 | block move operations) required to transform the OCR-generated text in 11 | .I generatedfile 12 | to the correct text found in 13 | .IR correctfile . 14 | A report containing edit operation statistics is written to 15 | .I editop_report 16 | if specified; otherwise, it is written to stdout. 17 | .SH "SEE ALSO" 18 | .IR accuracy (1), 19 | .IR editopcost (1), 20 | .IR editopsum (1), 21 | .IR synctext (1) 22 | and 23 | .IR wordacc (1). 24 | -------------------------------------------------------------------------------- /share/man/man1/nonstopacc.1: -------------------------------------------------------------------------------- 1 | .TH NONSTOPACC 1 2 | .SH NAME 3 | nonstopacc \- computes non-stopword accuracy 4 | .SH SYNOPSIS 5 | .B nonstopacc 6 | stopwordfile wordacc_report >xyfile 7 | .SH DESCRIPTION 8 | .I Nonstopacc 9 | computes non-stopword accuracy as a function of the number of stopwords. 10 | Stopwords are specified in 11 | .I stopwordfile 12 | in decreasing order of frequency. Word accuracy data is supplied by 13 | .IR wordacc_report , 14 | which must have been produced by either 15 | .I wordacc 16 | or 17 | .IR wordaccsum . 18 | Non-stopword accuracy is computed and written to stdout using no stopwords, 19 | one stopword, two stopwords, ..., and all stopwords from 20 | .IR stopwordfile . 21 | .SH "SEE ALSO" 22 | .IR wordacc (1) 23 | and 24 | .IR wordaccsum (1). 25 | -------------------------------------------------------------------------------- /share/man/man1/wordaccci.1: -------------------------------------------------------------------------------- 1 | .TH WORDACCCI 1 2 | .SH NAME 3 | wordaccci \- computes a confidence interval for word accuracy 4 | .SH SYNOPSIS 5 | .B wordaccci 6 | wordacc_report1 wordacc_report2 ... >resultfile 7 | .SH DESCRIPTION 8 | .I Wordaccci 9 | reads two or more word accuracy reports and writes to stdout an 10 | approximate 95% confidence interval for word accuracy. Each input report 11 | is treated as one observation, and normally has been produced for a single page 12 | using 13 | .IR wordacc . 14 | The confidence interval is computed using a technique known as jackknife 15 | estimation which assumes that the observations are independent. For best 16 | results, at least 30 observations are needed. 17 | .SH "SEE ALSO" 18 | .IR accci (1) 19 | and 20 | .IR wordacc (1). 21 | -------------------------------------------------------------------------------- /share/man/man1/accci.1: -------------------------------------------------------------------------------- 1 | .TH ACCCI 1 2 | .SH NAME 3 | accci \- computes a confidence interval for character accuracy 4 | .SH SYNOPSIS 5 | .B accci 6 | accuracy_report1 accuracy_report2 ... >resultfile 7 | .SH DESCRIPTION 8 | .I Accci 9 | reads two or more character accuracy reports and writes to stdout an 10 | approximate 95% confidence interval for character accuracy. Each input report 11 | is treated as one observation, and normally has been produced for a single page 12 | using 13 | .IR accuracy . 14 | The confidence interval is computed using a technique known as jackknife 15 | estimation which assumes that the observations are independent. For best 16 | results, at least 30 observations are needed. 17 | .SH "SEE ALSO" 18 | .IR accuracy (1) 19 | and 20 | .IR wordaccci (1). 21 | -------------------------------------------------------------------------------- /share/man/man1/ngram.1: -------------------------------------------------------------------------------- 1 | .TH NGRAM 1 2 | .SH NAME 3 | ngram \- computes 4 | .IR n -grams 5 | .SH SYNOPSIS 6 | .B ngram 7 | [ 8 | .B \-n 9 | 1 | 2 | 3 10 | ] textfile1 textfile2 ... >resultfile 11 | .SH DESCRIPTION 12 | .I Ngram 13 | reads one or more text files and writes to stdout the 14 | .IR n -gram 15 | statistics for the text found in these files. The `\-n' option specifies the 16 | value of 17 | .IR n : 18 | 1 for uni-grams, 2 for bi-grams, or 3 for tri-grams, where the default is 1 19 | (uni-grams). The output shows the number of occurrences of each distinct 20 | .IR n -character 21 | sequence and indicates the number of those occurrences that are suspect 22 | (i.e., have at least one character marked as suspect). 23 | .SH OPTIONS 24 | .TP 25 | .B \-n 26 | Specify the value of 27 | .IR n . 28 | .SH "SEE ALSO" 29 | .IR wordfreq (1). 30 | -------------------------------------------------------------------------------- /share/man/man1/editopcost.1: -------------------------------------------------------------------------------- 1 | .TH EDITOPCOST 1 2 | .SH NAME 3 | editopcost \- computes the cost of edit operations 4 | .SH SYNOPSIS 5 | .B editopcost 6 | editop_report [ editop_report2 ] >xyfile 7 | .SH DESCRIPTION 8 | .I Editopcost 9 | computes the cost of the edit operations described in 10 | .IR editop_report , 11 | less the cost of the edit operations described in 12 | .IR editop_report2 , 13 | if specified. 14 | The cost is based on the number of insertions, the number and lengths of move 15 | operations, and a threshold value, 16 | .IR T , 17 | used to convert move operations into an equivalent number of insertions. 18 | The cost is computed and written to stdout for 19 | .I T 20 | = 0 to 100. 21 | The input reports must have been produced by either 22 | .I editop 23 | or 24 | .IR editopsum . 25 | .SH "SEE ALSO" 26 | .IR editop (1) 27 | and 28 | .IR editopsum (1). 29 | -------------------------------------------------------------------------------- /share/man/man1/wordacc.1: -------------------------------------------------------------------------------- 1 | .TH WORDACC 1 2 | .SH NAME 3 | wordacc \- computes word accuracy 4 | .SH SYNOPSIS 5 | .B wordacc 6 | [ 7 | .B \-S 8 | stopwordfile ] correctfile generatedfile [ wordacc_report ] 9 | .SH DESCRIPTION 10 | .I Wordacc 11 | computes the word accuracy of the OCR-generated text in 12 | .I generatedfile 13 | using the correct text found in 14 | .IR correctfile . 15 | A report containing accuracy statistics is written to 16 | .I wordacc_report 17 | if specified; otherwise, it is written to stdout. Stopwords are taken from 18 | .I stopwordfile 19 | if specified; otherwise, the BASISplus default set of 110 stopwords is 20 | utilized. 21 | .SH OPTIONS 22 | .TP 23 | .B \-S 24 | Specify the name of a file containing stopwords. 25 | .SH "SEE ALSO" 26 | .IR accuracy (1), 27 | .IR editop (1), 28 | .IR nonstopacc (1), 29 | .IR wordaccci (1) 30 | and 31 | .IR wordaccsum (1). 32 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | The ISRI Analytic Tools for OCR Evaluation 2 | 3 | Originally developed by Dr. Steven Rice in 1996 for his doctoral dissertation. 4 | 5 | ocreval 6 | 7 | An updated port of the ISRI Analytic Tools written by @eddieantonio to 8 | continue its OCR evaluation goodness for all of the languages representable by 9 | Unicode! 10 | 11 | --- 12 | 13 | Copyright 2015–2018 Eddie Antonio Santos 14 | 15 | Copyright 1996 The Board of Regents of the Nevada System of Higher 16 | Education, on behalf, of the University of Nevada, Las Vegas, 17 | Information Science Research Institute 18 | 19 | This product includes software developed at The Information Science 20 | Research Institute (http://www.isri.unlv.edu/). 21 | 22 | Additional information and a large collection of ground truth data is 23 | available The ISRI OCR Performance Toolkit website 24 | http://code.google.com/p/isri-ocr-evaluation-tools 25 | 26 | -------------------------------------------------------------------------------- /use-libocreval-internal.mk: -------------------------------------------------------------------------------- 1 | # Get absolute path to containing directory: http://stackoverflow.com/a/324782 2 | TOP := $(dir $(CURDIR)/$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))) 3 | # For INTERNAL headers! 4 | LOCAL_INCLUDE_DIR := $(abspath $(TOP)src) 5 | # For use with the -L option. 6 | LOCAL_LINK_DIR := $(abspath $(TOP)lib) 7 | 8 | LIBOCREVAL = $(LOCAL_LINK_DIR)/libocreval.a 9 | 10 | # Compilation flags for all files. 11 | override CFLAGS += -ansi 12 | # X/Open 6.0 standardizes features used in this K&R C source... 13 | CPPDEFINES = -D_XOPEN_SOURCE=600 14 | # Create dependency files. 15 | CPPFLAGS = -MMD 16 | # utf8proc lib usually lives in here: 17 | override CPPFLAGS += -I/usr/local/include $(CPPDEFINES) 18 | LDFLAGS += -L/usr/local/lib 19 | LDLIBS = -lm -lutf8proc 20 | 21 | # Use libocreval, created in lib/ 22 | override CPPFLAGS += -I$(LOCAL_INCLUDE_DIR) 23 | LDFLAGS += -L$(LOCAL_LINK_DIR) 24 | LDLIBS := -locreval $(LDLIBS) 25 | -------------------------------------------------------------------------------- /share/man/man1/vote.1: -------------------------------------------------------------------------------- 1 | .TH VOTE 1 2 | .SH NAME 3 | vote \- applies voting to text files 4 | .SH SYNOPSIS 5 | .B vote 6 | [ 7 | .B \-O 8 | ] [ 9 | .B \-o 10 | outputfile ] [ 11 | .B \-s 12 | m/n ] [ 13 | .B \-w 14 | m/n ] textfile1 textfile2 ... 15 | .SH DESCRIPTION 16 | .I Vote 17 | applies a voting algorithm to two or more text files. The resulting text is 18 | written to 19 | .I outputfile 20 | if specified; otherwise, it is written to stdout. 21 | .PP 22 | An unmarked character in the input receives one vote. 23 | A reject character receives no votes. 24 | If a fraction is specified by the `\-w' option, then a character marked as 25 | suspect receives this fraction of a vote; otherwise, it receives a full vote. 26 | If a fraction is specified by the `\-s' option, and an output character 27 | receives no more than this fraction of the possible number of votes, then it is 28 | marked as suspect; otherwise, it is unmarked. For both of these options, 29 | .I m 30 | and 31 | .I n 32 | must satisfy 1 <= 33 | .I m 34 | <= 35 | .I n 36 | <= 9. 37 | .SH OPTIONS 38 | .TP 39 | .B \-O 40 | Enable optimization. 41 | .TP 42 | .B \-o 43 | Specify the name of the output file. 44 | .TP 45 | .B \-s 46 | Specify the threshold for marking output characters. 47 | .TP 48 | .B \-w 49 | Specify the weight of marked input characters. 50 | -------------------------------------------------------------------------------- /share/man/man1/synctext.1: -------------------------------------------------------------------------------- 1 | .TH SYNCTEXT 1 2 | .SH NAME 3 | synctext \- synchronizes text files 4 | .SH SYNOPSIS 5 | .B synctext 6 | [ 7 | .B \-H 8 | ] [ 9 | .B \-i 10 | ] [ 11 | .B \-s 12 | ] [ 13 | .B \-T 14 | ] textfile1 textfile2 ... >resultfile 15 | .SH DESCRIPTION 16 | .I Synctext 17 | synchronizes two or more text files and writes to stdout the differences 18 | among these files. The output shows the substrings that are common to all of 19 | the files followed by footnotes indicating what the differences are. 20 | .PP 21 | If more than two input files are specified, then a heuristic (sub-optimal) 22 | algorithm is used to find matches. If only two input files are specified, then 23 | an optimal algorithm is used, unless the `\-H' or `\-T' option is specified. 24 | If the `\-H' option is specified, then the heuristic algorithm is applied to 25 | the two input files. If the `\-T' option is specified, then a heuristic 26 | algorithm that can find transposed matches is utilized, and the output takes a 27 | different form: each match is numbered and appears within braces. 28 | .SH OPTIONS 29 | .TP 30 | .B \-H 31 | Use heuristic algorithm. 32 | .TP 33 | .B \-i 34 | Ignore case (i.e., case-insensitive). 35 | .TP 36 | .B \-s 37 | Show suspect markers. 38 | .TP 39 | .B \-T 40 | Find transposed matches. 41 | .SH "SEE ALSO" 42 | .IR accuracy (1) 43 | and 44 | .IR editop (1). 45 | -------------------------------------------------------------------------------- /src/charclass.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * charclass.h 4 | * 5 | * This module provides definitions and utility routines pertaining to 6 | * character classes. 7 | * 8 | * Author: Stephen V. Rice 9 | * 10 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 11 | * Education, on behalf, of the University of Nevada, Las Vegas, 12 | * Information Science Research Institute 13 | * 14 | * Licensed under the Apache License, Version 2.0 (the "License"); you 15 | * may not use this file except in compliance with the License. You 16 | * may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, software 21 | * distributed under the License is distributed on an "AS IS" BASIS, 22 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 23 | * implied. See the License for the specific language governing 24 | * permissions and limitations under the License. 25 | * 26 | **********************************************************************/ 27 | 28 | #ifndef _CHARCLASS_ 29 | #define _CHARCLASS_ 30 | 31 | #include "text.h" 32 | 33 | typedef unsigned char Charclass; 34 | #define MAX_CHARCLASSES 256 35 | 36 | Charclass charclass(/* Charvalue value */); 37 | /* returns the character class for the given character 38 | value */ 39 | 40 | char *charclass_name(/* Charclass class */); 41 | /* returns the name of the given character class */ 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/accsum.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * accsum.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "accrpt.h" 26 | 27 | #define usage "accuracy_report1 accuracy_report2 ... >accuracy_report" 28 | 29 | Accdata accdata; 30 | 31 | /**********************************************************************/ 32 | 33 | main(argc, argv) 34 | int argc; 35 | char *argv[]; 36 | { 37 | int i; 38 | initialize(&argc, argv, usage, NULL); 39 | if (argc < 2) 40 | error("not enough input files"); 41 | for (i = 0; i < argc; i++) 42 | read_accrpt(&accdata, argv[i]); 43 | write_accrpt(&accdata, NULL); 44 | terminate(); 45 | } 46 | -------------------------------------------------------------------------------- /src/wordaccsum.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wordaccsum.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "wacrpt.h" 26 | 27 | #define usage "wordacc_report1 wordacc_report2 ... >wordacc_report" 28 | 29 | Wacdata wacdata; 30 | 31 | /**********************************************************************/ 32 | 33 | main(argc, argv) 34 | int argc; 35 | char *argv[]; 36 | { 37 | int i; 38 | initialize(&argc, argv, usage, NULL); 39 | if (argc < 2) 40 | error("not enough input files"); 41 | for (i = 0; i < argc; i++) 42 | read_wacrpt(&wacdata, argv[i]); 43 | write_wacrpt(&wacdata, NULL); 44 | terminate(); 45 | } 46 | -------------------------------------------------------------------------------- /src/editopsum.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * editopsum.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "edorpt.h" 26 | #include "util.h" 27 | 28 | #define usage "editop_report1 editop_report2 ... >editop_report" 29 | 30 | Edodata edodata; 31 | 32 | /**********************************************************************/ 33 | 34 | main(argc, argv) 35 | int argc; 36 | char *argv[]; 37 | { 38 | int i; 39 | initialize(&argc, argv, usage, NULL); 40 | if (argc < 2) 41 | error("not enough input files"); 42 | for (i = 0; i < argc; i++) 43 | read_edorpt(&edodata, argv[i]); 44 | write_edorpt(&edodata, NULL); 45 | terminate(); 46 | } 47 | -------------------------------------------------------------------------------- /src/sort.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * sort.h 4 | * 5 | * This module provides a general-purpose sorting routine. 6 | * 7 | * Author: Stephen V. Rice 8 | * 9 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 10 | * Education, on behalf, of the University of Nevada, Las Vegas, 11 | * Information Science Research Institute 12 | * 13 | * Licensed under the Apache License, Version 2.0 (the "License"); you 14 | * may not use this file except in compliance with the License. You 15 | * may obtain a copy of the License at 16 | * 17 | * http://www.apache.org/licenses/LICENSE-2.0 18 | * 19 | * Unless required by applicable law or agreed to in writing, software 20 | * distributed under the License is distributed on an "AS IS" BASIS, 21 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 22 | * implied. See the License for the specific language governing 23 | * permissions and limitations under the License. 24 | * 25 | **********************************************************************/ 26 | 27 | #ifndef _SORT_ 28 | #define _SORT_ 29 | 30 | void sort(/* long num_elements, void *array[], 31 | int (*compare)(void *element1, void *element2) */); 32 | /* given an array of pointers having the specified 33 | number of elements, sorts the array using the 34 | comparison routine provided; this routine returns 35 | a negative value if "element1" precedes "element2", 36 | a positive value if "element1" follows "element2", 37 | and a zero value if they are equal */ 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/stopword.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * stopword.h 4 | * 5 | * This module provides routines pertaining to stopwords. 6 | * 7 | * Author: Stephen V. Rice 8 | * 9 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 10 | * Education, on behalf, of the University of Nevada, Las Vegas, 11 | * Information Science Research Institute 12 | * 13 | * Licensed under the Apache License, Version 2.0 (the "License"); you 14 | * may not use this file except in compliance with the License. You 15 | * may obtain a copy of the License at 16 | * 17 | * http://www.apache.org/licenses/LICENSE-2.0 18 | * 19 | * Unless required by applicable law or agreed to in writing, software 20 | * distributed under the License is distributed on an "AS IS" BASIS, 21 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 22 | * implied. See the License for the specific language governing 23 | * permissions and limitations under the License. 24 | * 25 | **********************************************************************/ 26 | 27 | #ifndef _STOPWORD_ 28 | #define _STOPWORD_ 29 | 30 | #include "util.h" 31 | 32 | void init_stopwords(/* char *filename */); 33 | /* reads stopwords from the named file, and reports an 34 | error and quits if unable to open the file; if 35 | "filename" is NULL, the default set of 110 stopwords 36 | from BASISplus is used */ 37 | 38 | Boolean is_stopword(/* unsigned char *string */); 39 | /* returns True if "string" represents a stopword; this 40 | routine can be called only after stopwords have been 41 | initialized using "init_stopwords" */ 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/accdist.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * accdist.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "accrpt.h" 26 | #include "dist.h" 27 | 28 | #define usage "accuracy_report1 accuracy_report2 ... >xyfile" 29 | 30 | Accdata accdata; 31 | Dist dist; 32 | 33 | /**********************************************************************/ 34 | 35 | void process_file(filename) 36 | char *filename; 37 | { 38 | long chars, errors; 39 | chars = accdata.characters; 40 | errors = accdata.errors; 41 | read_accrpt(&accdata, filename); 42 | update_dist(&dist, accdata.characters - chars, accdata.errors - errors); 43 | } 44 | /**********************************************************************/ 45 | 46 | main(argc, argv) 47 | int argc; 48 | char *argv[]; 49 | { 50 | int i; 51 | initialize(&argc, argv, usage, NULL); 52 | for (i = 0; i < argc; i++) 53 | process_file(argv[i]); 54 | write_dist(&dist, NULL); 55 | terminate(); 56 | } 57 | -------------------------------------------------------------------------------- /src/word.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * word.h 4 | * 5 | * This module provides definitions and routines pertaining to words. 6 | * 7 | * Author: Stephen V. Rice 8 | * 9 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 10 | * Education, on behalf, of the University of Nevada, Las Vegas, 11 | * Information Science Research Institute 12 | * 13 | * Licensed under the Apache License, Version 2.0 (the "License"); you 14 | * may not use this file except in compliance with the License. You 15 | * may obtain a copy of the License at 16 | * 17 | * http://www.apache.org/licenses/LICENSE-2.0 18 | * 19 | * Unless required by applicable law or agreed to in writing, software 20 | * distributed under the License is distributed on an "AS IS" BASIS, 21 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 22 | * implied. See the License for the specific language governing 23 | * permissions and limitations under the License. 24 | * 25 | **********************************************************************/ 26 | 27 | #ifndef _WORD_ 28 | #define _WORD_ 29 | 30 | #include "text.h" 31 | 32 | #define MAX_WORDLENGTH 50 33 | 34 | BEGIN_ITEM(Word) 35 | char *string; 36 | /* character string representation of the word */ 37 | END_ITEM(Word); /* an occurrence of a word */ 38 | 39 | BEGIN_LIST_OF(Word) 40 | END_LIST(Wordlist); /* a list of word occurrences */ 41 | 42 | void find_words(/* Wordlist *wordlist, Text *text */); 43 | /* finds the word occurrences in "text" and appends 44 | them to "wordlist" in sequence; all letters in 45 | "text" are assumed to be in lowercase */ 46 | 47 | void free_word(/* Word *word */); 48 | /* de-allocates a Word structure */ 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/dist.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * dist.h 4 | * 5 | * This module provides support for updating and writing a structure 6 | * that describes a distribution of character or word accuracies. 7 | * 8 | * Author: Stephen V. Rice 9 | * 10 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 11 | * Education, on behalf, of the University of Nevada, Las Vegas, 12 | * Information Science Research Institute 13 | * 14 | * Licensed under the Apache License, Version 2.0 (the "License"); you 15 | * may not use this file except in compliance with the License. You 16 | * may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, software 21 | * distributed under the License is distributed on an "AS IS" BASIS, 22 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 23 | * implied. See the License for the specific language governing 24 | * permissions and limitations under the License. 25 | * 26 | **********************************************************************/ 27 | 28 | #ifndef _DIST_ 29 | #define _DIST_ 30 | 31 | typedef 32 | struct 33 | { 34 | long count[101]; /* the (i)th element contains the total count for which 35 | accuracies are greater than or equal to (i)% */ 36 | long total_count; /* total count for all accuracies */ 37 | } Dist; 38 | 39 | void update_dist(/* Dist *dist, long count, long missed */); 40 | /* updates "dist" to reflect the accuracy described by 41 | "count" and "missed" */ 42 | 43 | void write_dist(/* Dist *dist, char *filename */); 44 | /* writes the distribution represented by "dist" to the 45 | named file (or stdout if "filename" is NULL); 46 | reports an error and quits if unable to create the 47 | file */ 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/wordaccdist.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wordaccdist.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "dist.h" 26 | #include "wacrpt.h" 27 | 28 | #define usage "wordacc_report1 wordacc_report2 ... >xyfile" 29 | 30 | Wacdata wacdata; 31 | Dist dist; 32 | 33 | /**********************************************************************/ 34 | 35 | void process_file(filename) 36 | char *filename; 37 | { 38 | long count, missed; 39 | count = wacdata.total.count; 40 | missed = wacdata.total.missed; 41 | read_wacrpt(&wacdata, filename); 42 | update_dist(&dist, wacdata.total.count - count, 43 | wacdata.total.missed - missed); 44 | } 45 | /**********************************************************************/ 46 | 47 | main(argc, argv) 48 | int argc; 49 | char *argv[]; 50 | { 51 | int i; 52 | initialize(&argc, argv, usage, NULL); 53 | for (i = 0; i < argc; i++) 54 | process_file(argv[i]); 55 | write_dist(&dist, NULL); 56 | terminate(); 57 | } 58 | -------------------------------------------------------------------------------- /src/dist.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * dist.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "dist.h" 26 | #include "util.h" 27 | 28 | /**********************************************************************/ 29 | 30 | void update_dist(dist, count, missed) 31 | Dist *dist; 32 | long count, missed; 33 | { 34 | double accuracy; 35 | short i; 36 | if (count == 0) 37 | return; 38 | accuracy = 100.0 * (count - missed) / count; 39 | for (i = 0; accuracy >= i; i++) 40 | dist->count[i] += count; 41 | dist->total_count += count; 42 | } 43 | /**********************************************************************/ 44 | 45 | void write_dist(dist, filename) 46 | Dist *dist; 47 | char *filename; 48 | { 49 | FILE *f; 50 | short i; 51 | if (dist->total_count == 0) 52 | return; 53 | f = open_file(filename, "w"); 54 | for (i = 0; i <= 100; i++) 55 | fprintf(f, "%3d %6.2f\n", i, 56 | 100.0 * dist->count[i] / dist->total_count); 57 | close_file(f); 58 | } 59 | -------------------------------------------------------------------------------- /src/ci.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * ci.h 4 | * 5 | * This module provides definitions and utility routines pertaining to 6 | * confidence intervals. 7 | * 8 | * Author: Stephen V. Rice 9 | * 10 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 11 | * Education, on behalf, of the University of Nevada, Las Vegas, 12 | * Information Science Research Institute 13 | * 14 | * Licensed under the Apache License, Version 2.0 (the "License"); you 15 | * may not use this file except in compliance with the License. You 16 | * may obtain a copy of the License at 17 | * 18 | * http://www.apache.org/licenses/LICENSE-2.0 19 | * 20 | * Unless required by applicable law or agreed to in writing, software 21 | * distributed under the License is distributed on an "AS IS" BASIS, 22 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 23 | * implied. See the License for the specific language governing 24 | * permissions and limitations under the License. 25 | * 26 | **********************************************************************/ 27 | 28 | #ifndef _CI_ 29 | #define _CI_ 30 | 31 | #include "list.h" 32 | 33 | BEGIN_ITEM(Obs) 34 | long count; /* total number */ 35 | long missed; /* number that were misrecognized */ 36 | double theta; /* estimator */ 37 | double j; /* pseudovalue */ 38 | END_ITEM(Obs); /* an observation */ 39 | 40 | BEGIN_LIST_OF(Obs) 41 | Obs total; /* total for all observations */ 42 | END_LIST(Obslist); /* a list of observations */ 43 | 44 | void append_obs(/* Obslist *obslist, long count, long missed */); 45 | /* appends the given observation to "obslist" */ 46 | 47 | void compute_ci(/* Obslist *obslist, double *lower, double *upper */); 48 | /* computes and returns an approximate 95% confidence 49 | interval for accuracy for the given set of 50 | observations */ 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/sort.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * sort.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | static void **a; 26 | static int (*cmp)(); 27 | 28 | /**********************************************************************/ 29 | 30 | static void quicksort(left, right) 31 | long left, right; 32 | { 33 | long i, j; 34 | void *ref; 35 | i = left; 36 | j = right; 37 | ref = a[i]; 38 | while (i < j) 39 | { 40 | while (i < j && (*cmp)(ref, a[j]) < 0) 41 | j--; 42 | if (i != j) 43 | a[i++] = a[j]; 44 | while (i < j && (*cmp)(ref, a[i]) > 0) 45 | i++; 46 | if (i != j) 47 | a[j--] = a[i]; 48 | } 49 | a[j] = ref; 50 | if (left < --j) 51 | quicksort(left, j); 52 | if (++i < right) 53 | quicksort(i, right); 54 | } 55 | /**********************************************************************/ 56 | 57 | void sort(num_elements, array, compare) 58 | long num_elements; 59 | void *array[]; 60 | int (*compare)(); 61 | { 62 | if (num_elements < 2) 63 | return; 64 | a = array; 65 | cmp = compare; 66 | quicksort(0, num_elements - 1); 67 | } 68 | -------------------------------------------------------------------------------- /libexec/Unicode-License.txt: -------------------------------------------------------------------------------- 1 | The file WordBreakProperty.txt.gz is distributed under the following license: 2 | 3 | COPYRIGHT AND PERMISSION NOTICE 4 | 5 | Copyright © 1991-2015 Unicode, Inc. All rights reserved. 6 | Distributed under the Terms of Use in 7 | http://www.unicode.org/copyright.html. 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining 10 | a copy of the Unicode data files and any associated documentation 11 | (the "Data Files") or Unicode software and any associated documentation 12 | (the "Software") to deal in the Data Files or Software 13 | without restriction, including without limitation the rights to use, 14 | copy, modify, merge, publish, distribute, and/or sell copies of 15 | the Data Files or Software, and to permit persons to whom the Data Files 16 | or Software are furnished to do so, provided that 17 | (a) this copyright and permission notice appear with all copies 18 | of the Data Files or Software, 19 | (b) this copyright and permission notice appear in associated 20 | documentation, and 21 | (c) there is clear notice in each modified Data File or in the Software 22 | as well as in the documentation associated with the Data File(s) or 23 | Software that the data or software has been modified. 24 | 25 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 26 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 27 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. 29 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 30 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 31 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 32 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 33 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 34 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 35 | 36 | Except as contained in this notice, the name of a copyright holder 37 | shall not be used in advertising or otherwise to promote the sale, 38 | use or other dealings in these Data Files or Software without prior 39 | written authorization of the copyright holder. 40 | -------------------------------------------------------------------------------- /src/edorpt.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * edorpt.h 4 | * 5 | * This module provides support for reading and writing edit operation 6 | * reports. The contents of one of these reports is represented by an 7 | * "Edodata" structure. 8 | * 9 | * Author: Stephen V. Rice 10 | * 11 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 12 | * Education, on behalf, of the University of Nevada, Las Vegas, 13 | * Information Science Research Institute 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you 16 | * may not use this file except in compliance with the License. You 17 | * may obtain a copy of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software 22 | * distributed under the License is distributed on an "AS IS" BASIS, 23 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | * implied. See the License for the specific language governing 25 | * permissions and limitations under the License. 26 | * 27 | **********************************************************************/ 28 | 29 | #ifndef _EDORPT_ 30 | #define _EDORPT_ 31 | 32 | #define MAX_MOVE_LENGTH 100 33 | /* maximum length of a move operation, given in number 34 | of characters moved; longer moves are counted as 35 | moves of this length */ 36 | 37 | typedef 38 | struct 39 | { 40 | long total_insertions; 41 | /* number of "character insert" operations */ 42 | long total_deletions; 43 | /* number of "character delete" operations */ 44 | long total_moves; /* number of "block move" operations */ 45 | long moves[MAX_MOVE_LENGTH + 1]; 46 | /* number of move operations for each length */ 47 | } Edodata; 48 | 49 | void read_edorpt(/* Edodata *edodata, char *filename */); 50 | /* reads the named file (or stdin if "filename" is NULL) 51 | and adds its contents to "edodata"; reports an error 52 | and quits if unable to open the file, or if the file 53 | does not contain an edit operation report */ 54 | 55 | void write_edorpt(/* Edodata *edodata, char *filename */); 56 | /* writes the contents of "edodata" to the named file 57 | (or stdout if "filename" is NULL); reports an error 58 | and quits if unable to create the file */ 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/editopcost.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * editopcost.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "edorpt.h" 26 | #include "util.h" 27 | 28 | #define usage "editop_report [editop_report2] >xyfile" 29 | 30 | Edodata edodata, edodata2; 31 | 32 | /**********************************************************************/ 33 | 34 | void decrement_edodata() 35 | { 36 | short i; 37 | edodata.total_insertions -= edodata2.total_insertions; 38 | edodata.total_deletions -= edodata2.total_deletions; 39 | edodata.total_moves -= edodata2.total_moves; 40 | for (i = 1; i <= MAX_MOVE_LENGTH; i++) 41 | edodata.moves[i] -= edodata2.moves[i]; 42 | } 43 | /**********************************************************************/ 44 | 45 | void write_results() 46 | { 47 | long insertions, moves, i; 48 | insertions = edodata.total_insertions; 49 | moves = edodata.total_moves; 50 | for (i = 0; i <= MAX_MOVE_LENGTH; i++) 51 | { 52 | printf("%3ld %10ld\n", i, insertions + i * moves); 53 | insertions += i * edodata.moves[i]; 54 | moves -= edodata.moves[i]; 55 | } 56 | } 57 | /**********************************************************************/ 58 | 59 | main(argc, argv) 60 | int argc; 61 | char *argv[]; 62 | { 63 | initialize(&argc, argv, usage, NULL); 64 | if (argc < 1 || argc > 2) 65 | error("invalid number of files"); 66 | read_edorpt(&edodata, argv[0]); 67 | if (argc == 2) 68 | { 69 | read_edorpt(&edodata2, argv[1]); 70 | decrement_edodata(); 71 | } 72 | write_results(); 73 | terminate(); 74 | } 75 | -------------------------------------------------------------------------------- /src/accci.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * accci.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "accrpt.h" 26 | #include "ci.h" 27 | 28 | #define usage "accuracy_report1 accuracy_report2 ... >resultfile" 29 | 30 | Accdata accdata; 31 | Obslist obslist; 32 | 33 | /**********************************************************************/ 34 | 35 | void process_file(filename) 36 | char *filename; 37 | { 38 | long chars, errors; 39 | chars = accdata.characters; 40 | errors = accdata.errors; 41 | read_accrpt(&accdata, filename); 42 | append_obs(&obslist, accdata.characters - chars, accdata.errors - errors); 43 | } 44 | /**********************************************************************/ 45 | 46 | void write_results() 47 | { 48 | double lower, upper; 49 | compute_ci(&obslist, &lower, &upper); 50 | printf("%14ld Observations\n", obslist.count); 51 | printf("%14ld Characters\n", accdata.characters); 52 | printf("%14ld Errors\n", accdata.errors); 53 | printf("%14.2f%% Accuracy\n", 54 | 100.0 * (accdata.characters - accdata.errors) / accdata.characters); 55 | printf("%6.2f%%,%6.2f%% %s\n", lower, upper, 56 | "Approximate 95% Confidence Interval for Accuracy"); 57 | } 58 | /**********************************************************************/ 59 | 60 | main(argc, argv) 61 | int argc; 62 | char *argv[]; 63 | { 64 | int i; 65 | initialize(&argc, argv, usage, NULL); 66 | if (argc < 2) 67 | error("not enough input files"); 68 | for (i = 0; i < argc; i++) 69 | process_file(argv[i]); 70 | write_results(); 71 | terminate(); 72 | } 73 | -------------------------------------------------------------------------------- /src/wordaccci.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wordaccci.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "ci.h" 26 | #include "wacrpt.h" 27 | 28 | #define usage "wordacc_report1 wordacc_report2 ... >resultfile" 29 | 30 | Wacdata wacdata; 31 | Obslist obslist; 32 | 33 | /**********************************************************************/ 34 | 35 | void process_file(filename) 36 | char *filename; 37 | { 38 | long count, missed; 39 | count = wacdata.total.count; 40 | missed = wacdata.total.missed; 41 | read_wacrpt(&wacdata, filename); 42 | append_obs(&obslist, wacdata.total.count - count, 43 | wacdata.total.missed - missed); 44 | } 45 | /**********************************************************************/ 46 | 47 | void write_results() 48 | { 49 | double lower, upper; 50 | compute_ci(&obslist, &lower, &upper); 51 | printf("%14ld Observations\n", obslist.count); 52 | printf("%14ld Words\n", wacdata.total.count); 53 | printf("%14ld Misrecognized\n", wacdata.total.missed); 54 | printf("%14.2f%% Accuracy\n", 55 | 100.0 * (wacdata.total.count - wacdata.total.missed) / wacdata.total.count); 56 | printf("%6.2f%%,%6.2f%% %s\n", lower, upper, 57 | "Approximate 95% Confidence Interval for Accuracy"); 58 | } 59 | /**********************************************************************/ 60 | 61 | main(argc, argv) 62 | int argc; 63 | char *argv[]; 64 | { 65 | int i; 66 | initialize(&argc, argv, usage, NULL); 67 | if (argc < 2) 68 | error("not enough input files"); 69 | for (i = 0; i < argc; i++) 70 | process_file(argv[i]); 71 | write_results(); 72 | terminate(); 73 | } 74 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | This project adheres to [Semantic Versioning](http://semver.org/). 4 | This file is inspired by [Keep a `CHANGELOG`](http://keepachangelog.com/). 5 | 6 | ## [7.0.1] - 2018-11-21 7 | ### Fixed 8 | - Compile error involving inappropriate use of `ssize_t` 9 | 10 | ## [7.0.0] - 2018-11-21 11 | ### Changed 12 | - Changed name from `isri-ocr-evaluation-tools` to `ocreval` ([#21]) 13 | 14 | ## [6.1.2] - 2017-01-04 15 | ### Fixed 16 | - Read in UTF-8 characters in `accsum` ([#14]) 17 | 18 | ## [6.1.1] - 2016-02-22 19 | ### Fixed 20 | - No longer spuriously abort if inputs are longer than 65,536 characters ([#10]) 21 | 22 | ## [6.1.0] - 2016-01-01 23 | ### Added 24 | - `make exports` which outputs shell `export` commands (to avoid global installation) 25 | 26 | ### Changed 27 | - More conventional directory layout ([#4]) 28 | 29 | ## [6.0.1] - 2016-01-04 30 | ### Fixed 31 | - Bug in implementation of [WB6](http://unicode.org/reports/tr29/#WB6) 32 | - Special case U+0020 SPACE ' ' as a graphic character 33 | - Clang warnings 34 | 35 | ## [6.0.0] - 2016-01-04 36 | ### Added 37 | - Word segmentation using [Unicode word boundaries](http://unicode.org/reports/tr29/#Word_Boundaries). 38 | 39 | ### Changed 40 | - Start following [SemVer](http://semver.org) properly. 41 | - All input and output is in UTF-8 42 | - Fixes to handle non-BMP code points (code points beyond U+FFFF) 43 | 44 | ### Removed 45 | - `uni2asc` and `asc2uni` (redundant due to change to UTF-8) 46 | 47 | ## [5.1.3] - 2015-11-15 48 | ### Changed 49 | - More idiomatic `make` build system 50 | 51 | ### Fixed 52 | - Compiles on modern OS X and Ubuntu 53 | 54 | [7.0.1]: https://github.com/eddieantonio/ocreval/compare/v7.0.0...v7.0.1 55 | [7.0.0]: https://github.com/eddieantonio/ocreval/compare/v6.1.2...v7.0.0 56 | [6.1.2]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.1.1...v6.1.2 57 | [6.1.1]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.1.0...v6.1.1 58 | [6.1.0]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.0.1...v6.1.0 59 | [6.0.1]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.0.0...v6.0.1 60 | [6.0.0]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v5.1.3...v6.0.0 61 | [5.1.3]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v5.1.0...v5.1.3 62 | 63 | [#4]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/issues/4 64 | [#10]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/issues/10 65 | [#14]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/issues/14 66 | [#21]: https://github.com/eddieantonio/ocreval/issues/21 67 | -------------------------------------------------------------------------------- /src/nonstopacc.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * nonstopacc.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "wacrpt.h" 26 | 27 | #define usage "stopwordfile wordacc_report >xyfile" 28 | 29 | Textopt textopt = { False, False, 0, True, True, True }; 30 | Text text; 31 | Wordlist wordlist; 32 | 33 | Wacdata wacdata; 34 | 35 | long count, missed; 36 | 37 | /**********************************************************************/ 38 | 39 | void find_stopword(termtable, string) 40 | Termtable *termtable; 41 | char *string; 42 | { 43 | Term *term; 44 | term = table_lookup(termtable, string); 45 | if (term) 46 | { 47 | count -= term->wac.count; 48 | missed -= term->wac.missed; 49 | } 50 | } 51 | /**********************************************************************/ 52 | 53 | void write_line() 54 | { 55 | static long linenum = 0; 56 | printf("%3ld %6.2f\n", linenum++, 100.0 * (count - missed) / count); 57 | } 58 | /**********************************************************************/ 59 | 60 | void write_results() 61 | { 62 | Word *word; 63 | count = wacdata.total.count; 64 | missed = wacdata.total.missed; 65 | if (count == 0) 66 | return; 67 | write_line(); 68 | for (word = wordlist.first; word; word = word->next) 69 | { 70 | find_stopword(&wacdata.stopword_table, word->string); 71 | find_stopword(&wacdata.non_stopword_table, word->string); 72 | if (count == 0) 73 | return; 74 | write_line(); 75 | } 76 | } 77 | /**********************************************************************/ 78 | 79 | main(argc, argv) 80 | int argc; 81 | char *argv[]; 82 | { 83 | initialize(&argc, argv, usage, NULL); 84 | if (argc != 2) 85 | error("invalid number of files"); 86 | read_text(&text, argv[0], &textopt); 87 | find_words(&wordlist, &text); 88 | read_wacrpt(&wacdata, argv[1]); 89 | write_results(); 90 | terminate(); 91 | } 92 | -------------------------------------------------------------------------------- /src/groupacc.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * groupacc.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "accrpt.h" 26 | 27 | #define usage "groupfile accuracy_report [groupacc_report]" 28 | 29 | Textopt textopt = { False, True, 0, True, True }; 30 | 31 | Text text; 32 | 33 | Accdata accdata; 34 | 35 | /**********************************************************************/ 36 | 37 | void write_line(f, count, missed) 38 | FILE *f; 39 | long count, missed; 40 | { 41 | fprintf(f, "%8ld %8ld ", count, missed); 42 | if (count == 0) 43 | fprintf(f, " ------ "); 44 | else 45 | fprintf(f, "%8.2f ", 100.0 * (count - missed) / count); 46 | } 47 | /**********************************************************************/ 48 | 49 | void write_report(filename) 50 | char *filename; 51 | { 52 | FILE *f; 53 | long total_count = 0, total_missed = 0, count, missed; 54 | Char *c; 55 | char buffer[STRING_SIZE]; 56 | f = open_file(filename, "w"); 57 | fprintf(f, " Count Missed %%Right\n"); 58 | for (c = text.first; c; c = c->next) 59 | if (accdata.small_class[c->value].count > 0 && c->value != BLANK && 60 | c->value != NEWLINE) 61 | { 62 | count = accdata.small_class[c->value].count; 63 | missed = accdata.small_class[c->value].missed; 64 | write_line(f, count, missed); 65 | char_to_string(False, c->value, buffer, True); 66 | fprintf(f, "{%s}\n", buffer); 67 | total_count += count; 68 | total_missed += missed; 69 | } 70 | write_line(f, total_count, total_missed); 71 | fprintf(f, "Total\n"); 72 | close_file(f); 73 | } 74 | /**********************************************************************/ 75 | 76 | main(argc, argv) 77 | int argc; 78 | char *argv[]; 79 | { 80 | initialize(&argc, argv, usage, NULL); 81 | if (argc < 2 || argc > 3) 82 | error("invalid number of files"); 83 | read_text(&text, argv[0], &textopt); 84 | read_accrpt(&accdata, argv[1]); 85 | write_report(argc == 3 ? argv[2] : NULL); 86 | terminate(); 87 | } 88 | -------------------------------------------------------------------------------- /src/ci.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * ci.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include 26 | 27 | #include "ci.h" 28 | #include "util.h" 29 | 30 | /**********************************************************************/ 31 | 32 | void append_obs(obslist, count, missed) 33 | Obslist *obslist; 34 | long count, missed; 35 | { 36 | Obs *obs; 37 | obs = NEW(Obs); 38 | obs->count = count; 39 | obs->missed = missed; 40 | obslist->total.count += count; 41 | obslist->total.missed += missed; 42 | list_insert_last(obslist, obs); 43 | } 44 | /**********************************************************************/ 45 | 46 | static Boolean valid_obslist(obslist) 47 | Obslist *obslist; 48 | { 49 | long n = 0; 50 | Obs *obs; 51 | for (obs = obslist->first; obs; obs = obs->next) 52 | if (obs->count > 0) 53 | n++; 54 | return(n > 1 ? True : False); 55 | } 56 | /**********************************************************************/ 57 | 58 | static double accuracy(count, missed) 59 | long count, missed; 60 | { 61 | return(100.0 * (count - missed) / count); 62 | } 63 | /**********************************************************************/ 64 | 65 | void compute_ci(obslist, lower, upper) 66 | Obslist *obslist; 67 | double *lower, *upper; 68 | { 69 | long n; 70 | double ntheta, sum = 0.0, w; 71 | Obs *obs; 72 | if (!valid_obslist(obslist)) 73 | error("not enough observations"); 74 | n = obslist->count; 75 | obslist->total.theta = accuracy(obslist->total.count, 76 | obslist->total.missed); 77 | ntheta = n * obslist->total.theta; 78 | for (obs = obslist->first; obs; obs = obs->next) 79 | { 80 | obs->theta = accuracy(obslist->total.count - obs->count, 81 | obslist->total.missed - obs->missed); 82 | obs->j = ntheta - (n - 1) * obs->theta; 83 | sum += obs->theta; 84 | } 85 | obslist->total.j = ntheta - (n - 1) * sum / n; 86 | sum = 0.0; 87 | for (obs = obslist->first; obs; obs = obs->next) 88 | sum += (obs->j - obslist->total.j) * (obs->j - obslist->total.j); 89 | w = 1.96 * sqrt(sum / (n - 1) / n); 90 | *lower = max(0.0, min(100.0, obslist->total.j - w)); 91 | *upper = max(0.0, min(100.0, obslist->total.j + w)); 92 | } 93 | -------------------------------------------------------------------------------- /test/test_accuracy_large_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: UTF-8 -*- 3 | 4 | """ 5 | Tests accuracy on LARGE UTF-8 files. 6 | """ 7 | 8 | import random 9 | import subprocess 10 | import tempfile 11 | 12 | import os.path as p 13 | 14 | 15 | # Alias range as xrange in Python 3: 16 | try: 17 | xrange 18 | except NameError: 19 | xrange = range 20 | 21 | # Create a Python 2/3 Unicode string literal: 22 | try: 23 | unicode 24 | except NameError: 25 | u = str 26 | else: 27 | u = lambda s: s.decode('UTF-8') 28 | 29 | # Path to accuracy program 30 | ACCURACY = p.join(p.dirname(p.dirname(p.realpath(__file__))), 31 | 'bin', 'accuracy') 32 | assert p.exists(ACCURACY), 'Could not find ' + ACCURACY 33 | 34 | 35 | # http://www.languagegeek.com/isolate/haidastory.html 36 | corpus = u('''\ 37 | Aaniisuu tangaa g̱aging.ang ’wan suuga. ’Ll xidgwangaas, x̱uyaa’aa. Tllgu 38 | ḵ’aawgyaa hllng.aaygi ’laa ḵyaang.aas. Ḵawdiuu gwaay g̱ud gwaa nang ḵadlaa 39 | ḵ’ayg̱udyaas ’laagu ḵ’aawgaay g̱an ’laa g̱á ’laa xidaas. Á tl’l sg̱aana ḵidaads 40 | ’yaahlgaagaas g̱iinuus gangaang ’laagu gud gwii x̱iihlt’ahliyaagaas. Ga 41 | sg̱aanag̱waa g̱ax̱aas ’laa t’isda ḵ’a sḵ’agilaang.aas, tll gwii x̱an, hahl gwii’ad 42 | wah gwii’aa. G̱adagaas gyaanuu’asing g̱aalgaagaang ’wan suuga. 43 | 44 | Nang kilsdlaas naag̱ag̱a.aw tadll chi’a’aawaagan. Sing ḵ’alg̱ada ’ll ḵaaxuhls 45 | gyaan ’ll kindagaang.aas. Sda ’laa xid k’udahldats’aasii gyaan gagu ’laa 46 | ḵ’aw’aawaasgu x̱an ’laa ḵ’aawgangas. 47 | ''') 48 | 49 | dictionary = tuple(word for word in corpus.split()) 50 | alphabet = [char for char in corpus if char not in ' \n'] 51 | 52 | 53 | def one_in(n): 54 | return random.choice(xrange(n)) == 1 55 | 56 | 57 | def change_letter(word): 58 | letter_index = random.choice(xrange(len(word))) 59 | mutation = random.choice(alphabet) 60 | return word[:letter_index] + mutation + word[letter_index + 1:] 61 | 62 | 63 | if __name__ == '__main__': 64 | import sys 65 | amount_of_words = int(sys.argv[1]) if len(sys.argv) > 1 else 32768 66 | 67 | # Create temporary files for each... 68 | with tempfile.NamedTemporaryFile('wb') as correct_file,\ 69 | tempfile.NamedTemporaryFile('wb') as generated_file: 70 | 71 | # Generate A LOT of random words 72 | for _ in xrange(amount_of_words): 73 | end = b'\n' if one_in(10) else b' ' 74 | 75 | word = random.choice(dictionary) 76 | correct_file.write(word.encode('UTF-8')) 77 | 78 | # Occasionally, typo a word in the generated file. 79 | generated_word = change_letter(word) if one_in(1000) else word 80 | generated_file.write(generated_word.encode('UTF-8')) 81 | 82 | # Write a space or newline. 83 | correct_file.write(end) 84 | generated_file.write(end) 85 | 86 | # Finish off the file with a new line and flush the output. 87 | if end != b'\n': 88 | correct_file.write(b'\n') 89 | generated_file.write(b'\n') 90 | 91 | correct_file.flush() 92 | generated_file.flush() 93 | 94 | # This will fail if accuracy itself fails. 95 | subprocess.check_call([ACCURACY, 96 | correct_file.name, generated_file.name]) 97 | -------------------------------------------------------------------------------- /src/sync.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * sync.h 4 | * 5 | * This module provides support for string synchronization. There are 6 | * two algorithms based on the detection of long common substrings: 7 | * "synchronize" can match two or more text streams but does not find 8 | * transposed matches, while "transpose_sync" locates transposed 9 | * matches but can be applied to only two text streams. A third 10 | * algorithm, "fastukk_sync", is based on an algorithm by Ukkonen, and 11 | * finds an optimal match of two text streams using cost function 12 | * (1,1,1). 13 | * 14 | * Author: Stephen V. Rice 15 | * 16 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 17 | * Education, on behalf, of the University of Nevada, Las Vegas, 18 | * Information Science Research Institute 19 | * 20 | * Licensed under the Apache License, Version 2.0 (the "License"); you 21 | * may not use this file except in compliance with the License. You 22 | * may obtain a copy of the License at 23 | * 24 | * http://www.apache.org/licenses/LICENSE-2.0 25 | * 26 | * Unless required by applicable law or agreed to in writing, software 27 | * distributed under the License is distributed on an "AS IS" BASIS, 28 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 29 | * implied. See the License for the specific language governing 30 | * permissions and limitations under the License. 31 | * 32 | **********************************************************************/ 33 | 34 | #ifndef _SYNC_ 35 | #define _SYNC_ 36 | 37 | #include "text.h" 38 | 39 | typedef 40 | struct 41 | { 42 | long start; /* index of the first character of the substring */ 43 | long stop; /* index of the last character of the substring */ 44 | long length; /* length of the substring in characters */ 45 | } Substr; /* describes a substring of a text value by specifying 46 | its indices within the "array" representation of the 47 | text value */ 48 | 49 | BEGIN_ITEM(Sync) 50 | Substr *substr; /* array describing one substring for each text value */ 51 | long *match; /* if non-NULL, then the substrings have been matched 52 | and this points to the match number; otherwise, the 53 | substrings are unmatched */ 54 | END_ITEM(Sync); 55 | 56 | BEGIN_LIST_OF(Sync) 57 | END_LIST(Synclist); /* list of matched and unmatched substrings */ 58 | 59 | void synchronize(/* Synclist *synclist, short num_text, Text *text */); 60 | /* given "num_text" streams of text, synchronizes the 61 | streams and stores the results in "synclist"; each 62 | item in the list points to an array of "num_text" 63 | substrings */ 64 | 65 | void transpose_sync(/* Synclist *synclist1, Synclist *synclist2, 66 | Text *text1, Text *text2 */); 67 | /* synchronizes two streams of text while allowing for 68 | transposed matches; each stream has its results 69 | stored in its own list, and each item of its list 70 | points to only a single substring */ 71 | 72 | void fastukk_sync(/* Synclist *synclist, Text *text */); 73 | /* given two streams in the array "text", synchronizes 74 | them optimally and stores the results in "synclist"; 75 | each item in the list points to an array of two 76 | substrings */ 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/wordfreq.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wordfreq.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "sort.h" 26 | #include "wacrpt.h" 27 | 28 | #define usage "textfile1 textfile2 ... >resultfile" 29 | 30 | Textopt textopt = { True, True, 0, True, True, True }; 31 | Text text; 32 | 33 | Wordlist wordlist; 34 | 35 | Termtable termtable; 36 | 37 | /**********************************************************************/ 38 | 39 | void process_file(filename) 40 | char *filename; 41 | { 42 | Word *word; 43 | read_text(&text, filename, &textopt); 44 | find_words(&wordlist, &text); 45 | for (word = wordlist.first; word; word = word->next) 46 | add_term(&termtable, word->string, 1, 0); 47 | list_empty(&text, free); 48 | list_empty(&wordlist, free_word); 49 | } 50 | /**********************************************************************/ 51 | 52 | int order_by_key(term1, term2) 53 | Term *term1, *term2; 54 | { 55 | return(ustrcmp(term1->key, term2->key)); 56 | } 57 | /**********************************************************************/ 58 | 59 | int order_by_count(term1, term2) 60 | Term *term1, *term2; 61 | { 62 | if (term1->wac.count != term2->wac.count) 63 | return(term2->wac.count - term1->wac.count); 64 | return(order_by_key(term1, term2)); 65 | } 66 | /**********************************************************************/ 67 | 68 | void write_array() 69 | { 70 | long i, total = 0; 71 | printf(" Count\n"); 72 | for (i = 0; i < termtable.count; i++) 73 | { 74 | printf("%8ld %s\n", termtable.array[i]->wac.count, 75 | termtable.array[i]->key); 76 | total += termtable.array[i]->wac.count; 77 | } 78 | printf("%8ld Total\n", total); 79 | } 80 | /**********************************************************************/ 81 | 82 | void write_report() 83 | { 84 | table_in_array(&termtable); 85 | sort(termtable.count, termtable.array, order_by_key); 86 | write_array(); 87 | printf("\n\n"); 88 | sort(termtable.count, termtable.array, order_by_count); 89 | write_array(); 90 | } 91 | /**********************************************************************/ 92 | 93 | main(argc, argv) 94 | int argc; 95 | char *argv[]; 96 | { 97 | int i; 98 | initialize(&argc, argv, usage, NULL); 99 | if (argc == 0) 100 | error("no text files specified"); 101 | for (i = 0; i < argc; i++) 102 | process_file(argv[i]); 103 | write_report(); 104 | terminate(); 105 | } 106 | -------------------------------------------------------------------------------- /test/text_test.c: -------------------------------------------------------------------------------- 1 | #include "greatest.h" 2 | #include "test_utils.h" 3 | 4 | #include 5 | #include 6 | 7 | TEST cstring_to_text_should_handle_ascii_strings() { 8 | ASSERT(cstring_to_text(text, "hello")); 9 | ASSERT_EQ_FMT(5, text->count, "%d"); 10 | 11 | PASS(); 12 | } 13 | 14 | /* Handle a 2 character UTF-8 string. */ 15 | TEST cstring_to_text_should_handle_latin() { 16 | ASSERT(cstring_to_text(text, "łódź")); 17 | ASSERT_EQ_FMT(4, text->count, "%d"); 18 | 19 | PASS(); 20 | } 21 | 22 | /* Handle a 3 character UTF-8 string. */ 23 | TEST cstring_to_text_should_handle_bmp() { 24 | ASSERT(cstring_to_text(text, "働")); 25 | ASSERT_EQ_FMT(1, text->count, "%d"); 26 | 27 | PASS(); 28 | } 29 | 30 | /* Handle a 4 character UTF-8 string. */ 31 | TEST cstring_to_text_should_handle_astral_code_points() { 32 | /* You could say I'm a flan of this test case. */ 33 | ASSERT(cstring_to_text(text, "🍮")); 34 | ASSERT_EQ_FMT(1, text->count, "%d"); 35 | 36 | PASS(); 37 | } 38 | 39 | TEST char_to_string_converts_space() { 40 | char buffer[STRING_SIZE]; 41 | 42 | char_to_string(False, ' ', buffer, False); 43 | ASSERT_STR_EQ(" ", buffer); 44 | 45 | PASS(); 46 | } 47 | 48 | TEST char_to_string_converts_printable_ascii() { 49 | char buffer[STRING_SIZE]; 50 | 51 | char_to_string(False, '%', buffer, False); 52 | ASSERT_STR_EQ("%", buffer); 53 | 54 | PASS(); 55 | } 56 | 57 | TEST char_to_string_converts_non_printable_ascii() { 58 | char buffer[STRING_SIZE]; 59 | 60 | char_to_string(False, 0x0007, buffer, False); 61 | ASSERT_STR_EQ("<07>", buffer); 62 | 63 | PASS(); 64 | } 65 | 66 | TEST char_to_string_converts_printable_bmp() { 67 | char buffer[STRING_SIZE]; 68 | 69 | char_to_string(False, 0x50cd, buffer, False); 70 | ASSERT_STR_EQ("働", buffer); 71 | 72 | PASS(); 73 | } 74 | 75 | TEST char_to_string_converts_astral_code_points() { 76 | char buffer[STRING_SIZE]; 77 | 78 | char_to_string(False, 0x0101E1, buffer, False); 79 | ASSERT_STR_EQ("𐇡", buffer); 80 | 81 | PASS(); 82 | } 83 | 84 | TEST char_to_string_converts_bmp_combiner() { 85 | char buffer[STRING_SIZE]; 86 | 87 | char_to_string(False, 0x0309, buffer, False); 88 | ASSERT_STR_EQ("◌̉", buffer); 89 | 90 | PASS(); 91 | } 92 | 93 | TEST char_to_string_converts_astral_combiner() { 94 | char buffer[STRING_SIZE]; 95 | 96 | char_to_string(False, 0x0101FD, buffer, False); 97 | ASSERT_STR_EQ("◌𐇽", buffer); 98 | 99 | PASS(); 100 | } 101 | 102 | SUITE(cstring_to_text_suite) { 103 | SET_SETUP(initialize_texts, (Text*[]) {text, NULL}); 104 | SET_TEARDOWN(deinitialize_texts, (Text*[]) {text, NULL}); 105 | 106 | RUN_TEST(cstring_to_text_should_handle_ascii_strings); 107 | RUN_TEST(cstring_to_text_should_handle_latin); 108 | RUN_TEST(cstring_to_text_should_handle_bmp); 109 | RUN_TEST(cstring_to_text_should_handle_astral_code_points); 110 | } 111 | 112 | SUITE(char_to_string_suite) { 113 | RUN_TEST(char_to_string_converts_printable_ascii); 114 | RUN_TEST(char_to_string_converts_space); 115 | RUN_TEST(char_to_string_converts_non_printable_ascii); 116 | RUN_TEST(char_to_string_converts_printable_bmp); 117 | RUN_TEST(char_to_string_converts_astral_code_points); 118 | RUN_TEST(char_to_string_converts_bmp_combiner); 119 | RUN_TEST(char_to_string_converts_astral_combiner); 120 | } 121 | -------------------------------------------------------------------------------- /src/list.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * list.h 4 | * 5 | * This module provides a general-purpose linked-list capability. A 6 | * "list" contains zero or more "items". 7 | * 8 | * An item is a record structure having the following initial fields: 9 | * 10 | * Item *prev; - pointer to the previous item in the list 11 | * Item *next; - pointer to the next item in the list 12 | * 13 | * This structure is declared by 14 | * 15 | * BEGIN_ITEM(Item) 16 | * 17 | * END_ITEM(Item); 18 | * 19 | * A list is a record structure having the following initial fields: 20 | * 21 | * Item *first; - pointer to the first item in the list 22 | * Item *last; - pointer to the last item in the list 23 | * Item *array[]; - array of pointers to the items in the list 24 | * long count; - number of items in the list 25 | * 26 | * This structure is declared by 27 | * 28 | * BEGIN_LIST_OF(Item) 29 | * 30 | * END_LIST(List); 31 | * 32 | * Author: Stephen V. Rice 33 | * 34 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 35 | * Education, on behalf, of the University of Nevada, Las Vegas, 36 | * Information Science Research Institute 37 | * 38 | * Licensed under the Apache License, Version 2.0 (the "License"); you 39 | * may not use this file except in compliance with the License. You 40 | * may obtain a copy of the License at 41 | * 42 | * http://www.apache.org/licenses/LICENSE-2.0 43 | * 44 | * Unless required by applicable law or agreed to in writing, software 45 | * distributed under the License is distributed on an "AS IS" BASIS, 46 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 47 | * implied. See the License for the specific language governing 48 | * permissions and limitations under the License. 49 | * 50 | **********************************************************************/ 51 | 52 | #ifndef _LIST_ 53 | #define _LIST_ 54 | 55 | #define BEGIN_ITEM(Item) \ 56 | typedef \ 57 | struct Item \ 58 | { \ 59 | struct Item *prev, *next; 60 | 61 | #define END_ITEM(Item) \ 62 | } Item 63 | 64 | #define BEGIN_LIST_OF(Item) \ 65 | typedef \ 66 | struct \ 67 | { \ 68 | Item *first, *last, **array; \ 69 | long count; 70 | 71 | #define END_LIST(List) \ 72 | } List 73 | 74 | void list_initialize(/* List *list */); 75 | /* initializes the list; this routine does not need to 76 | be called if the list structure was initialized 77 | statically or dynamically */ 78 | 79 | void list_insert_first(/* List *list, Item *item */); 80 | /* inserts the item at the beginning of the list */ 81 | 82 | void list_insert_last(/* List *list, Item *item */); 83 | /* inserts the item at the end of the list */ 84 | 85 | void list_insert_before(/* List *list, Item *item, Item *ref */); 86 | /* inserts the item before "ref" in the list */ 87 | 88 | void list_insert_after(/* List *list, Item *ref, Item *item */); 89 | /* inserts the item after "ref" in the list */ 90 | 91 | void list_remove(/* List *list, Item *item */); 92 | /* removes the item from the list */ 93 | 94 | void list_in_array(/* List *list */); 95 | /* if the list is non-empty, creates and initializes 96 | the array of pointers to the items in the list */ 97 | 98 | void list_empty(/* List *list, void (*process_item)(Item *item) */); 99 | /* empties the list; as each item is removed, the given 100 | routine is called to process it */ 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /src/wacrpt.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wacrpt.h 4 | * 5 | * This module provides support for reading and writing word accuracy 6 | * reports. The contents of one of these reports is represented by a 7 | * "Wacdata" structure. 8 | * 9 | * Author: Stephen V. Rice 10 | * 11 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 12 | * Education, on behalf, of the University of Nevada, Las Vegas, 13 | * Information Science Research Institute 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you 16 | * may not use this file except in compliance with the License. You 17 | * may obtain a copy of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software 22 | * distributed under the License is distributed on an "AS IS" BASIS, 23 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | * implied. See the License for the specific language governing 25 | * permissions and limitations under the License. 26 | * 27 | **********************************************************************/ 28 | 29 | #ifndef _WACRPT_ 30 | #define _WACRPT_ 31 | 32 | #include "table.h" 33 | #include "word.h" 34 | 35 | #define MAX_OCCURRENCES 10 36 | #define MAX_PHRASELENGTH 8 37 | 38 | typedef 39 | struct 40 | { 41 | long count; /* number of word occurrences */ 42 | long missed; /* number of these that were misrecognized */ 43 | } Wac; 44 | 45 | BEGIN_ENTRY(Term) 46 | Wac wac; 47 | END_ENTRY(Term); /* a distinct word */ 48 | 49 | BEGIN_TABLE_OF(Term, Termlist) 50 | END_TABLE(Termtable); /* table of distinct words */ 51 | 52 | typedef 53 | struct 54 | { 55 | Wac total; /* all words */ 56 | Wac stopword[MAX_WORDLENGTH + 1]; 57 | /* stopwords by word length (in characters); the total 58 | for all stopwords is in the 0th element */ 59 | Wac non_stopword[MAX_WORDLENGTH + 1]; 60 | /* non-stopwords by word length (in characters); the 61 | total for all non-stopwords is in the 0th element */ 62 | Wac distinct_non_stopword[MAX_OCCURRENCES + 2]; 63 | /* distinct non-stopwords by number of occurrences on 64 | a page; the (MAX_OCCURRENCES + 1) element groups 65 | all occurring more than MAX_OCCURRENCES times; the 66 | total for all distinct non-stopwords is in the 0th 67 | element */ 68 | Wac phrase[MAX_PHRASELENGTH + 1]; 69 | /* phrases by phrase length (in words); the 0th element 70 | is not used */ 71 | Termtable stopword_table; 72 | /* table of distinct stopwords */ 73 | Termtable non_stopword_table; 74 | /* table of distinct non-stopwords */ 75 | } Wacdata; 76 | 77 | void increment_wac(/* Wac *wac, long count, long missed */); 78 | /* adds "count" and "missed" to the respective fields 79 | of "wac" */ 80 | 81 | void add_term(/* Termtable *termtable, char *key, long count, long missed */); 82 | /* adds the given word to "termtable"; "key" contains 83 | the character string representation of the word; 84 | a copy of this string is stored in the table */ 85 | 86 | void read_wacrpt(/* Wacdata *wacdata, char *filename */); 87 | /* reads the named file (or stdin if "filename" is NULL) 88 | and adds its contents to "wacdata"; reports an error 89 | and quits if unable to open the file, or if the file 90 | does not contain a word accuracy report */ 91 | 92 | void write_wacrpt(/* Wacdata *wacdata, char *filename */); 93 | /* writes the contents of "wacdata" to the named file 94 | (or stdout if "filename" is NULL); reports an error 95 | and quits if unable to create the file */ 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /src/table.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * table.h 4 | * 5 | * This module provides a general-purpose hash table capability. A 6 | * "table" contains zero or more "entries". 7 | * 8 | * An entry is a record structure having the following initial fields: 9 | * 10 | * Entry *prev; - pointer to the previous entry in the list 11 | * Entry *next; - pointer to the next entry in the list 12 | * char *key; - null-terminated string containing the hash key 13 | * 14 | * This structure is declared by 15 | * 16 | * BEGIN_ENTRY(Entry) 17 | * 18 | * END_ENTRY(Entry); 19 | * 20 | * A table is a record structure having the following initial fields: 21 | * 22 | * Entrylist list[TABLE_SIZE]; 23 | * - array of lists of entries 24 | * Entry *array[]; - array of pointers to the entries in the table 25 | * long count; - number of entries in the table 26 | * 27 | * The following declares this structure and the Entrylist structure: 28 | * 29 | * BEGIN_TABLE_OF(Entry, Entrylist) 30 | * 31 | * END_TABLE(Table); 32 | * 33 | * Author: Stephen V. Rice 34 | * 35 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 36 | * Education, on behalf, of the University of Nevada, Las Vegas, 37 | * Information Science Research Institute 38 | * 39 | * Licensed under the Apache License, Version 2.0 (the "License"); you 40 | * may not use this file except in compliance with the License. You 41 | * may obtain a copy of the License at 42 | * 43 | * http://www.apache.org/licenses/LICENSE-2.0 44 | * 45 | * Unless required by applicable law or agreed to in writing, software 46 | * distributed under the License is distributed on an "AS IS" BASIS, 47 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 48 | * implied. See the License for the specific language governing 49 | * permissions and limitations under the License. 50 | * 51 | **********************************************************************/ 52 | 53 | #ifndef _TABLE_ 54 | #define _TABLE_ 55 | 56 | #include "list.h" 57 | 58 | #define TABLE_SIZE 503 59 | 60 | #define BEGIN_ENTRY(Entry) \ 61 | BEGIN_ITEM(Entry) \ 62 | char *key; 63 | 64 | #define END_ENTRY(Entry) \ 65 | END_ITEM(Entry) 66 | 67 | #define BEGIN_TABLE_OF(Entry, Entrylist) \ 68 | BEGIN_LIST_OF(Entry) \ 69 | END_LIST(Entrylist); \ 70 | typedef \ 71 | struct \ 72 | { \ 73 | Entrylist list[TABLE_SIZE]; \ 74 | Entry **array; \ 75 | long count; 76 | 77 | #define END_TABLE(Table) \ 78 | } Table 79 | 80 | void table_initialize(/* Table *table */); 81 | /* initializes the table; this routine does not need to 82 | be called if the table structure was initialized 83 | statically or dynamically */ 84 | 85 | void *table_lookup(/* Table *table, char *key */); 86 | /* searches the table for an entry having the specified 87 | key value; returns a pointer to it if found; returns 88 | NULL if not found */ 89 | 90 | void table_insert(/* Table *table, Entry *entry */); 91 | /* inserts the entry into the table */ 92 | 93 | void table_remove(/* Table *table, Entry *entry */); 94 | /* removes the entry from the table */ 95 | 96 | void table_in_array(/* Table *table */); 97 | /* if the table is non-empty, creates and initializes 98 | the array of pointers to the entries in the table */ 99 | 100 | void table_empty(/* Table *table, void (*process_entry)(Entry *entry) */); 101 | /* empties the table; as each entry is removed, the 102 | given routine is called to process it */ 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /src/table.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * table.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "table.h" 26 | #include "util.h" 27 | 28 | BEGIN_ENTRY(Entry) 29 | END_ENTRY(Entry); 30 | 31 | BEGIN_TABLE_OF(Entry, Entrylist) 32 | END_TABLE(Table); 33 | 34 | /**********************************************************************/ 35 | 36 | void table_initialize(table) 37 | Table *table; 38 | { 39 | short i; 40 | for (i = 0; i < TABLE_SIZE; i++) 41 | list_initialize(&table->list[i]); 42 | table->array = NULL; 43 | table->count = 0; 44 | } 45 | /**********************************************************************/ 46 | 47 | static short table_index(key_string) 48 | const char *key_string; 49 | { 50 | const unsigned char *key = (const unsigned char*) key_string; 51 | long i, sum = 0; 52 | 53 | for (i = 0; key[i]; i++) 54 | sum += key[i]; 55 | return(sum % TABLE_SIZE); 56 | } 57 | /**********************************************************************/ 58 | 59 | void *table_lookup(table, key) 60 | Table *table; 61 | char *key; 62 | { 63 | Entry *entry; 64 | for (entry = table->list[table_index(key)].first; entry && 65 | strcmp(key, entry->key) != 0; entry = entry->next); 66 | return(entry); 67 | } 68 | /**********************************************************************/ 69 | 70 | static void free_array(table) 71 | Table *table; 72 | { 73 | if (table->array) 74 | { 75 | free(table->array); 76 | table->array = NULL; 77 | } 78 | } 79 | /**********************************************************************/ 80 | 81 | void table_insert(table, entry) 82 | Table *table; 83 | Entry *entry; 84 | { 85 | list_insert_first(&table->list[table_index(entry->key)], entry); 86 | free_array(table); 87 | table->count++; 88 | } 89 | /**********************************************************************/ 90 | 91 | void table_remove(table, entry) 92 | Table *table; 93 | Entry *entry; 94 | { 95 | list_remove(&table->list[table_index(entry->key)], entry); 96 | free_array(table); 97 | table->count--; 98 | } 99 | /**********************************************************************/ 100 | 101 | void table_in_array(table) 102 | Table *table; 103 | { 104 | Entry *entry; 105 | long i, j = 0; 106 | if (table->array || table->count == 0) 107 | return; 108 | table->array = NEW_ARRAY(table->count, Entry *); 109 | for (i = 0; i < TABLE_SIZE; i++) 110 | for (entry = table->list[i].first; entry; entry = entry->next) 111 | table->array[j++] = entry; 112 | } 113 | /**********************************************************************/ 114 | 115 | void table_empty(table, process_entry) 116 | Table *table; 117 | void (*process_entry)(); 118 | { 119 | short i; 120 | for (i = 0; i < TABLE_SIZE; i++) 121 | list_empty(&table->list[i], process_entry); 122 | free_array(table); 123 | table->count = 0; 124 | } 125 | -------------------------------------------------------------------------------- /src/stopword.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * stopword.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "stopword.h" 26 | #include "table.h" 27 | #include "word.h" 28 | 29 | BEGIN_ENTRY(Stopword) 30 | END_ENTRY(Stopword); 31 | 32 | BEGIN_TABLE_OF(Stopword, Stopwordlist) 33 | END_TABLE(Stopwordtable); 34 | static Stopwordtable stopwordtable; 35 | static Boolean initialized = False; 36 | 37 | static Textopt textopt = { False, False, 0, True, True, True }; 38 | static Text text; 39 | static Wordlist wordlist; 40 | 41 | static char *default_stopword[] = 42 | { 43 | "a", "about", "after", "all", "also", 44 | "an", "and", "any", "are", "as", 45 | "at", "back", "be", "because", "been", 46 | "but", "by", "can", "could", "did", 47 | "do", "does", "down", "each", "first", 48 | "for", "from", "get", "good", "had", 49 | "has", "have", "he", "her", "him", 50 | "his", "how", "i", "if", "in", 51 | "into", "is", "it", "its", "just", 52 | "know", "like", "little", "long", "made", 53 | "make", "man", "many", "may", "me", 54 | "more", "most", "my", "new", "no", 55 | "not", "now", "of", "on", "one", 56 | "only", "or", "other", "our", "out", 57 | "over", "said", "same", "see", "she", 58 | "so", "some", "than", "that", "the", 59 | "their", "them", "then", "there", "these", 60 | "they", "this", "to", "too", "two", 61 | "up", "us", "used", "very", "was", 62 | "way", "we", "were", "what", "when", 63 | "where", "which", "who", "why", "will", 64 | "with", "woman", "would", "you", "your" 65 | }; 66 | 67 | /**********************************************************************/ 68 | 69 | static void save_stopword(key) 70 | char *key; 71 | { 72 | Stopword *stopword; 73 | stopword = table_lookup(&stopwordtable, key); 74 | if (stopword) 75 | warning_string("duplicate stopword", key); 76 | else 77 | { 78 | stopword = NEW(Stopword); 79 | stopword->key = key; 80 | table_insert(&stopwordtable, stopword); 81 | } 82 | } 83 | /**********************************************************************/ 84 | 85 | void init_stopwords(filename) 86 | char *filename; 87 | { 88 | Word *word; 89 | short i; 90 | if (initialized) 91 | error("stopwords already initialized"); 92 | if (filename) 93 | { 94 | read_text(&text, filename, &textopt); 95 | find_words(&wordlist, &text); 96 | list_empty(&text, free); 97 | for (word = wordlist.first; word; word = word->next) 98 | save_stopword(word->string); 99 | } 100 | else 101 | for (i = 0; i < sizeof(default_stopword) / sizeof(char *); i++) 102 | save_stopword(default_stopword[i]); 103 | initialized = True; 104 | } 105 | /**********************************************************************/ 106 | 107 | Boolean is_stopword(string) 108 | unsigned char *string; 109 | { 110 | if (!initialized) 111 | error("stopwords not initialized"); 112 | return(table_lookup(&stopwordtable, string) ? True : False); 113 | } 114 | -------------------------------------------------------------------------------- /src/accrpt.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * accrpt.h 4 | * 5 | * This module provides support for reading and writing character 6 | * accuracy reports. The contents of one of these reports is 7 | * represented by an "Accdata" structure. 8 | * 9 | * Author: Stephen V. Rice 10 | * 11 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 12 | * Education, on behalf, of the University of Nevada, Las Vegas, 13 | * Information Science Research Institute 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you 16 | * may not use this file except in compliance with the License. You 17 | * may obtain a copy of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software 22 | * distributed under the License is distributed on an "AS IS" BASIS, 23 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | * implied. See the License for the specific language governing 25 | * permissions and limitations under the License. 26 | * 27 | **********************************************************************/ 28 | 29 | #ifndef _ACCRPT_ 30 | #define _ACCRPT_ 31 | 32 | #include "charclass.h" 33 | #include "table.h" 34 | 35 | typedef 36 | struct 37 | { 38 | long ins; /* number of insertions */ 39 | long subst; /* number of substitutions */ 40 | long del; /* number of deletions */ 41 | long errors; /* number of errors = ins + subst + del */ 42 | } Accops; 43 | 44 | typedef 45 | struct 46 | { 47 | long count; /* number of ground-truth characters in this class */ 48 | long missed; /* number of these that were misrecognized */ 49 | } Accclass; 50 | 51 | BEGIN_ENTRY(Conf) 52 | long errors; /* number of errors caused by this confusion */ 53 | long marked; /* number of these that were marked */ 54 | END_ENTRY(Conf); 55 | 56 | BEGIN_TABLE_OF(Conf, Conflist) 57 | END_TABLE(Conftable); /* table of confusions */ 58 | 59 | typedef 60 | struct 61 | { 62 | long characters; /* number of ground-truth characters */ 63 | long errors; /* number of errors made */ 64 | long reject_characters; 65 | /* number of reject characters generated */ 66 | long suspect_markers; 67 | /* number of characters marked as suspect */ 68 | long false_marks; /* number of false marks */ 69 | Accops marked_ops; /* edit operations for marked errors */ 70 | Accops unmarked_ops;/* edit operations for unmarked errors */ 71 | Accops total_ops; /* edit operations for all errors */ 72 | Accclass large_class[MAX_CHARCLASSES]; 73 | /* enumeration for each character class */ 74 | Accclass total_class; 75 | /* enumeration for all classes combined */ 76 | Conftable conftable;/* table of confusions */ 77 | Accclass small_class[NUM_CHARVALUES]; 78 | /* enumeration for each character code */ 79 | } Accdata; 80 | 81 | void add_class(/* Accdata *accdata, Charvalue value, long count, 82 | long missed */); 83 | /* adds the given character value to "accdata", 84 | updating all relevant class enumerations */ 85 | 86 | void add_conf(/* Accdata *accdata, char *key, long errors, long marked */); 87 | /* adds the given confusion to "accdata"; "key" contains 88 | the character string representation of the confusion 89 | that will appear in the accuracy report (including 90 | the trailing newline character); a copy of this 91 | string is stored in the table */ 92 | 93 | void read_accrpt(/* Accdata *accdata, char *filename */); 94 | /* reads the named file (or stdin if "filename" is NULL) 95 | and adds its contents to "accdata"; reports an error 96 | and quits if unable to open the file, or if the file 97 | does not contain an accuracy report */ 98 | 99 | void write_accrpt(/* Accdata *accdata, char *filename */); 100 | /* writes the contents of "accdata" to the named file 101 | (or stdout if "filename" is NULL); reports an error 102 | and quits if unable to create the file */ 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Eddie Antonio Santos 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Install prefix, if installing globally. 16 | # See also: `exports` target 17 | PREFIX = /usr/local 18 | BINDIR = $(PREFIX)/bin 19 | MANDIR = $(PREFIX)/share/man/man1 20 | 21 | # List of all the tools (executables + manual pages) 22 | TOOLS = accci accdist accsum accuracy editop editopcost editopsum \ 23 | groupacc ngram nonstopacc synctext vote wordacc wordaccci \ 24 | wordaccdist wordaccsum wordfreq 25 | 26 | # Name: libocreval, or -locreval 27 | NAME = ocreval 28 | MAJOR_VERSION = 7 29 | MINOR_VERSION = 0 30 | 31 | # All the executables go in bin/ 32 | EXECUTABLES = $(addprefix bin/,$(TOOLS)) 33 | # All manual pages go in share/man/man1 34 | MANPAGES = $(foreach TOOL,$(TOOLS),share/man/man1/$(TOOL).1) 35 | 36 | include use-libocreval-internal.mk 37 | 38 | LIBRARY.a = lib/lib$(NAME).a 39 | ifeq ($(shell uname -s),Darwin) 40 | LIBRARY.so = $(LIBRARY.a:.a=.dylib) 41 | else 42 | LIBRARY.so = $(LIBRARY.a:.a=.so.$(MAJOR_VERSION).$(MINOR_VERSION)) 43 | endif 44 | 45 | ################################################################################ 46 | 47 | # Allows for proper compilation and linking settings for libocreval 48 | 49 | all: $(EXECUTABLES) 50 | 51 | install: install-bin install-man 52 | 53 | install-bin: $(EXECUTABLES) 54 | mkdir -p $(BINDIR) 55 | cp $(EXECUTABLES) $(BINDIR)/ 56 | 57 | install-man: $(MANPAGES) 58 | mkdir -p $(MANDIR) 59 | cp $(MANPAGES) $(MANDIR)/ 60 | 61 | # Prints a bunch of exports you can source in your shell's startup file. 62 | exports: 63 | @echo '#' ocreval 64 | @echo export PATH='$(TOP)bin:$$PATH' 65 | @echo export MANPATH='$(TOP)share/man:$$MANPATH' 66 | ifeq ($(shell uname -s),Darwin) 67 | @echo export DYLD_LIBRARY_PATH='$(TOP)lib:$$DYLD_LIBRARY_PATH' 68 | else 69 | @echo export LD_LIBRARY_PATH='$(TOP)lib:$$LD_LIBRARY_PATH' 70 | endif 71 | 72 | clean: clean-objs clean-execs clean-libs clean-deps clean-test 73 | 74 | clean-libs: 75 | $(RM) $(LIBRARY.a) $(LIBRARY.so) 76 | 77 | clean-objs: 78 | $(RM) $(MODULES:.c=.o) 79 | 80 | clean-deps: 81 | $(RM) $(DEPENDENCIES) 82 | 83 | clean-execs: 84 | $(RM) $(EXECUTABLES) 85 | 86 | clean-test: 87 | $(MAKE) -C test clean 88 | 89 | TEST_ARGS = 90 | test: $(EXECUTABLES) $(LIBRARY.a) 91 | $(MAKE) -C test 92 | 93 | # Uses https://github.com/alexch/rerun 94 | # $ gem install rerun 95 | watch: 96 | rerun --clear --exit --pattern '**/*.{c,h}' -- make test 97 | 98 | .PHONY: all 99 | .PHONY: install install-bin install-man 100 | .PHONY: clean clean-deps clean-execs clean-lib clean-objs clean-test 101 | .PHONY: test watch 102 | ################################################################################ 103 | 104 | # Executable sources are C files that provide a main() for executables. 105 | EXECUTABLE_SOURCES := $(foreach TOOL,$(TOOLS),src/$(TOOL).c) 106 | # Modules are all object files exclusively for inclusion in libocreval 107 | MODULE_SOURCES := $(filter-out $(EXECUTABLE_SOURCES),$(wildcard src/*.c)) 108 | MODULES := $(MODULE_SOURCES:.c=.o) 109 | # Dependencies are .d files included by Make 110 | DEPENDENCIES := $(EXECUTABLES:=.d) $(MODULES:.o=.d) 111 | 112 | -include $(DEPENDENCIES) 113 | 114 | # Rules for building executables; they are statically linked with the library. 115 | bin/%: src/%.c $(LIBRARY.a) 116 | $(LINK.c) -o $@ $< $(LDLIBS) 117 | 118 | $(LIBRARY.a): $(MODULES) 119 | $(AR) $(ARFLAGS) -s $@ $^ 120 | 121 | # Special case: Generate this include file, required by libocreval.a 122 | $(TOP)src/word_break_property.h src/word_break_property.h: \ 123 | libexec/generate_word_break.py libexec/WordBreakProperty.txt.gz 124 | ./$< > $@ 125 | -------------------------------------------------------------------------------- /src/edorpt.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * edorpt.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "edorpt.h" 26 | #include "util.h" 27 | #include "ocreval_version.h" 28 | 29 | #define TITLE "ocreval Edit Operation Report Version " OCREVAL_VERSION "\n" 30 | #define DIVIDER "-----------------------------------------\n" 31 | 32 | static char line[100]; 33 | 34 | /**********************************************************************/ 35 | 36 | static Boolean read_line(f) 37 | FILE *f; 38 | { 39 | return(fgets(line, sizeof(line) - 1, f) ? True : False); 40 | } 41 | /**********************************************************************/ 42 | 43 | static Boolean read_value(f, value, sum_value) 44 | FILE *f; 45 | long *value, *sum_value; 46 | { 47 | if (read_line(f) && sscanf(line, "%ld", value) == 1) 48 | { 49 | *sum_value += *value; 50 | return(True); 51 | } 52 | else 53 | return(False); 54 | } 55 | /**********************************************************************/ 56 | 57 | static Boolean read_two(f, value1, value2) 58 | FILE *f; 59 | long *value1, *value2; 60 | { 61 | return(read_line(f) && sscanf(line, "%ld %ld", value1, value2) == 2 ? 62 | True : False); 63 | } 64 | /**********************************************************************/ 65 | 66 | void read_edorpt(edodata, filename) 67 | Edodata *edodata; 68 | char *filename; 69 | { 70 | FILE *f; 71 | long moves, value1, value2; 72 | f = open_file(filename, "r"); 73 | if (read_line(f) && strncmp(line, TITLE, sizeof(TITLE) - 3) == 0 && 74 | read_line(f) && strcmp(line, DIVIDER) == 0 && 75 | read_value(f, &value1, &edodata->total_insertions) && 76 | read_value(f, &value1, &edodata->total_deletions) && 77 | read_value(f, &moves, &edodata->total_moves)) 78 | { 79 | if (moves > 0 && read_line(f) && read_line(f) && read_line(f)) 80 | while (read_two(f, &value1, &value2)) 81 | edodata->moves[value2] += value1; 82 | } 83 | else 84 | error_string("invalid format in", (filename ? filename : "stdin")); 85 | close_file(f); 86 | } 87 | /**********************************************************************/ 88 | 89 | static void write_value(f, value, string) 90 | FILE *f; 91 | long value; 92 | char *string; 93 | { 94 | fprintf(f, "%8ld %s\n", value, string); 95 | } 96 | /**********************************************************************/ 97 | 98 | static void write_move(f, count, length) 99 | FILE *f; 100 | long count, length; 101 | { 102 | fprintf(f, "%8ld %8ld\n", count, length); 103 | } 104 | /**********************************************************************/ 105 | 106 | void write_edorpt(edodata, filename) 107 | Edodata *edodata; 108 | char *filename; 109 | { 110 | FILE *f; 111 | long i; 112 | f = open_file(filename, "w"); 113 | fprintf(f, "%s%s", TITLE, DIVIDER); 114 | write_value(f, edodata->total_insertions, "Insertions"); 115 | write_value(f, edodata->total_deletions, "Deletions"); 116 | write_value(f, edodata->total_moves, "Moves"); 117 | if (edodata->total_moves > 0) 118 | { 119 | fprintf(f, "\nMoves\n Count Length\n"); 120 | for (i = 1; i <= MAX_MOVE_LENGTH; i++) 121 | if (edodata->moves[i] > 0) 122 | write_move(f, edodata->moves[i], i); 123 | } 124 | close_file(f); 125 | } 126 | -------------------------------------------------------------------------------- /src/list.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * list.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "list.h" 26 | #include "util.h" 27 | 28 | BEGIN_ITEM(Item) 29 | END_ITEM(Item); 30 | 31 | BEGIN_LIST_OF(Item) 32 | END_LIST(List); 33 | 34 | /**********************************************************************/ 35 | 36 | void list_initialize(list) 37 | List *list; 38 | { 39 | list->first = list->last = NULL; 40 | list->array = NULL; 41 | list->count = 0; 42 | } 43 | /**********************************************************************/ 44 | 45 | static void free_array(list) 46 | List *list; 47 | { 48 | if (list->array) 49 | { 50 | free(list->array); 51 | list->array = NULL; 52 | } 53 | } 54 | /**********************************************************************/ 55 | 56 | static void list_insert(list, ref1, item, ref2) 57 | List *list; 58 | Item *ref1, *item, *ref2; 59 | { 60 | item->prev = ref1; 61 | item->next = ref2; 62 | if (ref1) 63 | ref1->next = item; 64 | else 65 | list->first = item; 66 | if (ref2) 67 | ref2->prev = item; 68 | else 69 | list->last = item; 70 | free_array(list); 71 | list->count++; 72 | } 73 | /**********************************************************************/ 74 | 75 | void list_insert_first(list, item) 76 | List *list; 77 | Item *item; 78 | { 79 | list_insert(list, NULL, item, list->first); 80 | } 81 | /**********************************************************************/ 82 | 83 | void list_insert_last(list, item) 84 | List *list; 85 | Item *item; 86 | { 87 | list_insert(list, list->last, item, NULL); 88 | } 89 | /**********************************************************************/ 90 | 91 | void list_insert_before(list, item, ref) 92 | List *list; 93 | Item *item, *ref; 94 | { 95 | list_insert(list, ref->prev, item, ref); 96 | } 97 | /**********************************************************************/ 98 | 99 | void list_insert_after(list, ref, item) 100 | List *list; 101 | Item *ref, *item; 102 | { 103 | list_insert(list, ref, item, ref->next); 104 | } 105 | /**********************************************************************/ 106 | 107 | void list_remove(list, item) 108 | List *list; 109 | Item *item; 110 | { 111 | if (item->prev) 112 | item->prev->next = item->next; 113 | else 114 | list->first = item->next; 115 | if (item->next) 116 | item->next->prev = item->prev; 117 | else 118 | list->last = item->prev; 119 | item->prev = item->next = NULL; 120 | free_array(list); 121 | list->count--; 122 | } 123 | /**********************************************************************/ 124 | 125 | void list_in_array(list) 126 | List *list; 127 | { 128 | Item *item; 129 | long i = 0; 130 | if (list->array || list->count == 0) 131 | return; 132 | list->array = NEW_ARRAY(list->count, Item *); 133 | for (item = list->first; item; item = item->next) 134 | list->array[i++] = item; 135 | } 136 | /**********************************************************************/ 137 | 138 | void list_empty(list, process_item) 139 | List *list; 140 | void (*process_item)(); 141 | { 142 | Item *item; 143 | while (list->first) 144 | { 145 | item = list->first; 146 | list_remove(list, item); 147 | (*process_item)(item); 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ocreval 2 | 3 | [![Build Status](https://travis-ci.org/eddieantonio/ocreval.svg?branch=master)](https://travis-ci.org/eddieantonio/ocreval) 4 | 5 | The `ocreval` consist of 17 tools for measuring the 6 | performance of and experimenting with OCR output. See [the user 7 | guide][user-guide] for more information. 8 | 9 | [user-guide]: https://github.com/eddieantonio/ocreval/raw/master/user-guide.pdf 10 | 11 | `ocreval` is a modern port of the [ISRI Analytic Tools for OCR Evaluation][isri], 12 | with UTF-8 support and other improvements. 13 | 14 | See [the archived Google Code repository of the original 15 | project][isri-code]! 16 | 17 | [isri]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.216.9427&rep=rep1&type=pdf 18 | [isri-code]: http://code.google.com/p/isri-ocr-evaluation-tools 19 | 20 | # Install (macOS) 21 | 22 | Using [Homebrew][brew]: 23 | 24 | brew install eddieantonio/eddieantonio/ocreval 25 | 26 | [brew]: http://brew.sh/ 27 | 28 | 29 | Building 30 | ======== 31 | 32 | To build the library and all of the programs, ensure that you have all 33 | required [dependencies](#dependencies). 34 | 35 | ## Dependencies 36 | 37 | `ocreval` requires [utf8proc](https://github.com/JuliaStrings/utf8proc) 38 | to build from source. 39 | 40 | ### macOS 41 | 42 | Using [Homebrew][brew]: 43 | 44 | brew install utf8proc 45 | 46 | ### Ubuntu/Debian 47 | 48 | You may need to install `make` and a C compiler: 49 | 50 | sudo apt install build-essential 51 | 52 | Then install, `libutf8proc-dev`: 53 | 54 | sudo apt install libutf8proc-dev 55 | 56 | If `libutf8proc-dev` cannot be installed using `apt`, follow 57 | [Other Linux](#other-linux) below 58 | 59 | ### Other Linux 60 | 61 | Install `libutf8proc-dev` manually: 62 | 63 | curl -OL https://github.com/JuliaStrings/utf8proc/archive/v1.3.1.tar.gz 64 | tar xzf v1.3.1.tar.gz 65 | cd utf8proc-1.3.1/ 66 | make 67 | sudo make install 68 | # Rebuild the shared object cache - needed to load the library 69 | # at runtime 70 | sudo ldconfig 71 | cd - 72 | 73 | ## Building the tools 74 | 75 | Once all dependencies are installed, you may compile all of the 76 | utilities using `make`: 77 | 78 | make 79 | 80 | ## Installing 81 | 82 | Install to `/usr/local/`: 83 | 84 | sudo make install 85 | 86 | Note: You will not need `sudo` on macOS if you have `brew` installed. 87 | 88 | ## Installing "locally" 89 | 90 | This will not copy any files at all, but instead create the appropriate 91 | shell commands to add all executables, man pages, and libraries to 92 | the correct path (replace `~/.bashrc` with your start-up file): 93 | 94 | make exports >> ~/.bashrc 95 | 96 | # Porting Credits 97 | 98 | Ported by Eddie Antonio Santos, 2015, 2016. See `NOTICE` for copyright 99 | information regarding the original code. 100 | 101 | # Citation 102 | 103 | ```bibtex 104 | @inproceedings{santos-2019-ocr, 105 | title = "{OCR} evaluation tools for the 21st century", 106 | author = "Santos, Eddie Antonio", 107 | booktitle = "Proceedings of the 3rd Workshop on the Use of Computational Methods in the Study of Endangered Languages Volume 1 (Papers)", 108 | month = feb, 109 | year = "2019", 110 | address = "Honolulu", 111 | publisher = "Association for Computational Linguistics", 112 | url = "https://www.aclweb.org/anthology/W19-6004", 113 | pages = "23--27", 114 | } 115 | ``` 116 | 117 | See: 118 | 119 | 120 | # License 121 | 122 | ### ocreval 123 | 124 | Copyright 2015–2017 Eddie Antonio Santos 125 | 126 | Copyright © 2018–2021 National Research Council Canada 127 | 128 | ### The ISRI Analytic Tools for OCR Evaluation 129 | 130 | Copyright 1996 The Board of Regents of the Nevada System of Higher 131 | Education, on behalf, of the University of Nevada, Las Vegas, 132 | Information Science Research Institute 133 | 134 | Licensed under the Apache License, Version 2.0 (the "License"); you 135 | may not use this file except in compliance with the License. You may 136 | obtain a copy of the License at 137 | 138 | http://www.apache.org/licenses/LICENSE-2.0 139 | 140 | Unless required by applicable law or agreed to in writing, software 141 | distributed under the License is distributed on an "AS IS" BASIS, 142 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 143 | implied. See the License for the specific language governing 144 | permissions and limitations under the License. 145 | -------------------------------------------------------------------------------- /libexec/generate_word_break.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: UTF-8 3 | 4 | # Copyright 2016 Eddie Antonio Santos 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """ 19 | Parses WordBreakProperty.txt and generates a binary search table as a C header 20 | file. Note! This header file must be included only ONCE in only ONE 21 | translation unit (i.e. C file). 22 | """ 23 | 24 | import os 25 | import sys 26 | import gzip 27 | 28 | PROLOGUE = '''\ 29 | /* AUTOGENERATED FILE! DO NOT MODIFY. 30 | * See Supplement/generate_word_break.py */ 31 | ''' 32 | 33 | STRUCT_DEF = '''\ 34 | typedef struct { 35 | Charvalue start, end; 36 | wb_property value; 37 | } wb_range; 38 | ''' 39 | 40 | ENUM_TEMP = '''\ 41 | typedef enum { 42 | %s 43 | } %s; 44 | ''' 45 | 46 | TABLE_TEMP = '''\ 47 | static const wb_range WORD_BREAK_PROPERTY[] = { 48 | %s 49 | }; 50 | ''' 51 | 52 | CATEGORY_NAMES = '''\ 53 | Other 54 | CR 55 | LF 56 | Newline 57 | Extend 58 | Regional_Indicator 59 | Format 60 | Katakana 61 | Hebrew_Letter 62 | ALetter 63 | Single_Quote 64 | Double_Quote 65 | MidNumLet 66 | MidLetter 67 | MidNum 68 | Numeric 69 | ExtendNumLet 70 | sot 71 | eot 72 | '''.strip().split() 73 | 74 | def open_word_break_file(): 75 | filename = 'WordBreakProperty.txt.gz' 76 | path = os.path.join(os.path.dirname(__file__), filename) 77 | return gzip.open(path, 'rb') 78 | 79 | def blank_or_comment(line): 80 | return line.startswith('#') or len(line.strip()) == 0 81 | 82 | def parse_range(text): 83 | """ 84 | >>> parse_range('11730..11739') 85 | (71472, 71481) 86 | >>> parse_range('FF3F') 87 | (65343, 65343) 88 | """ 89 | components = text.strip().split('..')[0:2] 90 | parse_num = lambda string: int(string, base=16) 91 | 92 | if len(components) == 2: 93 | start, end = map(parse_num, components) 94 | else: 95 | assert len(components) == 1 96 | start = end = parse_num(components[0]) 97 | 98 | return (start, end) 99 | 100 | def parse_line(line): 101 | if blank_or_comment(line): 102 | return None 103 | 104 | cp_range, cat_and_comment = line.split(';')[0:2] 105 | category = cat_and_comment.split('#')[0].strip() 106 | cp_range = parse_range(cp_range) 107 | 108 | return cp_range, category 109 | 110 | def parse_lines(word_break_file): 111 | for line in word_break_file: 112 | contents = parse_line(line) 113 | if contents is None: 114 | continue 115 | else: 116 | yield contents 117 | 118 | def to_c_header(values): 119 | assert set(category for _, category in values).issubset(set(CATEGORY_NAMES)) 120 | values.sort(key=lambda c: c[0][0]) 121 | 122 | yield PROLOGUE 123 | yield '\n' 124 | yield generate_enum('wb_property', CATEGORY_NAMES) 125 | yield '\n' 126 | yield STRUCT_DEF 127 | yield '\n' 128 | yield generate_table(values) 129 | 130 | def enum_name(name): 131 | """ 132 | Originally, this added a prefix, but since this file generates a header 133 | that is only included internally and only in *one* file, this is 134 | unnecessary and just clutters up things. 135 | """ 136 | return name 137 | 138 | def generate_enum(name, categories): 139 | str_values = ',\n '.join(enum_name(category) for category in categories) 140 | return ENUM_TEMP % (str_values, name) 141 | 142 | def generate_literal(value): 143 | cp_range, category = value 144 | start, end = cp_range 145 | template = '{0x%06X, 0x%06X, %s}' if end > 0xffff else '{0x%04X, 0x%04X, %s}' 146 | return template % (start, end, enum_name(category)) 147 | 148 | def generate_table(values): 149 | str_values = ',\n '.join(generate_literal(value) for value in values) 150 | return TABLE_TEMP % (str_values,) 151 | 152 | if __name__ == '__main__': 153 | with open_word_break_file() as word_break_file: 154 | big_list = list(parse_lines(word_break_file)) 155 | 156 | for text in to_c_header(big_list): 157 | sys.stdout.write(text) 158 | -------------------------------------------------------------------------------- /src/accuracy.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * accuracy.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "accrpt.h" 26 | #include "sync.h" 27 | 28 | #define usage "correctfile generatedfile [accuracy_report]" 29 | 30 | #define MAX_DISPLAY 24 31 | 32 | Textopt textopt = { True, True, 0, True, True }; 33 | 34 | Text text[2]; 35 | 36 | Accdata accdata; 37 | 38 | /**********************************************************************/ 39 | 40 | void make_key(key, sync) 41 | char *key; 42 | Sync *sync; 43 | { 44 | long i, j; 45 | char buffer[2][MAX_DISPLAY + 4], string[STRING_SIZE]; 46 | for (i = 0; i < 2; i++) 47 | { 48 | buffer[i][0] = '\0'; 49 | for (j = sync->substr[i].start; j <= sync->substr[i].stop; j++) 50 | { 51 | char_to_string(False, text[i].array[j]->value, string, True); 52 | if (strlen(buffer[i]) + strlen(string) > MAX_DISPLAY) 53 | { 54 | strcat(buffer[i], "..."); 55 | break; 56 | } 57 | strcat(buffer[i], string); 58 | } 59 | } 60 | sprintf(key, "{%s}-{%s}\n", buffer[0], buffer[1]); 61 | } 62 | /**********************************************************************/ 63 | 64 | void add_ops(sum_ops, ops) 65 | Accops *sum_ops, *ops; 66 | { 67 | sum_ops->ins += ops->ins; 68 | sum_ops->subst += ops->subst; 69 | sum_ops->del += ops->del; 70 | sum_ops->errors += ops->errors; 71 | } 72 | /**********************************************************************/ 73 | 74 | void process_synclist(synclist) 75 | Synclist *synclist; 76 | { 77 | Sync *sync; 78 | long i, characters, wildcards, reject_characters, suspect_markers, genchars; 79 | Accops ops; 80 | char key[100]; 81 | for (sync = synclist->first; sync; sync = sync->next) 82 | { 83 | characters = wildcards = 0; 84 | for (i = sync->substr[0].start; i <= sync->substr[0].stop; i++) 85 | if (text[0].array[i]->value == REJECT_CHARACTER) 86 | wildcards++; 87 | else 88 | { 89 | characters++; 90 | add_class(&accdata, text[0].array[i]->value, 1, 91 | (sync->match ? 0 : 1)); 92 | } 93 | accdata.characters += characters; 94 | reject_characters = suspect_markers = 0; 95 | for (i = sync->substr[1].start; i <= sync->substr[1].stop; i++) 96 | if (text[1].array[i]->value == REJECT_CHARACTER) 97 | reject_characters++; 98 | else if (text[1].array[i]->suspect) 99 | suspect_markers++; 100 | accdata.reject_characters += reject_characters; 101 | accdata.suspect_markers += suspect_markers; 102 | if (sync->match) 103 | accdata.false_marks += suspect_markers; 104 | else 105 | { 106 | genchars = max(0, sync->substr[1].length - wildcards); 107 | ops.errors = max(characters, genchars); 108 | if (ops.errors > 0) 109 | { 110 | accdata.errors += ops.errors; 111 | ops.ins = max(0, characters - genchars); 112 | ops.subst = min(characters, genchars); 113 | ops.del = max(0, genchars - characters); 114 | make_key(key, sync); 115 | if (reject_characters + suspect_markers > 0) 116 | { 117 | add_ops(&accdata.marked_ops, &ops); 118 | add_conf(&accdata, key, ops.errors, ops.errors); 119 | } 120 | else 121 | { 122 | add_ops(&accdata.unmarked_ops, &ops); 123 | add_conf(&accdata, key, ops.errors, 0); 124 | } 125 | add_ops(&accdata.total_ops, &ops); 126 | } 127 | } 128 | } 129 | } 130 | /**********************************************************************/ 131 | 132 | main(argc, argv) 133 | int argc; 134 | char *argv[]; 135 | { 136 | Synclist synclist; 137 | initialize(&argc, argv, usage, NULL); 138 | if (argc < 2 || argc > 3) 139 | error("invalid number of files"); 140 | read_text(&text[0], argv[0], &textopt); 141 | if (textopt.found_header) 142 | error("no correct file specified"); 143 | read_text(&text[1], argv[1], &textopt); 144 | fastukk_sync(&synclist, text); 145 | process_synclist(&synclist); 146 | write_accrpt(&accdata, (argc == 3 ? argv[2] : NULL)); 147 | terminate(); 148 | } 149 | -------------------------------------------------------------------------------- /src/util.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * util.h 4 | * 5 | * This module contains basic definitions and utility routines that 6 | * are needed by almost every module/program in the OCR Experimental 7 | * Environment. 8 | * 9 | * Author: Stephen V. Rice 10 | * 11 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 12 | * Education, on behalf, of the University of Nevada, Las Vegas, 13 | * Information Science Research Institute 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you 16 | * may not use this file except in compliance with the License. You 17 | * may obtain a copy of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software 22 | * distributed under the License is distributed on an "AS IS" BASIS, 23 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 24 | * implied. See the License for the specific language governing 25 | * permissions and limitations under the License. 26 | * 27 | **********************************************************************/ 28 | 29 | #ifndef _UTIL_ 30 | #define _UTIL_ 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | #ifndef unix 37 | /* On Linux and Mac OS X */ 38 | #if defined(__unix__) || defined(__MACH__) 39 | #define unix 40 | #endif 41 | #endif 42 | 43 | #ifndef True 44 | typedef char Boolean; 45 | #define True 1 46 | #define False 0 47 | #endif 48 | 49 | #ifndef max 50 | #define max(a, b) ((a) > (b) ? (a) : (b)) 51 | #define min(a, b) ((a) < (b) ? (a) : (b)) 52 | #endif 53 | 54 | #define NEW(type) ((type *) allocate((size_t) 1, sizeof(type))) 55 | /* allocate one instance of "type" */ 56 | #define NEW_ARRAY(number, type) \ 57 | ((type *) allocate((size_t) (number), sizeof(type))) 58 | /* allocate an array of "type" */ 59 | void *allocate(/* size_t number, size_t size */); 60 | 61 | int ustrcmp(/* unsigned char *s1, unsigned char *s2 */); 62 | /* compares strings like "strcmp" but treats characters 63 | as unsigned */ 64 | 65 | FILE *open_file(/* char *filename, char *mode */); 66 | /* opens the named file in the specified mode; reports 67 | an error and quits if unable to open the file; if 68 | "filename" is NULL, returns stdin or stdout, 69 | depending on the mode */ 70 | 71 | void close_file(/* FILE *f */); 72 | /* closes the specified file */ 73 | 74 | Boolean file_exists(/* char *filename */); 75 | /* returns True if the named file exists */ 76 | 77 | char *tempfilename(); /* creates and returns a unique name for a temporary 78 | file */ 79 | 80 | char *basefilename(/* char *pathname */); 81 | /* given a pathname, returns the base filename; e.g., 82 | basefilename("/local/isri/bin/ocr") returns "ocr" */ 83 | 84 | extern char *exec_name; /* base filename of the executable */ 85 | 86 | extern Boolean usage_when_no_args; 87 | /* indicates whether the usage should be displayed when 88 | no arguments have been specified to the program; by 89 | default it is True; to override this, set it to False 90 | before calling "initialize" */ 91 | 92 | extern void (*usage_routine)(); 93 | /* specifies a routine to be called to display the 94 | usage, overriding the "usage" argument passed to 95 | "initialize"; this must be set before calling 96 | "initialize" */ 97 | 98 | extern void (*cleanup_routine)(); 99 | /* specifies a routine to be called upon exit */ 100 | 101 | typedef 102 | struct 103 | { 104 | char name; /* character identifying the option; e.g., 'D' for -D; 105 | '\0' to terminate an array of these */ 106 | char **string; /* if a string-valued option, address of variable to 107 | hold the string value; e.g., for -Dcaere or -D caere, 108 | the variable gets "caere"; if a Boolean-valued 109 | option, this should be NULL */ 110 | Boolean *boolean; /* if a Boolean-valued option, address of variable to 111 | set to True to indicate the option was specified; if 112 | a string-valued option, this should be NULL */ 113 | } Option; 114 | 115 | void initialize(/* int *argc, char *argv[], char *usage, Option option[] */); 116 | /* parses the command line arguments looking for any 117 | of the allowed options; reports any invalid option 118 | and quits; updates "argc" and "argv" to contain only 119 | the non-option arguments; displays "usage" when 120 | appropriate; "usage" may be NULL if "usage_routine" 121 | has been set; "option" may be NULL if there are no 122 | options */ 123 | 124 | void terminate() /* terminates the program with exit status 0 */ 125 | __attribute__ ((noreturn)); 126 | 127 | extern int errstatus; /* status returned when exiting due to an error; this is 128 | 1 by default */ 129 | 130 | void error(/* char *message */) 131 | __attribute__ ((noreturn)); 132 | /* writes an error message, then quits or returns */ 133 | 134 | void error_string(/* char *message, char *string */) 135 | __attribute__ ((noreturn)); 136 | /* writes an error message, including "string" in 137 | quotes, then quits */ 138 | 139 | void warning_string(/* char *message, char *string */); 140 | /* writes an warning message, including "string" */ 141 | 142 | #endif 143 | -------------------------------------------------------------------------------- /src/ngram.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * ngram.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "sort.h" 26 | #include "table.h" 27 | #include "text.h" 28 | 29 | #define usage "[-n 1|2|3] textfile1 textfile2 ... >resultfile" 30 | 31 | #define MAX_N 3 32 | short n; 33 | char *nstring; 34 | 35 | Option option[] = 36 | { 37 | 'n', &nstring, NULL, 38 | '\0' 39 | }; 40 | 41 | Textopt textopt = { True, True, 0, True, True }; 42 | 43 | Text text; 44 | 45 | BEGIN_ENTRY(Sequence) 46 | Charvalue value[MAX_N]; 47 | long count, suspect; 48 | END_ENTRY(Sequence); 49 | 50 | BEGIN_TABLE_OF(Sequence, Seqlist) 51 | long total_count, total_suspect; 52 | END_TABLE(Seqtable); 53 | Seqtable seqtable; 54 | 55 | /**********************************************************************/ 56 | 57 | short get_n() 58 | { 59 | if (!nstring) 60 | return(1); 61 | if (nstring[0] >= '1' && nstring[0] <= '0' + MAX_N && !nstring[1]) 62 | return(nstring[0] - '0'); 63 | error_string("invalid value", nstring); 64 | } 65 | /**********************************************************************/ 66 | 67 | void add_sequence(key, value, suspect) 68 | char *key; 69 | Charvalue value[]; 70 | Boolean suspect; 71 | { 72 | Sequence *sequence; 73 | short i; 74 | sequence = table_lookup(&seqtable, key); 75 | if (!sequence) 76 | { 77 | sequence = NEW(Sequence); 78 | sequence->key = strdup(key); 79 | for (i = 0; i < n; i++) 80 | sequence->value[i] = value[i]; 81 | table_insert(&seqtable, sequence); 82 | } 83 | sequence->count++; 84 | seqtable.total_count++; 85 | if (suspect) 86 | { 87 | sequence->suspect++; 88 | seqtable.total_suspect++; 89 | } 90 | } 91 | /**********************************************************************/ 92 | 93 | void process_file(filename) 94 | char *filename; 95 | { 96 | Char *start, *c; 97 | char key[MAX_N * STRING_SIZE], string[STRING_SIZE]; 98 | Boolean suspect; 99 | short i; 100 | Charvalue value[MAX_N]; 101 | list_empty(&text, free); 102 | read_text(&text, filename, &textopt); 103 | for (start = text.first; start; start = start->next) 104 | { 105 | key[0] = '\0'; 106 | suspect = False; 107 | for (i = 0, c = start; i < n; i++, c = c->next) 108 | { 109 | if (!c) 110 | return; 111 | char_to_string(False, c->value, string, True); 112 | strcat(key, string); 113 | value[i] = c->value; 114 | if (c->suspect) 115 | suspect = True; 116 | } 117 | add_sequence(key, value, suspect); 118 | } 119 | } 120 | /**********************************************************************/ 121 | 122 | int order_by_value(sequence1, sequence2) 123 | Sequence *sequence1, *sequence2; 124 | { 125 | short i; 126 | for (i = 0; i < n && sequence1->value[i] == sequence2->value[i]; i++); 127 | if (i < n) 128 | return(sequence1->value[i] < sequence2->value[i] ? -1 : 1); 129 | return(0); 130 | } 131 | /**********************************************************************/ 132 | 133 | int order_by_count(sequence1, sequence2) 134 | Sequence *sequence1, *sequence2; 135 | { 136 | if (sequence1->count != sequence2->count) 137 | return(sequence2->count - sequence1->count); 138 | if (sequence1->suspect != sequence2->suspect) 139 | return(sequence2->suspect - sequence1->suspect); 140 | return(order_by_value(sequence1, sequence2)); 141 | } 142 | /**********************************************************************/ 143 | 144 | void write_array() 145 | { 146 | long i; 147 | printf(" Count Suspect\n"); 148 | for (i = 0; i < seqtable.count; i++) 149 | printf("%8ld %8ld {%s}\n", seqtable.array[i]->count, 150 | seqtable.array[i]->suspect, seqtable.array[i]->key); 151 | printf("%8ld %8ld Total\n", seqtable.total_count, 152 | seqtable.total_suspect); 153 | } 154 | /**********************************************************************/ 155 | 156 | void write_report() 157 | { 158 | table_in_array(&seqtable); 159 | sort(seqtable.count, seqtable.array, order_by_value); 160 | write_array(); 161 | printf("\n\n"); 162 | sort(seqtable.count, seqtable.array, order_by_count); 163 | write_array(); 164 | } 165 | /**********************************************************************/ 166 | 167 | main(argc, argv) 168 | int argc; 169 | char *argv[]; 170 | { 171 | int i; 172 | initialize(&argc, argv, usage, option); 173 | if (argc == 0) 174 | error("no text files specified"); 175 | n = get_n(); 176 | for (i = 0; i < argc; i++) 177 | process_file(argv[i]); 178 | write_report(); 179 | terminate(); 180 | } 181 | -------------------------------------------------------------------------------- /src/synctext.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * synctext.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "sync.h" 26 | 27 | #define usage "[-H] [-i] [-s] [-T] textfile1 textfile2 ... >resultfile" 28 | 29 | Textopt textopt = { True, True, 0, True, True }; 30 | 31 | Boolean heuristic, show_suspect, transpose; 32 | 33 | Option option[] = 34 | { 35 | 'H', NULL, &heuristic, 36 | 'i', NULL, &textopt.case_insensitive, 37 | 's', NULL, &show_suspect, 38 | 'T', NULL, &transpose, 39 | '\0' 40 | }; 41 | 42 | /**********************************************************************/ 43 | 44 | void write_separator() 45 | { 46 | short i; 47 | for (i = 1; i < 80; i++) 48 | putchar('='); 49 | putchar(NEWLINE); 50 | } 51 | /**********************************************************************/ 52 | 53 | void write_transposed(synclist, text) 54 | Synclist *synclist; 55 | Text *text; 56 | { 57 | Sync *sync; 58 | long i; 59 | char string[STRING_SIZE]; 60 | write_separator(); 61 | putchar(NEWLINE); 62 | for (sync = synclist->first; sync; sync = sync->next) 63 | { 64 | if (sync->match) 65 | printf("{%ld:", *sync->match); 66 | for (i = sync->substr->start; i <= sync->substr->stop; i++) 67 | { 68 | char_to_string(show_suspect & text->array[i]->suspect, 69 | text->array[i]->value, string, False); 70 | printf("%s", string); 71 | } 72 | if (sync->match) 73 | putchar('}'); 74 | } 75 | putchar(NEWLINE); 76 | } 77 | /**********************************************************************/ 78 | 79 | void write_matches(synclist, num_text, text) 80 | Synclist *synclist; 81 | short num_text; 82 | Text *text; 83 | { 84 | Sync *sync; 85 | long i, j, footnote = 0; 86 | Boolean suspect; 87 | char string[STRING_SIZE]; 88 | write_separator(); 89 | putchar(NEWLINE); 90 | for (sync = synclist->first; sync; sync = sync->next) 91 | if (sync->match) 92 | for (i = 0; i < sync->substr[0].length; i++) 93 | { 94 | suspect = False; 95 | if (show_suspect) 96 | for (j = 0; j < num_text && !suspect; j++) 97 | if (text[j].array[sync->substr[j].start + i]->suspect) 98 | suspect = True; 99 | char_to_string(suspect, 100 | text[0].array[sync->substr[0].start + i]->value, string, False); 101 | printf("%s", string); 102 | } 103 | else 104 | printf("{%ld}", ++footnote); 105 | putchar(NEWLINE); 106 | } 107 | /**********************************************************************/ 108 | 109 | void write_differences(synclist, num_text, text, filename) 110 | Synclist *synclist; 111 | short num_text; 112 | Text *text; 113 | char *filename[]; 114 | { 115 | long i, j, maxlen = 0, footnote = 0; 116 | char format[20], string[STRING_SIZE]; 117 | Sync *sync; 118 | for (i = 0; i < num_text; i++) 119 | maxlen = max(maxlen, strlen(filename[i])); 120 | sprintf(format, "%%-%lds", maxlen); 121 | for (sync = synclist->first; sync; sync = sync->next) 122 | if (!sync->match) 123 | { 124 | write_separator(); 125 | printf("{%ld}\n", ++footnote); 126 | for (i = 0; i < num_text; i++) 127 | { 128 | printf(format, filename[i]); 129 | printf(" {"); 130 | for (j = sync->substr[i].start; j <= sync->substr[i].stop; j++) 131 | { 132 | char_to_string(show_suspect & text[i].array[j]->suspect, 133 | text[i].array[j]->value, string, False); 134 | printf("%s", string); 135 | if (text[i].array[j]->value == NEWLINE) 136 | { 137 | printf(format, ""); 138 | printf(" "); 139 | } 140 | } 141 | printf("}\n"); 142 | } 143 | } 144 | } 145 | /**********************************************************************/ 146 | 147 | main(argc, argv) 148 | int argc; 149 | char *argv[]; 150 | { 151 | Text *text; 152 | int i; 153 | Synclist synclist1, synclist2; 154 | initialize(&argc, argv, usage, option); 155 | if (argc < 2 || (transpose && argc > 2)) 156 | error("invalid number of text files"); 157 | text = NEW_ARRAY(argc, Text); 158 | for (i = 0; i < argc; i++) 159 | read_text(&text[i], argv[i], &textopt); 160 | if (transpose) 161 | { 162 | transpose_sync(&synclist1, &synclist2, &text[0], &text[1]); 163 | write_transposed(&synclist1, &text[0]); 164 | write_transposed(&synclist2, &text[1]); 165 | } 166 | else 167 | { 168 | if (heuristic || argc > 2) 169 | synchronize(&synclist1, argc, text); 170 | else 171 | fastukk_sync(&synclist1, text); 172 | write_matches(&synclist1, argc, text); 173 | write_differences(&synclist1, argc, text, argv); 174 | } 175 | write_separator(); 176 | terminate(); 177 | } 178 | -------------------------------------------------------------------------------- /src/text.h: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * text.h 4 | * 5 | * This module provides definitions and routines to support the reading 6 | * and writing of OCR-generated text. A "Text" structure is defined to 7 | * be a linked list of "Char" structures, where each "Char" structure 8 | * gives the value of one 32-bit Unicode character and indicates whether it 9 | * is suspect. 10 | * 11 | * Author: Stephen V. Rice (1996) 12 | * Author: Eddie Antonio Santos (2015) 13 | * 14 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 15 | * Education, on behalf, of the University of Nevada, Las Vegas, 16 | * Information Science Research Institute 17 | * 18 | * Licensed under the Apache License, Version 2.0 (the "License"); you 19 | * may not use this file except in compliance with the License. You 20 | * may obtain a copy of the License at 21 | * 22 | * http://www.apache.org/licenses/LICENSE-2.0 23 | * 24 | * Unless required by applicable law or agreed to in writing, software 25 | * distributed under the License is distributed on an "AS IS" BASIS, 26 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 27 | * implied. See the License for the specific language governing 28 | * permissions and limitations under the License. 29 | * 30 | **********************************************************************/ 31 | 32 | #ifndef _TEXT_ 33 | #define _TEXT_ 34 | 35 | #include 36 | 37 | #include "list.h" 38 | #include "util.h" 39 | 40 | #define BLANK ' ' 41 | #define NEWLINE '\n' 42 | #define LINE_TABULATION 0x00B 43 | #define FORM_FEED 0x00C 44 | #define CARRIAGE_RETURN 0x00D 45 | #define NON_BREAKING_SPACE 0x0A0 46 | #define REJECT_CHARACTER '~' 47 | #define SUSPECT_MARKER '^' 48 | #define COMBINING_MARK_BASE ((Charvalue) 0x25CC) /* ◌ -- Dotted circle */ 49 | #define INVALID_CHARVALUE ((Charvalue) (-1)) 50 | 51 | /* Use UTF-32 internally. */ 52 | typedef uint32_t Charvalue; 53 | /* Technically, there are far less *scalar values* (what you and I call 54 | * "characters") in Unicode, but using the max number of *code points* here 55 | * makes the implementation simpler. See: http://www.unicode.org/glossary/ */ 56 | #define NUM_CHARVALUES 0x10FFFF 57 | 58 | /* Maximum char size of a single char_to_string() operation, including the 59 | * null-terminator. It's either the largetst size of an encoded non-graphic 60 | * character (this happens to be U+10FFFF) OR the size of an astral (non-BMP) 61 | * combining character, that combines on top of U+25CC DOTTED CIRCLE 62 | * WITH a suspect marker! */ 63 | #define STRING_SIZE (max(sizeof("^◌𐇽"), sizeof("<10FFFF>"))) 64 | 65 | BEGIN_ITEM(Char) 66 | Boolean suspect; 67 | Charvalue value; 68 | END_ITEM(Char); 69 | 70 | BEGIN_LIST_OF(Char) 71 | END_LIST(Text); 72 | 73 | void append_char(/* Text *text, Boolean suspect, Charvalue value */); 74 | /* appends the given character to "text" */ 75 | 76 | typedef 77 | struct 78 | { 79 | Boolean find_header;/* if True, a header will be looked for and skipped if 80 | present; if False and a header is present, it will be 81 | stored as text */ 82 | Boolean find_markers; 83 | /* if True, any occurrence of the "suspect_marker" 84 | character will be interpreted as marking the 85 | following character as suspect */ 86 | 87 | Charvalue suspect_marker; 88 | /* applicable when "find_markers" is True; if zero, 89 | SUSPECT_MARKER will be used */ 90 | Boolean find_hex_values; 91 | /* deprecated and silently ignored */ 92 | Boolean normalize; /* if True, spacing is compressed */ 93 | Boolean case_insensitive; 94 | /* if True, letters are converted to lower-case */ 95 | Boolean found_header; 96 | /* set to True if a header was found; applicable when 97 | "find_header" is True */ 98 | } Textopt; 99 | 100 | void read_text(/* Text *text, char *filename, Textopt *textopt */); 101 | /* reads the named file (or stdin if "filename" is NULL 102 | and "textopt->find_header" is False) based on the 103 | options specified in "textopt", and appends each 104 | character to "text"; reports an error and quits if 105 | unable to open the file */ 106 | 107 | void char_to_string(/* Boolean suspect, Charvalue value, char *string, 108 | Boolean fake_newline */); 109 | /* stores a representation of the given character in 110 | "string", which must be at least STRING_SIZE bytes; a 111 | non-printable character is represented by a hex value 112 | of the form or ; if "fake_newline" is 113 | True, the newline character is represented by <\n>, 114 | which is desirable for some reports */ 115 | 116 | signed char encode_or_die(/* Charvalue value, char *string */); 117 | /* writes a Unicode value to the given string; 118 | * exits if the value cannot be written; 119 | * returns characters written to string */ 120 | 121 | Boolean cstring_to_text(Text* text, const char *string); 122 | /* appends the UTF-8 string to the text list; 123 | * exits if the value cannot be written; */ 124 | 125 | 126 | void write_text(/* Text *text, char *filename, 127 | void (*write_header)(FILE *f) */); 128 | /* writes each character of "text" to the named file 129 | (or stdout if "filename" is NULL) using 130 | "char_to_string" to represent the characters; 131 | if "write_header" is non-NULL, this routine is 132 | called to write a header to the file; reports an 133 | error and quits if unable to create the file */ 134 | 135 | #endif 136 | -------------------------------------------------------------------------------- /src/charclass.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * charclass.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "charclass.h" 26 | 27 | static short num_classes; 28 | static char *class_name[MAX_CHARCLASSES]; 29 | static Charclass class[NUM_CHARVALUES]; 30 | 31 | static struct range 32 | { 33 | Charvalue start, stop; 34 | char *name; 35 | } range[] = 36 | { 37 | 0x0000, 0x0000, "Unassigned", 38 | 0x0000, 0x0008, "ASCII Control Codes", 39 | 0x0009, 0x000D, "ASCII Spacing Characters", 40 | 0x000E, 0x001F, "ASCII Control Codes", 41 | 0x0020, 0x0020, "ASCII Spacing Characters", 42 | 0x0021, 0x002F, "ASCII Special Symbols", 43 | 0x0030, 0x0039, "ASCII Digits", 44 | 0x003A, 0x0040, "ASCII Special Symbols", 45 | 0x0041, 0x005A, "ASCII Uppercase Letters", 46 | 0x005B, 0x0060, "ASCII Special Symbols", 47 | 0x0061, 0x007A, "ASCII Lowercase Letters", 48 | 0x007B, 0x007E, "ASCII Special Symbols", 49 | 0x007F, 0x007F, "ASCII Control Codes", 50 | 0x0080, 0x009F, "Latin1 Control Codes", 51 | 0x00A0, 0x00A0, "Latin1 Spacing Characters", 52 | 0x00A1, 0x00BF, "Latin1 Special Symbols", 53 | 0x00C0, 0x00D6, "Latin1 Uppercase Letters", 54 | 0x00D7, 0x00D7, "Latin1 Special Symbols", 55 | 0x00D8, 0x00DE, "Latin1 Uppercase Letters", 56 | 0x00DF, 0x00F6, "Latin1 Lowercase Letters", 57 | 0x00F7, 0x00F7, "Latin1 Special Symbols", 58 | 0x00F8, 0x00FF, "Latin1 Lowercase Letters", 59 | 0x0100, 0x017F, "Latin Extended-A", 60 | 0x0180, 0x024F, "Latin Extended-B", 61 | 0x0250, 0x02AF, "IPA Extensions", 62 | 0x02B0, 0x02FF, "Spacing Modifier Letters", 63 | 0x0300, 0x036F, "Combining Diacritical Marks", 64 | 0x0370, 0x03CF, "Basic Greek", 65 | 0x03D0, 0x03FF, "Greek Symbols and Coptic", 66 | 0x0400, 0x04FF, "Cyrillic", 67 | 0x0530, 0x058F, "Armenian", 68 | 0x0590, 0x05CF, "Hebrew Extended-A", 69 | 0x05D0, 0x05EA, "Basic Hebrew", 70 | 0x05EB, 0x05FF, "Hebrew Extended-B", 71 | 0x0600, 0x0652, "Basic Arabic", 72 | 0x0653, 0x06FF, "Arabic Extended", 73 | 0x0900, 0x097F, "Devanagari", 74 | 0x0980, 0x09FF, "Bengali", 75 | 0x0A00, 0x0A7F, "Gurmukhi", 76 | 0x0A80, 0x0AFF, "Gujarati", 77 | 0x0B00, 0x0B7F, "Oriya", 78 | 0x0B80, 0x0BFF, "Tamil", 79 | 0x0C00, 0x0C7F, "Telugu", 80 | 0x0C80, 0x0CFF, "Kannada", 81 | 0x0D00, 0x0D7F, "Malayalam", 82 | 0x0E00, 0x0E7F, "Thai", 83 | 0x0E80, 0x0EFF, "Lao", 84 | 0x10A0, 0x10CF, "Georgian Extended", 85 | 0x10D0, 0x10FF, "Basic Georgian", 86 | 0x1100, 0x11FF, "Hanguljamo", 87 | 0x1E00, 0x1EFF, "Latin Extended Additional", 88 | 0x1F00, 0x1FFF, "Greek Extended", 89 | 0x2000, 0x206F, "General Punctuation", 90 | 0x2070, 0x209F, "Superscripts and Subscripts", 91 | 0x20A0, 0x20CF, "Currency Symbols", 92 | 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols", 93 | 0x2100, 0x214F, "Letterlike Symbols", 94 | 0x2150, 0x218F, "Number Forms", 95 | 0x2190, 0x21FF, "Arrows", 96 | 0x2200, 0x22FF, "Mathematical Operators", 97 | 0x2300, 0x23FF, "Miscellaneous Technical", 98 | 0x2400, 0x243F, "Control Pictures", 99 | 0x2440, 0x245F, "Optical Character Recognition", 100 | 0x2460, 0x24FF, "Enclosed Alphanumerics", 101 | 0x2500, 0x257F, "Box Drawing", 102 | 0x2580, 0x259F, "Block Elements", 103 | 0x25A0, 0x25FF, "Geometric Shapes", 104 | 0x2600, 0x26FF, "Miscellaneous Symbols", 105 | 0x2700, 0x27BF, "Dingbats", 106 | 0x3000, 0x303F, "CJK Symbols and Punctuation", 107 | 0x3040, 0x309F, "Hiragana", 108 | 0x30A0, 0x30FF, "Katakana", 109 | 0x3100, 0x312F, "Bopomofo", 110 | 0x3130, 0x318F, "Hangul Compatibility Jamo", 111 | 0x3190, 0x319F, "CJK Miscellaneous", 112 | 0x3200, 0x32FF, "Enclosed CJK Letters and Months", 113 | 0x3300, 0x33FF, "CJK Compatibility", 114 | 0x3400, 0x3D2D, "Hangul", 115 | 0x3D2E, 0x44B7, "Hangul Supplementary-A", 116 | 0x44B8, 0x4DFF, "Hangul Supplementary-B", 117 | 0x4E00, 0x9FFF, "CJK Unified Ideographs", 118 | 0xE000, 0xF8FF, "Private Use Area", 119 | 0xF900, 0xFAFF, "CJK Compatibility Ideographs", 120 | 0xFB00, 0xFB4F, "Alphabetic Presentation Forms", 121 | 0xFB50, 0xFDFF, "Arabic Presentation Forms-A", 122 | 0xFE20, 0xFE2F, "Combining Half Marks", 123 | 0xFE30, 0xFE4F, "CJK Compatibility Forms", 124 | 0xFE50, 0xFE6F, "Small Form Variants", 125 | 0xFE70, 0xFEFE, "Arabic Presentation Forms-B", 126 | 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms", 127 | 0xFFF0, 0xFFFD, "Specials" 128 | }; 129 | 130 | /**********************************************************************/ 131 | 132 | static void initialize_charclass() 133 | { 134 | long i, j, k; 135 | for (i = 0; i < sizeof(range) / sizeof(struct range); i++) 136 | { 137 | for (j = 0; j < num_classes && 138 | strcmp(range[i].name, class_name[j]) != 0; j++); 139 | if (j == num_classes) 140 | { 141 | if (num_classes == MAX_CHARCLASSES) 142 | error("too many character classes"); 143 | num_classes++; 144 | class_name[j] = range[i].name; 145 | } 146 | for (k = range[i].start; k <= range[i].stop; k++) 147 | class[k] = j; 148 | } 149 | } 150 | /**********************************************************************/ 151 | 152 | Charclass charclass(value) 153 | Charvalue value; 154 | { 155 | if (num_classes == 0) 156 | initialize_charclass(); 157 | return(class[value]); 158 | } 159 | /**********************************************************************/ 160 | 161 | char *charclass_name(class) 162 | Charclass class; 163 | { 164 | if (num_classes == 0) 165 | initialize_charclass(); 166 | if (class >= num_classes) 167 | error("invalid character class"); 168 | return(class_name[class]); 169 | } 170 | -------------------------------------------------------------------------------- /test/word_test.c: -------------------------------------------------------------------------------- 1 | #include "greatest.h" 2 | #include "test_utils.h" 3 | 4 | #include 5 | 6 | static Wordlist wordlist_; 7 | static Wordlist *wordlist = &wordlist_; 8 | 9 | /* Aliases for traversing the linked list. */ 10 | /* ALWAYS ensure sufficient length before using these aliases. */ 11 | #define second first->next 12 | #define third first->next->next 13 | #define fourth first->next->next->next 14 | #define fifth first->next->next->next->next 15 | #define sixth first->next->next->next->next->next 16 | #define seventh first->next->next->next->next->next->next 17 | #define eighth first->next->next->next->next->next->next->next 18 | #define ninth first->next->next->next->next->next->next->next->next 19 | 20 | 21 | TEST find_words_segments_a_single_ascii_word() { 22 | cstring_to_text(text, "C11"); 23 | find_words(wordlist, text); 24 | 25 | ASSERT_EQ_FMT(1, wordlist->count, "%d"); 26 | ASSERT_STR_EQ("C11", wordlist->first->string); 27 | PASS(); 28 | } 29 | 30 | TEST find_words_returns_nfc() { 31 | char pho_nfc[] = { 'p', 'h', 0xE1, 0xBB, 0x9F, 0 }; 32 | /* With two combining characters. */ 33 | cstring_to_text(text, (char []) { 'p', 'h', 'o', 34 | 0xCC, 0x9B, /* ◌̛ */ 35 | 0xCC, 0x89, /* ◌̉ */ 36 | 0 }); 37 | find_words(wordlist, text); 38 | 39 | ASSERT_EQ_FMT(1, wordlist->count, "%d"); 40 | ASSERT_STR_EQ(pho_nfc, wordlist->first->string); 41 | PASS(); 42 | } 43 | 44 | TEST find_words_returns_zero_when_not_given_words() { 45 | /* With two combining characters. */ 46 | cstring_to_text(text, "#$#@! #@!\n#@!!!$#"); 47 | find_words(wordlist, text); 48 | 49 | ASSERT_EQ_FMT(0, wordlist->count, "%d"); 50 | PASS(); 51 | } 52 | 53 | 54 | /* Exercises ASCII characters. */ 55 | TEST find_words_segments_english_with_punctuation() { 56 | /* From: http://unicode.org/reports/tr29/#Word_Boundaries */ 57 | cstring_to_text(text, "The quick (\"brown\") fox can’t jump 32.3 feet, " 58 | "right?"); 59 | find_words(wordlist, text); 60 | 61 | ASSERT_EQ_FMT(9, wordlist->count, "%d"); 62 | ASSERT_STR_EQ("quick", wordlist->second->string); 63 | ASSERT_STR_EQ("brown", wordlist->third->string); 64 | ASSERT_STR_EQ("fox", wordlist->fourth->string); 65 | ASSERT_STR_EQ("can’t", wordlist->fifth->string); 66 | ASSERT_STR_EQ("jump", wordlist->sixth->string); 67 | ASSERT_STR_EQ("32.3", wordlist->seventh->string); 68 | ASSERT_STR_EQ("right", wordlist->ninth->string); 69 | 70 | PASS(); 71 | } 72 | 73 | /* Exercises Latin-1 characters and punctuation. */ 74 | TEST find_words_segments_spanish_words() { 75 | cstring_to_text(text, "¡Feliz año nuevo!"); 76 | find_words(wordlist, text); 77 | 78 | ASSERT_EQ_FMT(3, wordlist->count, "%d"); 79 | ASSERT_STR_EQ("Feliz", wordlist->first->string); 80 | ASSERT_STR_EQ("año", wordlist->second->string); 81 | ASSERT_STR_EQ("nuevo", wordlist->third->string); 82 | 83 | PASS(); 84 | } 85 | 86 | /* Exercises numeric processing. */ 87 | TEST find_words_segments_numerals() { 88 | /* From https://github.com/eddieantonio/ocreval/issues/3 */ 89 | cstring_to_text(text, "PLASTIK-KARTON BARDA %18 *1,75"); 90 | find_words(wordlist, text); 91 | 92 | ASSERT_EQ_FMT(5, wordlist->count, "%d"); 93 | ASSERT_STR_EQ("PLASTIK", wordlist->first->string); 94 | ASSERT_STR_EQ("KARTON", wordlist->second->string); 95 | ASSERT_STR_EQ("BARDA", wordlist->third->string); 96 | ASSERT_STR_EQ("18", wordlist->fourth->string); 97 | ASSERT_STR_EQ("1,75", wordlist->fifth->string); 98 | 99 | PASS(); 100 | } 101 | 102 | /* Exercises numeric processing. */ 103 | TEST find_words_segments_japanese() { 104 | /* This phrase -- rōkaraizu no densetsu 'Legend of Localization' -- is 105 | * conveniently written in katakana, hiragana, and kanji, respectively. */ 106 | cstring_to_text(text, "ローカライズの伝説"); 107 | find_words(wordlist, text); 108 | 109 | ASSERT_EQ_FMT(4, wordlist->count, "%d"); 110 | ASSERT_STR_EQ("ローカライズ", wordlist->first->string); 111 | ASSERT_STR_EQ("の", wordlist->second->string); 112 | /* A Japanese- tailored algorithm would segment this into three words 113 | * instead of four, however, that would involve incorporating a Japanese 114 | * dictionary in order to look-up Kanji words... */ 115 | ASSERT_STR_EQ("伝", wordlist->third->string); 116 | ASSERT_STR_EQ("説", wordlist->fourth->string); 117 | 118 | PASS(); 119 | } 120 | 121 | TEST find_words_segments_haida_words() { 122 | cstring_to_text(text, "Wᴀˊstᴀ haˊoîsîn ᵋāl ʟēˊłas ʟ̣ū haoîsîˊn\n" 123 | "l’ sᵋaiˊᵋänᴀn."); 124 | find_words(wordlist, text); 125 | 126 | ASSERT_EQ_FMT(8, wordlist->count, "%d"); 127 | ASSERT_STR_EQ("Wᴀˊstᴀ", wordlist->first->string); 128 | ASSERT_STR_EQ("haˊoîsîn", wordlist->second->string); 129 | ASSERT_STR_EQ("ᵋāl", wordlist->third->string); 130 | ASSERT_STR_EQ("ʟēˊłas", wordlist->fourth->string); 131 | ASSERT_STR_EQ("ʟ̣ū", wordlist->fifth->string); 132 | ASSERT_STR_EQ("haoîsîˊn", wordlist->sixth->string); 133 | ASSERT_STR_EQ("sᵋaiˊᵋänᴀn", wordlist->eighth->string); 134 | 135 | PASS(); 136 | } 137 | 138 | /* Regression test -- word boundaries segfaults on control characters. 139 | * See: https://github.com/eddieantonio/ocreval/issues/22#issuecomment-491448129 */ 140 | TEST find_words_control_character() { 141 | cstring_to_text(text, "\003"); 142 | find_words(wordlist, text); 143 | 144 | ASSERT_EQ_FMT(0, wordlist->count, "%d"); 145 | 146 | PASS(); 147 | } 148 | 149 | 150 | #undef second 151 | #undef third 152 | #undef fourth 153 | #undef fifth 154 | #undef sixth 155 | #undef seventh 156 | #undef eighth 157 | #undef ninth 158 | 159 | static void setup_find_words(void *unused) { 160 | initialize_texts((Text*[]) {text, NULL}); 161 | list_initialize(wordlist); 162 | } 163 | 164 | static void teardown_find_words(void *unused) { 165 | deinitialize_texts((Text*[]) {text, NULL}); 166 | list_empty(wordlist, free_word); 167 | } 168 | 169 | SUITE(find_words_suite) { 170 | SET_SETUP(setup_find_words, NULL); 171 | SET_TEARDOWN(teardown_find_words, NULL); 172 | 173 | RUN_TEST(find_words_segments_a_single_ascii_word); 174 | RUN_TEST(find_words_returns_nfc); 175 | RUN_TEST(find_words_returns_zero_when_not_given_words); 176 | RUN_TEST(find_words_segments_english_with_punctuation); 177 | RUN_TEST(find_words_segments_spanish_words); 178 | RUN_TEST(find_words_segments_haida_words); 179 | RUN_TEST(find_words_segments_numerals); 180 | RUN_TEST(find_words_segments_japanese); 181 | RUN_TEST(find_words_control_character); 182 | } 183 | -------------------------------------------------------------------------------- /src/editop.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * editop.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "edorpt.h" 26 | #include "sync.h" 27 | 28 | #define usage "correctfile generatedfile [editop_report]" 29 | 30 | Boolean debug; 31 | 32 | Option option[] = 33 | { 34 | 'D', NULL, &debug, 35 | '\0' 36 | }; 37 | 38 | Textopt textopt = { True, True, 0, True, True }; 39 | 40 | Text text1, text2; 41 | 42 | Edodata edodata; 43 | 44 | typedef 45 | struct 46 | { 47 | Boolean for_insertion, for_deletion; 48 | Sync *sync; 49 | } Candidate; 50 | 51 | /**********************************************************************/ 52 | 53 | void display(synclist, label) 54 | Synclist *synclist; 55 | char *label; 56 | { 57 | Sync *sync; 58 | printf("%-9s:", label); 59 | for (sync = synclist->first; sync; sync = sync->next) 60 | printf(" {%ld:%ld}", *sync->match, sync->substr->length); 61 | printf("\n\n"); 62 | } 63 | /**********************************************************************/ 64 | 65 | void discard_sync(synclist, sync) 66 | Synclist *synclist; 67 | Sync *sync; 68 | { 69 | list_remove(synclist, sync); 70 | free(sync->substr); 71 | if (sync->match) 72 | free(sync->match); 73 | free(sync); 74 | } 75 | /**********************************************************************/ 76 | 77 | void count_insertions(synclist, text) 78 | Synclist *synclist; 79 | Text *text; 80 | { 81 | Sync *sync; 82 | long i; 83 | for (sync = synclist->first; sync; sync = sync->next) 84 | if (!sync->match) 85 | for (i = sync->substr->start; i <= sync->substr->stop; i++) 86 | if (text->array[i]->value != REJECT_CHARACTER) 87 | edodata.total_insertions++; 88 | } 89 | /**********************************************************************/ 90 | 91 | void count_deletions(synclist) 92 | Synclist *synclist; 93 | { 94 | Sync *sync, *next; 95 | sync = synclist->first; 96 | while (sync) 97 | { 98 | next = sync->next; 99 | if (!sync->match) 100 | { 101 | edodata.total_deletions += sync->substr->length; 102 | discard_sync(synclist, sync); 103 | } 104 | sync = next; 105 | } 106 | } 107 | /**********************************************************************/ 108 | 109 | void decrement_match(synclist, limit) 110 | Synclist *synclist; 111 | long limit; 112 | { 113 | Sync *sync; 114 | for (sync = synclist->first; sync; sync = sync->next) 115 | if (*sync->match > limit) 116 | *sync->match -= 1; 117 | } 118 | /**********************************************************************/ 119 | 120 | void combine_adjacent(synclist) 121 | Synclist *synclist; 122 | { 123 | Sync *sync, *next; 124 | sync = synclist->first; 125 | while (sync) 126 | { 127 | next = sync->next; 128 | if (next && *next->match == *sync->match + 1) 129 | { 130 | sync->substr->length += next->substr->length; 131 | discard_sync(synclist, next); 132 | decrement_match(synclist, *sync->match); 133 | } 134 | else 135 | sync = next; 136 | } 137 | if (debug) 138 | display(synclist, "combined"); 139 | } 140 | /**********************************************************************/ 141 | 142 | Candidate *find_candidates(synclist) 143 | Synclist *synclist; 144 | { 145 | Candidate *candidate; 146 | Sync *sync; 147 | candidate = NEW_ARRAY(synclist->count + 1, Candidate); 148 | for (sync = synclist->first; sync; sync = sync->next) 149 | { 150 | if (sync->next && *sync->next->match == *sync->match + 2) 151 | candidate[*sync->match + 1].for_insertion = True; 152 | if (sync->next && sync->next->next && 153 | *sync->next->next->match == *sync->match + 1) 154 | candidate[*sync->next->match].for_deletion = True; 155 | candidate[*sync->match].sync = sync; 156 | } 157 | return(candidate); 158 | } 159 | /**********************************************************************/ 160 | 161 | Sync *find_move(synclist, candidate) 162 | Synclist *synclist; 163 | Candidate candidate[]; 164 | { 165 | long i, reduction, move_i, move_length, move_reduction = 0; 166 | for (i = 1; i <= synclist->count; i++) 167 | { 168 | reduction = 1; 169 | if (candidate[i].for_insertion) 170 | reduction++; 171 | if (candidate[i].for_deletion) 172 | reduction++; 173 | if (reduction > move_reduction || (reduction == move_reduction && 174 | candidate[i].sync->substr->length < move_length)) 175 | { 176 | move_i = i; 177 | move_length = candidate[i].sync->substr->length; 178 | move_reduction = reduction; 179 | } 180 | } 181 | return(candidate[move_i].sync); 182 | } 183 | /**********************************************************************/ 184 | 185 | void perform_move(synclist, candidate, sync) 186 | Synclist *synclist; 187 | Candidate candidate[]; 188 | Sync *sync; 189 | { 190 | short length; 191 | list_remove(synclist, sync); 192 | if (*sync->match == 1) 193 | list_insert_before(synclist, sync, candidate[2].sync); 194 | else 195 | list_insert_after(synclist, candidate[*sync->match - 1].sync, sync); 196 | edodata.total_moves++; 197 | length = min(sync->substr->length, MAX_MOVE_LENGTH); 198 | edodata.moves[length]++; 199 | if (debug) 200 | { 201 | char label[20]; 202 | sprintf(label, "moved %ld", *sync->match); 203 | display(synclist, label); 204 | } 205 | } 206 | /**********************************************************************/ 207 | 208 | void count_moves(synclist) 209 | Synclist *synclist; 210 | { 211 | Candidate *candidate; 212 | Sync *sync; 213 | if (debug) 214 | display(synclist, "original"); 215 | combine_adjacent(synclist); 216 | while (synclist->count > 1) 217 | { 218 | candidate = find_candidates(synclist); 219 | sync = find_move(synclist, candidate); 220 | perform_move(synclist, candidate, sync); 221 | free(candidate); 222 | combine_adjacent(synclist); 223 | } 224 | } 225 | /**********************************************************************/ 226 | 227 | main(argc, argv) 228 | int argc; 229 | char *argv[]; 230 | { 231 | Synclist synclist1, synclist2; 232 | initialize(&argc, argv, usage, option); 233 | if (argc < 2 || argc > 3) 234 | error("invalid number of files"); 235 | read_text(&text1, argv[0], &textopt); 236 | if (textopt.found_header) 237 | error("no correct file specified"); 238 | read_text(&text2, argv[1], &textopt); 239 | transpose_sync(&synclist1, &synclist2, &text1, &text2); 240 | count_insertions(&synclist1, &text1); 241 | count_deletions(&synclist2); 242 | count_moves(&synclist2); 243 | write_edorpt(&edodata, (argc == 3 ? argv[2] : NULL)); 244 | terminate(); 245 | } 246 | -------------------------------------------------------------------------------- /src/wordacc.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wordacc.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "stopword.h" 26 | #include "wacrpt.h" 27 | 28 | #define usage "[-S stopwordfile] correctfile generatedfile [wordacc_report]" 29 | 30 | char *stopwordfilename; 31 | 32 | Option option[] = 33 | { 34 | 'S', &stopwordfilename, NULL, 35 | '\0' 36 | }; 37 | 38 | Textopt textopt = { True, True, 0, True, True, True }; 39 | Text text[2]; 40 | 41 | Wordlist wordlist[2]; 42 | 43 | BEGIN_ENTRY(Id) 44 | Boolean found[2]; 45 | END_ENTRY(Id); 46 | 47 | BEGIN_TABLE_OF(Id, Idlist) 48 | END_TABLE(Idtable); 49 | Idtable idtable; 50 | 51 | typedef 52 | struct 53 | { 54 | Id *id; 55 | Boolean recognized; 56 | } Symbol; 57 | 58 | Symbol *symbol[2], **a, **b; 59 | 60 | long m, n, min_k, max_k; 61 | 62 | typedef unsigned short F; 63 | #define MAX_F 65535 64 | 65 | BEGIN_ITEM(Path) 66 | long p; 67 | F *f; 68 | END_ITEM(Path); 69 | 70 | BEGIN_LIST_OF(Path) 71 | END_LIST(Pathlist); 72 | Pathlist pathlist; 73 | 74 | Wacdata wacdata; 75 | 76 | /**********************************************************************/ 77 | 78 | Symbol **setup_array(index, length) 79 | long index, *length; 80 | { 81 | Symbol **array; 82 | long i, j = 0; 83 | array = NEW_ARRAY(wordlist[index].count + 1, Symbol *); 84 | for (i = 0; i < wordlist[index].count; i++) 85 | if (symbol[index][i].id->found[1 - index]) 86 | array[j++] = &symbol[index][i]; 87 | *length = j; 88 | return(array); 89 | } 90 | /**********************************************************************/ 91 | 92 | void setup(filename) 93 | char *filename[]; 94 | { 95 | long i, j; 96 | Word *word; 97 | Id *id; 98 | for (i = 0; i < 2; i++) 99 | { 100 | read_text(&text[i], filename[i], &textopt); 101 | if (i == 0 && textopt.found_header) 102 | error("no correct file specified"); 103 | find_words(&wordlist[i], &text[i]); 104 | symbol[i] = NEW_ARRAY(wordlist[i].count + 1, Symbol); 105 | j = 0; 106 | for (word = wordlist[i].first; word; word = word->next) 107 | { 108 | id = table_lookup(&idtable, word->string); 109 | if (!id) 110 | { 111 | id = NEW(Id); 112 | id->key = (char *) word->string; 113 | table_insert(&idtable, id); 114 | } 115 | id->found[i] = True; 116 | symbol[i][j++].id = id; 117 | } 118 | } 119 | a = setup_array(0, &m); 120 | if (m > MAX_F) 121 | error("text stream is too long"); 122 | b = setup_array(1, &n); 123 | } 124 | /**********************************************************************/ 125 | 126 | long initial_f(k, prev_path, prev_k) 127 | long k, *prev_k; 128 | Path *prev_path; 129 | { 130 | long value, result = 0; 131 | if (prev_path) 132 | { 133 | if (k > -prev_path->p) 134 | { 135 | result = prev_path->f[(k - 1 + prev_path->p) >> 1]; 136 | *prev_k = k - 1; 137 | } 138 | if (k < prev_path->p) 139 | { 140 | value = prev_path->f[(k + 1 + prev_path->p) >> 1] + 1; 141 | if (value >= result) 142 | { 143 | result = value; 144 | *prev_k = k + 1; 145 | } 146 | } 147 | } 148 | return(result); 149 | } 150 | /**********************************************************************/ 151 | 152 | void compute_f(k, path) 153 | long k; 154 | Path *path; 155 | { 156 | long i, j, value; 157 | i = initial_f(k, path->prev, &value); 158 | j = i + k; 159 | while (i < m && j < n && a[i]->id == b[j]->id) 160 | { 161 | i++; 162 | j++; 163 | } 164 | if (i == m) 165 | min_k = k + 1; 166 | if (j == n) 167 | max_k = k - 1; 168 | path->f[(k + path->p) >> 1] = i; 169 | } 170 | /**********************************************************************/ 171 | 172 | void compute_pathlist() 173 | { 174 | long p = -1, k; 175 | Path *path; 176 | min_k = -m; 177 | max_k = n; 178 | while (min_k <= n - m) 179 | { 180 | path = NEW(Path); 181 | path->p = ++p; 182 | path->f = NEW_ARRAY(p + 1, F); 183 | list_insert_last(&pathlist, path); 184 | k = -p; 185 | while (k <= p) 186 | { 187 | if (k >= min_k && k <= max_k) 188 | compute_f(k, path); 189 | k += 2; 190 | } 191 | } 192 | } 193 | /**********************************************************************/ 194 | 195 | void obtain_matches() 196 | { 197 | long k, f, start, prev_k; 198 | Path *path; 199 | k = n - m; 200 | for (path = pathlist.last; path; path = path->prev) 201 | { 202 | f = path->f[(k + path->p) >> 1]; 203 | start = initial_f(k, path->prev, &prev_k); 204 | while (f > start) 205 | a[--f]->recognized = True; 206 | k = prev_k; 207 | } 208 | } 209 | /**********************************************************************/ 210 | 211 | void process_terms(termtable, length, occurs) 212 | Termtable *termtable; 213 | Wac length[], occurs[]; 214 | { 215 | long i, count, missed; 216 | table_in_array(termtable); 217 | for (i = 0; i < termtable->count; i++) 218 | { 219 | count = termtable->array[i]->wac.count; 220 | missed = termtable->array[i]->wac.missed; 221 | increment_wac(&wacdata.total, count, missed); 222 | increment_wac(&length[0], count, missed); 223 | increment_wac(&length[strlen(termtable->array[i]->key)], count, missed); 224 | if (occurs) 225 | { 226 | increment_wac(&occurs[0], 1, (count == missed ? 1 : 0)); 227 | increment_wac(&occurs[min(count, MAX_OCCURRENCES + 1)], 1, 228 | (count == missed ? 1 : 0)); 229 | } 230 | } 231 | } 232 | /**********************************************************************/ 233 | 234 | void process_phrases() 235 | { 236 | long i, j; 237 | Boolean recognized; 238 | for (i = 0; i < wordlist[0].count; i++) 239 | { 240 | recognized = True; 241 | for (j = 0; j < MAX_PHRASELENGTH && i + j < wordlist[0].count; j++) 242 | { 243 | recognized &= symbol[0][i + j].recognized; 244 | increment_wac(&wacdata.phrase[j + 1], 1, (recognized ? 0 : 1)); 245 | } 246 | } 247 | } 248 | /**********************************************************************/ 249 | 250 | void determine_wacdata() 251 | { 252 | long i; 253 | for (i = 0; i < wordlist[0].count; i++) 254 | add_term((is_stopword(symbol[0][i].id->key) ? 255 | &wacdata.stopword_table : &wacdata.non_stopword_table), 256 | symbol[0][i].id->key, 1, (symbol[0][i].recognized ? 0 : 1)); 257 | process_terms(&wacdata.stopword_table, wacdata.stopword, NULL); 258 | process_terms(&wacdata.non_stopword_table, wacdata.non_stopword, 259 | wacdata.distinct_non_stopword); 260 | process_phrases(); 261 | } 262 | /**********************************************************************/ 263 | 264 | main(argc, argv) 265 | int argc; 266 | char *argv[]; 267 | { 268 | initialize(&argc, argv, usage, option); 269 | if (argc < 2 || argc > 3) 270 | error("invalid number of files"); 271 | init_stopwords(stopwordfilename); 272 | setup(argv); 273 | compute_pathlist(); 274 | obtain_matches(); 275 | determine_wacdata(); 276 | write_wacrpt(&wacdata, (argc == 3 ? argv[2] : NULL)); 277 | terminate(); 278 | } 279 | -------------------------------------------------------------------------------- /src/util.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * util.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include 26 | 27 | #if defined(__unix__) || defined(__MACH__) 28 | #include 29 | #include 30 | #else 31 | #include 32 | #include 33 | #endif 34 | 35 | #include "util.h" 36 | 37 | char *exec_name; 38 | 39 | Boolean usage_when_no_args = True; 40 | void (*usage_routine)(); 41 | 42 | void (*cleanup_routine)(); 43 | 44 | int errstatus = 1; 45 | 46 | static short tempfile_id; 47 | static void quit(/* int status */) __attribute__ ((noreturn)); 48 | 49 | /**********************************************************************/ 50 | 51 | void *allocate(number, size) 52 | size_t number, size; 53 | { 54 | void *p; 55 | p = calloc(number, size); 56 | if (!p) 57 | error("unable to allocate memory"); 58 | return(p); 59 | } 60 | /**********************************************************************/ 61 | 62 | int ustrcmp(s1, s2) 63 | unsigned char *s1, *s2; 64 | { 65 | long i; 66 | for (i = 0; s1[i] && s1[i] == s2[i]; i++); 67 | if (s1[i] || s2[i]) 68 | return(s1[i] < s2[i] ? -1 : 1); 69 | else 70 | return(0); 71 | } 72 | /**********************************************************************/ 73 | 74 | FILE *open_file(filename, mode) 75 | char *filename, *mode; 76 | { 77 | FILE *f; 78 | if (!filename) 79 | return(mode[0] == 'r' ? stdin : stdout); 80 | f = fopen(filename, mode); 81 | if (f) 82 | return(f); 83 | if (mode[0] == 'w') 84 | error_string("unable to create file", filename); 85 | else 86 | error_string("unable to open file", filename); 87 | } 88 | /**********************************************************************/ 89 | 90 | void close_file(f) 91 | FILE *f; 92 | { 93 | if (f != stdin && f != stdout) 94 | fclose(f); 95 | } 96 | /**********************************************************************/ 97 | 98 | Boolean file_exists(filename) 99 | char *filename; 100 | { 101 | struct stat buffer; 102 | return(stat(filename, &buffer) == 0 ? True : False); 103 | } 104 | /**********************************************************************/ 105 | 106 | static char *create_tempfilename(id) 107 | short id; 108 | { 109 | char name[100]; 110 | #ifdef unix 111 | sprintf(name, "/tmp/.%s%d-%d", exec_name, getpid(), id); 112 | #else 113 | sprintf(name, "c:\\temp\\tempfile.%d", id); 114 | #endif 115 | return(strdup(name)); 116 | } 117 | /**********************************************************************/ 118 | 119 | char *tempfilename() 120 | { 121 | char *name; 122 | name = create_tempfilename(++tempfile_id); 123 | unlink(name); 124 | return(name); 125 | } 126 | /**********************************************************************/ 127 | 128 | static void delete_tempfiles() 129 | { 130 | short i; 131 | char *name; 132 | for (i = 1; i <= tempfile_id; i++) 133 | { 134 | name = create_tempfilename(i); 135 | unlink(name); 136 | free(name); 137 | } 138 | } 139 | /**********************************************************************/ 140 | 141 | char *basefilename(pathname) 142 | char *pathname; 143 | { 144 | #ifdef unix 145 | char delimiter = '/'; 146 | #else 147 | char delimiter = '\\'; 148 | #endif 149 | short i; 150 | for (i = strlen(pathname) - 1; i >= 0 && pathname[i] != delimiter; i--); 151 | return(&pathname[i + 1]); 152 | } 153 | /**********************************************************************/ 154 | 155 | static void handle_interrupt(signal) 156 | int signal; 157 | { 158 | static Boolean handling_interrupt = False; 159 | if (handling_interrupt) 160 | return; 161 | handling_interrupt = True; 162 | error("process killed"); 163 | } 164 | /**********************************************************************/ 165 | 166 | static void trap_interrupts() 167 | { 168 | signal(SIGINT, handle_interrupt); 169 | signal(SIGTERM, handle_interrupt); 170 | } 171 | /**********************************************************************/ 172 | 173 | static void show_usage(usage) 174 | char *usage; 175 | { 176 | if (usage_routine) 177 | (*usage_routine)(); 178 | else 179 | fprintf(stderr, "Usage: %s %s\n", exec_name, usage); 180 | terminate(); 181 | } 182 | /**********************************************************************/ 183 | 184 | static Boolean split_option(arg, next_arg, option) 185 | char *arg, *next_arg; 186 | Option option[]; 187 | { 188 | short i; 189 | if (!option) 190 | goto invalid_option; 191 | for (i = 0; option[i].name && arg[1] != option[i].name; i++); 192 | if (!option[i].name) 193 | goto invalid_option; 194 | if (option[i].string && (arg[2] || next_arg)) 195 | { 196 | if (*option[i].string) 197 | goto duplicate_option; 198 | *option[i].string = (arg[2] ? &arg[2] : next_arg); 199 | return(arg[2] ? False : True); 200 | } 201 | if (option[i].boolean && !arg[2]) 202 | { 203 | if (*option[i].boolean) 204 | goto duplicate_option; 205 | *option[i].boolean = True; 206 | return(False); 207 | } 208 | invalid_option: 209 | error_string("invalid option", arg); 210 | duplicate_option: 211 | error_string("duplicate option", arg); 212 | } 213 | /**********************************************************************/ 214 | 215 | static void parse_args(argc, argv, usage, option) 216 | int *argc; 217 | char *argv[], *usage; 218 | Option option[]; 219 | { 220 | short i, j = 0; 221 | if (*argc == 1 && usage_when_no_args) 222 | show_usage(usage); 223 | for (i = 1; i < *argc; i++) 224 | if (argv[i][0] == '-' && argv[i][1]) 225 | { 226 | if (strncmp("-help", argv[i], strlen(argv[i])) == 0) 227 | show_usage(usage); 228 | if (split_option(argv[i], (i + 1 < *argc ? argv[i + 1] : NULL), 229 | option)) 230 | i++; 231 | } 232 | else 233 | argv[j++] = argv[i]; 234 | *argc = j; 235 | } 236 | /**********************************************************************/ 237 | 238 | void initialize(argc, argv, usage, option) 239 | int *argc; 240 | char *argv[], *usage; 241 | Option option[]; 242 | { 243 | exec_name = basefilename(argv[0]); 244 | trap_interrupts(); 245 | parse_args(argc, argv, usage, option); 246 | } 247 | /**********************************************************************/ 248 | 249 | static void quit(status) 250 | int status; 251 | { 252 | if (cleanup_routine) 253 | (*cleanup_routine)(); 254 | delete_tempfiles(); 255 | exit(status); 256 | } 257 | /**********************************************************************/ 258 | 259 | void terminate() 260 | { 261 | quit(0); 262 | } 263 | /**********************************************************************/ 264 | 265 | void error(message) 266 | char *message; 267 | { 268 | fprintf(stderr, "%s: %s\n", exec_name, message); 269 | quit(errstatus); 270 | } 271 | /**********************************************************************/ 272 | 273 | void error_string(message, string) 274 | char *message, *string; 275 | { 276 | fprintf(stderr, "%s: %s \"%s\"\n", exec_name, message, string); 277 | quit(errstatus); 278 | } 279 | 280 | void warning_string(message, string) 281 | char *message, *string; 282 | { 283 | fprintf(stderr, "%s: %s \"%s\"\n", exec_name, message, string); 284 | } 285 | -------------------------------------------------------------------------------- /src/wacrpt.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * wacrpt.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "sort.h" 26 | #include "wacrpt.h" 27 | #include "ocreval_version.h" 28 | 29 | #define TITLE "ocreval Word Accuracy Report Version " OCREVAL_VERSION "\n" 30 | #define DIVIDER "----------------------------------------\n" 31 | 32 | #define TOTAL " Total\n" 33 | #define OFFSET 29 34 | 35 | static char line[100]; 36 | 37 | /**********************************************************************/ 38 | 39 | void increment_wac(wac, count, missed) 40 | Wac *wac; 41 | long count, missed; 42 | { 43 | wac->count += count; 44 | wac->missed += missed; 45 | } 46 | /**********************************************************************/ 47 | 48 | void add_term(termtable, key, count, missed) 49 | Termtable *termtable; 50 | char *key; 51 | long count, missed; 52 | { 53 | Term *term; 54 | term = table_lookup(termtable, key); 55 | if (!term) 56 | { 57 | term = NEW(Term); 58 | term->key = strdup(key); 59 | table_insert(termtable, term); 60 | } 61 | increment_wac(&term->wac, count, missed); 62 | } 63 | /**********************************************************************/ 64 | 65 | static Boolean read_line(f) 66 | FILE *f; 67 | { 68 | return(fgets(line, sizeof(line) - 1, f) ? True : False); 69 | } 70 | /**********************************************************************/ 71 | 72 | static Boolean read_one(f, value) 73 | FILE *f; 74 | long *value; 75 | { 76 | return(read_line(f) && sscanf(line, "%ld", value) == 1 ? True : False); 77 | } 78 | /**********************************************************************/ 79 | 80 | static Boolean read_two(f, value1, value2) 81 | FILE *f; 82 | long *value1, *value2; 83 | { 84 | return(read_line(f) && sscanf(line, "%ld %ld", value1, value2) == 2 ? 85 | True : False); 86 | } 87 | /**********************************************************************/ 88 | 89 | static long read_numbers(f, wac) 90 | FILE *f; 91 | Wac wac[]; 92 | { 93 | long count, missed, index, total_count = 0; 94 | if (read_line(f) && read_line(f)) 95 | while (read_two(f, &count, &missed)) 96 | { 97 | index = atoi(&line[OFFSET]); 98 | if (index == 0) { 99 | if (strcmp(&line[OFFSET], TOTAL) == 0) 100 | total_count = count; 101 | else /* excess */ 102 | index = MAX_OCCURRENCES + 1; 103 | } 104 | increment_wac(&wac[index], count, missed); 105 | } 106 | return(total_count); 107 | } 108 | /**********************************************************************/ 109 | 110 | static void read_terms(f, termtable) 111 | FILE *f; 112 | Termtable *termtable; 113 | { 114 | long count, missed; 115 | if (read_line(f) && read_line(f)) 116 | while (read_two(f, &count, &missed)) 117 | { 118 | line[strlen(line) - 1] = '\0'; 119 | add_term(termtable, &line[OFFSET], count, missed); 120 | } 121 | } 122 | /**********************************************************************/ 123 | 124 | void read_wacrpt(wacdata, filename) 125 | Wacdata *wacdata; 126 | char *filename; 127 | { 128 | FILE *f; 129 | long words, missed, stopwords, non_stopwords; 130 | f = open_file(filename, "r"); 131 | if (read_line(f) && strncmp(line, TITLE, sizeof(TITLE) - 3) == 0 && 132 | read_line(f) && strcmp(line, DIVIDER) == 0 && 133 | read_one(f, &words) && read_one(f, &missed) && 134 | read_line(f) && read_line(f)) 135 | { 136 | increment_wac(&wacdata->total, words, missed); 137 | stopwords = read_numbers(f, wacdata->stopword); 138 | non_stopwords = read_numbers(f, wacdata->non_stopword); 139 | read_numbers(f, wacdata->distinct_non_stopword); 140 | if (words > 0) 141 | { 142 | read_numbers(f, wacdata->phrase); 143 | if (stopwords > 0) 144 | read_terms(f, &wacdata->stopword_table); 145 | if (non_stopwords > 0) 146 | read_terms(f, &wacdata->non_stopword_table); 147 | } 148 | } 149 | else 150 | error_string("invalid format in", (filename ? filename : "stdin")); 151 | close_file(f); 152 | } 153 | /**********************************************************************/ 154 | 155 | static void write_pct(f, wac) 156 | FILE *f; 157 | Wac *wac; 158 | { 159 | if (wac->count == 0) 160 | fputs(" ------", f); 161 | else 162 | fprintf(f, "%8.2f", 100.0 * (wac->count - wac->missed) / wac->count); 163 | } 164 | /**********************************************************************/ 165 | 166 | static void write_wac(f, wac) 167 | FILE *f; 168 | Wac *wac; 169 | { 170 | if (wac) 171 | { 172 | fprintf(f, "%8ld %8ld ", wac->count, wac->missed); 173 | write_pct(f, wac); 174 | } 175 | else 176 | fputs(" Count Missed %Right", f); 177 | fputs(" ", f); 178 | } 179 | /**********************************************************************/ 180 | 181 | static void write_numbers(f, wac, limit, title, excess, total) 182 | FILE *f; 183 | Wac wac[]; 184 | short limit; 185 | char *title; 186 | Boolean excess, total; 187 | { 188 | short i; 189 | fprintf(f, "\n%s\n", title); 190 | write_wac(f, NULL); 191 | fprintf(f, "%s\n", (excess ? "Occurs" : "Length")); 192 | for (i = 1; i <= limit; i++) 193 | if (wac[i].count > 0) 194 | { 195 | write_wac(f, &wac[i]); 196 | fprintf(f, " %2d\n", i); 197 | } 198 | if (excess && wac[limit + 1].count > 0) 199 | { 200 | write_wac(f, &wac[limit + 1]); 201 | fprintf(f, " >%2d\n", limit); 202 | } 203 | if (total) 204 | { 205 | write_wac(f, &wac[0]); 206 | fputs(TOTAL, f); 207 | } 208 | } 209 | /**********************************************************************/ 210 | 211 | static int compare_term(term1, term2) 212 | Term *term1, *term2; 213 | { 214 | return(ustrcmp(term1->key, term2->key)); 215 | } 216 | /**********************************************************************/ 217 | 218 | static void write_terms(f, termtable, title) 219 | FILE *f; 220 | Termtable *termtable; 221 | char *title; 222 | { 223 | long i; 224 | table_in_array(termtable); 225 | sort(termtable->count, termtable->array, compare_term); 226 | fprintf(f, "\n%s\n", title); 227 | write_wac(f, NULL); 228 | fputc('\n', f); 229 | for (i = 0; i < termtable->count; i++) 230 | { 231 | write_wac(f, &termtable->array[i]->wac); 232 | fprintf(f, "%s\n", termtable->array[i]->key); 233 | } 234 | } 235 | /**********************************************************************/ 236 | 237 | void write_wacrpt(wacdata, filename) 238 | Wacdata *wacdata; 239 | char *filename; 240 | { 241 | FILE *f; 242 | f = open_file(filename, "w"); 243 | fprintf(f, "%s%s", TITLE, DIVIDER); 244 | fprintf(f, "%8ld Words\n", wacdata->total.count); 245 | fprintf(f, "%8ld Misrecognized\n", wacdata->total.missed); 246 | write_pct(f, &wacdata->total); 247 | fputs("% Accuracy\n", f); 248 | write_numbers(f, wacdata->stopword, MAX_WORDLENGTH, 249 | "Stopwords", False, True); 250 | write_numbers(f, wacdata->non_stopword, MAX_WORDLENGTH, 251 | "Non-stopwords", False, True); 252 | write_numbers(f, wacdata->distinct_non_stopword, MAX_OCCURRENCES, 253 | "Distinct Non-stopwords", True, True); 254 | if (wacdata->total.count > 0) 255 | { 256 | write_numbers(f, wacdata->phrase, MAX_PHRASELENGTH, 257 | "Phrases", False, False); 258 | if (wacdata->stopword[0].count > 0) 259 | write_terms(f, &wacdata->stopword_table, "Stopwords"); 260 | if (wacdata->non_stopword[0].count > 0) 261 | write_terms(f, &wacdata->non_stopword_table, "Non-stopwords"); 262 | } 263 | close_file(f); 264 | } 265 | -------------------------------------------------------------------------------- /src/vote.c: -------------------------------------------------------------------------------- 1 | /********************************************************************** 2 | * 3 | * vote.c 4 | * 5 | * Author: Stephen V. Rice 6 | * 7 | * Copyright 1996 The Board of Regents of the Nevada System of Higher 8 | * Education, on behalf, of the University of Nevada, Las Vegas, 9 | * Information Science Research Institute 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); you 12 | * may not use this file except in compliance with the License. You 13 | * may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 20 | * implied. See the License for the specific language governing 21 | * permissions and limitations under the License. 22 | * 23 | **********************************************************************/ 24 | 25 | #include "sort.h" 26 | #include "sync.h" 27 | #include "table.h" 28 | 29 | #define usage "[-O] [-o outputfile] [-s m/n] [-w m/n] textfile1 textfile2 ..." 30 | 31 | Boolean debug, optimize; 32 | char *outputfilename, *sfraction, *wfraction; 33 | 34 | Option option[] = 35 | { 36 | 'D', NULL, &debug, 37 | 'O', NULL, &optimize, 38 | 'o', &outputfilename, NULL, 39 | 's', &sfraction, NULL, 40 | 'w', &wfraction, NULL, 41 | '\0' 42 | }; 43 | 44 | Textopt textopt = { True, True, 0, True, True }; 45 | 46 | #define MIN_VOTERS 2 47 | #define MAX_VOTERS 16 48 | 49 | typedef 50 | struct 51 | { 52 | short argnum; 53 | char *filename; 54 | Text text; 55 | double distance; 56 | } Voter; 57 | Voter *voter[MAX_VOTERS]; 58 | short num_voters, actual_voters = 3; 59 | 60 | #define N 2 61 | 62 | short suspect_threshold; 63 | short suspect_weight = 1, unmarked_weight = 1; 64 | 65 | BEGIN_ENTRY(Sequence) 66 | long count[MAX_VOTERS]; 67 | float median; 68 | END_ENTRY(Sequence); 69 | 70 | BEGIN_TABLE_OF(Sequence, Seqlist) 71 | END_TABLE(Seqtable); 72 | Seqtable seqtable; 73 | 74 | Text input[MAX_VOTERS], output; 75 | 76 | struct 77 | { 78 | Char *c; 79 | short num_votes; 80 | } candidate[MAX_VOTERS]; 81 | short num_candidates; 82 | 83 | /**********************************************************************/ 84 | 85 | Boolean valid_fraction(fraction, m, n) 86 | char *fraction; 87 | short *m, *n; 88 | { 89 | if (fraction[0] >= '1' && fraction[1] == '/' && fraction[2] <= '9' && 90 | fraction[0] <= fraction[2] && !fraction[3]) 91 | { 92 | *m = fraction[0] - '0'; 93 | *n = fraction[2] - '0'; 94 | return(True); 95 | } 96 | else 97 | return(False); 98 | } 99 | /**********************************************************************/ 100 | 101 | void validate_args(argc, argv) 102 | int argc; 103 | char *argv[]; 104 | { 105 | short i, m, n; 106 | if (argc < MIN_VOTERS || argc > MAX_VOTERS) 107 | error("invalid number of voters"); 108 | for (i = 0; i < argc; i++) 109 | { 110 | voter[i] = NEW(Voter); 111 | voter[i]->argnum = i; 112 | voter[i]->filename = argv[i]; 113 | read_text(&voter[i]->text, argv[i], &textopt); 114 | } 115 | num_voters = argc; 116 | if (!optimize || num_voters < actual_voters) 117 | actual_voters = num_voters; 118 | if (wfraction && 119 | !valid_fraction(wfraction, &suspect_weight, &unmarked_weight)) 120 | error_string("invalid weight", wfraction); 121 | if (sfraction) { 122 | if (valid_fraction(sfraction, &m, &n)) 123 | suspect_threshold = actual_voters * unmarked_weight * m / n; 124 | else 125 | error_string("invalid threshold", sfraction); 126 | } 127 | } 128 | /**********************************************************************/ 129 | 130 | void add_sequence(key, reject, index) 131 | char *key; 132 | Boolean reject; 133 | short index; 134 | { 135 | Sequence *sequence; 136 | sequence = table_lookup(&seqtable, key); 137 | if (!sequence) 138 | { 139 | sequence = NEW(Sequence); 140 | sequence->key = strdup(key); 141 | if (!reject) 142 | sequence->median = 1; 143 | table_insert(&seqtable, sequence); 144 | } 145 | sequence->count[index]++; 146 | } 147 | /**********************************************************************/ 148 | 149 | void count_sequences(index) 150 | short index; 151 | { 152 | Char *start, *c; 153 | char key[N * STRING_SIZE], string[STRING_SIZE]; 154 | Boolean reject; 155 | short i; 156 | for (start = voter[index]->text.first; start; start = start->next) 157 | { 158 | key[0] = '\0'; 159 | reject = False; 160 | for (i = 0, c = start; i < N; i++, c = c->next) 161 | { 162 | if (!c) 163 | return; 164 | char_to_string(False, c->value, string, False); 165 | strcat(key, string); 166 | if (c->value == REJECT_CHARACTER) 167 | reject = True; 168 | } 169 | add_sequence(key, reject, index); 170 | } 171 | } 172 | /**********************************************************************/ 173 | 174 | int compare_counts(count1, count2) 175 | long *count1, *count2; 176 | { 177 | return(*count1 - *count2); 178 | } 179 | /**********************************************************************/ 180 | 181 | void compute_median(sequence) 182 | Sequence *sequence; 183 | { 184 | static long *count[MAX_VOTERS]; 185 | long i; 186 | if (!sequence->median) 187 | return; 188 | if (!count[0]) 189 | for (i = 0; i < num_voters; i++) 190 | count[i] = NEW(long); 191 | for (i = 0; i < num_voters; i++) 192 | *count[i] = sequence->count[i]; 193 | sort(i, count, compare_counts); 194 | sequence->median = 195 | (i & 1 ? *count[i / 2] : (*count[i / 2 - 1] + *count[i / 2]) / 2.0); 196 | } 197 | /**********************************************************************/ 198 | 199 | void compute_distance(index) 200 | short index; 201 | { 202 | long i; 203 | double difference; 204 | for (i = 0; i < seqtable.count; i++) 205 | { 206 | difference = 207 | seqtable.array[i]->count[index] - seqtable.array[i]->median; 208 | if (difference < 0) 209 | difference = -difference; 210 | voter[index]->distance += difference; 211 | } 212 | } 213 | /**********************************************************************/ 214 | 215 | int compare_distances(voter1, voter2) 216 | Voter *voter1, *voter2; 217 | { 218 | if (voter1->distance != voter2->distance) 219 | return(voter1->distance < voter2->distance ? -1 : 1); 220 | return(voter1->argnum - voter2->argnum); 221 | } 222 | /**********************************************************************/ 223 | 224 | void select_voters() 225 | { 226 | long i; 227 | if (optimize) 228 | { 229 | for (i = 0; i < num_voters; i++) 230 | count_sequences(i); 231 | table_in_array(&seqtable); 232 | for (i = 0; i < seqtable.count; i++) 233 | compute_median(seqtable.array[i]); 234 | for (i = 0; i < num_voters; i++) 235 | compute_distance(i); 236 | sort(i, voter, compare_distances); 237 | if (debug) 238 | for (i = 0; i < num_voters; i++) 239 | printf("%11.1f %s\n", voter[i]->distance, voter[i]->filename); 240 | } 241 | for (i = 0; i < actual_voters; i++) 242 | input[i] = voter[i]->text; 243 | } 244 | /**********************************************************************/ 245 | 246 | void place_vote(c) 247 | Char *c; 248 | { 249 | short num_votes, i; 250 | num_votes = unmarked_weight; 251 | if (c) 252 | { 253 | if (c->value == REJECT_CHARACTER) 254 | return; 255 | if (c->suspect) 256 | num_votes = suspect_weight; 257 | for (i = 0; i < num_candidates && (!candidate[i].c || 258 | c->value != candidate[i].c->value); i++); 259 | } 260 | else 261 | for (i = 0; i < num_candidates && candidate[i].c; i++); 262 | if (i < num_candidates) 263 | candidate[i].num_votes += num_votes; 264 | else 265 | { 266 | num_candidates++; 267 | candidate[i].c = c; 268 | candidate[i].num_votes = num_votes; 269 | } 270 | } 271 | /**********************************************************************/ 272 | 273 | Boolean winner() 274 | { 275 | short i, leader = 0; 276 | if (num_candidates == 0) 277 | { 278 | append_char(&output, False, REJECT_CHARACTER); 279 | return(True); 280 | } 281 | for (i = 1; i < num_candidates; i++) 282 | if (candidate[i].num_votes > candidate[leader].num_votes) 283 | leader = i; 284 | num_candidates = 0; 285 | if (!candidate[leader].c) 286 | return(False); 287 | append_char(&output, 288 | (candidate[leader].num_votes <= suspect_threshold ? True : False), 289 | candidate[leader].c->value); 290 | return(True); 291 | } 292 | /**********************************************************************/ 293 | 294 | void perform_vote(synclist) 295 | Synclist *synclist; 296 | { 297 | Sync *sync; 298 | short i; 299 | for (sync = synclist->first; sync; sync = sync->next) 300 | do 301 | for (i = 0; i < actual_voters; i++) 302 | if (sync->substr[i].start <= sync->substr[i].stop) 303 | place_vote(input[i].array[sync->substr[i].start++]); 304 | else 305 | place_vote(NULL); 306 | while (winner()); 307 | } 308 | /**********************************************************************/ 309 | 310 | main(argc, argv) 311 | int argc; 312 | char *argv[]; 313 | { 314 | Synclist synclist; 315 | initialize(&argc, argv, usage, option); 316 | validate_args(argc, argv); 317 | select_voters(); 318 | synchronize(&synclist, actual_voters, input); 319 | perform_vote(&synclist); 320 | write_text(&output, outputfilename, NULL); 321 | terminate(); 322 | } 323 | -------------------------------------------------------------------------------- /test/test_accsum_graphic_characters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: UTF-8 -*- 3 | # Copyright 2017 Eddie Antonio Santos 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Tests accsum on UTF-8 files. 20 | """ 21 | 22 | import glob 23 | import re 24 | import shutil 25 | import subprocess 26 | import tempfile 27 | import unicodedata 28 | 29 | import os.path as p 30 | from collections import namedtuple, OrderedDict 31 | 32 | # Alias range as xrange in Python 3: 33 | try: 34 | xrange 35 | except NameError: 36 | xrange = range 37 | 38 | # Create a Python 2/3 Unicode string literal: 39 | try: 40 | unicode 41 | except NameError: 42 | u = str 43 | else: 44 | u = lambda s: s.decode('UTF-8') 45 | 46 | 47 | # pathlib doesn't exist in Python 2, so make a small version of it: 48 | class Path(str): 49 | def __div__(self, other): 50 | return Path(p.join(self, other)) 51 | 52 | def exists(self): 53 | return p.exists(self) 54 | 55 | def create_file(self, filename, contents): 56 | with open(self / filename, 'w') as fp: 57 | fp.write(contents.encode('UTF-8')) 58 | if not contents.endswith('\n'): 59 | fp.write(b'\x0A') 60 | 61 | 62 | # Path to accuracy program 63 | BIN_DIR = Path(p.dirname(p.dirname(p.realpath(__file__)))) / 'bin' 64 | ACCURACY_BIN = BIN_DIR / 'accuracy' 65 | ACCSUM_BIN = BIN_DIR / 'accsum' 66 | assert ACCURACY_BIN.exists(), 'Could not find ' + ACCURACY_BIN 67 | assert ACCSUM_BIN.exists(), 'Could not find ' + ACCSUM_BIN 68 | 69 | 70 | class FilePair(namedtuple('FilePairBase', 'correct generated')): 71 | """ 72 | Pair of tests that are written as documents. Then an accuracy report may 73 | be produced. 74 | """ 75 | 76 | @property 77 | def prefix(self): 78 | return str(hash(self.correct)).replace('-', '_') 79 | 80 | def write_to_dir(self, directory): 81 | directory.create_file(self.correct_filename, self.correct) 82 | directory.create_file(self.generated_filename, self.generated) 83 | 84 | @property 85 | def correct_filename(self): 86 | return '%s_correct' % self.prefix 87 | 88 | @property 89 | def generated_filename(self): 90 | return '%s_generated' % self.prefix 91 | 92 | @property 93 | def report_filename(self): 94 | return '%s_report' % self.prefix 95 | 96 | def write_accuracy_report(self, directory): 97 | self.write_to_dir(directory) 98 | 99 | # Return name of the report? 100 | subprocess.check_call([ 101 | ACCURACY_BIN, 102 | directory / self.correct_filename, 103 | directory / self.generated_filename, 104 | directory / self.report_filename 105 | ]) 106 | 107 | 108 | class ClassResult(namedtuple('ResultBase', 'count missed right character')): 109 | pass 110 | 111 | 112 | def extract_bracketed_char(text): 113 | match = re.match('^{(.+)}$', text) 114 | return match.group(1) 115 | 116 | 117 | def nfc(text): 118 | """ 119 | Returns NFC normalized text. 120 | """ 121 | return unicodedata.normalize('NFC', u(text)) 122 | 123 | 124 | def nfd(text): 125 | """ 126 | Returns NFD normalized text. 127 | """ 128 | return unicodedata.normalize('NFD', u(text)) 129 | 130 | 131 | class ClassReport(object): 132 | """ 133 | Wraps an accuracy report. 134 | """ 135 | def __init__(self, *results): 136 | self._results = OrderedDict(( 137 | (result.character, result) for result in results 138 | )) 139 | 140 | def __getitem__(self, key): 141 | return self._results[key] 142 | 143 | def __iter__(self): 144 | return iter(self._results) 145 | 146 | def __contains__(self, key): 147 | return key in self._results 148 | 149 | @classmethod 150 | def from_accuracy_report(cls, report_text): 151 | lines = report_text.split('\n\n')[-1].rstrip('\n').split('\n') 152 | 153 | # Assert we've got the right header 154 | count, missed, right = lines.pop(0).split() 155 | assert count == u('Count') 156 | assert missed == u('Missed') 157 | assert right == u('%Right') 158 | 159 | def generate_results(): 160 | for line in lines: 161 | count, missed, right, char = line.lstrip().split(None, 3) 162 | char = extract_bracketed_char(char) 163 | yield ClassResult(int(count), int(missed), right, char) 164 | 165 | return cls(*list(generate_results())) 166 | 167 | 168 | class TemporaryDirectory(object): 169 | """ 170 | Context manager: creates a temporary directory and removes it when 171 | finished. 172 | """ 173 | def __enter__(self): 174 | self._name = Path(tempfile.mkdtemp()) 175 | return self._name 176 | 177 | def __exit__(self, *exc_info): 178 | shutil.rmtree(self._name) 179 | 180 | 181 | def accsum(reports): 182 | """ 183 | Runs accsum, returning a ClassReport (the final section in the report). 184 | """ 185 | report_bytes = subprocess.check_output( 186 | [ACCSUM_BIN] + reports, 187 | stderr=subprocess.STDOUT 188 | ) 189 | contents = report_bytes.decode('UTF-8') 190 | 191 | return ClassReport.from_accuracy_report(contents) 192 | 193 | 194 | tests = [ 195 | # Test some delimiting and special characters 196 | FilePair(correct= nfc("{{"), 197 | generated=nfc("{<")), 198 | FilePair(correct= nfc("<<"), 199 | generated=nfc("<{")), 200 | FilePair(correct= nfc("q\\z"), 201 | generated=nfc("q|z")), 202 | 203 | # Latin scripts 204 | FilePair(correct= nfc("Mirosław"), 205 | generated=nfc("Miroslaw")), 206 | # From: https://fi.wikipedia.org/w/index.php?title=Tekstintunnistus&oldid=15178566 207 | FilePair(correct= nfc("""käsin kirjoittamalla"""), 208 | generated=nfc("""kasin kirjoittämalla""")), 209 | FilePair(correct= nfc("""sähköisesti muokattavaan muotoon"""), 210 | generated=nfc("""sähköisesti muökattavaan muotoon""")), 211 | 212 | # Combining characters. Notice the use of NFD (decomposed) 213 | FilePair(correct =nfd("q̃◌q̃"), 214 | generated=nfd("q̃◌q̂")), 215 | 216 | # Hiragana 217 | FilePair(correct= nfc("""びょおいん"""), 218 | generated=nfc("""びよおいん""")), 219 | 220 | # Emoji 221 | FilePair(correct= nfc("""💩"""), 222 | generated=nfc("""👜""")), 223 | ] 224 | 225 | # TODO: Change this for an ACTUAL expected report (this one is incomplete) 226 | expected_report = ClassReport.from_accuracy_report(u( 227 | r"""ocreval Accuracy Report Version 7.0 228 | ----------------------------------- 229 | 230 | Count Missed %Right 231 | 9 0 100.00 {<\n>} 232 | 3 0 100.00 { } 233 | 2 1 50.00 {<} 234 | 2 1 50.00 {{} 235 | 1 1 50.00 {\} 236 | 1 0 100.00 {M} 237 | 8 1 80.00 {a} 238 | 1 0 80.00 {e} 239 | 1 0 100.00 {h} 240 | 6 0 100.00 {i} 241 | 1 0 100.00 {j} 242 | 4 0 100.00 {k} 243 | 2 0 100.00 {l} 244 | 3 0 100.00 {m} 245 | 3 0 100.00 {n} 246 | 6 1 83.33 {o} 247 | 3 0 100.00 {q} 248 | 2 0 100.00 {r} 249 | 5 0 100.00 {s} 250 | 6 0 100.00 {t} 251 | 2 0 100.00 {u} 252 | 1 0 100.00 {v} 253 | 1 0 100.00 {w} 254 | 1 0 100.00 {z} 255 | 1 1 0.00 {ł} 256 | 2 1 50.00 {ä} 257 | 1 0 100.00 {ö} 258 | 1 0 100.00 {び} 259 | 1 1 0.00 {ょ} 260 | 1 0 100.00 {お} 261 | 1 0 100.00 {い} 262 | 1 0 100.00 {ん} 263 | 1 1 0.00 {💩} 264 | 2 1 50.00 {◌̃} 265 | 1 0 100.00 {◌} 266 | """)) 267 | 268 | 269 | def main(temp_dir): 270 | # Create each individual accuracy report: 271 | for test in tests: 272 | test.write_accuracy_report(temp_dir) 273 | 274 | reports = glob.glob(temp_dir / '*_report') 275 | assert len(reports) == len(tests) 276 | 277 | # Create the accuracy summary! 278 | actual_report = accsum(reports) 279 | 280 | for char in expected_report: 281 | # Check if the character is even in the report. 282 | assert char in actual_report, ( 283 | '{%s} not in report: %r' % (char, set(actual_report)) 284 | ) 285 | 286 | # Check that the counts match 287 | expected, actual = expected_report[char], actual_report[char] 288 | assert expected.count == actual.count, ( 289 | '{%s}: counts does not match expected: %d; actual: %d' % ( 290 | char, expected.count, actual.count 291 | ) 292 | ) 293 | assert expected.missed == actual.missed, ( 294 | '{%s}: #missed does not match expected: %d; actual: %d' % ( 295 | char, expected.missed, actual.missed 296 | ) 297 | ) 298 | 299 | difference = set(actual_report) - set(expected_report) 300 | assert len(difference) == 0, ( 301 | 'Actual report has extra characters: %r' % (difference,) 302 | ) 303 | 304 | 305 | if __name__ == '__main__': 306 | import sys 307 | try: 308 | _, flag = sys.argv 309 | except: 310 | debug = False 311 | else: 312 | debug = flag == '--debug' 313 | 314 | # Create temporary files for each... 315 | with TemporaryDirectory() as temp_dir: 316 | try: 317 | main(temp_dir) 318 | except subprocess.CalledProcessError as error: 319 | sys.stderr.write("Error %d running command: %s" % ( 320 | error.returncode, 321 | ' '.join(error.cmd) 322 | )) 323 | sys.stderr.write("\n") 324 | 325 | if error.output is not None: 326 | sys.stderr.write("\n--- stdout ---\n") 327 | sys.stderr.write(error.output) 328 | 329 | if debug: 330 | import pdb 331 | pdb.set_trace() 332 | sys.exit(-1) 333 | except AssertionError as error: 334 | if debug: 335 | import pdb 336 | pdb.set_trace() 337 | print(error.message) 338 | sys.exit(-1) 339 | --------------------------------------------------------------------------------