├── bin
    └── .gitignore
├── test
    ├── .gitignore
    ├── test_utils.h
    ├── run.c
    ├── Makefile
    ├── test_utils.c
    ├── test_accuracy_large_files.py
    ├── text_test.c
    ├── word_test.c
    └── test_accsum_graphic_characters.py
├── _config.yml
├── user-guide.pdf
├── .gitignore
├── libexec
    ├── WordBreakProperty.txt.gz
    ├── Unicode-License.txt
    └── generate_word_break.py
├── lib
    └── .gitignore
├── src
    ├── ocreval_version.h
    ├── charclass.h
    ├── accsum.c
    ├── wordaccsum.c
    ├── editopsum.c
    ├── sort.h
    ├── stopword.h
    ├── accdist.c
    ├── word.h
    ├── dist.h
    ├── wordaccdist.c
    ├── dist.c
    ├── ci.h
    ├── sort.c
    ├── edorpt.h
    ├── editopcost.c
    ├── accci.c
    ├── wordaccci.c
    ├── nonstopacc.c
    ├── groupacc.c
    ├── ci.c
    ├── sync.h
    ├── wordfreq.c
    ├── list.h
    ├── wacrpt.h
    ├── table.h
    ├── table.c
    ├── stopword.c
    ├── accrpt.h
    ├── edorpt.c
    ├── list.c
    ├── accuracy.c
    ├── util.h
    ├── ngram.c
    ├── synctext.c
    ├── text.h
    ├── charclass.c
    ├── editop.c
    ├── wordacc.c
    ├── util.c
    ├── wacrpt.c
    └── vote.c
├── .travis.yml
├── share
    └── man
    │   └── man1
    │       ├── wordfreq.1
    │       ├── accsum.1
    │       ├── accdist.1
    │       ├── wordaccdist.1
    │       ├── editopsum.1
    │       ├── wordaccsum.1
    │       ├── accuracy.1
    │       ├── groupacc.1
    │       ├── editop.1
    │       ├── nonstopacc.1
    │       ├── wordaccci.1
    │       ├── accci.1
    │       ├── ngram.1
    │       ├── editopcost.1
    │       ├── wordacc.1
    │       ├── vote.1
    │       └── synctext.1
├── install_utf8proc.sh
├── docs
    └── install_utf8proc.sh
├── NOTICE
├── use-libocreval-internal.mk
├── CHANGELOG.md
├── Makefile
└── README.md


/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | run
2 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal


--------------------------------------------------------------------------------
/user-guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eddieantonio/ocreval/HEAD/user-guide.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Objects and dependencies.
2 | *.o
3 | *.d
4 | # Debugging
5 | *.dSYM/
6 | core
7 | 


--------------------------------------------------------------------------------
/libexec/WordBreakProperty.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eddieantonio/ocreval/HEAD/libexec/WordBreakProperty.txt.gz


--------------------------------------------------------------------------------
/lib/.gitignore:
--------------------------------------------------------------------------------
1 | # Ensure that this directory exists in git
2 | # but ignore every file that it will be used for.
3 | *.a
4 | *.so
5 | *.so.*
6 | 


--------------------------------------------------------------------------------
/src/ocreval_version.h:
--------------------------------------------------------------------------------
1 | #define OCREVAL_VERSION_MAJOR "7"
2 | #define OCREVAL_VERSION_MINOR "0"
3 | #define OCREVAL_VERSION OCREVAL_VERSION_MAJOR "." OCREVAL_VERSION_MINOR
4 | 


--------------------------------------------------------------------------------
/test/test_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_UTILS_H
 2 | #define TEST_UTILS_H
 3 | 
 4 | #include <text.h>
 5 | 
 6 | extern Text* text;
 7 | 
 8 | void initialize_texts(void *list);
 9 | void deinitialize_texts(void *list);
10 | 
11 | #endif /* TEST_UTILS_H */
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | compiler:
 3 | - clang
 4 | - gcc
 5 | 
 6 | os:
 7 | - linux
 8 | - osx
 9 | dist: xenial
10 | 
11 | addons:
12 |   apt:
13 |     packages: libutf8proc-dev
14 |   homebrew:
15 |     packages: utf8proc
16 |     update: true
17 | 
18 | script:
19 | - make
20 | - make test
21 | 


--------------------------------------------------------------------------------
/test/run.c:
--------------------------------------------------------------------------------
 1 | #include "text_test.c"
 2 | #include "word_test.c"
 3 | 
 4 | #include "greatest.h"
 5 | 
 6 | GREATEST_MAIN_DEFS();
 7 | 
 8 | int main(int argc, char **argv) {
 9 |     GREATEST_MAIN_BEGIN();
10 | 
11 |     RUN_SUITE(cstring_to_text_suite);
12 |     RUN_SUITE(char_to_string_suite);
13 | 
14 |     RUN_SUITE(find_words_suite);
15 | 
16 |     GREATEST_MAIN_END();
17 | }
18 | 


--------------------------------------------------------------------------------
/share/man/man1/wordfreq.1:
--------------------------------------------------------------------------------
 1 | .TH WORDFREQ 1
 2 | .SH NAME
 3 | wordfreq \- determines the frequency of words
 4 | .SH SYNOPSIS
 5 | .B wordfreq
 6 | textfile1 textfile2 ... >resultfile
 7 | .SH DESCRIPTION
 8 | .I Wordfreq
 9 | reads one or more text files and writes to stdout the number of occurrences of
10 | each distinct word found in these files, where a word is defined to be any
11 | sequence of one or more letters.
12 | .SH "SEE ALSO"
13 | .IR ngram (1).
14 | 


--------------------------------------------------------------------------------
/share/man/man1/accsum.1:
--------------------------------------------------------------------------------
 1 | .TH ACCSUM 1
 2 | .SH NAME
 3 | accsum \- combines character accuracy reports
 4 | .SH SYNOPSIS
 5 | .B accsum
 6 | accuracy_report1 accuracy_report2 ... >accuracy_report
 7 | .SH DESCRIPTION
 8 | .I Accsum
 9 | combines two or more character accuracy reports and writes an aggregate report
10 | to stdout.  The input reports must have been produced by either
11 | .I accuracy
12 | or
13 | .IR accsum .
14 | .SH "SEE ALSO"
15 | .IR accuracy (1),
16 | .IR editopsum (1)
17 | and
18 | .IR wordaccsum (1).
19 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
 1 | include ../use-libocreval-internal.mk
 2 | 
 3 | test: unit-test large-file-test accsum-test
 4 | 
 5 | unit-test: run
 6 | 	./$< $(TEST_ARGS)
 7 | 
 8 | large-file-test:
 9 | 	./test_accuracy_large_files.py
10 | 
11 | accsum-test:
12 | 	./test_accsum_graphic_characters.py
13 | 
14 | clean:
15 | 	$(RM) run
16 | 
17 | run: run.c $(wildcard *_test.c) $(LIBOCREVAL) test_utils.c test_utils.h
18 | 	$(LINK.c) test_utils.c $< -locreval -lutf8proc -o $@
19 | 
20 | .PHONY: test clean accsum-test large-file-test unit-test
21 | 


--------------------------------------------------------------------------------
/share/man/man1/accdist.1:
--------------------------------------------------------------------------------
 1 | .TH ACCDIST 1
 2 | .SH NAME
 3 | accdist \- displays the distribution of character accuracies
 4 | .SH SYNOPSIS
 5 | .B accdist
 6 | accuracy_report1 accuracy_report2 ... >xyfile
 7 | .SH DESCRIPTION
 8 | .I Accdist
 9 | writes to stdout the distribution of character accuracies found in the input
10 | reports.  For
11 | .I X
12 | = 0 to 100, the percentage of characters recognized with at least
13 | .IR X %
14 | accuracy is reported.
15 | .SH "SEE ALSO"
16 | .IR accuracy (1)
17 | and
18 | .IR wordaccdist (1).
19 | 
20 | 


--------------------------------------------------------------------------------
/share/man/man1/wordaccdist.1:
--------------------------------------------------------------------------------
 1 | .TH WORDACCDIST 1
 2 | .SH NAME
 3 | wordaccdist \- displays the distribution of word accuracies
 4 | .SH SYNOPSIS
 5 | .B wordaccdist
 6 | wordacc_report1 wordacc_report2 ... >xyfile
 7 | .SH DESCRIPTION
 8 | .I Wordaccdist
 9 | writes to stdout the distribution of word accuracies found in the input
10 | reports.  For
11 | .I X
12 | = 0 to 100, the percentage of words recognized with at least
13 | .IR X %
14 | accuracy is reported.
15 | .SH "SEE ALSO"
16 | .IR accdist (1)
17 | and
18 | .IR wordacc (1).
19 | 
20 | 


--------------------------------------------------------------------------------
/share/man/man1/editopsum.1:
--------------------------------------------------------------------------------
 1 | .TH EDITOPSUM 1
 2 | .SH NAME
 3 | editopsum \- combines edit operation reports
 4 | .SH SYNOPSIS
 5 | .B editopsum
 6 | editop_report1 editop_report2 ... >editop_report
 7 | .SH DESCRIPTION
 8 | .I Editopsum
 9 | combines two or more edit operation reports and writes an aggregate report
10 | to stdout.  The input reports must have been produced by either
11 | .I editop
12 | or
13 | .IR editopsum .
14 | .SH "SEE ALSO"
15 | .IR accsum (1),
16 | .IR editop (1),
17 | .IR editopcost (1)
18 | and
19 | .IR wordaccsum (1).
20 | 


--------------------------------------------------------------------------------
/share/man/man1/wordaccsum.1:
--------------------------------------------------------------------------------
 1 | .TH WORDACCSUM 1
 2 | .SH NAME
 3 | wordaccsum \- combines word accuracy reports
 4 | .SH SYNOPSIS
 5 | .B wordaccsum
 6 | wordacc_report1 wordacc_report2 ... >wordacc_report
 7 | .SH DESCRIPTION
 8 | .I Wordaccsum
 9 | combines two or more word accuracy reports and writes an aggregate report
10 | to stdout.  The input reports must have been produced by either
11 | .I wordacc
12 | or
13 | .IR wordaccsum .
14 | .SH "SEE ALSO"
15 | .IR accsum (1),
16 | .IR editopsum (1),
17 | .IR nonstopacc (1)
18 | and
19 | .IR wordacc (1).
20 | 


--------------------------------------------------------------------------------
/test/test_utils.c:
--------------------------------------------------------------------------------
 1 | #include "test_utils.h"
 2 | 
 3 | #include <list.h>
 4 | 
 5 | static Text text_;
 6 | Text* text = &text_;
 7 | 
 8 | void initialize_texts(void *list) {
 9 |     Text** texts = (Text**) list;
10 |     for (; *texts != NULL; texts++) {
11 |         list_initialize(*texts);
12 |     }
13 | }
14 | 
15 | void deinitialize_texts(void *list) {
16 |     Text** texts = (Text**) list;
17 |     for (; *texts != NULL; texts++) {
18 |         /* Frees each character and clears the text. */
19 |         list_empty(*texts, free);
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/install_utf8proc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | PROJECT=utf8proc
 4 | VERSION=1.3.1
 5 | DIRECTORY="$PROJECT-$VERSION"
 6 | TAR_NAME="v${VERSION}.tar.gz"
 7 | TAR_URL="https://github.com/JuliaLang/$PROJECT/archive/$TAR_NAME"
 8 | 
 9 | set -ex
10 | 
11 | cd /tmp/
12 | curl -OL $TAR_URL
13 | tar xzf $TAR_NAME
14 | cd $DIRECTORY/
15 | make
16 | sudo make install
17 | 
18 | if [ `uname -s` != Darwin ] ; then
19 |     # Rebuild the shared object cache - needed to load the library
20 |     # at runtime <http://linux.die.net/man/8/ldconfig>
21 |     sudo ldconfig
22 | fi
23 | 


--------------------------------------------------------------------------------
/docs/install_utf8proc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | PROJECT=utf8proc
 4 | VERSION=1.3.1
 5 | DIRECTORY="$PROJECT-$VERSION"
 6 | TAR_NAME="v${VERSION}.tar.gz"
 7 | TAR_URL="https://github.com/JuliaLang/$PROJECT/archive/$TAR_NAME"
 8 | 
 9 | set -ex
10 | 
11 | cd /tmp/
12 | curl -OL $TAR_URL
13 | tar xzf $TAR_NAME
14 | cd $DIRECTORY/
15 | make
16 | sudo make install
17 | 
18 | if [ `uname -s` != Darwin ] ; then
19 |     # Rebuild the shared object cache - needed to load the library
20 |     # at runtime <http://linux.die.net/man/8/ldconfig>
21 |     sudo ldconfig
22 | fi
23 | 


--------------------------------------------------------------------------------
/share/man/man1/accuracy.1:
--------------------------------------------------------------------------------
 1 | .TH ACCURACY 1
 2 | .SH NAME
 3 | accuracy \- computes character accuracy
 4 | .SH SYNOPSIS
 5 | .B accuracy
 6 | correctfile generatedfile [ accuracy_report ]
 7 | .SH DESCRIPTION
 8 | .I Accuracy
 9 | computes the character accuracy of the OCR-generated text in
10 | .I generatedfile
11 | using the correct text found in
12 | .IR correctfile .
13 | A report containing accuracy statistics is written to
14 | .I accuracy_report
15 | if specified; otherwise, it is written to stdout.
16 | .SH "SEE ALSO"
17 | .IR accci (1),
18 | .IR accsum (1),
19 | .IR editop (1),
20 | .IR synctext (1)
21 | and
22 | .IR wordacc (1).
23 | 


--------------------------------------------------------------------------------
/share/man/man1/groupacc.1:
--------------------------------------------------------------------------------
 1 | .TH GROUPACC 1
 2 | .SH NAME
 3 | groupacc \- shows the accuracy for characters belonging to a group
 4 | .SH SYNOPSIS
 5 | .B groupacc
 6 | groupfile accuracy_report [ groupacc_report ]
 7 | .SH DESCRIPTION
 8 | .I Groupacc
 9 | extracts the accuracy results from
10 | .I accuracy_report
11 | for each character found in
12 | .IR groupfile .
13 | The results are written to
14 | .I groupacc_report
15 | if specified; otherwise, they are written to stdout.
16 | The input character accuracy report must have been produced by either
17 | .I accuracy
18 | or
19 | .IR accsum .
20 | .SH "SEE ALSO"
21 | .IR accsum (1)
22 | and
23 | .IR accuracy (1).
24 | 


--------------------------------------------------------------------------------
/share/man/man1/editop.1:
--------------------------------------------------------------------------------
 1 | .TH EDITOP 1
 2 | .SH NAME
 3 | editop \- counts edit operations
 4 | .SH SYNOPSIS
 5 | .B editop
 6 | correctfile generatedfile [ editop_report ]
 7 | .SH DESCRIPTION
 8 | .I Editop
 9 | counts the edit operations (character insertions, character deletions, and
10 | block move operations) required to transform the OCR-generated text in
11 | .I generatedfile
12 | to the correct text found in
13 | .IR correctfile .
14 | A report containing edit operation statistics is written to
15 | .I editop_report
16 | if specified; otherwise, it is written to stdout.
17 | .SH "SEE ALSO"
18 | .IR accuracy (1),
19 | .IR editopcost (1),
20 | .IR editopsum (1),
21 | .IR synctext (1)
22 | and
23 | .IR wordacc (1).
24 | 


--------------------------------------------------------------------------------
/share/man/man1/nonstopacc.1:
--------------------------------------------------------------------------------
 1 | .TH NONSTOPACC 1
 2 | .SH NAME
 3 | nonstopacc \- computes non-stopword accuracy
 4 | .SH SYNOPSIS
 5 | .B nonstopacc
 6 | stopwordfile wordacc_report >xyfile
 7 | .SH DESCRIPTION
 8 | .I Nonstopacc
 9 | computes non-stopword accuracy as a function of the number of stopwords.
10 | Stopwords are specified in
11 | .I stopwordfile
12 | in decreasing order of frequency.  Word accuracy data is supplied by
13 | .IR wordacc_report ,
14 | which must have been produced by either
15 | .I wordacc
16 | or
17 | .IR wordaccsum .
18 | Non-stopword accuracy is computed and written to stdout using no stopwords,
19 | one stopword, two stopwords, ..., and all stopwords from
20 | .IR stopwordfile .
21 | .SH "SEE ALSO"
22 | .IR wordacc (1)
23 | and
24 | .IR wordaccsum (1).
25 | 


--------------------------------------------------------------------------------
/share/man/man1/wordaccci.1:
--------------------------------------------------------------------------------
 1 | .TH WORDACCCI 1
 2 | .SH NAME
 3 | wordaccci \- computes a confidence interval for word accuracy
 4 | .SH SYNOPSIS
 5 | .B wordaccci
 6 | wordacc_report1 wordacc_report2 ... >resultfile
 7 | .SH DESCRIPTION
 8 | .I Wordaccci
 9 | reads two or more word accuracy reports and writes to stdout an
10 | approximate 95% confidence interval for word accuracy.  Each input report
11 | is treated as one observation, and normally has been produced for a single page
12 | using
13 | .IR wordacc .
14 | The confidence interval is computed using a technique known as jackknife
15 | estimation which assumes that the observations are independent.  For best
16 | results, at least 30 observations are needed.
17 | .SH "SEE ALSO"
18 | .IR accci (1)
19 | and
20 | .IR wordacc (1).
21 | 


--------------------------------------------------------------------------------
/share/man/man1/accci.1:
--------------------------------------------------------------------------------
 1 | .TH ACCCI 1
 2 | .SH NAME
 3 | accci \- computes a confidence interval for character accuracy
 4 | .SH SYNOPSIS
 5 | .B accci
 6 | accuracy_report1 accuracy_report2 ... >resultfile
 7 | .SH DESCRIPTION
 8 | .I Accci
 9 | reads two or more character accuracy reports and writes to stdout an
10 | approximate 95% confidence interval for character accuracy.  Each input report
11 | is treated as one observation, and normally has been produced for a single page
12 | using
13 | .IR accuracy .
14 | The confidence interval is computed using a technique known as jackknife
15 | estimation which assumes that the observations are independent.  For best
16 | results, at least 30 observations are needed.
17 | .SH "SEE ALSO"
18 | .IR accuracy (1)
19 | and
20 | .IR wordaccci (1).
21 | 


--------------------------------------------------------------------------------
/share/man/man1/ngram.1:
--------------------------------------------------------------------------------
 1 | .TH NGRAM 1
 2 | .SH NAME
 3 | ngram \- computes 
 4 | .IR n -grams
 5 | .SH SYNOPSIS
 6 | .B ngram
 7 | [
 8 | .B \-n
 9 | 1 | 2 | 3
10 | ] textfile1 textfile2 ... >resultfile
11 | .SH DESCRIPTION
12 | .I Ngram
13 | reads one or more text files and writes to stdout the
14 | .IR n -gram
15 | statistics for the text found in these files.  The `\-n' option specifies the
16 | value of
17 | .IR n :
18 | 1 for uni-grams, 2 for bi-grams, or 3 for tri-grams, where the default is 1
19 | (uni-grams).  The output shows the number of occurrences of each distinct
20 | .IR n -character
21 | sequence and indicates the number of those occurrences that are suspect
22 | (i.e., have at least one character marked as suspect).
23 | .SH OPTIONS
24 | .TP
25 | .B \-n
26 | Specify the value of
27 | .IR n .
28 | .SH "SEE ALSO"
29 | .IR wordfreq (1).
30 | 


--------------------------------------------------------------------------------
/share/man/man1/editopcost.1:
--------------------------------------------------------------------------------
 1 | .TH EDITOPCOST 1
 2 | .SH NAME
 3 | editopcost \- computes the cost of edit operations
 4 | .SH SYNOPSIS
 5 | .B editopcost
 6 | editop_report [ editop_report2 ] >xyfile
 7 | .SH DESCRIPTION
 8 | .I Editopcost
 9 | computes the cost of the edit operations described in
10 | .IR editop_report ,
11 | less the cost of the edit operations described in
12 | .IR editop_report2 ,
13 | if specified.
14 | The cost is based on the number of insertions, the number and lengths of move
15 | operations, and a threshold value,
16 | .IR T ,
17 | used to convert move operations into an equivalent number of insertions.
18 | The cost is computed and written to stdout for
19 | .I T
20 | = 0 to 100.
21 | The input reports must have been produced by either
22 | .I editop
23 | or
24 | .IR editopsum .
25 | .SH "SEE ALSO"
26 | .IR editop (1)
27 | and
28 | .IR editopsum (1).
29 | 


--------------------------------------------------------------------------------
/share/man/man1/wordacc.1:
--------------------------------------------------------------------------------
 1 | .TH WORDACC 1
 2 | .SH NAME
 3 | wordacc \- computes word accuracy
 4 | .SH SYNOPSIS
 5 | .B wordacc
 6 | [
 7 | .B \-S
 8 | stopwordfile ] correctfile generatedfile [ wordacc_report ]
 9 | .SH DESCRIPTION
10 | .I Wordacc
11 | computes the word accuracy of the OCR-generated text in
12 | .I generatedfile
13 | using the correct text found in
14 | .IR correctfile .
15 | A report containing accuracy statistics is written to
16 | .I wordacc_report
17 | if specified; otherwise, it is written to stdout.  Stopwords are taken from
18 | .I stopwordfile
19 | if specified; otherwise, the BASISplus default set of 110 stopwords is
20 | utilized.
21 | .SH OPTIONS
22 | .TP
23 | .B \-S
24 | Specify the name of a file containing stopwords.
25 | .SH "SEE ALSO"
26 | .IR accuracy (1),
27 | .IR editop (1),
28 | .IR nonstopacc (1),
29 | .IR wordaccci (1)
30 | and
31 | .IR wordaccsum (1).
32 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | The ISRI Analytic Tools for OCR Evaluation
 2 | 
 3 | Originally developed by Dr. Steven Rice in 1996 for his doctoral dissertation.
 4 | 
 5 | ocreval
 6 | 
 7 | An updated port of the ISRI Analytic Tools written by @eddieantonio to
 8 | continue its OCR evaluation goodness for all of the languages representable by
 9 | Unicode!
10 | 
11 | ---
12 | 
13 | Copyright 2015–2018 Eddie Antonio Santos
14 | 
15 | Copyright 1996 The Board of Regents of the Nevada System of Higher
16 | Education, on behalf, of the University of Nevada, Las Vegas,
17 | Information Science Research Institute
18 | 
19 | This product includes software developed at The Information Science
20 | Research Institute (http://www.isri.unlv.edu/).
21 | 
22 | Additional information and a large collection of ground truth data is
23 | available The ISRI OCR Performance Toolkit website
24 |   http://code.google.com/p/isri-ocr-evaluation-tools
25 | 
26 | 


--------------------------------------------------------------------------------
/use-libocreval-internal.mk:
--------------------------------------------------------------------------------
 1 | # Get absolute path to containing directory: http://stackoverflow.com/a/324782
 2 | TOP := $(dir $(CURDIR)/$(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))
 3 | # For INTERNAL headers!
 4 | LOCAL_INCLUDE_DIR := $(abspath $(TOP)src)
 5 | # For use with the -L option.
 6 | LOCAL_LINK_DIR := $(abspath $(TOP)lib)
 7 | 
 8 | LIBOCREVAL = $(LOCAL_LINK_DIR)/libocreval.a
 9 | 
10 | # Compilation flags for all files.
11 | override CFLAGS += -ansi
12 | # X/Open 6.0 standardizes features used in this K&R C source...
13 | CPPDEFINES = -D_XOPEN_SOURCE=600
14 | # Create dependency files.
15 | CPPFLAGS = -MMD
16 | # utf8proc lib usually lives in here:
17 | override CPPFLAGS += -I/usr/local/include $(CPPDEFINES)
18 | LDFLAGS += -L/usr/local/lib
19 | LDLIBS = -lm -lutf8proc
20 | 
21 | # Use libocreval, created in lib/
22 | override CPPFLAGS += -I$(LOCAL_INCLUDE_DIR)
23 | LDFLAGS += -L$(LOCAL_LINK_DIR)
24 | LDLIBS := -locreval $(LDLIBS)
25 | 


--------------------------------------------------------------------------------
/share/man/man1/vote.1:
--------------------------------------------------------------------------------
 1 | .TH VOTE 1
 2 | .SH NAME
 3 | vote \- applies voting to text files
 4 | .SH SYNOPSIS
 5 | .B vote
 6 | [
 7 | .B \-O
 8 | ] [
 9 | .B \-o
10 | outputfile ] [
11 | .B \-s
12 | m/n ] [
13 | .B \-w
14 | m/n ] textfile1 textfile2 ...
15 | .SH DESCRIPTION
16 | .I Vote
17 | applies a voting algorithm to two or more text files.  The resulting text is
18 | written to
19 | .I outputfile
20 | if specified; otherwise, it is written to stdout.
21 | .PP
22 | An unmarked character in the input receives one vote.
23 | A reject character receives no votes.
24 | If a fraction is specified by the `\-w' option, then a character marked as
25 | suspect receives this fraction of a vote; otherwise, it receives a full vote.
26 | If a fraction is specified by the `\-s' option, and an output character
27 | receives no more than this fraction of the possible number of votes, then it is
28 | marked as suspect; otherwise, it is unmarked.  For both of these options,
29 | .I m
30 | and
31 | .I n
32 | must satisfy 1 <=
33 | .I m
34 | <=
35 | .I n
36 | <= 9.
37 | .SH OPTIONS
38 | .TP
39 | .B \-O
40 | Enable optimization.
41 | .TP
42 | .B \-o
43 | Specify the name of the output file.
44 | .TP
45 | .B \-s
46 | Specify the threshold for marking output characters.
47 | .TP
48 | .B \-w
49 | Specify the weight of marked input characters.
50 | 


--------------------------------------------------------------------------------
/share/man/man1/synctext.1:
--------------------------------------------------------------------------------
 1 | .TH SYNCTEXT 1
 2 | .SH NAME
 3 | synctext \- synchronizes text files
 4 | .SH SYNOPSIS
 5 | .B synctext
 6 | [
 7 | .B \-H
 8 | ] [
 9 | .B \-i
10 | ] [
11 | .B \-s
12 | ] [
13 | .B \-T
14 | ] textfile1 textfile2 ... >resultfile
15 | .SH DESCRIPTION
16 | .I Synctext
17 | synchronizes two or more text files and writes to stdout the differences
18 | among these files.  The output shows the substrings that are common to all of
19 | the files followed by footnotes indicating what the differences are.
20 | .PP
21 | If more than two input files are specified, then a heuristic (sub-optimal)
22 | algorithm is used to find matches.  If only two input files are specified, then
23 | an optimal algorithm is used, unless the `\-H' or `\-T' option is specified.
24 | If the `\-H' option is specified, then the heuristic algorithm is applied to
25 | the two input files.  If the `\-T' option is specified, then a heuristic
26 | algorithm that can find transposed matches is utilized, and the output takes a
27 | different form: each match is numbered and appears within braces.
28 | .SH OPTIONS
29 | .TP
30 | .B \-H
31 | Use heuristic algorithm.
32 | .TP
33 | .B \-i
34 | Ignore case (i.e., case-insensitive).
35 | .TP
36 | .B \-s
37 | Show suspect markers.
38 | .TP
39 | .B \-T
40 | Find transposed matches.
41 | .SH "SEE ALSO"
42 | .IR accuracy (1)
43 | and
44 | .IR editop (1).
45 | 


--------------------------------------------------------------------------------
/src/charclass.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  charclass.h
 4 |  *
 5 |  *  This module provides definitions and utility routines pertaining to
 6 |  *  character classes.
 7 |  *
 8 |  *  Author: Stephen V. Rice
 9 |  *  
10 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
11 |  * Education, on behalf, of the University of Nevada, Las Vegas,
12 |  * Information Science Research Institute
13 |  *
14 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
15 |  * may not use this file except in compliance with the License.  You
16 |  * may obtain a copy of the License at
17 |  *
18 |  *    http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing, software
21 |  * distributed under the License is distributed on an "AS IS" BASIS,
22 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 |  * implied. See the License for the specific language governing
24 |  * permissions and limitations under the License.
25 |  *
26 |  **********************************************************************/
27 | 
28 | #ifndef _CHARCLASS_
29 | #define _CHARCLASS_
30 | 
31 | #include "text.h"
32 | 
33 | typedef unsigned char Charclass;
34 | #define MAX_CHARCLASSES  256
35 | 
36 | Charclass charclass(/* Charvalue value */);
37 | 			/* returns the character class for the given character
38 | 			   value */
39 | 
40 | char *charclass_name(/* Charclass class */);
41 | 			/* returns the name of the given character class */
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/accsum.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  accsum.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "accrpt.h"
26 | 
27 | #define usage  "accuracy_report1 accuracy_report2 ... >accuracy_report"
28 | 
29 | Accdata accdata;
30 | 
31 | /**********************************************************************/
32 | 
33 | main(argc, argv)
34 | int argc;
35 | char *argv[];
36 | {
37 |     int i;
38 |     initialize(&argc, argv, usage, NULL);
39 |     if (argc < 2)
40 | 	error("not enough input files");
41 |     for (i = 0; i < argc; i++)
42 | 	read_accrpt(&accdata, argv[i]);
43 |     write_accrpt(&accdata, NULL);
44 |     terminate();
45 | }
46 | 


--------------------------------------------------------------------------------
/src/wordaccsum.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  wordaccsum.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "wacrpt.h"
26 | 
27 | #define usage  "wordacc_report1 wordacc_report2 ... >wordacc_report"
28 | 
29 | Wacdata wacdata;
30 | 
31 | /**********************************************************************/
32 | 
33 | main(argc, argv)
34 | int argc;
35 | char *argv[];
36 | {
37 |     int i;
38 |     initialize(&argc, argv, usage, NULL);
39 |     if (argc < 2)
40 | 	error("not enough input files");
41 |     for (i = 0; i < argc; i++)
42 | 	read_wacrpt(&wacdata, argv[i]);
43 |     write_wacrpt(&wacdata, NULL);
44 |     terminate();
45 | }
46 | 


--------------------------------------------------------------------------------
/src/editopsum.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  editopsum.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "edorpt.h"
26 | #include "util.h"
27 | 
28 | #define usage  "editop_report1 editop_report2 ... >editop_report"
29 | 
30 | Edodata edodata;
31 | 
32 | /**********************************************************************/
33 | 
34 | main(argc, argv)
35 | int argc;
36 | char *argv[];
37 | {
38 |     int i;
39 |     initialize(&argc, argv, usage, NULL);
40 |     if (argc < 2)
41 | 	error("not enough input files");
42 |     for (i = 0; i < argc; i++)
43 | 	read_edorpt(&edodata, argv[i]);
44 |     write_edorpt(&edodata, NULL);
45 |     terminate();
46 | }
47 | 


--------------------------------------------------------------------------------
/src/sort.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  sort.h
 4 |  *
 5 |  *  This module provides a general-purpose sorting routine.
 6 |  *
 7 |  *  Author: Stephen V. Rice
 8 |  *  
 9 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
10 |  * Education, on behalf, of the University of Nevada, Las Vegas,
11 |  * Information Science Research Institute
12 |  *
13 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
14 |  * may not use this file except in compliance with the License.  You
15 |  * may obtain a copy of the License at
16 |  *
17 |  *    http://www.apache.org/licenses/LICENSE-2.0
18 |  *
19 |  * Unless required by applicable law or agreed to in writing, software
20 |  * distributed under the License is distributed on an "AS IS" BASIS,
21 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
22 |  * implied. See the License for the specific language governing
23 |  * permissions and limitations under the License.
24 |  *
25 |  **********************************************************************/
26 | 
27 | #ifndef _SORT_
28 | #define _SORT_
29 | 
30 | void sort(/* long num_elements, void *array[],
31 |              int (*compare)(void *element1, void *element2) */);
32 | 			/* given an array of pointers having the specified
33 | 			   number of elements, sorts the array using the
34 | 			   comparison routine provided; this routine returns
35 | 			   a negative value if "element1" precedes "element2",
36 | 			   a positive value if "element1" follows "element2",
37 | 			   and a zero value if they are equal */
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/src/stopword.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  stopword.h
 4 |  *
 5 |  *  This module provides routines pertaining to stopwords.
 6 |  *
 7 |  *  Author: Stephen V. Rice
 8 |  *  
 9 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
10 |  * Education, on behalf, of the University of Nevada, Las Vegas,
11 |  * Information Science Research Institute
12 |  *
13 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
14 |  * may not use this file except in compliance with the License.  You
15 |  * may obtain a copy of the License at
16 |  *
17 |  *    http://www.apache.org/licenses/LICENSE-2.0
18 |  *
19 |  * Unless required by applicable law or agreed to in writing, software
20 |  * distributed under the License is distributed on an "AS IS" BASIS,
21 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
22 |  * implied. See the License for the specific language governing
23 |  * permissions and limitations under the License.
24 |  *
25 |  **********************************************************************/
26 | 
27 | #ifndef _STOPWORD_
28 | #define _STOPWORD_
29 | 
30 | #include "util.h"
31 | 
32 | void init_stopwords(/* char *filename */);
33 | 			/* reads stopwords from the named file, and reports an
34 | 			   error and quits if unable to open the file; if
35 | 			   "filename" is NULL, the default set of 110 stopwords
36 | 			   from BASISplus is used */
37 | 
38 | Boolean is_stopword(/* unsigned char *string */);
39 | 			/* returns True if "string" represents a stopword; this
40 | 			   routine can be called only after stopwords have been
41 | 			   initialized using "init_stopwords" */
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/accdist.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  accdist.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "accrpt.h"
26 | #include "dist.h"
27 | 
28 | #define usage  "accuracy_report1 accuracy_report2 ... >xyfile"
29 | 
30 | Accdata accdata;
31 | Dist dist;
32 | 
33 | /**********************************************************************/
34 | 
35 | void process_file(filename)
36 | char *filename;
37 | {
38 |     long chars, errors;
39 |     chars  = accdata.characters;
40 |     errors = accdata.errors;
41 |     read_accrpt(&accdata, filename);
42 |     update_dist(&dist, accdata.characters - chars, accdata.errors - errors);
43 | }
44 | /**********************************************************************/
45 | 
46 | main(argc, argv)
47 | int argc;
48 | char *argv[];
49 | {
50 |     int i;
51 |     initialize(&argc, argv, usage, NULL);
52 |     for (i = 0; i < argc; i++)
53 | 	process_file(argv[i]);
54 |     write_dist(&dist, NULL);
55 |     terminate();
56 | }
57 | 


--------------------------------------------------------------------------------
/src/word.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  word.h
 4 |  *
 5 |  *  This module provides definitions and routines pertaining to words.
 6 |  *
 7 |  *  Author: Stephen V. Rice
 8 |  *
 9 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
10 |  * Education, on behalf, of the University of Nevada, Las Vegas,
11 |  * Information Science Research Institute
12 |  *
13 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
14 |  * may not use this file except in compliance with the License.  You
15 |  * may obtain a copy of the License at
16 |  *
17 |  *    http://www.apache.org/licenses/LICENSE-2.0
18 |  *
19 |  * Unless required by applicable law or agreed to in writing, software
20 |  * distributed under the License is distributed on an "AS IS" BASIS,
21 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
22 |  * implied. See the License for the specific language governing
23 |  * permissions and limitations under the License.
24 |  *
25 |  **********************************************************************/
26 | 
27 | #ifndef _WORD_
28 | #define _WORD_
29 | 
30 | #include "text.h"
31 | 
32 | #define MAX_WORDLENGTH  50
33 | 
34 | BEGIN_ITEM(Word)
35 |     char *string;
36 |                         /* character string representation of the word */
37 | END_ITEM(Word);         /* an occurrence of a word */
38 | 
39 | BEGIN_LIST_OF(Word)
40 | END_LIST(Wordlist);     /* a list of word occurrences */
41 | 
42 | void find_words(/* Wordlist *wordlist, Text *text */);
43 |                         /* finds the word occurrences in "text" and appends
44 |                            them to "wordlist" in sequence; all letters in
45 |                            "text" are assumed to be in lowercase */
46 | 
47 | void free_word(/* Word *word */);
48 |                         /* de-allocates a Word structure */
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/src/dist.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  dist.h
 4 |  *
 5 |  *  This module provides support for updating and writing a structure
 6 |  *  that describes a distribution of character or word accuracies.
 7 |  *
 8 |  *  Author: Stephen V. Rice
 9 |  *  
10 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
11 |  * Education, on behalf, of the University of Nevada, Las Vegas,
12 |  * Information Science Research Institute
13 |  *
14 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
15 |  * may not use this file except in compliance with the License.  You
16 |  * may obtain a copy of the License at
17 |  *
18 |  *    http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing, software
21 |  * distributed under the License is distributed on an "AS IS" BASIS,
22 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 |  * implied. See the License for the specific language governing
24 |  * permissions and limitations under the License.
25 |  *
26 |  **********************************************************************/
27 | 
28 | #ifndef _DIST_
29 | #define _DIST_
30 | 
31 | typedef
32 | struct
33 | {
34 |     long count[101];	/* the (i)th element contains the total count for which
35 | 			   accuracies are greater than or equal to (i)% */
36 |     long total_count;	/* total count for all accuracies */
37 | } Dist;
38 | 
39 | void update_dist(/* Dist *dist, long count, long missed */);
40 | 			/* updates "dist" to reflect the accuracy described by
41 | 			   "count" and "missed" */
42 | 
43 | void write_dist(/* Dist *dist, char *filename */);
44 | 			/* writes the distribution represented by "dist" to the
45 | 			   named file (or stdout if "filename" is NULL);
46 | 			   reports an error and quits if unable to create the
47 | 			   file */
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/src/wordaccdist.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  wordaccdist.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "dist.h"
26 | #include "wacrpt.h"
27 | 
28 | #define usage  "wordacc_report1 wordacc_report2 ... >xyfile"
29 | 
30 | Wacdata wacdata;
31 | Dist dist;
32 | 
33 | /**********************************************************************/
34 | 
35 | void process_file(filename)
36 | char *filename;
37 | {
38 |     long count, missed;
39 |     count  = wacdata.total.count;
40 |     missed = wacdata.total.missed;
41 |     read_wacrpt(&wacdata, filename);
42 |     update_dist(&dist, wacdata.total.count - count,
43 |     wacdata.total.missed - missed);
44 | }
45 | /**********************************************************************/
46 | 
47 | main(argc, argv)
48 | int argc;
49 | char *argv[];
50 | {
51 |     int i;
52 |     initialize(&argc, argv, usage, NULL);
53 |     for (i = 0; i < argc; i++)
54 | 	process_file(argv[i]);
55 |     write_dist(&dist, NULL);
56 |     terminate();
57 | }
58 | 


--------------------------------------------------------------------------------
/src/dist.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  dist.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "dist.h"
26 | #include "util.h"
27 | 
28 | /**********************************************************************/
29 | 
30 | void update_dist(dist, count, missed)
31 | Dist *dist;
32 | long count, missed;
33 | {
34 |     double accuracy;
35 |     short i;
36 |     if (count == 0)
37 | 	return;
38 |     accuracy = 100.0 * (count - missed) / count;
39 |     for (i = 0; accuracy >= i; i++)
40 | 	dist->count[i] += count;
41 |     dist->total_count += count;
42 | }
43 | /**********************************************************************/
44 | 
45 | void write_dist(dist, filename)
46 | Dist *dist;
47 | char *filename;
48 | {
49 |     FILE *f;
50 |     short i;
51 |     if (dist->total_count == 0)
52 | 	return;
53 |     f = open_file(filename, "w");
54 |     for (i = 0; i <= 100; i++)
55 | 	fprintf(f, "%3d %6.2f\n", i,
56 | 	100.0 * dist->count[i] / dist->total_count);
57 |     close_file(f);
58 | }
59 | 


--------------------------------------------------------------------------------
/src/ci.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  ci.h
 4 |  *
 5 |  *  This module provides definitions and utility routines pertaining to
 6 |  *  confidence intervals.
 7 |  *
 8 |  *  Author: Stephen V. Rice
 9 |  *  
10 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
11 |  * Education, on behalf, of the University of Nevada, Las Vegas,
12 |  * Information Science Research Institute
13 |  *
14 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
15 |  * may not use this file except in compliance with the License.  You
16 |  * may obtain a copy of the License at
17 |  *
18 |  *    http://www.apache.org/licenses/LICENSE-2.0
19 |  *
20 |  * Unless required by applicable law or agreed to in writing, software
21 |  * distributed under the License is distributed on an "AS IS" BASIS,
22 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 |  * implied. See the License for the specific language governing
24 |  * permissions and limitations under the License.
25 |  *
26 |  **********************************************************************/
27 | 
28 | #ifndef _CI_
29 | #define _CI_
30 | 
31 | #include "list.h"
32 | 
33 | BEGIN_ITEM(Obs)
34 |     long count;		/* total number */
35 |     long missed;	/* number that were misrecognized */
36 |     double theta;	/* estimator */
37 |     double j;		/* pseudovalue */
38 | END_ITEM(Obs);		/* an observation */
39 | 
40 | BEGIN_LIST_OF(Obs)
41 |     Obs total;		/* total for all observations */
42 | END_LIST(Obslist);	/* a list of observations */
43 | 
44 | void append_obs(/* Obslist *obslist, long count, long missed */);
45 | 			/* appends the given observation to "obslist" */
46 | 
47 | void compute_ci(/* Obslist *obslist, double *lower, double *upper */);
48 | 			/* computes and returns an approximate 95% confidence
49 | 			   interval for accuracy for the given set of
50 | 			   observations */
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/src/sort.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  sort.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | static void **a;
26 | static int (*cmp)();
27 | 
28 | /**********************************************************************/
29 | 
30 | static void quicksort(left, right)
31 | long left, right;
32 | {
33 |     long i, j;
34 |     void *ref;
35 |     i = left;
36 |     j = right;
37 |     ref = a[i];
38 |     while (i < j)
39 |     {
40 | 	while (i < j && (*cmp)(ref, a[j]) < 0)
41 | 	    j--;
42 | 	if (i != j)
43 | 	    a[i++] = a[j];
44 | 	while (i < j && (*cmp)(ref, a[i]) > 0)
45 | 	    i++;
46 | 	if (i != j)
47 | 	    a[j--] = a[i];
48 |     }
49 |     a[j] = ref;
50 |     if (left < --j)
51 | 	quicksort(left, j);
52 |     if (++i < right)
53 | 	quicksort(i, right);
54 | }
55 | /**********************************************************************/
56 | 
57 | void sort(num_elements, array, compare)
58 | long num_elements;
59 | void *array[];
60 | int (*compare)();
61 | {
62 |     if (num_elements < 2)
63 | 	return;
64 |     a = array;
65 |     cmp = compare;
66 |     quicksort(0, num_elements - 1);
67 | }
68 | 


--------------------------------------------------------------------------------
/libexec/Unicode-License.txt:
--------------------------------------------------------------------------------
 1 | The file WordBreakProperty.txt.gz is distributed under the following license:
 2 | 
 3 | COPYRIGHT AND PERMISSION NOTICE
 4 | 
 5 | Copyright © 1991-2015 Unicode, Inc. All rights reserved.
 6 | Distributed under the Terms of Use in 
 7 | http://www.unicode.org/copyright.html.
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining
10 | a copy of the Unicode data files and any associated documentation
11 | (the "Data Files") or Unicode software and any associated documentation
12 | (the "Software") to deal in the Data Files or Software
13 | without restriction, including without limitation the rights to use,
14 | copy, modify, merge, publish, distribute, and/or sell copies of
15 | the Data Files or Software, and to permit persons to whom the Data Files
16 | or Software are furnished to do so, provided that
17 | (a) this copyright and permission notice appear with all copies 
18 | of the Data Files or Software,
19 | (b) this copyright and permission notice appear in associated 
20 | documentation, and
21 | (c) there is clear notice in each modified Data File or in the Software
22 | as well as in the documentation associated with the Data File(s) or
23 | Software that the data or software has been modified.
24 | 
25 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
26 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
27 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 | NONINFRINGEMENT OF THIRD PARTY RIGHTS.
29 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
30 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
31 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
32 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
33 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
34 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
35 | 
36 | Except as contained in this notice, the name of a copyright holder
37 | shall not be used in advertising or otherwise to promote the sale,
38 | use or other dealings in these Data Files or Software without prior
39 | written authorization of the copyright holder.
40 | 


--------------------------------------------------------------------------------
/src/edorpt.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  edorpt.h
 4 |  *
 5 |  *  This module provides support for reading and writing edit operation
 6 |  *  reports.  The contents of one of these reports is represented by an
 7 |  *  "Edodata" structure.
 8 |  *
 9 |  *  Author: Stephen V. Rice
10 |  *  
11 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
12 |  * Education, on behalf, of the University of Nevada, Las Vegas,
13 |  * Information Science Research Institute
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
16 |  * may not use this file except in compliance with the License.  You
17 |  * may obtain a copy of the License at
18 |  *
19 |  *    http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software
22 |  * distributed under the License is distributed on an "AS IS" BASIS,
23 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 |  * implied. See the License for the specific language governing
25 |  * permissions and limitations under the License.
26 |  *
27 |  **********************************************************************/
28 | 
29 | #ifndef _EDORPT_
30 | #define _EDORPT_
31 | 
32 | #define MAX_MOVE_LENGTH  100
33 | 			/* maximum length of a move operation, given in number
34 | 			   of characters moved; longer moves are counted as
35 | 			   moves of this length */
36 | 
37 | typedef
38 | struct
39 | {
40 |     long total_insertions;
41 | 			/* number of "character insert" operations */
42 |     long total_deletions;
43 | 			/* number of "character delete" operations */
44 |     long total_moves;	/* number of "block move" operations */
45 |     long moves[MAX_MOVE_LENGTH + 1];
46 | 			/* number of move operations for each length */
47 | } Edodata;
48 | 
49 | void read_edorpt(/* Edodata *edodata, char *filename */);
50 | 			/* reads the named file (or stdin if "filename" is NULL)
51 | 			   and adds its contents to "edodata"; reports an error
52 | 			   and quits if unable to open the file, or if the file
53 | 			   does not contain an edit operation report */
54 | 
55 | void write_edorpt(/* Edodata *edodata, char *filename */);
56 | 			/* writes the contents of "edodata" to the named file
57 | 			   (or stdout if "filename" is NULL); reports an error
58 | 			   and quits if unable to create the file */
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/editopcost.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  editopcost.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "edorpt.h"
26 | #include "util.h"
27 | 
28 | #define usage  "editop_report [editop_report2] >xyfile"
29 | 
30 | Edodata edodata, edodata2;
31 | 
32 | /**********************************************************************/
33 | 
34 | void decrement_edodata()
35 | {
36 |     short i;
37 |     edodata.total_insertions -= edodata2.total_insertions;
38 |     edodata.total_deletions  -= edodata2.total_deletions;
39 |     edodata.total_moves      -= edodata2.total_moves;
40 |     for (i = 1; i <= MAX_MOVE_LENGTH; i++)
41 | 	edodata.moves[i] -= edodata2.moves[i];
42 | }
43 | /**********************************************************************/
44 | 
45 | void write_results()
46 | {
47 |     long insertions, moves, i;
48 |     insertions = edodata.total_insertions;
49 |     moves = edodata.total_moves;
50 |     for (i = 0; i <= MAX_MOVE_LENGTH; i++)
51 |     {
52 | 	printf("%3ld %10ld\n", i, insertions + i * moves);
53 | 	insertions += i * edodata.moves[i];
54 | 	moves -= edodata.moves[i];
55 |     }
56 | }
57 | /**********************************************************************/
58 | 
59 | main(argc, argv)
60 | int argc;
61 | char *argv[];
62 | {
63 |     initialize(&argc, argv, usage, NULL);
64 |     if (argc < 1 || argc > 2)
65 | 	error("invalid number of files");
66 |     read_edorpt(&edodata, argv[0]);
67 |     if (argc == 2)
68 |     {
69 | 	read_edorpt(&edodata2, argv[1]);
70 | 	decrement_edodata();
71 |     }
72 |     write_results();
73 |     terminate();
74 | }
75 | 


--------------------------------------------------------------------------------
/src/accci.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  accci.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "accrpt.h"
26 | #include "ci.h"
27 | 
28 | #define usage  "accuracy_report1 accuracy_report2 ... >resultfile"
29 | 
30 | Accdata accdata;
31 | Obslist obslist;
32 | 
33 | /**********************************************************************/
34 | 
35 | void process_file(filename)
36 | char *filename;
37 | {
38 |     long chars, errors;
39 |     chars  = accdata.characters;
40 |     errors = accdata.errors;
41 |     read_accrpt(&accdata, filename);
42 |     append_obs(&obslist, accdata.characters - chars, accdata.errors - errors);
43 | }
44 | /**********************************************************************/
45 | 
46 | void write_results()
47 | {
48 |     double lower, upper;
49 |     compute_ci(&obslist, &lower, &upper);
50 |     printf("%14ld   Observations\n", obslist.count);
51 |     printf("%14ld   Characters\n", accdata.characters);
52 |     printf("%14ld   Errors\n", accdata.errors);
53 |     printf("%14.2f%%  Accuracy\n",
54 |     100.0 * (accdata.characters - accdata.errors) / accdata.characters);
55 |     printf("%6.2f%%,%6.2f%%  %s\n", lower, upper,
56 |     "Approximate 95% Confidence Interval for Accuracy");
57 | }
58 | /**********************************************************************/
59 | 
60 | main(argc, argv)
61 | int argc;
62 | char *argv[];
63 | {
64 |     int i;
65 |     initialize(&argc, argv, usage, NULL);
66 |     if (argc < 2)
67 | 	error("not enough input files");
68 |     for (i = 0; i < argc; i++)
69 | 	process_file(argv[i]);
70 |     write_results();
71 |     terminate();
72 | }
73 | 


--------------------------------------------------------------------------------
/src/wordaccci.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  wordaccci.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "ci.h"
26 | #include "wacrpt.h"
27 | 
28 | #define usage  "wordacc_report1 wordacc_report2 ... >resultfile"
29 | 
30 | Wacdata wacdata;
31 | Obslist obslist;
32 | 
33 | /**********************************************************************/
34 | 
35 | void process_file(filename)
36 | char *filename;
37 | {
38 |     long count, missed;
39 |     count  = wacdata.total.count;
40 |     missed = wacdata.total.missed;
41 |     read_wacrpt(&wacdata, filename);
42 |     append_obs(&obslist, wacdata.total.count - count,
43 |     wacdata.total.missed - missed);
44 | }
45 | /**********************************************************************/
46 | 
47 | void write_results()
48 | {
49 |     double lower, upper;
50 |     compute_ci(&obslist, &lower, &upper);
51 |     printf("%14ld   Observations\n", obslist.count);
52 |     printf("%14ld   Words\n", wacdata.total.count);
53 |     printf("%14ld   Misrecognized\n", wacdata.total.missed);
54 |     printf("%14.2f%%  Accuracy\n",
55 |     100.0 * (wacdata.total.count - wacdata.total.missed) / wacdata.total.count);
56 |     printf("%6.2f%%,%6.2f%%  %s\n", lower, upper,
57 |     "Approximate 95% Confidence Interval for Accuracy");
58 | }
59 | /**********************************************************************/
60 | 
61 | main(argc, argv)
62 | int argc;
63 | char *argv[];
64 | {
65 |     int i;
66 |     initialize(&argc, argv, usage, NULL);
67 |     if (argc < 2)
68 | 	error("not enough input files");
69 |     for (i = 0; i < argc; i++)
70 | 	process_file(argv[i]);
71 |     write_results();
72 |     terminate();
73 | }
74 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file.
 3 | This project adheres to [Semantic Versioning](http://semver.org/).
 4 | This file is inspired by [Keep a `CHANGELOG`](http://keepachangelog.com/).
 5 | 
 6 | ## [7.0.1] - 2018-11-21
 7 | ### Fixed
 8 | - Compile error involving inappropriate use of `ssize_t`
 9 | 
10 | ## [7.0.0] - 2018-11-21
11 | ### Changed
12 | - Changed name from `isri-ocr-evaluation-tools` to `ocreval` ([#21])
13 | 
14 | ## [6.1.2] - 2017-01-04
15 | ### Fixed
16 | - Read in UTF-8 characters in `accsum` ([#14])
17 | 
18 | ## [6.1.1] - 2016-02-22
19 | ### Fixed
20 | - No longer spuriously abort if inputs are longer than 65,536 characters ([#10])
21 | 
22 | ## [6.1.0] - 2016-01-01
23 | ### Added
24 | - `make exports` which outputs shell `export` commands (to avoid global installation)
25 | 
26 | ### Changed
27 | - More conventional directory layout ([#4])
28 | 
29 | ## [6.0.1] - 2016-01-04
30 | ### Fixed
31 | - Bug in implementation of [WB6](http://unicode.org/reports/tr29/#WB6)
32 | - Special case U+0020 SPACE ' ' as a graphic character
33 | - Clang warnings
34 | 
35 | ## [6.0.0] - 2016-01-04
36 | ### Added
37 | - Word segmentation using [Unicode word boundaries](http://unicode.org/reports/tr29/#Word_Boundaries).
38 | 
39 | ### Changed
40 | - Start following [SemVer](http://semver.org) properly.
41 | - All input and output is in UTF-8
42 | - Fixes to handle non-BMP code points (code points beyond U+FFFF)
43 | 
44 | ### Removed
45 | - `uni2asc` and `asc2uni` (redundant due to change to UTF-8)
46 | 
47 | ## [5.1.3] - 2015-11-15
48 | ### Changed
49 | - More idiomatic `make` build system
50 | 
51 | ### Fixed
52 | - Compiles on modern OS X and Ubuntu
53 | 
54 | [7.0.1]: https://github.com/eddieantonio/ocreval/compare/v7.0.0...v7.0.1
55 | [7.0.0]: https://github.com/eddieantonio/ocreval/compare/v6.1.2...v7.0.0
56 | [6.1.2]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.1.1...v6.1.2
57 | [6.1.1]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.1.0...v6.1.1
58 | [6.1.0]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.0.1...v6.1.0
59 | [6.0.1]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v6.0.0...v6.0.1
60 | [6.0.0]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v5.1.3...v6.0.0
61 | [5.1.3]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/compare/v5.1.0...v5.1.3
62 | 
63 | [#4]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/issues/4
64 | [#10]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/issues/10
65 | [#14]: https://github.com/eddieantonio/isri-ocr-evaluation-tools/issues/14
66 | [#21]: https://github.com/eddieantonio/ocreval/issues/21
67 | 


--------------------------------------------------------------------------------
/src/nonstopacc.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  nonstopacc.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "wacrpt.h"
26 | 
27 | #define usage  "stopwordfile wordacc_report >xyfile"
28 | 
29 | Textopt textopt = { False, False, 0, True, True, True };
30 | Text text;
31 | Wordlist wordlist;
32 | 
33 | Wacdata wacdata;
34 | 
35 | long count, missed;
36 | 
37 | /**********************************************************************/
38 | 
39 | void find_stopword(termtable, string)
40 | Termtable *termtable;
41 | char *string;
42 | {
43 |     Term *term;
44 |     term = table_lookup(termtable, string);
45 |     if (term)
46 |     {
47 | 	count  -= term->wac.count;
48 | 	missed -= term->wac.missed;
49 |     }
50 | }
51 | /**********************************************************************/
52 | 
53 | void write_line()
54 | {
55 |     static long linenum = 0;
56 |     printf("%3ld %6.2f\n", linenum++, 100.0 * (count - missed) / count);
57 | }
58 | /**********************************************************************/
59 | 
60 | void write_results()
61 | {
62 |     Word *word;
63 |     count  = wacdata.total.count;
64 |     missed = wacdata.total.missed;
65 |     if (count == 0)
66 | 	return;
67 |     write_line();
68 |     for (word = wordlist.first; word; word = word->next)
69 |     {
70 | 	find_stopword(&wacdata.stopword_table, word->string);
71 | 	find_stopword(&wacdata.non_stopword_table, word->string);
72 | 	if (count == 0)
73 | 	    return;
74 | 	write_line();
75 |     }
76 | }
77 | /**********************************************************************/
78 | 
79 | main(argc, argv)
80 | int argc;
81 | char *argv[];
82 | {
83 |     initialize(&argc, argv, usage, NULL);
84 |     if (argc != 2)
85 | 	error("invalid number of files");
86 |     read_text(&text, argv[0], &textopt);
87 |     find_words(&wordlist, &text);
88 |     read_wacrpt(&wacdata, argv[1]);
89 |     write_results();
90 |     terminate();
91 | }
92 | 


--------------------------------------------------------------------------------
/src/groupacc.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  groupacc.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include "accrpt.h"
26 | 
27 | #define usage  "groupfile accuracy_report [groupacc_report]"
28 | 
29 | Textopt textopt = { False, True, 0, True, True };
30 | 
31 | Text text;
32 | 
33 | Accdata accdata;
34 | 
35 | /**********************************************************************/
36 | 
37 | void write_line(f, count, missed)
38 | FILE *f;
39 | long count, missed;
40 | {
41 |     fprintf(f, "%8ld %8ld ", count, missed);
42 |     if (count == 0)
43 | 	fprintf(f, "  ------   ");
44 |     else
45 | 	fprintf(f, "%8.2f   ", 100.0 * (count - missed) / count);
46 | }
47 | /**********************************************************************/
48 | 
49 | void write_report(filename)
50 | char *filename;
51 | {
52 |     FILE *f;
53 |     long total_count = 0, total_missed = 0, count, missed;
54 |     Char *c;
55 |     char buffer[STRING_SIZE];
56 |     f = open_file(filename, "w");
57 |     fprintf(f, "   Count   Missed   %%Right\n");
58 |     for (c = text.first; c; c = c->next)
59 | 	if (accdata.small_class[c->value].count > 0 && c->value != BLANK &&
60 | 	c->value != NEWLINE)
61 | 	{
62 | 	    count  = accdata.small_class[c->value].count;
63 | 	    missed = accdata.small_class[c->value].missed;
64 | 	    write_line(f, count, missed);
65 | 	    char_to_string(False, c->value, buffer, True);
66 | 	    fprintf(f, "{%s}\n", buffer);
67 | 	    total_count  += count;
68 | 	    total_missed += missed;
69 | 	}
70 |     write_line(f, total_count, total_missed);
71 |     fprintf(f, "Total\n");
72 |     close_file(f);
73 | }
74 | /**********************************************************************/
75 | 
76 | main(argc, argv)
77 | int argc;
78 | char *argv[];
79 | {
80 |     initialize(&argc, argv, usage, NULL);
81 |     if (argc < 2 || argc > 3)
82 | 	error("invalid number of files");
83 |     read_text(&text, argv[0], &textopt);
84 |     read_accrpt(&accdata, argv[1]);
85 |     write_report(argc == 3 ? argv[2] : NULL);
86 |     terminate();
87 | }
88 | 


--------------------------------------------------------------------------------
/src/ci.c:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  ci.c
 4 |  *
 5 |  *  Author: Stephen V. Rice
 6 |  *  
 7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 9 |  * Information Science Research Institute
10 |  *
11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
12 |  * may not use this file except in compliance with the License.  You
13 |  * may obtain a copy of the License at
14 |  *
15 |  *    http://www.apache.org/licenses/LICENSE-2.0
16 |  *
17 |  * Unless required by applicable law or agreed to in writing, software
18 |  * distributed under the License is distributed on an "AS IS" BASIS,
19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 |  * implied. See the License for the specific language governing
21 |  * permissions and limitations under the License.
22 |  *
23 |  **********************************************************************/
24 | 
25 | #include <math.h>
26 | 
27 | #include "ci.h"
28 | #include "util.h"
29 | 
30 | /**********************************************************************/
31 | 
32 | void append_obs(obslist, count, missed)
33 | Obslist *obslist;
34 | long count, missed;
35 | {
36 |     Obs *obs;
37 |     obs = NEW(Obs);
38 |     obs->count  = count;
39 |     obs->missed = missed;
40 |     obslist->total.count  += count;
41 |     obslist->total.missed += missed;
42 |     list_insert_last(obslist, obs);
43 | }
44 | /**********************************************************************/
45 | 
46 | static Boolean valid_obslist(obslist)
47 | Obslist *obslist;
48 | {
49 |     long n = 0;
50 |     Obs *obs;
51 |     for (obs = obslist->first; obs; obs = obs->next)
52 | 	if (obs->count > 0)
53 | 	    n++;
54 |     return(n > 1 ? True : False);
55 | }
56 | /**********************************************************************/
57 | 
58 | static double accuracy(count, missed)
59 | long count, missed;
60 | {
61 |     return(100.0 * (count - missed) / count);
62 | }
63 | /**********************************************************************/
64 | 
65 | void compute_ci(obslist, lower, upper)
66 | Obslist *obslist;
67 | double *lower, *upper;
68 | {
69 |     long n;
70 |     double ntheta, sum = 0.0, w;
71 |     Obs *obs;
72 |     if (!valid_obslist(obslist))
73 | 	error("not enough observations");
74 |     n = obslist->count;
75 |     obslist->total.theta = accuracy(obslist->total.count,
76 |     obslist->total.missed);
77 |     ntheta = n * obslist->total.theta;
78 |     for (obs = obslist->first; obs; obs = obs->next)
79 |     {
80 | 	obs->theta = accuracy(obslist->total.count - obs->count,
81 | 	obslist->total.missed - obs->missed);
82 | 	obs->j = ntheta - (n - 1) * obs->theta;
83 | 	sum += obs->theta;
84 |     }
85 |     obslist->total.j = ntheta - (n - 1) * sum / n;
86 |     sum = 0.0;
87 |     for (obs = obslist->first; obs; obs = obs->next)
88 | 	sum += (obs->j - obslist->total.j) * (obs->j - obslist->total.j);
89 |     w = 1.96 * sqrt(sum / (n - 1) / n);
90 |     *lower = max(0.0, min(100.0, obslist->total.j - w));
91 |     *upper = max(0.0, min(100.0, obslist->total.j + w));
92 | }
93 | 


--------------------------------------------------------------------------------
/test/test_accuracy_large_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: UTF-8 -*-
 3 | 
 4 | """
 5 | Tests accuracy on LARGE UTF-8 files.
 6 | """
 7 | 
 8 | import random
 9 | import subprocess
10 | import tempfile
11 | 
12 | import os.path as p
13 | 
14 | 
15 | # Alias range as xrange in Python 3:
16 | try:
17 |     xrange
18 | except NameError:
19 |     xrange = range
20 | 
21 | # Create a Python 2/3 Unicode string literal:
22 | try:
23 |     unicode
24 | except NameError:
25 |     u = str
26 | else:
27 |     u = lambda s: s.decode('UTF-8')
28 | 
29 | # Path to accuracy program
30 | ACCURACY = p.join(p.dirname(p.dirname(p.realpath(__file__))),
31 |                   'bin', 'accuracy')
32 | assert p.exists(ACCURACY), 'Could not find ' + ACCURACY
33 | 
34 | 
35 | # http://www.languagegeek.com/isolate/haidastory.html
36 | corpus = u('''\
37 | Aaniisuu tangaa g̱aging.ang ’wan suuga. ’Ll xidgwangaas, x̱uyaa’aa. Tllgu
38 | ḵ’aawgyaa hllng.aaygi ’laa ḵyaang.aas. Ḵawdiuu gwaay g̱ud gwaa nang ḵadlaa
39 | ḵ’ayg̱udyaas ’laagu ḵ’aawgaay g̱an ’laa g̱á ’laa xidaas. Á tl’l sg̱aana ḵidaads
40 | ’yaahlgaagaas g̱iinuus gangaang ’laagu gud gwii x̱iihlt’ahliyaagaas. Ga
41 | sg̱aanag̱waa g̱ax̱aas ’laa t’isda ḵ’a sḵ’agilaang.aas, tll gwii x̱an, hahl gwii’ad
42 | wah gwii’aa. G̱adagaas gyaanuu’asing g̱aalgaagaang ’wan suuga.
43 | 
44 | Nang kilsdlaas naag̱ag̱a.aw tadll chi’a’aawaagan. Sing ḵ’alg̱ada ’ll ḵaaxuhls
45 | gyaan ’ll kindagaang.aas. Sda ’laa xid k’udahldats’aasii gyaan gagu ’laa
46 | ḵ’aw’aawaasgu x̱an ’laa ḵ’aawgangas.
47 | ''')
48 | 
49 | dictionary = tuple(word for word in corpus.split())
50 | alphabet = [char for char in corpus if char not in ' \n']
51 | 
52 | 
53 | def one_in(n):
54 |     return random.choice(xrange(n)) == 1
55 | 
56 | 
57 | def change_letter(word):
58 |     letter_index = random.choice(xrange(len(word)))
59 |     mutation = random.choice(alphabet)
60 |     return word[:letter_index] + mutation + word[letter_index + 1:]
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     import sys
65 |     amount_of_words = int(sys.argv[1]) if len(sys.argv) > 1 else 32768
66 | 
67 |     # Create temporary files for each...
68 |     with tempfile.NamedTemporaryFile('wb') as correct_file,\
69 |             tempfile.NamedTemporaryFile('wb') as generated_file:
70 | 
71 |         # Generate A LOT of random words
72 |         for _ in xrange(amount_of_words):
73 |             end = b'\n' if one_in(10) else b' '
74 | 
75 |             word = random.choice(dictionary)
76 |             correct_file.write(word.encode('UTF-8'))
77 | 
78 |             # Occasionally, typo a word in the generated file.
79 |             generated_word = change_letter(word) if one_in(1000) else word
80 |             generated_file.write(generated_word.encode('UTF-8'))
81 | 
82 |             # Write a space or newline.
83 |             correct_file.write(end)
84 |             generated_file.write(end)
85 | 
86 |         # Finish off the file with a new line and flush the output.
87 |         if end != b'\n':
88 |             correct_file.write(b'\n')
89 |             generated_file.write(b'\n')
90 | 
91 |         correct_file.flush()
92 |         generated_file.flush()
93 | 
94 |         # This will fail if accuracy itself fails.
95 |         subprocess.check_call([ACCURACY,
96 |                                correct_file.name, generated_file.name])
97 | 


--------------------------------------------------------------------------------
/src/sync.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  sync.h
 4 |  *
 5 |  *  This module provides support for string synchronization.  There are
 6 |  *  two algorithms based on the detection of long common substrings:
 7 |  *  "synchronize" can match two or more text streams but does not find
 8 |  *  transposed matches, while "transpose_sync" locates transposed
 9 |  *  matches but can be applied to only two text streams.  A third
10 |  *  algorithm, "fastukk_sync", is based on an algorithm by Ukkonen, and
11 |  *  finds an optimal match of two text streams using cost function
12 |  *  (1,1,1).
13 |  *
14 |  *  Author: Stephen V. Rice
15 |  *  
16 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
17 |  * Education, on behalf, of the University of Nevada, Las Vegas,
18 |  * Information Science Research Institute
19 |  *
20 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
21 |  * may not use this file except in compliance with the License.  You
22 |  * may obtain a copy of the License at
23 |  *
24 |  *    http://www.apache.org/licenses/LICENSE-2.0
25 |  *
26 |  * Unless required by applicable law or agreed to in writing, software
27 |  * distributed under the License is distributed on an "AS IS" BASIS,
28 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
29 |  * implied. See the License for the specific language governing
30 |  * permissions and limitations under the License.
31 |  *
32 |  **********************************************************************/
33 | 
34 | #ifndef _SYNC_
35 | #define _SYNC_
36 | 
37 | #include "text.h"
38 | 
39 | typedef
40 | struct
41 | {
42 |     long start;		/* index of the first character of the substring */
43 |     long stop;		/* index of the last character of the substring */
44 |     long length;	/* length of the substring in characters */
45 | } Substr;		/* describes a substring of a text value by specifying
46 | 			   its indices within the "array" representation of the
47 | 			   text value */
48 | 
49 | BEGIN_ITEM(Sync)
50 |     Substr *substr;	/* array describing one substring for each text value */
51 |     long *match;	/* if non-NULL, then the substrings have been matched
52 | 			   and this points to the match number; otherwise, the
53 | 			   substrings are unmatched */
54 | END_ITEM(Sync);
55 | 
56 | BEGIN_LIST_OF(Sync)
57 | END_LIST(Synclist);	/* list of matched and unmatched substrings */
58 | 
59 | void synchronize(/* Synclist *synclist, short num_text, Text *text */);
60 | 			/* given "num_text" streams of text, synchronizes the
61 | 			   streams and stores the results in "synclist"; each
62 | 			   item in the list points to an array of "num_text"
63 | 			   substrings */
64 | 
65 | void transpose_sync(/* Synclist *synclist1, Synclist *synclist2, 
66 | 		       Text *text1, Text *text2 */);
67 | 			/* synchronizes two streams of text while allowing for
68 | 			   transposed matches; each stream has its results
69 | 			   stored in its own list, and each item of its list
70 | 			   points to only a single substring */
71 | 
72 | void fastukk_sync(/* Synclist *synclist, Text *text */);
73 | 			/* given two streams in the array "text", synchronizes
74 | 			   them optimally and stores the results in "synclist";
75 | 			   each item in the list points to an array of two
76 | 			   substrings */
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/wordfreq.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  wordfreq.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "sort.h"
 26 | #include "wacrpt.h"
 27 | 
 28 | #define usage  "textfile1 textfile2 ... >resultfile"
 29 | 
 30 | Textopt textopt = { True, True, 0, True, True, True };
 31 | Text text;
 32 | 
 33 | Wordlist wordlist;
 34 | 
 35 | Termtable termtable;
 36 | 
 37 | /**********************************************************************/
 38 | 
 39 | void process_file(filename)
 40 | char *filename;
 41 | {
 42 |     Word *word;
 43 |     read_text(&text, filename, &textopt);
 44 |     find_words(&wordlist, &text);
 45 |     for (word = wordlist.first; word; word = word->next)
 46 | 	add_term(&termtable, word->string, 1, 0);
 47 |     list_empty(&text, free);
 48 |     list_empty(&wordlist, free_word);
 49 | }
 50 | /**********************************************************************/
 51 | 
 52 | int order_by_key(term1, term2)
 53 | Term *term1, *term2;
 54 | {
 55 |     return(ustrcmp(term1->key, term2->key));
 56 | }
 57 | /**********************************************************************/
 58 | 
 59 | int order_by_count(term1, term2)
 60 | Term *term1, *term2;
 61 | {
 62 |     if (term1->wac.count != term2->wac.count)
 63 | 	return(term2->wac.count - term1->wac.count);
 64 |     return(order_by_key(term1, term2));
 65 | }
 66 | /**********************************************************************/
 67 | 
 68 | void write_array()
 69 | {
 70 |     long i, total = 0;
 71 |     printf("   Count\n");
 72 |     for (i = 0; i < termtable.count; i++)
 73 |     {
 74 | 	printf("%8ld   %s\n", termtable.array[i]->wac.count,
 75 | 	termtable.array[i]->key);
 76 | 	total += termtable.array[i]->wac.count;
 77 |     }
 78 |     printf("%8ld   Total\n", total);
 79 | }
 80 | /**********************************************************************/
 81 | 
 82 | void write_report()
 83 | {
 84 |     table_in_array(&termtable);
 85 |     sort(termtable.count, termtable.array, order_by_key);
 86 |     write_array();
 87 |     printf("\n\n");
 88 |     sort(termtable.count, termtable.array, order_by_count);
 89 |     write_array();
 90 | }
 91 | /**********************************************************************/
 92 | 
 93 | main(argc, argv)
 94 | int argc;
 95 | char *argv[];
 96 | {
 97 |     int i;
 98 |     initialize(&argc, argv, usage, NULL);
 99 |     if (argc == 0)
100 | 	error("no text files specified");
101 |     for (i = 0; i < argc; i++)
102 | 	process_file(argv[i]);
103 |     write_report();
104 |     terminate();
105 | }
106 | 


--------------------------------------------------------------------------------
/test/text_test.c:
--------------------------------------------------------------------------------
  1 | #include "greatest.h"
  2 | #include "test_utils.h"
  3 | 
  4 | #include <list.h>
  5 | #include <text.h>
  6 | 
  7 | TEST cstring_to_text_should_handle_ascii_strings() {
  8 |     ASSERT(cstring_to_text(text, "hello"));
  9 |     ASSERT_EQ_FMT(5, text->count, "%d");
 10 | 
 11 |     PASS();
 12 | }
 13 | 
 14 | /* Handle a 2 character UTF-8 string. */
 15 | TEST cstring_to_text_should_handle_latin() {
 16 |     ASSERT(cstring_to_text(text, "łódź"));
 17 |     ASSERT_EQ_FMT(4, text->count, "%d");
 18 | 
 19 |     PASS();
 20 | }
 21 | 
 22 | /* Handle a 3 character UTF-8 string. */
 23 | TEST cstring_to_text_should_handle_bmp() {
 24 |     ASSERT(cstring_to_text(text, "働"));
 25 |     ASSERT_EQ_FMT(1, text->count, "%d");
 26 | 
 27 |     PASS();
 28 | }
 29 | 
 30 | /* Handle a 4 character UTF-8 string. */
 31 | TEST cstring_to_text_should_handle_astral_code_points() {
 32 |     /* You could say I'm a flan of this test case. */
 33 |     ASSERT(cstring_to_text(text, "🍮"));
 34 |     ASSERT_EQ_FMT(1, text->count, "%d");
 35 | 
 36 |     PASS();
 37 | }
 38 | 
 39 | TEST char_to_string_converts_space() {
 40 |     char buffer[STRING_SIZE];
 41 | 
 42 |     char_to_string(False, ' ', buffer, False);
 43 |     ASSERT_STR_EQ(" ", buffer);
 44 | 
 45 |     PASS();
 46 | }
 47 | 
 48 | TEST char_to_string_converts_printable_ascii() {
 49 |     char buffer[STRING_SIZE];
 50 | 
 51 |     char_to_string(False, '%', buffer, False);
 52 |     ASSERT_STR_EQ("%", buffer);
 53 | 
 54 |     PASS();
 55 | }
 56 | 
 57 | TEST char_to_string_converts_non_printable_ascii() {
 58 |     char buffer[STRING_SIZE];
 59 | 
 60 |     char_to_string(False, 0x0007, buffer, False);
 61 |     ASSERT_STR_EQ("<07>", buffer);
 62 | 
 63 |     PASS();
 64 | }
 65 | 
 66 | TEST char_to_string_converts_printable_bmp() {
 67 |     char buffer[STRING_SIZE];
 68 | 
 69 |     char_to_string(False, 0x50cd, buffer, False);
 70 |     ASSERT_STR_EQ("働", buffer);
 71 | 
 72 |     PASS();
 73 | }
 74 | 
 75 | TEST char_to_string_converts_astral_code_points() {
 76 |     char buffer[STRING_SIZE];
 77 | 
 78 |     char_to_string(False, 0x0101E1, buffer, False);
 79 |     ASSERT_STR_EQ("𐇡", buffer);
 80 | 
 81 |     PASS();
 82 | }
 83 | 
 84 | TEST char_to_string_converts_bmp_combiner() {
 85 |     char buffer[STRING_SIZE];
 86 | 
 87 |     char_to_string(False, 0x0309, buffer, False);
 88 |     ASSERT_STR_EQ("◌̉", buffer);
 89 | 
 90 |     PASS();
 91 | }
 92 | 
 93 | TEST char_to_string_converts_astral_combiner() {
 94 |     char buffer[STRING_SIZE];
 95 | 
 96 |     char_to_string(False, 0x0101FD, buffer, False);
 97 |     ASSERT_STR_EQ("◌𐇽", buffer);
 98 | 
 99 |     PASS();
100 | }
101 | 
102 | SUITE(cstring_to_text_suite) {
103 |     SET_SETUP(initialize_texts, (Text*[]) {text, NULL});
104 |     SET_TEARDOWN(deinitialize_texts, (Text*[]) {text, NULL});
105 | 
106 |     RUN_TEST(cstring_to_text_should_handle_ascii_strings);
107 |     RUN_TEST(cstring_to_text_should_handle_latin);
108 |     RUN_TEST(cstring_to_text_should_handle_bmp);
109 |     RUN_TEST(cstring_to_text_should_handle_astral_code_points);
110 | }
111 | 
112 | SUITE(char_to_string_suite) {
113 |     RUN_TEST(char_to_string_converts_printable_ascii);
114 |     RUN_TEST(char_to_string_converts_space);
115 |     RUN_TEST(char_to_string_converts_non_printable_ascii);
116 |     RUN_TEST(char_to_string_converts_printable_bmp);
117 |     RUN_TEST(char_to_string_converts_astral_code_points);
118 |     RUN_TEST(char_to_string_converts_bmp_combiner);
119 |     RUN_TEST(char_to_string_converts_astral_combiner);
120 | }
121 | 


--------------------------------------------------------------------------------
/src/list.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  list.h
  4 |  *
  5 |  *  This module provides a general-purpose linked-list capability.  A
  6 |  *  "list" contains zero or more "items".
  7 |  *
  8 |  *  An item is a record structure having the following initial fields:
  9 |  *
 10 |  *	Item *prev;	- pointer to the previous item in the list
 11 |  *	Item *next;	- pointer to the next item in the list
 12 |  *
 13 |  *  This structure is declared by
 14 |  *
 15 |  *  BEGIN_ITEM(Item)
 16 |  * 	<declare other fields here>
 17 |  *  END_ITEM(Item);
 18 |  *
 19 |  *  A list is a record structure having the following initial fields:
 20 |  *
 21 |  *	Item *first;	- pointer to the first item in the list
 22 |  *	Item *last;	- pointer to the last item in the list
 23 |  *	Item *array[];  - array of pointers to the items in the list
 24 |  *	long count;	- number of items in the list
 25 |  *
 26 |  *  This structure is declared by
 27 |  *
 28 |  *  BEGIN_LIST_OF(Item)
 29 |  *	<declare other fields here, if any>
 30 |  *  END_LIST(List);
 31 |  *
 32 |  *  Author: Stephen V. Rice
 33 |  *  
 34 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 35 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 36 |  * Information Science Research Institute
 37 |  *
 38 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 39 |  * may not use this file except in compliance with the License.  You
 40 |  * may obtain a copy of the License at
 41 |  *
 42 |  *    http://www.apache.org/licenses/LICENSE-2.0
 43 |  *
 44 |  * Unless required by applicable law or agreed to in writing, software
 45 |  * distributed under the License is distributed on an "AS IS" BASIS,
 46 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 47 |  * implied. See the License for the specific language governing
 48 |  * permissions and limitations under the License.
 49 |  *
 50 |  **********************************************************************/
 51 | 
 52 | #ifndef _LIST_
 53 | #define _LIST_
 54 | 
 55 | #define BEGIN_ITEM(Item)		\
 56 | typedef					\
 57 | struct Item				\
 58 | {					\
 59 |     struct Item *prev, *next;
 60 | 
 61 | #define END_ITEM(Item)			\
 62 | } Item
 63 | 
 64 | #define BEGIN_LIST_OF(Item)		\
 65 | typedef					\
 66 | struct					\
 67 | {					\
 68 |     Item *first, *last, **array;	\
 69 |     long count;
 70 | 
 71 | #define END_LIST(List)			\
 72 | } List
 73 | 
 74 | void list_initialize(/* List *list */);
 75 | 			/* initializes the list; this routine does not need to
 76 | 			   be called if the list structure was initialized
 77 | 			   statically or dynamically */
 78 | 
 79 | void list_insert_first(/* List *list, Item *item */);
 80 | 			/* inserts the item at the beginning of the list */
 81 | 
 82 | void list_insert_last(/* List *list, Item *item */);
 83 | 			/* inserts the item at the end of the list */
 84 | 
 85 | void list_insert_before(/* List *list, Item *item, Item *ref */);
 86 | 			/* inserts the item before "ref" in the list */
 87 | 
 88 | void list_insert_after(/* List *list, Item *ref, Item *item */);
 89 | 			/* inserts the item after "ref" in the list */
 90 | 
 91 | void list_remove(/* List *list, Item *item */);
 92 | 			/* removes the item from the list */
 93 | 
 94 | void list_in_array(/* List *list */);
 95 | 			/* if the list is non-empty, creates and initializes
 96 | 			   the array of pointers to the items in the list */
 97 | 
 98 | void list_empty(/* List *list, void (*process_item)(Item *item) */);
 99 | 			/* empties the list; as each item is removed, the given
100 | 			   routine is called to process it */
101 | 
102 | #endif
103 | 


--------------------------------------------------------------------------------
/src/wacrpt.h:
--------------------------------------------------------------------------------
 1 | /**********************************************************************
 2 |  *
 3 |  *  wacrpt.h
 4 |  *
 5 |  *  This module provides support for reading and writing word accuracy
 6 |  *  reports.  The contents of one of these reports is represented by a
 7 |  *  "Wacdata" structure.
 8 |  *
 9 |  *  Author: Stephen V. Rice
10 |  *  
11 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
12 |  * Education, on behalf, of the University of Nevada, Las Vegas,
13 |  * Information Science Research Institute
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
16 |  * may not use this file except in compliance with the License.  You
17 |  * may obtain a copy of the License at
18 |  *
19 |  *    http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software
22 |  * distributed under the License is distributed on an "AS IS" BASIS,
23 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
24 |  * implied. See the License for the specific language governing
25 |  * permissions and limitations under the License.
26 |  *
27 |  **********************************************************************/
28 | 
29 | #ifndef _WACRPT_
30 | #define _WACRPT_
31 | 
32 | #include "table.h"
33 | #include "word.h"
34 | 
35 | #define MAX_OCCURRENCES  10
36 | #define MAX_PHRASELENGTH  8
37 | 
38 | typedef
39 | struct
40 | {
41 |     long count;		/* number of word occurrences */
42 |     long missed;	/* number of these that were misrecognized */
43 | } Wac;
44 | 
45 | BEGIN_ENTRY(Term)
46 |     Wac wac;
47 | END_ENTRY(Term);	/* a distinct word */
48 | 
49 | BEGIN_TABLE_OF(Term, Termlist)
50 | END_TABLE(Termtable);	/* table of distinct words */
51 | 
52 | typedef
53 | struct
54 | {
55 |     Wac total;		/* all words */
56 |     Wac stopword[MAX_WORDLENGTH + 1];
57 | 			/* stopwords by word length (in characters); the total
58 | 			   for all stopwords is in the 0th element */
59 |     Wac non_stopword[MAX_WORDLENGTH + 1];
60 | 			/* non-stopwords by word length (in characters); the
61 | 			   total for all non-stopwords is in the 0th element */
62 |     Wac distinct_non_stopword[MAX_OCCURRENCES + 2];
63 | 			/* distinct non-stopwords by number of occurrences on
64 | 			   a page; the (MAX_OCCURRENCES + 1) element groups
65 | 			   all occurring more than MAX_OCCURRENCES times; the
66 | 			   total for all distinct non-stopwords is in the 0th
67 | 			   element */
68 |     Wac phrase[MAX_PHRASELENGTH + 1];
69 | 			/* phrases by phrase length (in words); the 0th element
70 | 			   is not used */
71 |     Termtable stopword_table;
72 | 			/* table of distinct stopwords */
73 |     Termtable non_stopword_table;
74 | 			/* table of distinct non-stopwords */
75 | } Wacdata;
76 | 
77 | void increment_wac(/* Wac *wac, long count, long missed */);
78 | 			/* adds "count" and "missed" to the respective fields
79 | 			   of "wac" */
80 | 
81 | void add_term(/* Termtable *termtable, char *key, long count, long missed */);
82 | 			/* adds the given word to "termtable"; "key" contains
83 | 			   the character string representation of the word;
84 | 			   a copy of this string is stored in the table */
85 | 
86 | void read_wacrpt(/* Wacdata *wacdata, char *filename */);
87 | 			/* reads the named file (or stdin if "filename" is NULL)
88 | 			   and adds its contents to "wacdata"; reports an error
89 | 			   and quits if unable to open the file, or if the file
90 | 			   does not contain a word accuracy report */
91 | 
92 | void write_wacrpt(/* Wacdata *wacdata, char *filename */);
93 | 			/* writes the contents of "wacdata" to the named file
94 | 			   (or stdout if "filename" is NULL); reports an error
95 | 			   and quits if unable to create the file */
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/src/table.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  table.h
  4 |  *
  5 |  *  This module provides a general-purpose hash table capability.  A
  6 |  *  "table" contains zero or more "entries".
  7 |  *
  8 |  *  An entry is a record structure having the following initial fields:
  9 |  *
 10 |  *	Entry *prev;	- pointer to the previous entry in the list
 11 |  *	Entry *next;	- pointer to the next entry in the list
 12 |  *	char *key;	- null-terminated string containing the hash key
 13 |  *
 14 |  *  This structure is declared by
 15 |  *
 16 |  *  BEGIN_ENTRY(Entry)
 17 |  *	<declare other fields here>
 18 |  *  END_ENTRY(Entry);
 19 |  *
 20 |  *  A table is a record structure having the following initial fields:
 21 |  *
 22 |  *	Entrylist list[TABLE_SIZE];
 23 |  *			- array of lists of entries
 24 |  *	Entry *array[]; - array of pointers to the entries in the table
 25 |  *	long count;	- number of entries in the table
 26 |  *
 27 |  *  The following declares this structure and the Entrylist structure:
 28 |  *
 29 |  *  BEGIN_TABLE_OF(Entry, Entrylist)
 30 |  *	<declare other fields here, if any>
 31 |  *  END_TABLE(Table);
 32 |  *
 33 |  *  Author: Stephen V. Rice
 34 |  *  
 35 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 36 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 37 |  * Information Science Research Institute
 38 |  *
 39 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 40 |  * may not use this file except in compliance with the License.  You
 41 |  * may obtain a copy of the License at
 42 |  *
 43 |  *    http://www.apache.org/licenses/LICENSE-2.0
 44 |  *
 45 |  * Unless required by applicable law or agreed to in writing, software
 46 |  * distributed under the License is distributed on an "AS IS" BASIS,
 47 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 48 |  * implied. See the License for the specific language governing
 49 |  * permissions and limitations under the License.
 50 |  *
 51 |  **********************************************************************/
 52 | 
 53 | #ifndef _TABLE_
 54 | #define _TABLE_
 55 | 
 56 | #include "list.h"
 57 | 
 58 | #define TABLE_SIZE  503
 59 | 
 60 | #define BEGIN_ENTRY(Entry)			\
 61 | BEGIN_ITEM(Entry)				\
 62 |     char *key;
 63 | 
 64 | #define END_ENTRY(Entry)			\
 65 | END_ITEM(Entry)
 66 | 
 67 | #define BEGIN_TABLE_OF(Entry, Entrylist)	\
 68 | BEGIN_LIST_OF(Entry)				\
 69 | END_LIST(Entrylist);				\
 70 | typedef						\
 71 | struct						\
 72 | {						\
 73 |     Entrylist list[TABLE_SIZE];			\
 74 |     Entry **array;				\
 75 |     long count;
 76 | 
 77 | #define END_TABLE(Table)			\
 78 | } Table
 79 | 
 80 | void table_initialize(/* Table *table */);
 81 | 			/* initializes the table; this routine does not need to
 82 | 			   be called if the table structure was initialized
 83 | 			   statically or dynamically */
 84 | 
 85 | void *table_lookup(/* Table *table, char *key */);
 86 | 			/* searches the table for an entry having the specified
 87 | 			   key value; returns a pointer to it if found; returns
 88 | 			   NULL if not found */
 89 | 
 90 | void table_insert(/* Table *table, Entry *entry */);
 91 | 			/* inserts the entry into the table */
 92 | 
 93 | void table_remove(/* Table *table, Entry *entry */);
 94 | 			/* removes the entry from the table */
 95 | 
 96 | void table_in_array(/* Table *table */);
 97 | 			/* if the table is non-empty, creates and initializes
 98 | 			   the array of pointers to the entries in the table */
 99 | 
100 | void table_empty(/* Table *table, void (*process_entry)(Entry *entry) */);
101 | 			/* empties the table; as each entry is removed, the
102 | 			   given routine is called to process it */
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/src/table.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  table.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "table.h"
 26 | #include "util.h"
 27 | 
 28 | BEGIN_ENTRY(Entry)
 29 | END_ENTRY(Entry);
 30 | 
 31 | BEGIN_TABLE_OF(Entry, Entrylist)
 32 | END_TABLE(Table);
 33 | 
 34 | /**********************************************************************/
 35 | 
 36 | void table_initialize(table)
 37 | Table *table;
 38 | {
 39 |     short i;
 40 |     for (i = 0; i < TABLE_SIZE; i++)
 41 | 	list_initialize(&table->list[i]);
 42 |     table->array = NULL;
 43 |     table->count = 0;
 44 | }
 45 | /**********************************************************************/
 46 | 
 47 | static short table_index(key_string)
 48 | const char *key_string;
 49 | {
 50 |     const unsigned char *key = (const unsigned char*) key_string;
 51 |     long i, sum = 0;
 52 | 
 53 |     for (i = 0; key[i]; i++)
 54 | 	sum += key[i];
 55 |     return(sum % TABLE_SIZE);
 56 | }
 57 | /**********************************************************************/
 58 | 
 59 | void *table_lookup(table, key)
 60 | Table *table;
 61 | char *key;
 62 | {
 63 |     Entry *entry;
 64 |     for (entry = table->list[table_index(key)].first; entry &&
 65 |     strcmp(key, entry->key) != 0; entry = entry->next);
 66 |     return(entry);
 67 | }
 68 | /**********************************************************************/
 69 | 
 70 | static void free_array(table)
 71 | Table *table;
 72 | {
 73 |     if (table->array)
 74 |     {
 75 | 	free(table->array);
 76 | 	table->array = NULL;
 77 |     }
 78 | }
 79 | /**********************************************************************/
 80 | 
 81 | void table_insert(table, entry)
 82 | Table *table;
 83 | Entry *entry;
 84 | {
 85 |     list_insert_first(&table->list[table_index(entry->key)], entry);
 86 |     free_array(table);
 87 |     table->count++;
 88 | }
 89 | /**********************************************************************/
 90 | 
 91 | void table_remove(table, entry)
 92 | Table *table;
 93 | Entry *entry;
 94 | {
 95 |     list_remove(&table->list[table_index(entry->key)], entry);
 96 |     free_array(table);
 97 |     table->count--;
 98 | }
 99 | /**********************************************************************/
100 | 
101 | void table_in_array(table)
102 | Table *table;
103 | {
104 |     Entry *entry;
105 |     long i, j = 0;
106 |     if (table->array || table->count == 0)
107 | 	return;
108 |     table->array = NEW_ARRAY(table->count, Entry *);
109 |     for (i = 0; i < TABLE_SIZE; i++)
110 | 	for (entry = table->list[i].first; entry; entry = entry->next)
111 | 	    table->array[j++] = entry;
112 | }
113 | /**********************************************************************/
114 | 
115 | void table_empty(table, process_entry)
116 | Table *table;
117 | void (*process_entry)();
118 | {
119 |     short i;
120 |     for (i = 0; i < TABLE_SIZE; i++)
121 | 	list_empty(&table->list[i], process_entry);
122 |     free_array(table);
123 |     table->count = 0;
124 | }
125 | 


--------------------------------------------------------------------------------
/src/stopword.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  stopword.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "stopword.h"
 26 | #include "table.h"
 27 | #include "word.h"
 28 | 
 29 | BEGIN_ENTRY(Stopword)
 30 | END_ENTRY(Stopword);
 31 | 
 32 | BEGIN_TABLE_OF(Stopword, Stopwordlist)
 33 | END_TABLE(Stopwordtable);
 34 | static Stopwordtable stopwordtable;
 35 | static Boolean initialized = False;
 36 | 
 37 | static Textopt textopt = { False, False, 0, True, True, True };
 38 | static Text text;
 39 | static Wordlist wordlist;
 40 | 
 41 | static char *default_stopword[] =
 42 | {
 43 | "a",		"about",	"after",	"all",		"also",
 44 | "an",		"and",		"any",		"are",		"as",
 45 | "at",		"back",		"be",		"because",	"been",
 46 | "but",		"by",		"can",		"could",	"did",
 47 | "do",		"does",		"down",		"each",		"first",
 48 | "for",		"from",		"get",		"good",		"had",
 49 | "has",		"have",		"he",		"her",		"him",
 50 | "his",		"how",		"i",		"if",		"in",
 51 | "into",		"is",		"it",		"its",		"just",
 52 | "know",		"like",		"little",	"long",		"made",
 53 | "make",		"man",		"many",		"may",		"me",
 54 | "more",		"most",		"my",		"new",		"no",
 55 | "not",		"now",		"of",		"on",		"one",
 56 | "only",		"or",		"other",	"our",		"out",
 57 | "over",		"said",		"same",		"see",		"she",
 58 | "so",		"some",		"than",		"that",		"the",
 59 | "their",	"them",		"then",		"there",	"these",
 60 | "they",		"this",		"to",		"too",		"two",
 61 | "up",		"us",		"used",		"very",		"was",
 62 | "way",		"we",		"were",		"what",		"when",
 63 | "where",	"which",	"who",		"why",		"will",
 64 | "with",		"woman",	"would",	"you",		"your"
 65 | };
 66 | 
 67 | /**********************************************************************/
 68 | 
 69 | static void save_stopword(key)
 70 | char *key;
 71 | {
 72 |     Stopword *stopword;
 73 |     stopword = table_lookup(&stopwordtable, key);
 74 |     if (stopword)
 75 | 	warning_string("duplicate stopword", key);
 76 |     else
 77 |     {
 78 | 	stopword = NEW(Stopword);
 79 | 	stopword->key = key;
 80 | 	table_insert(&stopwordtable, stopword);
 81 |     }
 82 | }
 83 | /**********************************************************************/
 84 | 
 85 | void init_stopwords(filename)
 86 | char *filename;
 87 | {
 88 |     Word *word;
 89 |     short i;
 90 |     if (initialized)
 91 | 	error("stopwords already initialized");
 92 |     if (filename)
 93 |     {
 94 | 	read_text(&text, filename, &textopt);
 95 | 	find_words(&wordlist, &text);
 96 | 	list_empty(&text, free);
 97 | 	for (word = wordlist.first; word; word = word->next)
 98 | 	    save_stopword(word->string);
 99 |     }
100 |     else
101 | 	for (i = 0; i < sizeof(default_stopword) / sizeof(char *); i++)
102 | 	    save_stopword(default_stopword[i]);
103 |     initialized = True;
104 | }
105 | /**********************************************************************/
106 | 
107 | Boolean is_stopword(string)
108 | unsigned char *string;
109 | {
110 |     if (!initialized)
111 | 	error("stopwords not initialized");
112 |     return(table_lookup(&stopwordtable, string) ? True : False);
113 | }
114 | 


--------------------------------------------------------------------------------
/src/accrpt.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  accrpt.h
  4 |  *
  5 |  *  This module provides support for reading and writing character
  6 |  *  accuracy reports.  The contents of one of these reports is
  7 |  *  represented by an "Accdata" structure.
  8 |  *
  9 |  *  Author: Stephen V. Rice
 10 |  *  
 11 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 12 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 13 |  * Information Science Research Institute
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 16 |  * may not use this file except in compliance with the License.  You
 17 |  * may obtain a copy of the License at
 18 |  *
 19 |  *    http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software
 22 |  * distributed under the License is distributed on an "AS IS" BASIS,
 23 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 |  * implied. See the License for the specific language governing
 25 |  * permissions and limitations under the License.
 26 |  *
 27 |  **********************************************************************/
 28 | 
 29 | #ifndef _ACCRPT_
 30 | #define _ACCRPT_
 31 | 
 32 | #include "charclass.h"
 33 | #include "table.h"
 34 | 
 35 | typedef
 36 | struct
 37 | {
 38 |     long ins;		/* number of insertions */
 39 |     long subst;		/* number of substitutions */
 40 |     long del;		/* number of deletions */
 41 |     long errors;	/* number of errors = ins + subst + del */
 42 | } Accops;
 43 | 
 44 | typedef
 45 | struct
 46 | {
 47 |     long count;		/* number of ground-truth characters in this class */
 48 |     long missed;	/* number of these that were misrecognized */
 49 | } Accclass;
 50 | 
 51 | BEGIN_ENTRY(Conf)
 52 |     long errors;	/* number of errors caused by this confusion */
 53 |     long marked;	/* number of these that were marked */
 54 | END_ENTRY(Conf);
 55 | 
 56 | BEGIN_TABLE_OF(Conf, Conflist)
 57 | END_TABLE(Conftable);	/* table of confusions */
 58 | 
 59 | typedef
 60 | struct
 61 | {
 62 |     long characters;	/* number of ground-truth characters */
 63 |     long errors;	/* number of errors made */
 64 |     long reject_characters;
 65 | 			/* number of reject characters generated */
 66 |     long suspect_markers;
 67 | 			/* number of characters marked as suspect */
 68 |     long false_marks;	/* number of false marks */
 69 |     Accops marked_ops;	/* edit operations for marked errors */
 70 |     Accops unmarked_ops;/* edit operations for unmarked errors */
 71 |     Accops total_ops;	/* edit operations for all errors */
 72 |     Accclass large_class[MAX_CHARCLASSES];
 73 | 			/* enumeration for each character class */
 74 |     Accclass total_class;
 75 | 			/* enumeration for all classes combined */
 76 |     Conftable conftable;/* table of confusions */
 77 |     Accclass small_class[NUM_CHARVALUES];
 78 | 			/* enumeration for each character code */
 79 | } Accdata;
 80 | 
 81 | void add_class(/* Accdata *accdata, Charvalue value, long count,
 82 |                   long missed */);
 83 | 			/* adds the given character value to "accdata",
 84 |                            updating all relevant class enumerations */
 85 | 
 86 | void add_conf(/* Accdata *accdata, char *key, long errors, long marked */);
 87 | 			/* adds the given confusion to "accdata"; "key" contains
 88 | 			   the character string representation of the confusion
 89 | 			   that will appear in the accuracy report (including
 90 | 			   the trailing newline character); a copy of this
 91 | 			   string is stored in the table */
 92 | 
 93 | void read_accrpt(/* Accdata *accdata, char *filename */);
 94 | 			/* reads the named file (or stdin if "filename" is NULL)
 95 | 			   and adds its contents to "accdata"; reports an error
 96 | 			   and quits if unable to open the file, or if the file
 97 | 			   does not contain an accuracy report */
 98 | 
 99 | void write_accrpt(/* Accdata *accdata, char *filename */);
100 | 			/* writes the contents of "accdata" to the named file
101 | 			   (or stdout if "filename" is NULL); reports an error
102 | 			   and quits if unable to create the file */
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Eddie Antonio Santos <easantos@ualberta.ca>
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #   http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # Install prefix, if installing globally.
 16 | # See also: `exports` target
 17 | PREFIX = /usr/local
 18 | BINDIR = $(PREFIX)/bin
 19 | MANDIR = $(PREFIX)/share/man/man1
 20 | 
 21 | # List of all the tools (executables + manual pages)
 22 | TOOLS = accci accdist accsum accuracy editop editopcost editopsum \
 23 | 		groupacc ngram nonstopacc synctext vote wordacc wordaccci \
 24 | 		wordaccdist wordaccsum wordfreq
 25 | 
 26 | # Name: libocreval, or -locreval
 27 | NAME = ocreval
 28 | MAJOR_VERSION = 7
 29 | MINOR_VERSION = 0
 30 | 
 31 | # All the executables go in bin/
 32 | EXECUTABLES = $(addprefix bin/,$(TOOLS))
 33 | # All manual pages go in share/man/man1
 34 | MANPAGES = $(foreach TOOL,$(TOOLS),share/man/man1/$(TOOL).1)
 35 | 
 36 | include use-libocreval-internal.mk
 37 | 
 38 | LIBRARY.a = lib/lib$(NAME).a
 39 | ifeq ($(shell uname -s),Darwin)
 40 | LIBRARY.so = $(LIBRARY.a:.a=.dylib)
 41 | else
 42 | LIBRARY.so = $(LIBRARY.a:.a=.so.$(MAJOR_VERSION).$(MINOR_VERSION))
 43 | endif
 44 | 
 45 | ################################################################################
 46 | 
 47 | # Allows for proper compilation and linking settings for libocreval
 48 | 
 49 | all: $(EXECUTABLES)
 50 | 
 51 | install: install-bin install-man
 52 | 
 53 | install-bin: $(EXECUTABLES)
 54 | 	mkdir -p $(BINDIR)
 55 | 	cp $(EXECUTABLES) $(BINDIR)/
 56 | 
 57 | install-man: $(MANPAGES)
 58 | 	mkdir -p $(MANDIR)
 59 | 	cp $(MANPAGES) $(MANDIR)/
 60 | 
 61 | # Prints a bunch of exports you can source in your shell's startup file.
 62 | exports:
 63 | 	@echo '#' ocreval
 64 | 	@echo export PATH='$(TOP)bin:$$PATH'
 65 | 	@echo export MANPATH='$(TOP)share/man:$$MANPATH'
 66 | ifeq ($(shell uname -s),Darwin)
 67 | 	@echo export DYLD_LIBRARY_PATH='$(TOP)lib:$$DYLD_LIBRARY_PATH'
 68 | else
 69 | 	@echo export LD_LIBRARY_PATH='$(TOP)lib:$$LD_LIBRARY_PATH'
 70 | endif
 71 | 
 72 | clean: clean-objs clean-execs clean-libs clean-deps clean-test
 73 | 
 74 | clean-libs:
 75 | 	$(RM) $(LIBRARY.a) $(LIBRARY.so)
 76 | 
 77 | clean-objs:
 78 | 	$(RM) $(MODULES:.c=.o)
 79 | 
 80 | clean-deps:
 81 | 	$(RM) $(DEPENDENCIES)
 82 | 
 83 | clean-execs:
 84 | 	$(RM) $(EXECUTABLES)
 85 | 
 86 | clean-test:
 87 | 	$(MAKE) -C test clean
 88 | 
 89 | TEST_ARGS =
 90 | test: $(EXECUTABLES) $(LIBRARY.a)
 91 | 	$(MAKE) -C test
 92 | 
 93 | # Uses https://github.com/alexch/rerun
 94 | # $ gem install rerun
 95 | watch:
 96 | 	rerun --clear --exit --pattern '**/*.{c,h}' -- make test
 97 | 
 98 | .PHONY: all
 99 | .PHONY: install install-bin install-man
100 | .PHONY: clean clean-deps clean-execs clean-lib clean-objs clean-test
101 | .PHONY: test watch
102 | ################################################################################
103 | 
104 | # Executable sources are C files that provide a main() for executables.
105 | EXECUTABLE_SOURCES := $(foreach TOOL,$(TOOLS),src/$(TOOL).c)
106 | # Modules are all object files exclusively for inclusion in libocreval
107 | MODULE_SOURCES := $(filter-out $(EXECUTABLE_SOURCES),$(wildcard src/*.c))
108 | MODULES := $(MODULE_SOURCES:.c=.o)
109 | # Dependencies are .d files included by Make
110 | DEPENDENCIES := $(EXECUTABLES:=.d) $(MODULES:.o=.d)
111 | 
112 | -include $(DEPENDENCIES)
113 | 
114 | # Rules for building executables; they are statically linked with the library.
115 | bin/%: src/%.c $(LIBRARY.a)
116 | 	$(LINK.c) -o $@ $< $(LDLIBS)
117 | 
118 | $(LIBRARY.a): $(MODULES)
119 | 	$(AR) $(ARFLAGS) -s $@ $^
120 | 
121 | # Special case: Generate this include file, required by libocreval.a
122 | $(TOP)src/word_break_property.h src/word_break_property.h: \
123 | 		libexec/generate_word_break.py libexec/WordBreakProperty.txt.gz
124 | 	./$< > $@
125 | 


--------------------------------------------------------------------------------
/src/edorpt.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  edorpt.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "edorpt.h"
 26 | #include "util.h"
 27 | #include "ocreval_version.h"
 28 | 
 29 | #define TITLE    "ocreval Edit Operation Report Version " OCREVAL_VERSION "\n"
 30 | #define DIVIDER  "-----------------------------------------\n"
 31 | 
 32 | static char line[100];
 33 | 
 34 | /**********************************************************************/
 35 | 
 36 | static Boolean read_line(f)
 37 | FILE *f;
 38 | {
 39 |     return(fgets(line, sizeof(line) - 1, f) ? True : False);
 40 | }
 41 | /**********************************************************************/
 42 | 
 43 | static Boolean read_value(f, value, sum_value)
 44 | FILE *f;
 45 | long *value, *sum_value;
 46 | {
 47 |     if (read_line(f) && sscanf(line, "%ld", value) == 1)
 48 |     {
 49 | 	*sum_value += *value;
 50 | 	return(True);
 51 |     }
 52 |     else
 53 | 	return(False);
 54 | }
 55 | /**********************************************************************/
 56 | 
 57 | static Boolean read_two(f, value1, value2)
 58 | FILE *f;
 59 | long *value1, *value2;
 60 | {
 61 |     return(read_line(f) && sscanf(line, "%ld %ld", value1, value2) == 2 ?
 62 |     True : False);
 63 | }
 64 | /**********************************************************************/
 65 | 
 66 | void read_edorpt(edodata, filename)
 67 | Edodata *edodata;
 68 | char *filename;
 69 | {
 70 |     FILE *f;
 71 |     long moves, value1, value2;
 72 |     f = open_file(filename, "r");
 73 |     if (read_line(f) && strncmp(line, TITLE, sizeof(TITLE) - 3) == 0 &&
 74 |     read_line(f) && strcmp(line, DIVIDER) == 0 &&
 75 |     read_value(f, &value1, &edodata->total_insertions) &&
 76 |     read_value(f, &value1, &edodata->total_deletions) &&
 77 |     read_value(f, &moves, &edodata->total_moves))
 78 |     {
 79 | 	if (moves > 0 && read_line(f) && read_line(f) && read_line(f))
 80 | 	    while (read_two(f, &value1, &value2))
 81 | 		edodata->moves[value2] += value1;
 82 |     }
 83 |     else
 84 | 	error_string("invalid format in", (filename ? filename : "stdin"));
 85 |     close_file(f);
 86 | }
 87 | /**********************************************************************/
 88 | 
 89 | static void write_value(f, value, string)
 90 | FILE *f;
 91 | long value;
 92 | char *string;
 93 | {
 94 |     fprintf(f, "%8ld   %s\n", value, string);
 95 | }
 96 | /**********************************************************************/
 97 | 
 98 | static void write_move(f, count, length)
 99 | FILE *f;
100 | long count, length;
101 | {
102 |     fprintf(f, "%8ld %8ld\n", count, length);
103 | }
104 | /**********************************************************************/
105 | 
106 | void write_edorpt(edodata, filename)
107 | Edodata *edodata;
108 | char *filename;
109 | {
110 |     FILE *f;
111 |     long i;
112 |     f = open_file(filename, "w");
113 |     fprintf(f, "%s%s", TITLE, DIVIDER);
114 |     write_value(f, edodata->total_insertions, "Insertions");
115 |     write_value(f, edodata->total_deletions, "Deletions");
116 |     write_value(f, edodata->total_moves, "Moves");
117 |     if (edodata->total_moves > 0)
118 |     {
119 | 	fprintf(f, "\nMoves\n   Count   Length\n");
120 | 	for (i = 1; i <= MAX_MOVE_LENGTH; i++)
121 | 	    if (edodata->moves[i] > 0)
122 | 		write_move(f, edodata->moves[i], i);
123 |     }
124 |     close_file(f);
125 | }
126 | 


--------------------------------------------------------------------------------
/src/list.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  list.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "list.h"
 26 | #include "util.h"
 27 | 
 28 | BEGIN_ITEM(Item)
 29 | END_ITEM(Item);
 30 | 
 31 | BEGIN_LIST_OF(Item)
 32 | END_LIST(List);
 33 | 
 34 | /**********************************************************************/
 35 | 
 36 | void list_initialize(list)
 37 | List *list;
 38 | {
 39 |     list->first = list->last = NULL;
 40 |     list->array = NULL;
 41 |     list->count = 0;
 42 | }
 43 | /**********************************************************************/
 44 | 
 45 | static void free_array(list)
 46 | List *list;
 47 | {
 48 |     if (list->array)
 49 |     {
 50 | 	free(list->array);
 51 | 	list->array = NULL;
 52 |     }
 53 | }
 54 | /**********************************************************************/
 55 | 
 56 | static void list_insert(list, ref1, item, ref2)
 57 | List *list;
 58 | Item *ref1, *item, *ref2;
 59 | {
 60 |     item->prev = ref1;
 61 |     item->next = ref2;
 62 |     if (ref1)
 63 | 	ref1->next = item;
 64 |     else
 65 | 	list->first = item;
 66 |     if (ref2)
 67 | 	ref2->prev = item;
 68 |     else
 69 | 	list->last = item;
 70 |     free_array(list);
 71 |     list->count++;
 72 | }
 73 | /**********************************************************************/
 74 | 
 75 | void list_insert_first(list, item)
 76 | List *list;
 77 | Item *item;
 78 | {
 79 |     list_insert(list, NULL, item, list->first);
 80 | }
 81 | /**********************************************************************/
 82 | 
 83 | void list_insert_last(list, item)
 84 | List *list;
 85 | Item *item;
 86 | {
 87 |     list_insert(list, list->last, item, NULL);
 88 | }
 89 | /**********************************************************************/
 90 | 
 91 | void list_insert_before(list, item, ref)
 92 | List *list;
 93 | Item *item, *ref;
 94 | {
 95 |     list_insert(list, ref->prev, item, ref);
 96 | }
 97 | /**********************************************************************/
 98 | 
 99 | void list_insert_after(list, ref, item)
100 | List *list;
101 | Item *ref, *item;
102 | {
103 |     list_insert(list, ref, item, ref->next);
104 | }
105 | /**********************************************************************/
106 | 
107 | void list_remove(list, item)
108 | List *list;
109 | Item *item;
110 | {
111 |     if (item->prev)
112 | 	item->prev->next = item->next;
113 |     else
114 | 	list->first = item->next;
115 |     if (item->next)
116 | 	item->next->prev = item->prev;
117 |     else
118 | 	list->last = item->prev;
119 |     item->prev = item->next = NULL;
120 |     free_array(list);
121 |     list->count--;
122 | }
123 | /**********************************************************************/
124 | 
125 | void list_in_array(list)
126 | List *list;
127 | {
128 |     Item *item;
129 |     long i = 0;
130 |     if (list->array || list->count == 0)
131 | 	return;
132 |     list->array = NEW_ARRAY(list->count, Item *);
133 |     for (item = list->first; item; item = item->next)
134 | 	list->array[i++] = item;
135 | }
136 | /**********************************************************************/
137 | 
138 | void list_empty(list, process_item)
139 | List *list;
140 | void (*process_item)();
141 | {
142 |     Item *item;
143 |     while (list->first)
144 |     {
145 | 	item = list->first;
146 | 	list_remove(list, item);
147 | 	(*process_item)(item);
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ocreval
  2 | 
  3 | [![Build Status](https://travis-ci.org/eddieantonio/ocreval.svg?branch=master)](https://travis-ci.org/eddieantonio/ocreval)
  4 | 
  5 | The `ocreval` consist of 17 tools for measuring the
  6 | performance of and experimenting with OCR output. See [the user
  7 | guide][user-guide] for more information.
  8 | 
  9 | [user-guide]: https://github.com/eddieantonio/ocreval/raw/master/user-guide.pdf
 10 | 
 11 | `ocreval` is a modern port of the [ISRI Analytic Tools for OCR Evaluation][isri],
 12 | with UTF-8 support and other improvements.
 13 | 
 14 | See [the archived Google Code repository of the original
 15 | project][isri-code]!
 16 | 
 17 | [isri]: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.216.9427&rep=rep1&type=pdf
 18 | [isri-code]: http://code.google.com/p/isri-ocr-evaluation-tools
 19 | 
 20 | # Install (macOS)
 21 | 
 22 | Using [Homebrew][brew]:
 23 | 
 24 |     brew install eddieantonio/eddieantonio/ocreval
 25 | 
 26 | [brew]: http://brew.sh/
 27 | 
 28 | 
 29 | Building
 30 | ========
 31 | 
 32 | To build the library and all of the programs, ensure that you have all
 33 | required [dependencies](#dependencies).
 34 | 
 35 | ## Dependencies
 36 | 
 37 | `ocreval` requires [utf8proc](https://github.com/JuliaStrings/utf8proc)
 38 | to build from source.
 39 | 
 40 | ### macOS
 41 | 
 42 | Using [Homebrew][brew]:
 43 | 
 44 |     brew install utf8proc
 45 | 
 46 | ### Ubuntu/Debian
 47 | 
 48 | You may need to install `make` and a C compiler:
 49 | 
 50 |     sudo apt install build-essential
 51 | 
 52 | Then install, `libutf8proc-dev`:
 53 | 
 54 |     sudo apt install libutf8proc-dev
 55 | 
 56 | If `libutf8proc-dev` cannot be installed using `apt`, follow
 57 | [Other Linux](#other-linux) below
 58 | 
 59 | ### Other Linux
 60 | 
 61 | Install `libutf8proc-dev` manually:
 62 | 
 63 |     curl -OL https://github.com/JuliaStrings/utf8proc/archive/v1.3.1.tar.gz
 64 |     tar xzf v1.3.1.tar.gz
 65 |     cd utf8proc-1.3.1/
 66 |     make
 67 |     sudo make install
 68 |     # Rebuild the shared object cache - needed to load the library
 69 |     # at runtime <http://linux.die.net/man/8/ldconfig>
 70 |     sudo ldconfig
 71 |     cd -
 72 | 
 73 | ## Building the tools
 74 | 
 75 | Once all dependencies are installed, you may compile all of the
 76 | utilities using `make`:
 77 | 
 78 |     make
 79 | 
 80 | ## Installing
 81 | 
 82 | Install to `/usr/local/`:
 83 | 
 84 |     sudo make install
 85 | 
 86 | Note: You will not need `sudo` on macOS if you have `brew` installed.
 87 | 
 88 | ## Installing "locally"
 89 | 
 90 | This will not copy any files at all, but instead create the appropriate
 91 | shell commands to add all executables, man pages, and libraries to
 92 | the correct path (replace `~/.bashrc` with your start-up file):
 93 | 
 94 |     make exports >> ~/.bashrc
 95 | 
 96 | # Porting Credits
 97 | 
 98 | Ported by Eddie Antonio Santos, 2015, 2016. See `NOTICE` for copyright
 99 | information regarding the original code.
100 | 
101 | # Citation
102 | 
103 | ```bibtex
104 | @inproceedings{santos-2019-ocr,
105 |     title = "{OCR} evaluation tools for the 21st century",
106 |     author = "Santos, Eddie Antonio",
107 |     booktitle = "Proceedings of the 3rd Workshop on the Use of Computational Methods in the Study of Endangered Languages Volume 1 (Papers)",
108 |     month = feb,
109 |     year = "2019",
110 |     address = "Honolulu",
111 |     publisher = "Association for Computational Linguistics",
112 |     url = "https://www.aclweb.org/anthology/W19-6004",
113 |     pages = "23--27",
114 | }
115 | ```
116 | 
117 | See: <https://www.aclweb.org/anthology/W19-6004/>
118 | 
119 | 
120 | # License
121 | 
122 | ### ocreval
123 | 
124 | Copyright 2015–2017 Eddie Antonio Santos
125 | 
126 | Copyright © 2018–2021 National Research Council Canada
127 | 
128 | ### The ISRI Analytic Tools for OCR Evaluation
129 | 
130 | Copyright 1996 The Board of Regents of the Nevada System of Higher
131 | Education, on behalf, of the University of Nevada, Las Vegas,
132 | Information Science Research Institute
133 | 
134 | Licensed under the Apache License, Version 2.0 (the "License"); you
135 | may not use this file except in compliance with the License.  You may
136 | obtain a copy of the License at
137 | 
138 |    http://www.apache.org/licenses/LICENSE-2.0
139 | 
140 | Unless required by applicable law or agreed to in writing, software
141 | distributed under the License is distributed on an "AS IS" BASIS,
142 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
143 | implied. See the License for the specific language governing
144 | permissions and limitations under the License.
145 | 


--------------------------------------------------------------------------------
/libexec/generate_word_break.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: UTF-8
  3 | 
  4 | # Copyright 2016 Eddie Antonio Santos <easantos@ualberta.ca>
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | """
 19 | Parses WordBreakProperty.txt and generates a binary search table as a C header
 20 | file. Note! This header file must be included only ONCE in only ONE
 21 | translation unit (i.e. C file).
 22 | """
 23 | 
 24 | import os
 25 | import sys
 26 | import gzip
 27 | 
 28 | PROLOGUE = '''\
 29 | /* AUTOGENERATED FILE! DO NOT MODIFY.
 30 |  * See Supplement/generate_word_break.py */
 31 | '''
 32 | 
 33 | STRUCT_DEF = '''\
 34 | typedef struct {
 35 |     Charvalue start, end;
 36 |     wb_property value;
 37 | } wb_range;
 38 | '''
 39 | 
 40 | ENUM_TEMP = '''\
 41 | typedef enum {
 42 |     %s
 43 | } %s;
 44 | '''
 45 | 
 46 | TABLE_TEMP = '''\
 47 | static const wb_range WORD_BREAK_PROPERTY[] = {
 48 |     %s
 49 | };
 50 | '''
 51 | 
 52 | CATEGORY_NAMES = '''\
 53 | Other
 54 | CR
 55 | LF
 56 | Newline
 57 | Extend
 58 | Regional_Indicator
 59 | Format
 60 | Katakana
 61 | Hebrew_Letter
 62 | ALetter
 63 | Single_Quote
 64 | Double_Quote
 65 | MidNumLet
 66 | MidLetter
 67 | MidNum
 68 | Numeric
 69 | ExtendNumLet
 70 | sot
 71 | eot
 72 | '''.strip().split()
 73 | 
 74 | def open_word_break_file():
 75 |     filename = 'WordBreakProperty.txt.gz'
 76 |     path = os.path.join(os.path.dirname(__file__), filename)
 77 |     return gzip.open(path, 'rb')
 78 | 
 79 | def blank_or_comment(line):
 80 |     return line.startswith('#') or len(line.strip()) == 0
 81 | 
 82 | def parse_range(text):
 83 |     """
 84 |     >>> parse_range('11730..11739')
 85 |     (71472, 71481)
 86 |     >>> parse_range('FF3F')
 87 |     (65343, 65343)
 88 |     """
 89 |     components = text.strip().split('..')[0:2]
 90 |     parse_num = lambda string: int(string, base=16)
 91 | 
 92 |     if len(components) == 2:
 93 |         start, end = map(parse_num, components)
 94 |     else:
 95 |         assert len(components) == 1
 96 |         start = end = parse_num(components[0])
 97 | 
 98 |     return (start, end)
 99 | 
100 | def parse_line(line):
101 |     if blank_or_comment(line):
102 |         return None
103 | 
104 |     cp_range, cat_and_comment = line.split(';')[0:2]
105 |     category = cat_and_comment.split('#')[0].strip()
106 |     cp_range = parse_range(cp_range)
107 | 
108 |     return cp_range, category
109 | 
110 | def parse_lines(word_break_file):
111 |     for line in word_break_file:
112 |         contents = parse_line(line)
113 |         if contents is None:
114 |             continue
115 |         else:
116 |             yield contents
117 | 
118 | def to_c_header(values):
119 |     assert set(category for _, category in values).issubset(set(CATEGORY_NAMES))
120 |     values.sort(key=lambda c: c[0][0])
121 | 
122 |     yield PROLOGUE
123 |     yield '\n'
124 |     yield generate_enum('wb_property', CATEGORY_NAMES)
125 |     yield '\n'
126 |     yield STRUCT_DEF
127 |     yield '\n'
128 |     yield generate_table(values)
129 | 
130 | def enum_name(name):
131 |     """
132 |     Originally, this added a prefix, but since this file generates a header
133 |     that is only included internally and only in *one* file, this is
134 |     unnecessary and just clutters up things.
135 |     """
136 |     return name
137 | 
138 | def generate_enum(name, categories):
139 |     str_values = ',\n    '.join(enum_name(category) for category in categories)
140 |     return ENUM_TEMP % (str_values, name)
141 | 
142 | def generate_literal(value):
143 |     cp_range, category = value
144 |     start, end = cp_range
145 |     template = '{0x%06X, 0x%06X, %s}' if end > 0xffff else '{0x%04X, 0x%04X, %s}'
146 |     return template % (start, end, enum_name(category))
147 | 
148 | def generate_table(values):
149 |     str_values = ',\n    '.join(generate_literal(value) for value in values)
150 |     return TABLE_TEMP % (str_values,)
151 | 
152 | if __name__ == '__main__':
153 |     with open_word_break_file() as word_break_file:
154 |         big_list = list(parse_lines(word_break_file))
155 | 
156 |     for text in to_c_header(big_list):
157 |         sys.stdout.write(text)
158 | 


--------------------------------------------------------------------------------
/src/accuracy.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  accuracy.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "accrpt.h"
 26 | #include "sync.h"
 27 | 
 28 | #define usage  "correctfile generatedfile [accuracy_report]"
 29 | 
 30 | #define MAX_DISPLAY  24
 31 | 
 32 | Textopt textopt = { True, True, 0, True, True };
 33 | 
 34 | Text text[2];
 35 | 
 36 | Accdata accdata;
 37 | 
 38 | /**********************************************************************/
 39 | 
 40 | void make_key(key, sync)
 41 | char *key;
 42 | Sync *sync;
 43 | {
 44 |     long i, j;
 45 |     char buffer[2][MAX_DISPLAY + 4], string[STRING_SIZE];
 46 |     for (i = 0; i < 2; i++)
 47 |     {
 48 | 	buffer[i][0] = '\0';
 49 | 	for (j = sync->substr[i].start; j <= sync->substr[i].stop; j++)
 50 | 	{
 51 | 	    char_to_string(False, text[i].array[j]->value, string, True);
 52 | 	    if (strlen(buffer[i]) + strlen(string) > MAX_DISPLAY)
 53 | 	    {
 54 | 		strcat(buffer[i], "...");
 55 | 		break;
 56 | 	    }
 57 | 	    strcat(buffer[i], string);
 58 | 	}
 59 |     }
 60 |     sprintf(key, "{%s}-{%s}\n", buffer[0], buffer[1]);
 61 | }
 62 | /**********************************************************************/
 63 | 
 64 | void add_ops(sum_ops, ops)
 65 | Accops *sum_ops, *ops;
 66 | {
 67 |     sum_ops->ins    += ops->ins;
 68 |     sum_ops->subst  += ops->subst;
 69 |     sum_ops->del    += ops->del;
 70 |     sum_ops->errors += ops->errors;
 71 | }
 72 | /**********************************************************************/
 73 | 
 74 | void process_synclist(synclist)
 75 | Synclist *synclist;
 76 | {
 77 |     Sync *sync;
 78 |     long i, characters, wildcards, reject_characters, suspect_markers, genchars;
 79 |     Accops ops;
 80 |     char key[100];
 81 |     for (sync = synclist->first; sync; sync = sync->next)
 82 |     {
 83 | 	characters = wildcards = 0;
 84 | 	for (i = sync->substr[0].start; i <= sync->substr[0].stop; i++)
 85 | 	    if (text[0].array[i]->value == REJECT_CHARACTER)
 86 | 		wildcards++;
 87 | 	    else
 88 | 	    {
 89 | 		characters++;
 90 | 		add_class(&accdata, text[0].array[i]->value, 1,
 91 | 		(sync->match ? 0 : 1));
 92 | 	    }
 93 | 	accdata.characters += characters;
 94 | 	reject_characters = suspect_markers = 0;
 95 | 	for (i = sync->substr[1].start; i <= sync->substr[1].stop; i++)
 96 | 	    if (text[1].array[i]->value == REJECT_CHARACTER)
 97 | 		reject_characters++;
 98 | 	    else if (text[1].array[i]->suspect)
 99 | 		suspect_markers++;
100 | 	accdata.reject_characters += reject_characters;
101 | 	accdata.suspect_markers += suspect_markers;
102 | 	if (sync->match)
103 | 	    accdata.false_marks += suspect_markers;
104 | 	else
105 | 	{
106 | 	    genchars = max(0, sync->substr[1].length - wildcards);
107 | 	    ops.errors = max(characters, genchars);
108 | 	    if (ops.errors > 0)
109 | 	    {
110 | 		accdata.errors += ops.errors;
111 | 		ops.ins   = max(0, characters - genchars);
112 | 		ops.subst = min(characters, genchars);
113 | 		ops.del   = max(0, genchars - characters);
114 | 		make_key(key, sync);
115 | 		if (reject_characters + suspect_markers > 0)
116 | 		{
117 | 		    add_ops(&accdata.marked_ops, &ops);
118 | 		    add_conf(&accdata, key, ops.errors, ops.errors);
119 | 		}
120 | 		else
121 | 		{
122 | 		    add_ops(&accdata.unmarked_ops, &ops);
123 | 		    add_conf(&accdata, key, ops.errors, 0);
124 | 		}
125 | 		add_ops(&accdata.total_ops, &ops);
126 | 	    }
127 | 	}
128 |     }
129 | }
130 | /**********************************************************************/
131 | 
132 | main(argc, argv)
133 | int argc;
134 | char *argv[];
135 | {
136 |     Synclist synclist;
137 |     initialize(&argc, argv, usage, NULL);
138 |     if (argc < 2 || argc > 3)
139 | 	error("invalid number of files");
140 |     read_text(&text[0], argv[0], &textopt);
141 |     if (textopt.found_header)
142 | 	error("no correct file specified");
143 |     read_text(&text[1], argv[1], &textopt);
144 |     fastukk_sync(&synclist, text);
145 |     process_synclist(&synclist);
146 |     write_accrpt(&accdata, (argc == 3 ? argv[2] : NULL));
147 |     terminate();
148 | }
149 | 


--------------------------------------------------------------------------------
/src/util.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  util.h
  4 |  *
  5 |  *  This module contains basic definitions and utility routines that
  6 |  *  are needed by almost every module/program in the OCR Experimental
  7 |  *  Environment.
  8 |  *
  9 |  *  Author: Stephen V. Rice
 10 |  *
 11 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 12 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 13 |  * Information Science Research Institute
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 16 |  * may not use this file except in compliance with the License.  You
 17 |  * may obtain a copy of the License at
 18 |  *
 19 |  *    http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software
 22 |  * distributed under the License is distributed on an "AS IS" BASIS,
 23 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 24 |  * implied. See the License for the specific language governing
 25 |  * permissions and limitations under the License.
 26 |  *
 27 |  **********************************************************************/
 28 | 
 29 | #ifndef _UTIL_
 30 | #define _UTIL_
 31 | 
 32 | #include <stdlib.h>
 33 | #include <stdio.h>
 34 | #include <string.h>
 35 | 
 36 | #ifndef unix
 37 | /* On Linux and Mac OS X */
 38 | #if defined(__unix__) || defined(__MACH__)
 39 | #define unix
 40 | #endif
 41 | #endif
 42 | 
 43 | #ifndef True
 44 | typedef char Boolean;
 45 | #define True   1
 46 | #define False  0
 47 | #endif
 48 | 
 49 | #ifndef max
 50 | #define max(a, b)  ((a) > (b) ? (a) : (b))
 51 | #define min(a, b)  ((a) < (b) ? (a) : (b))
 52 | #endif
 53 | 
 54 | #define NEW(type)  ((type *) allocate((size_t) 1, sizeof(type)))
 55 | 			/* allocate one instance of "type" */
 56 | #define NEW_ARRAY(number, type) \
 57 |                    ((type *) allocate((size_t) (number), sizeof(type)))
 58 | 			/* allocate an array of "type" */
 59 | void *allocate(/* size_t number, size_t size */);
 60 | 
 61 | int ustrcmp(/* unsigned char *s1, unsigned char *s2 */);
 62 | 			/* compares strings like "strcmp" but treats characters
 63 | 			   as unsigned */
 64 | 
 65 | FILE *open_file(/* char *filename, char *mode */);
 66 | 			/* opens the named file in the specified mode; reports
 67 | 			   an error and quits if unable to open the file; if
 68 | 			   "filename" is NULL, returns stdin or stdout,
 69 | 			   depending on the mode */
 70 | 
 71 | void close_file(/* FILE *f */);
 72 | 			/* closes the specified file */
 73 | 
 74 | Boolean file_exists(/* char *filename */);
 75 | 			/* returns True if the named file exists */
 76 | 
 77 | char *tempfilename();	/* creates and returns a unique name for a temporary
 78 | 			   file */
 79 | 
 80 | char *basefilename(/* char *pathname */);
 81 | 			/* given a pathname, returns the base filename; e.g.,
 82 | 			   basefilename("/local/isri/bin/ocr") returns "ocr" */
 83 | 
 84 | extern char *exec_name; /* base filename of the executable */
 85 | 
 86 | extern Boolean usage_when_no_args;
 87 | 			/* indicates whether the usage should be displayed when
 88 | 			   no arguments have been specified to the program; by
 89 | 			   default it is True; to override this, set it to False
 90 | 			   before calling "initialize" */
 91 | 
 92 | extern void (*usage_routine)();
 93 | 			/* specifies a routine to be called to display the
 94 | 			   usage, overriding the "usage" argument passed to
 95 | 			   "initialize"; this must be set before calling
 96 | 			   "initialize" */
 97 | 
 98 | extern void (*cleanup_routine)();
 99 | 			/* specifies a routine to be called upon exit */
100 | 
101 | typedef
102 | struct
103 | {
104 |     char name;		/* character identifying the option; e.g., 'D' for -D;
105 | 			   '\0' to terminate an array of these */
106 |     char **string;	/* if a string-valued option, address of variable to
107 | 			   hold the string value; e.g., for -Dcaere or -D caere,
108 | 			   the variable gets "caere"; if a Boolean-valued
109 | 			   option, this should be NULL */
110 |     Boolean *boolean;	/* if a Boolean-valued option, address of variable to
111 | 			   set to True to indicate the option was specified; if
112 | 			   a string-valued option, this should be NULL */
113 | } Option;
114 | 
115 | void initialize(/* int *argc, char *argv[], char *usage, Option option[] */);
116 | 			/* parses the command line arguments looking for any
117 | 			   of the allowed options; reports any invalid option
118 | 			   and quits; updates "argc" and "argv" to contain only
119 | 			   the non-option arguments; displays "usage" when
120 | 			   appropriate; "usage" may be NULL if "usage_routine"
121 | 			   has been set; "option" may be NULL if there are no
122 | 			   options */
123 | 
124 | void terminate()	/* terminates the program with exit status 0 */
125 |     __attribute__ ((noreturn));
126 | 
127 | extern int errstatus;	/* status returned when exiting due to an error; this is
128 | 			   1 by default */
129 | 
130 | void error(/* char *message */)
131 |     __attribute__ ((noreturn));
132 | 			/* writes an error message, then quits or returns */
133 | 
134 | void error_string(/* char *message, char *string */)
135 |     __attribute__ ((noreturn));
136 | 			/* writes an error message, including "string" in
137 | 			   quotes, then quits */
138 | 
139 | void warning_string(/* char *message, char *string */);
140 | 			/* writes an warning message, including "string" */
141 | 
142 | #endif
143 | 


--------------------------------------------------------------------------------
/src/ngram.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  ngram.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "sort.h"
 26 | #include "table.h"
 27 | #include "text.h"
 28 | 
 29 | #define usage  "[-n 1|2|3] textfile1 textfile2 ... >resultfile"
 30 | 
 31 | #define MAX_N  3
 32 | short n;
 33 | char *nstring;
 34 | 
 35 | Option option[] =
 36 | {
 37 |     'n', &nstring, NULL,
 38 |     '\0'
 39 | };
 40 | 
 41 | Textopt textopt = { True, True, 0, True, True };
 42 | 
 43 | Text text;
 44 | 
 45 | BEGIN_ENTRY(Sequence)
 46 |     Charvalue value[MAX_N];
 47 |     long count, suspect;
 48 | END_ENTRY(Sequence);
 49 | 
 50 | BEGIN_TABLE_OF(Sequence, Seqlist)
 51 |     long total_count, total_suspect;
 52 | END_TABLE(Seqtable);
 53 | Seqtable seqtable;
 54 | 
 55 | /**********************************************************************/
 56 | 
 57 | short get_n()
 58 | {
 59 |     if (!nstring)
 60 | 	return(1);
 61 |     if (nstring[0] >= '1' && nstring[0] <= '0' + MAX_N && !nstring[1])
 62 | 	return(nstring[0] - '0');
 63 |     error_string("invalid value", nstring);
 64 | }
 65 | /**********************************************************************/
 66 | 
 67 | void add_sequence(key, value, suspect)
 68 | char *key;
 69 | Charvalue value[];
 70 | Boolean suspect;
 71 | {
 72 |     Sequence *sequence;
 73 |     short i;
 74 |     sequence = table_lookup(&seqtable, key);
 75 |     if (!sequence)
 76 |     {
 77 | 	sequence = NEW(Sequence);
 78 | 	sequence->key = strdup(key);
 79 | 	for (i = 0; i < n; i++)
 80 | 	    sequence->value[i] = value[i];
 81 | 	table_insert(&seqtable, sequence);
 82 |     }
 83 |     sequence->count++;
 84 |     seqtable.total_count++;
 85 |     if (suspect)
 86 |     {
 87 | 	sequence->suspect++;
 88 | 	seqtable.total_suspect++;
 89 |     }
 90 | }
 91 | /**********************************************************************/
 92 | 
 93 | void process_file(filename)
 94 | char *filename;
 95 | {
 96 |     Char *start, *c;
 97 |     char key[MAX_N * STRING_SIZE], string[STRING_SIZE];
 98 |     Boolean suspect;
 99 |     short i;
100 |     Charvalue value[MAX_N];
101 |     list_empty(&text, free);
102 |     read_text(&text, filename, &textopt);
103 |     for (start = text.first; start; start = start->next)
104 |     {
105 | 	key[0] = '\0';
106 | 	suspect = False;
107 | 	for (i = 0, c = start; i < n; i++, c = c->next)
108 | 	{
109 | 	    if (!c)
110 | 		return;
111 | 	    char_to_string(False, c->value, string, True);
112 | 	    strcat(key, string);
113 | 	    value[i] = c->value;
114 | 	    if (c->suspect)
115 | 		suspect = True;
116 | 	}
117 | 	add_sequence(key, value, suspect);
118 |     }
119 | }
120 | /**********************************************************************/
121 | 
122 | int order_by_value(sequence1, sequence2)
123 | Sequence *sequence1, *sequence2;
124 | {
125 |     short i;
126 |     for (i = 0; i < n && sequence1->value[i] == sequence2->value[i]; i++);
127 |     if (i < n)
128 | 	return(sequence1->value[i] < sequence2->value[i] ? -1 : 1);
129 |     return(0);
130 | }
131 | /**********************************************************************/
132 | 
133 | int order_by_count(sequence1, sequence2)
134 | Sequence *sequence1, *sequence2;
135 | {
136 |     if (sequence1->count != sequence2->count)
137 | 	return(sequence2->count - sequence1->count);
138 |     if (sequence1->suspect != sequence2->suspect)
139 | 	return(sequence2->suspect - sequence1->suspect);
140 |     return(order_by_value(sequence1, sequence2));
141 | }
142 | /**********************************************************************/
143 | 
144 | void write_array()
145 | {
146 |     long i;
147 |     printf("   Count  Suspect\n");
148 |     for (i = 0; i < seqtable.count; i++)
149 | 	printf("%8ld %8ld   {%s}\n", seqtable.array[i]->count,
150 | 	seqtable.array[i]->suspect, seqtable.array[i]->key);
151 |     printf("%8ld %8ld   Total\n", seqtable.total_count,
152 |     seqtable.total_suspect);
153 | }
154 | /**********************************************************************/
155 | 
156 | void write_report()
157 | {
158 |     table_in_array(&seqtable);
159 |     sort(seqtable.count, seqtable.array, order_by_value);
160 |     write_array();
161 |     printf("\n\n");
162 |     sort(seqtable.count, seqtable.array, order_by_count);
163 |     write_array();
164 | }
165 | /**********************************************************************/
166 | 
167 | main(argc, argv)
168 | int argc;
169 | char *argv[];
170 | {
171 |     int i;
172 |     initialize(&argc, argv, usage, option);
173 |     if (argc == 0)
174 | 	error("no text files specified");
175 |     n = get_n();
176 |     for (i = 0; i < argc; i++)
177 | 	process_file(argv[i]);
178 |     write_report();
179 |     terminate();
180 | }
181 | 


--------------------------------------------------------------------------------
/src/synctext.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  synctext.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "sync.h"
 26 | 
 27 | #define usage  "[-H] [-i] [-s] [-T] textfile1 textfile2 ... >resultfile"
 28 | 
 29 | Textopt textopt = { True, True, 0, True, True };
 30 | 
 31 | Boolean heuristic, show_suspect, transpose;
 32 | 
 33 | Option option[] =
 34 | {
 35 |     'H', NULL, &heuristic,
 36 |     'i', NULL, &textopt.case_insensitive,
 37 |     's', NULL, &show_suspect,
 38 |     'T', NULL, &transpose,
 39 |     '\0'
 40 | };
 41 | 
 42 | /**********************************************************************/
 43 | 
 44 | void write_separator()
 45 | {
 46 |     short i;
 47 |     for (i = 1; i < 80; i++)
 48 | 	putchar('=');
 49 |     putchar(NEWLINE);
 50 | }
 51 | /**********************************************************************/
 52 | 
 53 | void write_transposed(synclist, text)
 54 | Synclist *synclist;
 55 | Text *text;
 56 | {
 57 |     Sync *sync;
 58 |     long i;
 59 |     char string[STRING_SIZE];
 60 |     write_separator();
 61 |     putchar(NEWLINE);
 62 |     for (sync = synclist->first; sync; sync = sync->next)
 63 |     {
 64 | 	if (sync->match)
 65 | 	    printf("{%ld:", *sync->match);
 66 | 	for (i = sync->substr->start; i <= sync->substr->stop; i++)
 67 | 	{
 68 | 	    char_to_string(show_suspect & text->array[i]->suspect,
 69 | 	    text->array[i]->value, string, False);
 70 | 	    printf("%s", string);
 71 | 	}
 72 | 	if (sync->match)
 73 | 	    putchar('}');
 74 |     }
 75 |     putchar(NEWLINE);
 76 | }
 77 | /**********************************************************************/
 78 | 
 79 | void write_matches(synclist, num_text, text)
 80 | Synclist *synclist;
 81 | short num_text;
 82 | Text *text;
 83 | {
 84 |     Sync *sync;
 85 |     long i, j, footnote = 0;
 86 |     Boolean suspect;
 87 |     char string[STRING_SIZE];
 88 |     write_separator();
 89 |     putchar(NEWLINE);
 90 |     for (sync = synclist->first; sync; sync = sync->next)
 91 | 	if (sync->match)
 92 | 	    for (i = 0; i < sync->substr[0].length; i++)
 93 | 	    {
 94 | 		suspect = False;
 95 | 		if (show_suspect)
 96 | 		    for (j = 0; j < num_text && !suspect; j++)
 97 | 			if (text[j].array[sync->substr[j].start + i]->suspect)
 98 | 			    suspect = True;
 99 | 		char_to_string(suspect,
100 | 		text[0].array[sync->substr[0].start + i]->value, string, False);
101 | 		printf("%s", string);
102 | 	    }
103 | 	else
104 | 	    printf("{%ld}", ++footnote);
105 |     putchar(NEWLINE);
106 | }
107 | /**********************************************************************/
108 | 
109 | void write_differences(synclist, num_text, text, filename)
110 | Synclist *synclist;
111 | short num_text;
112 | Text *text;
113 | char *filename[];
114 | {
115 |     long i, j, maxlen = 0, footnote = 0;
116 |     char format[20], string[STRING_SIZE];
117 |     Sync *sync;
118 |     for (i = 0; i < num_text; i++)
119 | 	maxlen = max(maxlen, strlen(filename[i]));
120 |     sprintf(format, "%%-%lds", maxlen);
121 |     for (sync = synclist->first; sync; sync = sync->next)
122 | 	if (!sync->match)
123 | 	{
124 | 	    write_separator();
125 | 	    printf("{%ld}\n", ++footnote);
126 | 	    for (i = 0; i < num_text; i++)
127 | 	    {
128 | 		printf(format, filename[i]);
129 | 		printf(" {");
130 | 		for (j = sync->substr[i].start; j <= sync->substr[i].stop; j++)
131 | 		{
132 | 		    char_to_string(show_suspect & text[i].array[j]->suspect,
133 | 		    text[i].array[j]->value, string, False);
134 | 		    printf("%s", string);
135 | 		    if (text[i].array[j]->value == NEWLINE)
136 | 		    {
137 | 			printf(format, "");
138 | 			printf("  ");
139 | 		    }
140 | 		}
141 | 		printf("}\n");
142 | 	    }
143 | 	}
144 | }
145 | /**********************************************************************/
146 | 
147 | main(argc, argv)
148 | int argc;
149 | char *argv[];
150 | {
151 |     Text *text;
152 |     int i;
153 |     Synclist synclist1, synclist2;
154 |     initialize(&argc, argv, usage, option);
155 |     if (argc < 2 || (transpose && argc > 2))
156 | 	error("invalid number of text files");
157 |     text = NEW_ARRAY(argc, Text);
158 |     for (i = 0; i < argc; i++)
159 | 	read_text(&text[i], argv[i], &textopt);
160 |     if (transpose)
161 |     {
162 | 	transpose_sync(&synclist1, &synclist2, &text[0], &text[1]);
163 | 	write_transposed(&synclist1, &text[0]);
164 | 	write_transposed(&synclist2, &text[1]);
165 |     }
166 |     else
167 |     {
168 | 	if (heuristic || argc > 2)
169 | 	    synchronize(&synclist1, argc, text);
170 | 	else
171 | 	    fastukk_sync(&synclist1, text);
172 | 	write_matches(&synclist1, argc, text);
173 | 	write_differences(&synclist1, argc, text, argv);
174 |     }
175 |     write_separator();
176 |     terminate();
177 | }
178 | 


--------------------------------------------------------------------------------
/src/text.h:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  text.h
  4 |  *
  5 |  *  This module provides definitions and routines to support the reading
  6 |  *  and writing of OCR-generated text.  A "Text" structure is defined to
  7 |  *  be a linked list of "Char" structures, where each "Char" structure
  8 |  *  gives the value of one 32-bit Unicode character and indicates whether it
  9 |  *  is suspect.
 10 |  *
 11 |  *  Author: Stephen V. Rice (1996)
 12 |  *  Author: Eddie Antonio Santos (2015)
 13 |  *
 14 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
 15 |  * Education, on behalf, of the University of Nevada, Las Vegas,
 16 |  * Information Science Research Institute
 17 |  *
 18 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 19 |  * may not use this file except in compliance with the License.  You
 20 |  * may obtain a copy of the License at
 21 |  *
 22 |  *    http://www.apache.org/licenses/LICENSE-2.0
 23 |  *
 24 |  * Unless required by applicable law or agreed to in writing, software
 25 |  * distributed under the License is distributed on an "AS IS" BASIS,
 26 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 27 |  * implied. See the License for the specific language governing
 28 |  * permissions and limitations under the License.
 29 |  *
 30 |  **********************************************************************/
 31 | 
 32 | #ifndef _TEXT_
 33 | #define _TEXT_
 34 | 
 35 | #include <stdint.h>
 36 | 
 37 | #include "list.h"
 38 | #include "util.h"
 39 | 
 40 | #define BLANK               ' '
 41 | #define NEWLINE             '\n'
 42 | #define LINE_TABULATION     0x00B
 43 | #define FORM_FEED           0x00C
 44 | #define CARRIAGE_RETURN     0x00D
 45 | #define NON_BREAKING_SPACE  0x0A0
 46 | #define REJECT_CHARACTER    '~'
 47 | #define SUSPECT_MARKER      '^'
 48 | #define COMBINING_MARK_BASE ((Charvalue) 0x25CC) /* ◌ -- Dotted circle */
 49 | #define INVALID_CHARVALUE   ((Charvalue) (-1))
 50 | 
 51 | /* Use UTF-32 internally. */
 52 | typedef uint32_t Charvalue;
 53 | /* Technically, there are far less *scalar values* (what you and I call
 54 |  * "characters") in Unicode, but using the max number of *code points* here
 55 |  * makes the implementation simpler. See: http://www.unicode.org/glossary/ */
 56 | #define NUM_CHARVALUES  0x10FFFF
 57 | 
 58 | /* Maximum char size of a single char_to_string() operation, including the
 59 |  * null-terminator. It's either the largetst size of an encoded non-graphic
 60 |  * character (this happens to be U+10FFFF) OR the size of an astral (non-BMP)
 61 |  * combining character, that combines on top of U+25CC DOTTED CIRCLE
 62 |  * WITH a suspect marker! */
 63 | #define STRING_SIZE (max(sizeof("^◌𐇽"), sizeof("<10FFFF>")))
 64 | 
 65 | BEGIN_ITEM(Char)
 66 |     Boolean suspect;
 67 |     Charvalue value;
 68 | END_ITEM(Char);
 69 | 
 70 | BEGIN_LIST_OF(Char)
 71 | END_LIST(Text);
 72 | 
 73 | void append_char(/* Text *text, Boolean suspect, Charvalue value */);
 74 |                         /* appends the given character to "text" */
 75 | 
 76 | typedef
 77 | struct
 78 | {
 79 |     Boolean find_header;/* if True, a header will be looked for and skipped if
 80 |                            present; if False and a header is present, it will be
 81 |                            stored as text */
 82 |     Boolean find_markers;
 83 |                         /* if True, any occurrence of the "suspect_marker"
 84 |                            character will be interpreted as marking the
 85 |                            following character as suspect */
 86 | 
 87 |     Charvalue suspect_marker;
 88 |                         /* applicable when "find_markers" is True; if zero,
 89 |                            SUSPECT_MARKER will be used */
 90 |     Boolean find_hex_values;
 91 |                         /* deprecated and silently ignored */
 92 |     Boolean normalize;  /* if True, spacing is compressed */
 93 |     Boolean case_insensitive;
 94 |                         /* if True, letters are converted to lower-case */
 95 |     Boolean found_header;
 96 |                         /* set to True if a header was found; applicable when
 97 |                            "find_header" is True */
 98 | } Textopt;
 99 | 
100 | void read_text(/* Text *text, char *filename, Textopt *textopt */);
101 |                         /* reads the named file (or stdin if "filename" is NULL
102 |                            and "textopt->find_header" is False) based on the
103 |                            options specified in "textopt", and appends each
104 |                            character to "text"; reports an error and quits if
105 |                            unable to open the file */
106 | 
107 | void char_to_string(/* Boolean suspect, Charvalue value, char *string,
108 |                        Boolean fake_newline */);
109 |                         /* stores a representation of the given character in
110 |                            "string", which must be at least STRING_SIZE bytes; a
111 |                            non-printable character is represented by a hex value
112 |                            of the form <FF> or <FFFF>; if "fake_newline" is
113 |                            True, the newline character is represented by <\n>,
114 |                            which is desirable for some reports */
115 | 
116 | signed char encode_or_die(/* Charvalue value, char *string */);
117 |                         /* writes a Unicode value to the given string;
118 |                          * exits if the value cannot be written;
119 |                          * returns characters written to string */
120 | 
121 | Boolean cstring_to_text(Text* text, const char *string);
122 |                         /* appends the UTF-8 string to the text list;
123 |                          * exits if the value cannot be written; */
124 | 
125 | 
126 | void write_text(/* Text *text, char *filename,
127 |                    void (*write_header)(FILE *f) */);
128 |                         /* writes each character of "text" to the named file
129 |                            (or stdout if "filename" is NULL) using
130 |                            "char_to_string" to represent the characters;
131 |                            if "write_header" is non-NULL, this routine is
132 |                            called to write a header to the file; reports an
133 |                            error and quits if unable to create the file */
134 | 
135 | #endif
136 | 


--------------------------------------------------------------------------------
/src/charclass.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  charclass.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "charclass.h"
 26 | 
 27 | static short num_classes;
 28 | static char *class_name[MAX_CHARCLASSES];
 29 | static Charclass class[NUM_CHARVALUES];
 30 | 
 31 | static struct range
 32 | {
 33 |     Charvalue start, stop;
 34 |     char *name;
 35 | } range[] =
 36 | {
 37 |     0x0000, 0x0000, "Unassigned",
 38 |     0x0000, 0x0008, "ASCII Control Codes",
 39 |     0x0009, 0x000D, "ASCII Spacing Characters",
 40 |     0x000E, 0x001F, "ASCII Control Codes",
 41 |     0x0020, 0x0020, "ASCII Spacing Characters",
 42 |     0x0021, 0x002F, "ASCII Special Symbols",
 43 |     0x0030, 0x0039, "ASCII Digits",
 44 |     0x003A, 0x0040, "ASCII Special Symbols",
 45 |     0x0041, 0x005A, "ASCII Uppercase Letters",
 46 |     0x005B, 0x0060, "ASCII Special Symbols",
 47 |     0x0061, 0x007A, "ASCII Lowercase Letters",
 48 |     0x007B, 0x007E, "ASCII Special Symbols",
 49 |     0x007F, 0x007F, "ASCII Control Codes",
 50 |     0x0080, 0x009F, "Latin1 Control Codes",
 51 |     0x00A0, 0x00A0, "Latin1 Spacing Characters",
 52 |     0x00A1, 0x00BF, "Latin1 Special Symbols",
 53 |     0x00C0, 0x00D6, "Latin1 Uppercase Letters",
 54 |     0x00D7, 0x00D7, "Latin1 Special Symbols",
 55 |     0x00D8, 0x00DE, "Latin1 Uppercase Letters",
 56 |     0x00DF, 0x00F6, "Latin1 Lowercase Letters",
 57 |     0x00F7, 0x00F7, "Latin1 Special Symbols",
 58 |     0x00F8, 0x00FF, "Latin1 Lowercase Letters",
 59 |     0x0100, 0x017F, "Latin Extended-A",
 60 |     0x0180, 0x024F, "Latin Extended-B",
 61 |     0x0250, 0x02AF, "IPA Extensions",
 62 |     0x02B0, 0x02FF, "Spacing Modifier Letters",
 63 |     0x0300, 0x036F, "Combining Diacritical Marks",
 64 |     0x0370, 0x03CF, "Basic Greek",
 65 |     0x03D0, 0x03FF, "Greek Symbols and Coptic",
 66 |     0x0400, 0x04FF, "Cyrillic",
 67 |     0x0530, 0x058F, "Armenian",
 68 |     0x0590, 0x05CF, "Hebrew Extended-A",
 69 |     0x05D0, 0x05EA, "Basic Hebrew",
 70 |     0x05EB, 0x05FF, "Hebrew Extended-B",
 71 |     0x0600, 0x0652, "Basic Arabic",
 72 |     0x0653, 0x06FF, "Arabic Extended",
 73 |     0x0900, 0x097F, "Devanagari",
 74 |     0x0980, 0x09FF, "Bengali",
 75 |     0x0A00, 0x0A7F, "Gurmukhi",
 76 |     0x0A80, 0x0AFF, "Gujarati",
 77 |     0x0B00, 0x0B7F, "Oriya",
 78 |     0x0B80, 0x0BFF, "Tamil",
 79 |     0x0C00, 0x0C7F, "Telugu",
 80 |     0x0C80, 0x0CFF, "Kannada",
 81 |     0x0D00, 0x0D7F, "Malayalam",
 82 |     0x0E00, 0x0E7F, "Thai",
 83 |     0x0E80, 0x0EFF, "Lao",
 84 |     0x10A0, 0x10CF, "Georgian Extended",
 85 |     0x10D0, 0x10FF, "Basic Georgian",
 86 |     0x1100, 0x11FF, "Hanguljamo",
 87 |     0x1E00, 0x1EFF, "Latin Extended Additional",
 88 |     0x1F00, 0x1FFF, "Greek Extended",
 89 |     0x2000, 0x206F, "General Punctuation",
 90 |     0x2070, 0x209F, "Superscripts and Subscripts",
 91 |     0x20A0, 0x20CF, "Currency Symbols",
 92 |     0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols",
 93 |     0x2100, 0x214F, "Letterlike Symbols",
 94 |     0x2150, 0x218F, "Number Forms",
 95 |     0x2190, 0x21FF, "Arrows",
 96 |     0x2200, 0x22FF, "Mathematical Operators",
 97 |     0x2300, 0x23FF, "Miscellaneous Technical",
 98 |     0x2400, 0x243F, "Control Pictures",
 99 |     0x2440, 0x245F, "Optical Character Recognition",
100 |     0x2460, 0x24FF, "Enclosed Alphanumerics",
101 |     0x2500, 0x257F, "Box Drawing",
102 |     0x2580, 0x259F, "Block Elements",
103 |     0x25A0, 0x25FF, "Geometric Shapes",
104 |     0x2600, 0x26FF, "Miscellaneous Symbols",
105 |     0x2700, 0x27BF, "Dingbats",
106 |     0x3000, 0x303F, "CJK Symbols and Punctuation",
107 |     0x3040, 0x309F, "Hiragana",
108 |     0x30A0, 0x30FF, "Katakana",
109 |     0x3100, 0x312F, "Bopomofo",
110 |     0x3130, 0x318F, "Hangul Compatibility Jamo",
111 |     0x3190, 0x319F, "CJK Miscellaneous",
112 |     0x3200, 0x32FF, "Enclosed CJK Letters and Months",
113 |     0x3300, 0x33FF, "CJK Compatibility",
114 |     0x3400, 0x3D2D, "Hangul",
115 |     0x3D2E, 0x44B7, "Hangul Supplementary-A",
116 |     0x44B8, 0x4DFF, "Hangul Supplementary-B",
117 |     0x4E00, 0x9FFF, "CJK Unified Ideographs",
118 |     0xE000, 0xF8FF, "Private Use Area",
119 |     0xF900, 0xFAFF, "CJK Compatibility Ideographs",
120 |     0xFB00, 0xFB4F, "Alphabetic Presentation Forms",
121 |     0xFB50, 0xFDFF, "Arabic Presentation Forms-A",
122 |     0xFE20, 0xFE2F, "Combining Half Marks",
123 |     0xFE30, 0xFE4F, "CJK Compatibility Forms",
124 |     0xFE50, 0xFE6F, "Small Form Variants",
125 |     0xFE70, 0xFEFE, "Arabic Presentation Forms-B",
126 |     0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms",
127 |     0xFFF0, 0xFFFD, "Specials"
128 | };
129 | 
130 | /**********************************************************************/
131 | 
132 | static void initialize_charclass()
133 | {
134 |     long i, j, k;
135 |     for (i = 0; i < sizeof(range) / sizeof(struct range); i++)
136 |     {
137 | 	for (j = 0; j < num_classes &&
138 | 	strcmp(range[i].name, class_name[j]) != 0; j++);
139 | 	if (j == num_classes)
140 | 	{
141 | 	    if (num_classes == MAX_CHARCLASSES)
142 | 		error("too many character classes");
143 | 	    num_classes++;
144 | 	    class_name[j] = range[i].name;
145 | 	}
146 | 	for (k = range[i].start; k <= range[i].stop; k++)
147 | 	    class[k] = j;
148 |     }
149 | }
150 | /**********************************************************************/
151 | 
152 | Charclass charclass(value)
153 | Charvalue value;
154 | {
155 |     if (num_classes == 0)
156 | 	initialize_charclass();
157 |     return(class[value]);
158 | }
159 | /**********************************************************************/
160 | 
161 | char *charclass_name(class)
162 | Charclass class;
163 | {
164 |     if (num_classes == 0)
165 | 	initialize_charclass();
166 |     if (class >= num_classes)
167 | 	error("invalid character class");
168 |     return(class_name[class]);
169 | }
170 | 


--------------------------------------------------------------------------------
/test/word_test.c:
--------------------------------------------------------------------------------
  1 | #include "greatest.h"
  2 | #include "test_utils.h"
  3 | 
  4 | #include <word.h>
  5 | 
  6 | static Wordlist wordlist_;
  7 | static Wordlist *wordlist = &wordlist_;
  8 | 
  9 | /* Aliases for traversing the linked list. */
 10 | /* ALWAYS ensure sufficient length before using these aliases. */
 11 | #define second  first->next
 12 | #define third   first->next->next
 13 | #define fourth  first->next->next->next
 14 | #define fifth   first->next->next->next->next
 15 | #define sixth   first->next->next->next->next->next
 16 | #define seventh first->next->next->next->next->next->next
 17 | #define eighth  first->next->next->next->next->next->next->next
 18 | #define ninth   first->next->next->next->next->next->next->next->next
 19 | 
 20 | 
 21 | TEST find_words_segments_a_single_ascii_word() {
 22 |     cstring_to_text(text, "C11");
 23 |     find_words(wordlist, text);
 24 | 
 25 |     ASSERT_EQ_FMT(1, wordlist->count, "%d");
 26 |     ASSERT_STR_EQ("C11", wordlist->first->string);
 27 |     PASS();
 28 | }
 29 | 
 30 | TEST find_words_returns_nfc() {
 31 |     char pho_nfc[] = { 'p', 'h', 0xE1, 0xBB, 0x9F, 0 };
 32 |     /* With two combining characters. */
 33 |     cstring_to_text(text, (char []) { 'p', 'h', 'o',
 34 |                                       0xCC, 0x9B, /* ◌̛ */
 35 |                                       0xCC, 0x89, /* ◌̉ */
 36 |                                       0 });
 37 |     find_words(wordlist, text);
 38 | 
 39 |     ASSERT_EQ_FMT(1, wordlist->count, "%d");
 40 |     ASSERT_STR_EQ(pho_nfc, wordlist->first->string);
 41 |     PASS();
 42 | }
 43 | 
 44 | TEST find_words_returns_zero_when_not_given_words() {
 45 |     /* With two combining characters. */
 46 |     cstring_to_text(text, "#$#@! #@!\n#@!!!$#");
 47 |     find_words(wordlist, text);
 48 | 
 49 |     ASSERT_EQ_FMT(0, wordlist->count, "%d");
 50 |     PASS();
 51 | }
 52 | 
 53 | 
 54 | /* Exercises ASCII characters. */
 55 | TEST find_words_segments_english_with_punctuation() {
 56 |     /* From: http://unicode.org/reports/tr29/#Word_Boundaries */
 57 |     cstring_to_text(text, "The quick (\"brown\") fox can’t jump 32.3 feet, "
 58 |                           "right?");
 59 |     find_words(wordlist, text);
 60 | 
 61 |     ASSERT_EQ_FMT(9, wordlist->count, "%d");
 62 |     ASSERT_STR_EQ("quick", wordlist->second->string);
 63 |     ASSERT_STR_EQ("brown", wordlist->third->string);
 64 |     ASSERT_STR_EQ("fox",   wordlist->fourth->string);
 65 |     ASSERT_STR_EQ("can’t", wordlist->fifth->string);
 66 |     ASSERT_STR_EQ("jump",  wordlist->sixth->string);
 67 |     ASSERT_STR_EQ("32.3",  wordlist->seventh->string);
 68 |     ASSERT_STR_EQ("right",  wordlist->ninth->string);
 69 | 
 70 |     PASS();
 71 | }
 72 | 
 73 | /* Exercises Latin-1 characters and punctuation. */
 74 | TEST find_words_segments_spanish_words() {
 75 |     cstring_to_text(text, "¡Feliz año nuevo!");
 76 |     find_words(wordlist, text);
 77 | 
 78 |     ASSERT_EQ_FMT(3, wordlist->count, "%d");
 79 |     ASSERT_STR_EQ("Feliz", wordlist->first->string);
 80 |     ASSERT_STR_EQ("año", wordlist->second->string);
 81 |     ASSERT_STR_EQ("nuevo", wordlist->third->string);
 82 | 
 83 |     PASS();
 84 | }
 85 | 
 86 | /* Exercises numeric processing. */
 87 | TEST find_words_segments_numerals() {
 88 |     /* From https://github.com/eddieantonio/ocreval/issues/3 */
 89 |     cstring_to_text(text, "PLASTIK-KARTON BARDA %18 *1,75");
 90 |     find_words(wordlist, text);
 91 | 
 92 |     ASSERT_EQ_FMT(5, wordlist->count, "%d");
 93 |     ASSERT_STR_EQ("PLASTIK", wordlist->first->string);
 94 |     ASSERT_STR_EQ("KARTON", wordlist->second->string);
 95 |     ASSERT_STR_EQ("BARDA", wordlist->third->string);
 96 |     ASSERT_STR_EQ("18", wordlist->fourth->string);
 97 |     ASSERT_STR_EQ("1,75", wordlist->fifth->string);
 98 | 
 99 |     PASS();
100 | }
101 | 
102 | /* Exercises numeric processing. */
103 | TEST find_words_segments_japanese() {
104 |     /* This phrase -- rōkaraizu no densetsu 'Legend of Localization' -- is
105 |      * conveniently written in katakana, hiragana, and kanji, respectively. */
106 |     cstring_to_text(text, "ローカライズの伝説");
107 |     find_words(wordlist, text);
108 | 
109 |     ASSERT_EQ_FMT(4, wordlist->count, "%d");
110 |     ASSERT_STR_EQ("ローカライズ", wordlist->first->string);
111 |     ASSERT_STR_EQ("の", wordlist->second->string);
112 |     /* A Japanese- tailored algorithm would segment this into three words
113 |      * instead of four, however, that would involve incorporating a Japanese
114 |      * dictionary in order to look-up Kanji words... */
115 |     ASSERT_STR_EQ("伝", wordlist->third->string);
116 |     ASSERT_STR_EQ("説", wordlist->fourth->string);
117 | 
118 |     PASS();
119 | }
120 | 
121 | TEST find_words_segments_haida_words() {
122 |     cstring_to_text(text, "Wᴀˊstᴀ haˊoîsîn ᵋāl ʟēˊłas ʟ̣ū haoîsîˊn\n"
123 |                           "l’ sᵋaiˊᵋänᴀn.");
124 |     find_words(wordlist, text);
125 | 
126 |     ASSERT_EQ_FMT(8, wordlist->count, "%d");
127 |     ASSERT_STR_EQ("Wᴀˊstᴀ",     wordlist->first->string);
128 |     ASSERT_STR_EQ("haˊoîsîn",   wordlist->second->string);
129 |     ASSERT_STR_EQ("ᵋāl",        wordlist->third->string);
130 |     ASSERT_STR_EQ("ʟēˊłas",     wordlist->fourth->string);
131 |     ASSERT_STR_EQ("ʟ̣ū",         wordlist->fifth->string);
132 |     ASSERT_STR_EQ("haoîsîˊn",   wordlist->sixth->string);
133 |     ASSERT_STR_EQ("sᵋaiˊᵋänᴀn", wordlist->eighth->string);
134 | 
135 |     PASS();
136 | }
137 | 
138 | /* Regression test -- word boundaries segfaults on control characters.
139 |  * See: https://github.com/eddieantonio/ocreval/issues/22#issuecomment-491448129 */
140 | TEST find_words_control_character() {
141 |     cstring_to_text(text, "\003");
142 |     find_words(wordlist, text);
143 | 
144 |     ASSERT_EQ_FMT(0, wordlist->count, "%d");
145 | 
146 |     PASS();
147 | }
148 | 
149 | 
150 | #undef second
151 | #undef third
152 | #undef fourth
153 | #undef fifth
154 | #undef sixth
155 | #undef seventh
156 | #undef eighth
157 | #undef ninth
158 | 
159 | static void setup_find_words(void *unused) {
160 |     initialize_texts((Text*[]) {text, NULL});
161 |     list_initialize(wordlist);
162 | }
163 | 
164 | static void teardown_find_words(void *unused) {
165 |     deinitialize_texts((Text*[]) {text, NULL});
166 |     list_empty(wordlist, free_word);
167 | }
168 | 
169 | SUITE(find_words_suite) {
170 |     SET_SETUP(setup_find_words, NULL);
171 |     SET_TEARDOWN(teardown_find_words, NULL);
172 | 
173 |     RUN_TEST(find_words_segments_a_single_ascii_word);
174 |     RUN_TEST(find_words_returns_nfc);
175 |     RUN_TEST(find_words_returns_zero_when_not_given_words);
176 |     RUN_TEST(find_words_segments_english_with_punctuation);
177 |     RUN_TEST(find_words_segments_spanish_words);
178 |     RUN_TEST(find_words_segments_haida_words);
179 |     RUN_TEST(find_words_segments_numerals);
180 |     RUN_TEST(find_words_segments_japanese);
181 |     RUN_TEST(find_words_control_character);
182 | }
183 | 


--------------------------------------------------------------------------------
/src/editop.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  editop.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "edorpt.h"
 26 | #include "sync.h"
 27 | 
 28 | #define usage  "correctfile generatedfile [editop_report]"
 29 | 
 30 | Boolean debug;
 31 | 
 32 | Option option[] =
 33 | {
 34 |     'D', NULL, &debug,
 35 |     '\0'
 36 | };
 37 | 
 38 | Textopt textopt = { True, True, 0, True, True };
 39 | 
 40 | Text text1, text2;
 41 | 
 42 | Edodata edodata;
 43 | 
 44 | typedef
 45 | struct
 46 | {
 47 |     Boolean for_insertion, for_deletion;
 48 |     Sync *sync;
 49 | } Candidate;
 50 | 
 51 | /**********************************************************************/
 52 | 
 53 | void display(synclist, label)
 54 | Synclist *synclist;
 55 | char *label;
 56 | {
 57 |     Sync *sync;
 58 |     printf("%-9s:", label);
 59 |     for (sync = synclist->first; sync; sync = sync->next)
 60 | 	printf(" {%ld:%ld}", *sync->match, sync->substr->length);
 61 |     printf("\n\n");
 62 | }
 63 | /**********************************************************************/
 64 | 
 65 | void discard_sync(synclist, sync)
 66 | Synclist *synclist;
 67 | Sync *sync;
 68 | {
 69 |     list_remove(synclist, sync);
 70 |     free(sync->substr);
 71 |     if (sync->match)
 72 | 	free(sync->match);
 73 |     free(sync);
 74 | }
 75 | /**********************************************************************/
 76 | 
 77 | void count_insertions(synclist, text)
 78 | Synclist *synclist;
 79 | Text *text;
 80 | {
 81 |     Sync *sync;
 82 |     long i;
 83 |     for (sync = synclist->first; sync; sync = sync->next)
 84 | 	if (!sync->match)
 85 | 	    for (i = sync->substr->start; i <= sync->substr->stop; i++)
 86 | 		if (text->array[i]->value != REJECT_CHARACTER)
 87 | 		    edodata.total_insertions++;
 88 | }
 89 | /**********************************************************************/
 90 | 
 91 | void count_deletions(synclist)
 92 | Synclist *synclist;
 93 | {
 94 |     Sync *sync, *next;
 95 |     sync = synclist->first;
 96 |     while (sync)
 97 |     {
 98 | 	next = sync->next;
 99 | 	if (!sync->match)
100 | 	{
101 | 	    edodata.total_deletions += sync->substr->length;
102 | 	    discard_sync(synclist, sync);
103 | 	}
104 | 	sync = next;
105 |     }
106 | }
107 | /**********************************************************************/
108 | 
109 | void decrement_match(synclist, limit)
110 | Synclist *synclist;
111 | long limit;
112 | {
113 |     Sync *sync;
114 |     for (sync = synclist->first; sync; sync = sync->next)
115 | 	if (*sync->match > limit)
116 | 	    *sync->match -= 1;
117 | }
118 | /**********************************************************************/
119 | 
120 | void combine_adjacent(synclist)
121 | Synclist *synclist;
122 | {
123 |     Sync *sync, *next;
124 |     sync = synclist->first;
125 |     while (sync)
126 |     {
127 | 	next = sync->next;
128 | 	if (next && *next->match == *sync->match + 1)
129 | 	{
130 | 	    sync->substr->length += next->substr->length;
131 | 	    discard_sync(synclist, next);
132 | 	    decrement_match(synclist, *sync->match);
133 | 	}
134 | 	else
135 | 	    sync = next;
136 |     }
137 |     if (debug)
138 | 	display(synclist, "combined");
139 | }
140 | /**********************************************************************/
141 | 
142 | Candidate *find_candidates(synclist)
143 | Synclist *synclist;
144 | {
145 |     Candidate *candidate;
146 |     Sync *sync;
147 |     candidate = NEW_ARRAY(synclist->count + 1, Candidate);
148 |     for (sync = synclist->first; sync; sync = sync->next)
149 |     {
150 | 	if (sync->next && *sync->next->match == *sync->match + 2)
151 | 	    candidate[*sync->match + 1].for_insertion = True;
152 | 	if (sync->next && sync->next->next &&
153 | 	*sync->next->next->match == *sync->match + 1)
154 | 	    candidate[*sync->next->match].for_deletion = True;
155 | 	candidate[*sync->match].sync = sync;
156 |     }
157 |     return(candidate);
158 | }
159 | /**********************************************************************/
160 | 
161 | Sync *find_move(synclist, candidate)
162 | Synclist *synclist;
163 | Candidate candidate[];
164 | {
165 |     long i, reduction, move_i, move_length, move_reduction = 0;
166 |     for (i = 1; i <= synclist->count; i++)
167 |     {
168 | 	reduction = 1;
169 | 	if (candidate[i].for_insertion)
170 | 	    reduction++;
171 | 	if (candidate[i].for_deletion)
172 | 	    reduction++;
173 | 	if (reduction > move_reduction || (reduction == move_reduction &&
174 | 	candidate[i].sync->substr->length < move_length))
175 | 	{
176 | 	    move_i = i;
177 | 	    move_length = candidate[i].sync->substr->length;
178 | 	    move_reduction = reduction;
179 | 	}
180 |     }
181 |     return(candidate[move_i].sync);
182 | }
183 | /**********************************************************************/
184 | 
185 | void perform_move(synclist, candidate, sync)
186 | Synclist *synclist;
187 | Candidate candidate[];
188 | Sync *sync;
189 | {
190 |     short length;
191 |     list_remove(synclist, sync);
192 |     if (*sync->match == 1)
193 | 	list_insert_before(synclist, sync, candidate[2].sync);
194 |     else
195 | 	list_insert_after(synclist, candidate[*sync->match - 1].sync, sync);
196 |     edodata.total_moves++;
197 |     length = min(sync->substr->length, MAX_MOVE_LENGTH);
198 |     edodata.moves[length]++;
199 |     if (debug)
200 |     {
201 | 	char label[20];
202 | 	sprintf(label, "moved %ld", *sync->match);
203 | 	display(synclist, label);
204 |     }
205 | }
206 | /**********************************************************************/
207 | 
208 | void count_moves(synclist)
209 | Synclist *synclist;
210 | {
211 |     Candidate *candidate;
212 |     Sync *sync;
213 |     if (debug)
214 | 	display(synclist, "original");
215 |     combine_adjacent(synclist);
216 |     while (synclist->count > 1)
217 |     {
218 | 	candidate = find_candidates(synclist);
219 | 	sync = find_move(synclist, candidate);
220 | 	perform_move(synclist, candidate, sync);
221 | 	free(candidate);
222 | 	combine_adjacent(synclist);
223 |     }
224 | }
225 | /**********************************************************************/
226 | 
227 | main(argc, argv)
228 | int argc;
229 | char *argv[];
230 | {
231 |     Synclist synclist1, synclist2;
232 |     initialize(&argc, argv, usage, option);
233 |     if (argc < 2 || argc > 3)
234 | 	error("invalid number of files");
235 |     read_text(&text1, argv[0], &textopt);
236 |     if (textopt.found_header)
237 | 	error("no correct file specified");
238 |     read_text(&text2, argv[1], &textopt);
239 |     transpose_sync(&synclist1, &synclist2, &text1, &text2);
240 |     count_insertions(&synclist1, &text1);
241 |     count_deletions(&synclist2);
242 |     count_moves(&synclist2);
243 |     write_edorpt(&edodata, (argc == 3 ? argv[2] : NULL));
244 |     terminate();
245 | }
246 | 


--------------------------------------------------------------------------------
/src/wordacc.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  wordacc.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "stopword.h"
 26 | #include "wacrpt.h"
 27 | 
 28 | #define usage  "[-S stopwordfile] correctfile generatedfile [wordacc_report]"
 29 | 
 30 | char *stopwordfilename;
 31 | 
 32 | Option option[] =
 33 | {
 34 |     'S', &stopwordfilename, NULL,
 35 |     '\0'
 36 | };
 37 | 
 38 | Textopt textopt = { True, True, 0, True, True, True };
 39 | Text text[2];
 40 | 
 41 | Wordlist wordlist[2];
 42 | 
 43 | BEGIN_ENTRY(Id)
 44 |     Boolean found[2];
 45 | END_ENTRY(Id);
 46 | 
 47 | BEGIN_TABLE_OF(Id, Idlist)
 48 | END_TABLE(Idtable);
 49 | Idtable idtable;
 50 | 
 51 | typedef
 52 | struct
 53 | {
 54 |     Id *id;
 55 |     Boolean recognized;
 56 | } Symbol;
 57 | 
 58 | Symbol *symbol[2], **a, **b;
 59 | 
 60 | long m, n, min_k, max_k;
 61 | 
 62 | typedef unsigned short F;
 63 | #define MAX_F  65535
 64 | 
 65 | BEGIN_ITEM(Path)
 66 |     long p;
 67 |     F *f;
 68 | END_ITEM(Path);
 69 | 
 70 | BEGIN_LIST_OF(Path)
 71 | END_LIST(Pathlist);
 72 | Pathlist pathlist;
 73 | 
 74 | Wacdata wacdata;
 75 | 
 76 | /**********************************************************************/
 77 | 
 78 | Symbol **setup_array(index, length)
 79 | long index, *length;
 80 | {
 81 |     Symbol **array;
 82 |     long i, j = 0;
 83 |     array = NEW_ARRAY(wordlist[index].count + 1, Symbol *);
 84 |     for (i = 0; i < wordlist[index].count; i++)
 85 | 	if (symbol[index][i].id->found[1 - index])
 86 | 	    array[j++] = &symbol[index][i];
 87 |     *length = j;
 88 |     return(array);
 89 | }
 90 | /**********************************************************************/
 91 | 
 92 | void setup(filename)
 93 | char *filename[];
 94 | {
 95 |     long i, j;
 96 |     Word *word;
 97 |     Id *id;
 98 |     for (i = 0; i < 2; i++)
 99 |     {
100 | 	read_text(&text[i], filename[i], &textopt);
101 | 	if (i == 0 && textopt.found_header)
102 | 	    error("no correct file specified");
103 | 	find_words(&wordlist[i], &text[i]);
104 | 	symbol[i] = NEW_ARRAY(wordlist[i].count + 1, Symbol);
105 | 	j = 0;
106 | 	for (word = wordlist[i].first; word; word = word->next)
107 | 	{
108 | 	    id = table_lookup(&idtable, word->string);
109 | 	    if (!id)
110 | 	    {
111 | 		id = NEW(Id);
112 | 		id->key = (char *) word->string;
113 | 		table_insert(&idtable, id);
114 | 	    }
115 | 	    id->found[i] = True;
116 | 	    symbol[i][j++].id = id;
117 | 	}
118 |     }
119 |     a = setup_array(0, &m);
120 |     if (m > MAX_F)
121 | 	error("text stream is too long");
122 |     b = setup_array(1, &n);
123 | }
124 | /**********************************************************************/
125 | 
126 | long initial_f(k, prev_path, prev_k)
127 | long k, *prev_k;
128 | Path *prev_path;
129 | {
130 |     long value, result = 0;
131 |     if (prev_path)
132 |     {
133 | 	if (k > -prev_path->p)
134 | 	{
135 | 	    result = prev_path->f[(k - 1 + prev_path->p) >> 1];
136 | 	    *prev_k = k - 1;
137 | 	}
138 | 	if (k < prev_path->p)
139 | 	{
140 | 	    value = prev_path->f[(k + 1 + prev_path->p) >> 1] + 1;
141 | 	    if (value >= result)
142 | 	    {
143 | 		result = value;
144 | 		*prev_k = k + 1;
145 | 	    }
146 | 	}
147 |     }
148 |     return(result);
149 | }
150 | /**********************************************************************/
151 | 
152 | void compute_f(k, path)
153 | long k;
154 | Path *path;
155 | {
156 |     long i, j, value;
157 |     i = initial_f(k, path->prev, &value);
158 |     j = i + k;
159 |     while (i < m && j < n && a[i]->id == b[j]->id)
160 |     {
161 | 	i++;
162 | 	j++;
163 |     }
164 |     if (i == m)
165 | 	min_k = k + 1;
166 |     if (j == n)
167 | 	max_k = k - 1;
168 |     path->f[(k + path->p) >> 1] = i;
169 | }
170 | /**********************************************************************/
171 | 
172 | void compute_pathlist()
173 | {
174 |     long p = -1, k;
175 |     Path *path;
176 |     min_k = -m;
177 |     max_k = n;
178 |     while (min_k <= n - m)
179 |     {
180 | 	path = NEW(Path);
181 | 	path->p = ++p;
182 | 	path->f = NEW_ARRAY(p + 1, F);
183 | 	list_insert_last(&pathlist, path);
184 | 	k = -p;
185 | 	while (k <= p)
186 | 	{
187 | 	    if (k >= min_k && k <= max_k)
188 | 		compute_f(k, path);
189 | 	    k += 2;
190 | 	}
191 |     }
192 | }
193 | /**********************************************************************/
194 | 
195 | void obtain_matches()
196 | {
197 |     long k, f, start, prev_k;
198 |     Path *path;
199 |     k = n - m;
200 |     for (path = pathlist.last; path; path = path->prev)
201 |     {
202 | 	f = path->f[(k + path->p) >> 1];
203 | 	start = initial_f(k, path->prev, &prev_k);
204 | 	while (f > start)
205 | 	    a[--f]->recognized = True;
206 | 	k = prev_k;
207 |     }
208 | }
209 | /**********************************************************************/
210 | 
211 | void process_terms(termtable, length, occurs)
212 | Termtable *termtable;
213 | Wac length[], occurs[];
214 | {
215 |     long i, count, missed;
216 |     table_in_array(termtable);
217 |     for (i = 0; i < termtable->count; i++)
218 |     {
219 | 	count  = termtable->array[i]->wac.count;
220 | 	missed = termtable->array[i]->wac.missed;
221 | 	increment_wac(&wacdata.total, count, missed);
222 | 	increment_wac(&length[0], count, missed);
223 | 	increment_wac(&length[strlen(termtable->array[i]->key)], count, missed);
224 | 	if (occurs)
225 | 	{
226 | 	    increment_wac(&occurs[0], 1, (count == missed ? 1 : 0));
227 | 	    increment_wac(&occurs[min(count, MAX_OCCURRENCES + 1)], 1,
228 | 	    (count == missed ? 1 : 0));
229 | 	}
230 |     }
231 | }
232 | /**********************************************************************/
233 | 
234 | void process_phrases()
235 | {
236 |     long i, j;
237 |     Boolean recognized;
238 |     for (i = 0; i < wordlist[0].count; i++)
239 |     {
240 | 	recognized = True;
241 | 	for (j = 0; j < MAX_PHRASELENGTH && i + j < wordlist[0].count; j++)
242 | 	{
243 | 	    recognized &= symbol[0][i + j].recognized;
244 | 	    increment_wac(&wacdata.phrase[j + 1], 1, (recognized ? 0 : 1));
245 | 	}
246 |     }
247 | }
248 | /**********************************************************************/
249 | 
250 | void determine_wacdata()
251 | {
252 |     long i;
253 |     for (i = 0; i < wordlist[0].count; i++)
254 | 	add_term((is_stopword(symbol[0][i].id->key) ?
255 | 	&wacdata.stopword_table : &wacdata.non_stopword_table),
256 | 	symbol[0][i].id->key, 1, (symbol[0][i].recognized ? 0 : 1));
257 |     process_terms(&wacdata.stopword_table, wacdata.stopword, NULL);
258 |     process_terms(&wacdata.non_stopword_table, wacdata.non_stopword,
259 |     wacdata.distinct_non_stopword);
260 |     process_phrases();
261 | }
262 | /**********************************************************************/
263 | 
264 | main(argc, argv)
265 | int argc;
266 | char *argv[];
267 | {
268 |     initialize(&argc, argv, usage, option);
269 |     if (argc < 2 || argc > 3)
270 | 	error("invalid number of files");
271 |     init_stopwords(stopwordfilename);
272 |     setup(argv);
273 |     compute_pathlist();
274 |     obtain_matches();
275 |     determine_wacdata();
276 |     write_wacrpt(&wacdata, (argc == 3 ? argv[2] : NULL));
277 |     terminate();
278 | }
279 | 


--------------------------------------------------------------------------------
/src/util.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  util.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include <signal.h>
 26 | 
 27 | #if defined(__unix__) || defined(__MACH__)
 28 | #include <sys/types.h>
 29 | #include <sys/stat.h>
 30 | #else
 31 | #include <sys\types.h>
 32 | #include <sys\stat.h>
 33 | #endif
 34 | 
 35 | #include "util.h"
 36 | 
 37 | char *exec_name;
 38 | 
 39 | Boolean usage_when_no_args = True;
 40 | void (*usage_routine)();
 41 | 
 42 | void (*cleanup_routine)();
 43 | 
 44 | int errstatus = 1;
 45 | 
 46 | static short tempfile_id;
 47 | static void quit(/* int status */) __attribute__ ((noreturn));
 48 | 
 49 | /**********************************************************************/
 50 | 
 51 | void *allocate(number, size)
 52 | size_t number, size;
 53 | {
 54 |     void *p;
 55 |     p = calloc(number, size);
 56 |     if (!p)
 57 | 	error("unable to allocate memory");
 58 |     return(p);
 59 | }
 60 | /**********************************************************************/
 61 | 
 62 | int ustrcmp(s1, s2)
 63 | unsigned char *s1, *s2;
 64 | {
 65 |     long i;
 66 |     for (i = 0; s1[i] && s1[i] == s2[i]; i++);
 67 |     if (s1[i] || s2[i])
 68 | 	return(s1[i] < s2[i] ? -1 : 1);
 69 |     else
 70 | 	return(0);
 71 | }
 72 | /**********************************************************************/
 73 | 
 74 | FILE *open_file(filename, mode)
 75 | char *filename, *mode;
 76 | {
 77 |     FILE *f;
 78 |     if (!filename)
 79 | 	return(mode[0] == 'r' ? stdin : stdout);
 80 |     f = fopen(filename, mode);
 81 |     if (f)
 82 | 	return(f);
 83 |     if (mode[0] == 'w')
 84 | 	error_string("unable to create file", filename);
 85 |     else
 86 | 	error_string("unable to open file", filename);
 87 | }
 88 | /**********************************************************************/
 89 | 
 90 | void close_file(f)
 91 | FILE *f;
 92 | {
 93 |     if (f != stdin && f != stdout)
 94 | 	fclose(f);
 95 | }
 96 | /**********************************************************************/
 97 | 
 98 | Boolean file_exists(filename)
 99 | char *filename;
100 | {
101 |     struct stat buffer;
102 |     return(stat(filename, &buffer) == 0 ? True : False);
103 | }
104 | /**********************************************************************/
105 | 
106 | static char *create_tempfilename(id)
107 | short id;
108 | {
109 |     char name[100];
110 | #ifdef unix
111 |     sprintf(name, "/tmp/.%s%d-%d", exec_name, getpid(), id);
112 | #else
113 |     sprintf(name, "c:\\temp\\tempfile.%d", id);
114 | #endif
115 |     return(strdup(name));
116 | }
117 | /**********************************************************************/
118 | 
119 | char *tempfilename()
120 | {
121 |     char *name;
122 |     name = create_tempfilename(++tempfile_id);
123 |     unlink(name);
124 |     return(name);
125 | }
126 | /**********************************************************************/
127 | 
128 | static void delete_tempfiles()
129 | {
130 |     short i;
131 |     char *name;
132 |     for (i = 1; i <= tempfile_id; i++)
133 |     {
134 | 	name = create_tempfilename(i);
135 | 	unlink(name);
136 | 	free(name);
137 |     }
138 | }
139 | /**********************************************************************/
140 | 
141 | char *basefilename(pathname)
142 | char *pathname;
143 | {
144 | #ifdef unix
145 |     char delimiter = '/';
146 | #else
147 |     char delimiter = '\\';
148 | #endif
149 |     short i;
150 |     for (i = strlen(pathname) - 1; i >= 0 && pathname[i] != delimiter; i--);
151 |     return(&pathname[i + 1]);
152 | }
153 | /**********************************************************************/
154 | 
155 | static void handle_interrupt(signal)
156 | int signal;
157 | {
158 |     static Boolean handling_interrupt = False;
159 |     if (handling_interrupt)
160 | 	return;
161 |     handling_interrupt = True;
162 |     error("process killed");
163 | }
164 | /**********************************************************************/
165 | 
166 | static void trap_interrupts()
167 | {
168 |     signal(SIGINT, handle_interrupt);
169 |     signal(SIGTERM, handle_interrupt);
170 | }
171 | /**********************************************************************/
172 | 
173 | static void show_usage(usage)
174 | char *usage;
175 | {
176 |     if (usage_routine)
177 | 	(*usage_routine)();
178 |     else
179 | 	fprintf(stderr, "Usage:  %s %s\n", exec_name, usage);
180 |     terminate();
181 | }
182 | /**********************************************************************/
183 | 
184 | static Boolean split_option(arg, next_arg, option)
185 | char *arg, *next_arg;
186 | Option option[];
187 | {
188 |     short i;
189 |     if (!option)
190 | 	goto invalid_option;
191 |     for (i = 0; option[i].name && arg[1] != option[i].name; i++);
192 |     if (!option[i].name)
193 | 	goto invalid_option;
194 |     if (option[i].string && (arg[2] || next_arg))
195 |     {
196 | 	if (*option[i].string)
197 | 	    goto duplicate_option;
198 | 	*option[i].string = (arg[2] ? &arg[2] : next_arg);
199 | 	return(arg[2] ? False : True);
200 |     }
201 |     if (option[i].boolean && !arg[2])
202 |     {
203 | 	if (*option[i].boolean)
204 | 	    goto duplicate_option;
205 | 	*option[i].boolean = True;
206 | 	return(False);
207 |     }
208 | invalid_option:
209 |     error_string("invalid option", arg);
210 | duplicate_option:
211 |     error_string("duplicate option", arg);
212 | }
213 | /**********************************************************************/
214 | 
215 | static void parse_args(argc, argv, usage, option)
216 | int *argc;
217 | char *argv[], *usage;
218 | Option option[];
219 | {
220 |     short i, j = 0;
221 |     if (*argc == 1 && usage_when_no_args)
222 | 	show_usage(usage);
223 |     for (i = 1; i < *argc; i++)
224 | 	if (argv[i][0] == '-' && argv[i][1])
225 | 	{
226 | 	    if (strncmp("-help", argv[i], strlen(argv[i])) == 0)
227 | 		show_usage(usage);
228 | 	    if (split_option(argv[i], (i + 1 < *argc ? argv[i + 1] : NULL),
229 | 	    option))
230 | 		i++;
231 | 	}
232 | 	else
233 | 	    argv[j++] = argv[i];
234 |     *argc = j;
235 | }
236 | /**********************************************************************/
237 | 
238 | void initialize(argc, argv, usage, option)
239 | int *argc;
240 | char *argv[], *usage;
241 | Option option[];
242 | {
243 |     exec_name = basefilename(argv[0]);
244 |     trap_interrupts();
245 |     parse_args(argc, argv, usage, option);
246 | }
247 | /**********************************************************************/
248 | 
249 | static void quit(status)
250 | int status;
251 | {
252 |     if (cleanup_routine)
253 | 	(*cleanup_routine)();
254 |     delete_tempfiles();
255 |     exit(status);
256 | }
257 | /**********************************************************************/
258 | 
259 | void terminate()
260 | {
261 |     quit(0);
262 | }
263 | /**********************************************************************/
264 | 
265 | void error(message)
266 | char *message;
267 | {
268 |     fprintf(stderr, "%s: %s\n", exec_name, message);
269 |     quit(errstatus);
270 | }
271 | /**********************************************************************/
272 | 
273 | void error_string(message, string)
274 | char *message, *string;
275 | {
276 |     fprintf(stderr, "%s: %s \"%s\"\n", exec_name, message, string);
277 |     quit(errstatus);
278 | }
279 | 
280 | void warning_string(message, string)
281 | char *message, *string;
282 | {
283 |     fprintf(stderr, "%s: %s \"%s\"\n", exec_name, message, string);
284 | }
285 | 


--------------------------------------------------------------------------------
/src/wacrpt.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  wacrpt.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "sort.h"
 26 | #include "wacrpt.h"
 27 | #include "ocreval_version.h"
 28 | 
 29 | #define TITLE    "ocreval Word Accuracy Report Version " OCREVAL_VERSION "\n"
 30 | #define DIVIDER  "----------------------------------------\n"
 31 | 
 32 | #define TOTAL    " Total\n"
 33 | #define OFFSET   29
 34 | 
 35 | static char line[100];
 36 | 
 37 | /**********************************************************************/
 38 | 
 39 | void increment_wac(wac, count, missed)
 40 | Wac *wac;
 41 | long count, missed;
 42 | {
 43 |     wac->count  += count;
 44 |     wac->missed += missed;
 45 | }
 46 | /**********************************************************************/
 47 | 
 48 | void add_term(termtable, key, count, missed)
 49 | Termtable *termtable;
 50 | char *key;
 51 | long count, missed;
 52 | {
 53 |     Term *term;
 54 |     term = table_lookup(termtable, key);
 55 |     if (!term)
 56 |     {
 57 | 	term = NEW(Term);
 58 | 	term->key = strdup(key);
 59 | 	table_insert(termtable, term);
 60 |     }
 61 |     increment_wac(&term->wac, count, missed);
 62 | }
 63 | /**********************************************************************/
 64 | 
 65 | static Boolean read_line(f)
 66 | FILE *f;
 67 | {
 68 |     return(fgets(line, sizeof(line) - 1, f) ? True : False);
 69 | }
 70 | /**********************************************************************/
 71 | 
 72 | static Boolean read_one(f, value)
 73 | FILE *f;
 74 | long *value;
 75 | {
 76 |     return(read_line(f) && sscanf(line, "%ld", value) == 1 ? True : False);
 77 | }
 78 | /**********************************************************************/
 79 | 
 80 | static Boolean read_two(f, value1, value2)
 81 | FILE *f;
 82 | long *value1, *value2;
 83 | {
 84 |     return(read_line(f) && sscanf(line, "%ld %ld", value1, value2) == 2 ?
 85 |     True : False);
 86 | }
 87 | /**********************************************************************/
 88 | 
 89 | static long read_numbers(f, wac)
 90 | FILE *f;
 91 | Wac wac[];
 92 | {
 93 |     long count, missed, index, total_count = 0;
 94 |     if (read_line(f) && read_line(f))
 95 | 	while (read_two(f, &count, &missed))
 96 | 	{
 97 | 	    index = atoi(&line[OFFSET]);
 98 | 	    if (index == 0) {
 99 | 		if (strcmp(&line[OFFSET], TOTAL) == 0)
100 | 		    total_count = count;
101 | 		else /* excess */
102 | 		    index = MAX_OCCURRENCES + 1;
103 | 	    }
104 | 	    increment_wac(&wac[index], count, missed);
105 | 	}
106 |     return(total_count);
107 | }
108 | /**********************************************************************/
109 | 
110 | static void read_terms(f, termtable)
111 | FILE *f;
112 | Termtable *termtable;
113 | {
114 |     long count, missed;
115 |     if (read_line(f) && read_line(f))
116 | 	while (read_two(f, &count, &missed))
117 | 	{
118 | 	    line[strlen(line) - 1] = '\0';
119 | 	    add_term(termtable, &line[OFFSET], count, missed);
120 | 	}
121 | }
122 | /**********************************************************************/
123 | 
124 | void read_wacrpt(wacdata, filename)
125 | Wacdata *wacdata;
126 | char *filename;
127 | {
128 |     FILE *f;
129 |     long words, missed, stopwords, non_stopwords;
130 |     f = open_file(filename, "r");
131 |     if (read_line(f) && strncmp(line, TITLE, sizeof(TITLE) - 3) == 0 &&
132 |     read_line(f) && strcmp(line, DIVIDER) == 0 &&
133 |     read_one(f, &words) && read_one(f, &missed) &&
134 |     read_line(f) && read_line(f))
135 |     {
136 | 	increment_wac(&wacdata->total, words, missed);
137 | 	stopwords = read_numbers(f, wacdata->stopword);
138 | 	non_stopwords = read_numbers(f, wacdata->non_stopword);
139 | 	read_numbers(f, wacdata->distinct_non_stopword);
140 | 	if (words > 0)
141 | 	{
142 | 	    read_numbers(f, wacdata->phrase);
143 | 	    if (stopwords > 0)
144 | 		read_terms(f, &wacdata->stopword_table);
145 | 	    if (non_stopwords > 0)
146 | 		read_terms(f, &wacdata->non_stopword_table);
147 | 	}
148 |     }
149 |     else
150 | 	error_string("invalid format in", (filename ? filename : "stdin"));
151 |     close_file(f);
152 | }
153 | /**********************************************************************/
154 | 
155 | static void write_pct(f, wac)
156 | FILE *f;
157 | Wac *wac;
158 | {
159 |     if (wac->count == 0)
160 | 	fputs("  ------", f);
161 |     else
162 | 	fprintf(f, "%8.2f", 100.0 * (wac->count - wac->missed) / wac->count);
163 | }
164 | /**********************************************************************/
165 | 
166 | static void write_wac(f, wac)
167 | FILE *f;
168 | Wac *wac;
169 | {
170 |     if (wac)
171 |     {
172 | 	fprintf(f, "%8ld %8ld ", wac->count, wac->missed);
173 | 	write_pct(f, wac);
174 |     }
175 |     else
176 | 	fputs("   Count   Missed   %Right", f);
177 |     fputs("   ", f);
178 | }
179 | /**********************************************************************/
180 | 
181 | static void write_numbers(f, wac, limit, title, excess, total)
182 | FILE *f;
183 | Wac wac[];
184 | short limit;
185 | char *title;
186 | Boolean excess, total;
187 | {
188 |     short i;
189 |     fprintf(f, "\n%s\n", title);
190 |     write_wac(f, NULL);
191 |     fprintf(f, "%s\n", (excess ? "Occurs" : "Length"));
192 |     for (i = 1; i <= limit; i++)
193 | 	if (wac[i].count > 0)
194 | 	{
195 | 	    write_wac(f, &wac[i]);
196 | 	    fprintf(f, "    %2d\n", i);
197 | 	}
198 |     if (excess && wac[limit + 1].count > 0)
199 |     {
200 | 	write_wac(f, &wac[limit + 1]);
201 | 	fprintf(f, "   >%2d\n", limit);
202 |     }
203 |     if (total)
204 |     {
205 | 	write_wac(f, &wac[0]);
206 | 	fputs(TOTAL, f);
207 |     }
208 | }
209 | /**********************************************************************/
210 | 
211 | static int compare_term(term1, term2)
212 | Term *term1, *term2;
213 | {
214 |     return(ustrcmp(term1->key, term2->key));
215 | }
216 | /**********************************************************************/
217 | 
218 | static void write_terms(f, termtable, title)
219 | FILE *f;
220 | Termtable *termtable;
221 | char *title;
222 | {
223 |     long i;
224 |     table_in_array(termtable);
225 |     sort(termtable->count, termtable->array, compare_term);
226 |     fprintf(f, "\n%s\n", title);
227 |     write_wac(f, NULL);
228 |     fputc('\n', f);
229 |     for (i = 0; i < termtable->count; i++)
230 |     {
231 | 	write_wac(f, &termtable->array[i]->wac);
232 | 	fprintf(f, "%s\n", termtable->array[i]->key);
233 |     }
234 | }
235 | /**********************************************************************/
236 | 
237 | void write_wacrpt(wacdata, filename)
238 | Wacdata *wacdata;
239 | char *filename;
240 | {
241 |     FILE *f;
242 |     f = open_file(filename, "w");
243 |     fprintf(f, "%s%s", TITLE, DIVIDER);
244 |     fprintf(f, "%8ld   Words\n", wacdata->total.count);
245 |     fprintf(f, "%8ld   Misrecognized\n", wacdata->total.missed);
246 |     write_pct(f, &wacdata->total);
247 |     fputs("%  Accuracy\n", f);
248 |     write_numbers(f, wacdata->stopword, MAX_WORDLENGTH,
249 |     "Stopwords", False, True);
250 |     write_numbers(f, wacdata->non_stopword, MAX_WORDLENGTH,
251 |     "Non-stopwords", False, True);
252 |     write_numbers(f, wacdata->distinct_non_stopword, MAX_OCCURRENCES,
253 |     "Distinct Non-stopwords", True, True);
254 |     if (wacdata->total.count > 0)
255 |     {
256 | 	write_numbers(f, wacdata->phrase, MAX_PHRASELENGTH,
257 | 	"Phrases", False, False);
258 | 	if (wacdata->stopword[0].count > 0)
259 | 	    write_terms(f, &wacdata->stopword_table, "Stopwords");
260 | 	if (wacdata->non_stopword[0].count > 0)
261 | 	    write_terms(f, &wacdata->non_stopword_table, "Non-stopwords");
262 |     }
263 |     close_file(f);
264 | }
265 | 


--------------------------------------------------------------------------------
/src/vote.c:
--------------------------------------------------------------------------------
  1 | /**********************************************************************
  2 |  *
  3 |  *  vote.c
  4 |  *
  5 |  *  Author: Stephen V. Rice
  6 |  *  
  7 |  * Copyright 1996 The Board of Regents of the Nevada System of Higher
  8 |  * Education, on behalf, of the University of Nevada, Las Vegas,
  9 |  * Information Science Research Institute
 10 |  *
 11 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 12 |  * may not use this file except in compliance with the License.  You
 13 |  * may obtain a copy of the License at
 14 |  *
 15 |  *    http://www.apache.org/licenses/LICENSE-2.0
 16 |  *
 17 |  * Unless required by applicable law or agreed to in writing, software
 18 |  * distributed under the License is distributed on an "AS IS" BASIS,
 19 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 20 |  * implied. See the License for the specific language governing
 21 |  * permissions and limitations under the License.
 22 |  *
 23 |  **********************************************************************/
 24 | 
 25 | #include "sort.h"
 26 | #include "sync.h"
 27 | #include "table.h"
 28 | 
 29 | #define usage  "[-O] [-o outputfile] [-s m/n] [-w m/n] textfile1 textfile2 ..."
 30 | 
 31 | Boolean debug, optimize;
 32 | char *outputfilename, *sfraction, *wfraction;
 33 | 
 34 | Option option[] =
 35 | {
 36 |     'D', NULL,            &debug,
 37 |     'O', NULL,            &optimize,
 38 |     'o', &outputfilename, NULL,
 39 |     's', &sfraction,      NULL,
 40 |     'w', &wfraction,      NULL,
 41 |     '\0'
 42 | };
 43 | 
 44 | Textopt textopt = { True, True, 0, True, True };
 45 | 
 46 | #define MIN_VOTERS   2
 47 | #define MAX_VOTERS  16
 48 | 
 49 | typedef
 50 | struct
 51 | {
 52 |     short argnum;
 53 |     char *filename;
 54 |     Text text;
 55 |     double distance;
 56 | } Voter;
 57 | Voter *voter[MAX_VOTERS];
 58 | short num_voters, actual_voters = 3;
 59 | 
 60 | #define N  2
 61 | 
 62 | short suspect_threshold;
 63 | short suspect_weight = 1, unmarked_weight = 1;
 64 | 
 65 | BEGIN_ENTRY(Sequence)
 66 |     long count[MAX_VOTERS];
 67 |     float median;
 68 | END_ENTRY(Sequence);
 69 | 
 70 | BEGIN_TABLE_OF(Sequence, Seqlist)
 71 | END_TABLE(Seqtable);
 72 | Seqtable seqtable;
 73 | 
 74 | Text input[MAX_VOTERS], output;
 75 | 
 76 | struct
 77 | {
 78 |     Char *c;
 79 |     short num_votes;
 80 | } candidate[MAX_VOTERS];
 81 | short num_candidates;
 82 | 
 83 | /**********************************************************************/
 84 | 
 85 | Boolean valid_fraction(fraction, m, n)
 86 | char *fraction;
 87 | short *m, *n;
 88 | {
 89 |     if (fraction[0] >= '1' && fraction[1] == '/' && fraction[2] <= '9' &&
 90 |     fraction[0] <= fraction[2] && !fraction[3])
 91 |     {
 92 | 	*m = fraction[0] - '0';
 93 | 	*n = fraction[2] - '0';
 94 | 	return(True);
 95 |     }
 96 |     else
 97 | 	return(False);
 98 | }
 99 | /**********************************************************************/
100 | 
101 | void validate_args(argc, argv)
102 | int argc;
103 | char *argv[];
104 | {
105 |     short i, m, n;
106 |     if (argc < MIN_VOTERS || argc > MAX_VOTERS)
107 | 	error("invalid number of voters");
108 |     for (i = 0; i < argc; i++)
109 |     {
110 | 	voter[i] = NEW(Voter);
111 | 	voter[i]->argnum = i;
112 | 	voter[i]->filename = argv[i];
113 | 	read_text(&voter[i]->text, argv[i], &textopt);
114 |     }
115 |     num_voters = argc;
116 |     if (!optimize || num_voters < actual_voters)
117 | 	actual_voters = num_voters;
118 |     if (wfraction &&
119 |     !valid_fraction(wfraction, &suspect_weight, &unmarked_weight))
120 | 	error_string("invalid weight", wfraction);
121 |     if (sfraction) {
122 | 	if (valid_fraction(sfraction, &m, &n))
123 | 	    suspect_threshold = actual_voters * unmarked_weight * m / n;
124 | 	else
125 | 	    error_string("invalid threshold", sfraction);
126 |     }
127 | }
128 | /**********************************************************************/
129 | 
130 | void add_sequence(key, reject, index)
131 | char *key;
132 | Boolean reject;
133 | short index;
134 | {
135 |     Sequence *sequence;
136 |     sequence = table_lookup(&seqtable, key);
137 |     if (!sequence)
138 |     {
139 | 	sequence = NEW(Sequence);
140 | 	sequence->key = strdup(key);
141 | 	if (!reject)
142 | 	    sequence->median = 1;
143 | 	table_insert(&seqtable, sequence);
144 |     }
145 |     sequence->count[index]++;
146 | }
147 | /**********************************************************************/
148 | 
149 | void count_sequences(index)
150 | short index;
151 | {
152 |     Char *start, *c;
153 |     char key[N * STRING_SIZE], string[STRING_SIZE];
154 |     Boolean reject;
155 |     short i;
156 |     for (start = voter[index]->text.first; start; start = start->next)
157 |     {
158 | 	key[0] = '\0';
159 | 	reject = False;
160 | 	for (i = 0, c = start; i < N; i++, c = c->next)
161 | 	{
162 | 	    if (!c)
163 | 		return;
164 | 	    char_to_string(False, c->value, string, False);
165 | 	    strcat(key, string);
166 | 	    if (c->value == REJECT_CHARACTER)
167 | 		reject = True;
168 | 	}
169 | 	add_sequence(key, reject, index);
170 |     }
171 | }
172 | /**********************************************************************/
173 | 
174 | int compare_counts(count1, count2)
175 | long *count1, *count2;
176 | {
177 |     return(*count1 - *count2);
178 | }
179 | /**********************************************************************/
180 | 
181 | void compute_median(sequence)
182 | Sequence *sequence;
183 | {
184 |     static long *count[MAX_VOTERS];
185 |     long i;
186 |     if (!sequence->median)
187 | 	return;
188 |     if (!count[0])
189 | 	for (i = 0; i < num_voters; i++)
190 | 	    count[i] = NEW(long);
191 |     for (i = 0; i < num_voters; i++)
192 | 	*count[i] = sequence->count[i];
193 |     sort(i, count, compare_counts);
194 |     sequence->median =
195 |     (i & 1 ? *count[i / 2] : (*count[i / 2 - 1] + *count[i / 2]) / 2.0);
196 | }
197 | /**********************************************************************/
198 | 
199 | void compute_distance(index)
200 | short index;
201 | {
202 |     long i;
203 |     double difference;
204 |     for (i = 0; i < seqtable.count; i++)
205 |     {
206 | 	difference =
207 |         seqtable.array[i]->count[index] - seqtable.array[i]->median;
208 | 	if (difference < 0)
209 | 	    difference = -difference;
210 | 	voter[index]->distance += difference;
211 |     }
212 | }
213 | /**********************************************************************/
214 | 
215 | int compare_distances(voter1, voter2)
216 | Voter *voter1, *voter2;
217 | {
218 |     if (voter1->distance != voter2->distance)
219 | 	return(voter1->distance < voter2->distance ? -1 : 1);
220 |     return(voter1->argnum - voter2->argnum);
221 | }
222 | /**********************************************************************/
223 | 
224 | void select_voters()
225 | {
226 |     long i;
227 |     if (optimize)
228 |     {
229 | 	for (i = 0; i < num_voters; i++)
230 | 	    count_sequences(i);
231 | 	table_in_array(&seqtable);
232 | 	for (i = 0; i < seqtable.count; i++)
233 | 	    compute_median(seqtable.array[i]);
234 | 	for (i = 0; i < num_voters; i++)
235 | 	    compute_distance(i);
236 | 	sort(i, voter, compare_distances);
237 | 	if (debug)
238 | 	    for (i = 0; i < num_voters; i++)
239 | 		printf("%11.1f %s\n", voter[i]->distance, voter[i]->filename);
240 |     }
241 |     for (i = 0; i < actual_voters; i++)
242 | 	input[i] = voter[i]->text;
243 | }
244 | /**********************************************************************/
245 | 
246 | void place_vote(c)
247 | Char *c;
248 | {
249 |     short num_votes, i;
250 |     num_votes = unmarked_weight;
251 |     if (c)
252 |     {
253 | 	if (c->value == REJECT_CHARACTER)
254 | 	    return;
255 | 	if (c->suspect)
256 | 	    num_votes = suspect_weight;
257 | 	for (i = 0; i < num_candidates && (!candidate[i].c ||
258 | 	c->value != candidate[i].c->value); i++);
259 |     }
260 |     else
261 | 	for (i = 0; i < num_candidates && candidate[i].c; i++);
262 |     if (i < num_candidates)
263 | 	candidate[i].num_votes += num_votes;
264 |     else
265 |     {
266 | 	num_candidates++;
267 | 	candidate[i].c = c;
268 | 	candidate[i].num_votes = num_votes;
269 |     }
270 | }
271 | /**********************************************************************/
272 | 
273 | Boolean winner()
274 | {
275 |     short i, leader = 0;
276 |     if (num_candidates == 0)
277 |     {
278 | 	append_char(&output, False, REJECT_CHARACTER);
279 | 	return(True);
280 |     }
281 |     for (i = 1; i < num_candidates; i++)
282 | 	if (candidate[i].num_votes > candidate[leader].num_votes)
283 | 	    leader = i;
284 |     num_candidates = 0;
285 |     if (!candidate[leader].c)
286 | 	return(False);
287 |     append_char(&output,
288 |     (candidate[leader].num_votes <= suspect_threshold ? True : False),
289 |     candidate[leader].c->value);
290 |     return(True);
291 | }
292 | /**********************************************************************/
293 | 
294 | void perform_vote(synclist)
295 | Synclist *synclist;
296 | {
297 |     Sync *sync;
298 |     short i;
299 |     for (sync = synclist->first; sync; sync = sync->next)
300 | 	do
301 | 	    for (i = 0; i < actual_voters; i++)
302 | 		if (sync->substr[i].start <= sync->substr[i].stop)
303 | 		    place_vote(input[i].array[sync->substr[i].start++]);
304 | 		else
305 | 		    place_vote(NULL);
306 | 	while (winner());
307 | }
308 | /**********************************************************************/
309 | 
310 | main(argc, argv)
311 | int argc;
312 | char *argv[];
313 | {
314 |     Synclist synclist;
315 |     initialize(&argc, argv, usage, option);
316 |     validate_args(argc, argv);
317 |     select_voters();
318 |     synchronize(&synclist, actual_voters, input);
319 |     perform_vote(&synclist);
320 |     write_text(&output, outputfilename, NULL);
321 |     terminate();
322 | }
323 | 


--------------------------------------------------------------------------------
/test/test_accsum_graphic_characters.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: UTF-8 -*-
  3 | # Copyright 2017 Eddie Antonio Santos <easantos@ualberta.ca>
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Tests accsum on UTF-8 files.
 20 | """
 21 | 
 22 | import glob
 23 | import re
 24 | import shutil
 25 | import subprocess
 26 | import tempfile
 27 | import unicodedata
 28 | 
 29 | import os.path as p
 30 | from collections import namedtuple, OrderedDict
 31 | 
 32 | # Alias range as xrange in Python 3:
 33 | try:
 34 |     xrange
 35 | except NameError:
 36 |     xrange = range
 37 | 
 38 | # Create a Python 2/3 Unicode string literal:
 39 | try:
 40 |     unicode
 41 | except NameError:
 42 |     u = str
 43 | else:
 44 |     u = lambda s: s.decode('UTF-8')
 45 | 
 46 | 
 47 | # pathlib doesn't exist in Python 2, so make a small version of it:
 48 | class Path(str):
 49 |     def __div__(self, other):
 50 |         return Path(p.join(self, other))
 51 | 
 52 |     def exists(self):
 53 |         return p.exists(self)
 54 | 
 55 |     def create_file(self, filename, contents):
 56 |         with open(self / filename, 'w') as fp:
 57 |             fp.write(contents.encode('UTF-8'))
 58 |             if not contents.endswith('\n'):
 59 |                 fp.write(b'\x0A')
 60 | 
 61 | 
 62 | # Path to accuracy program
 63 | BIN_DIR = Path(p.dirname(p.dirname(p.realpath(__file__)))) / 'bin'
 64 | ACCURACY_BIN = BIN_DIR / 'accuracy'
 65 | ACCSUM_BIN = BIN_DIR / 'accsum'
 66 | assert ACCURACY_BIN.exists(), 'Could not find ' + ACCURACY_BIN
 67 | assert ACCSUM_BIN.exists(), 'Could not find ' + ACCSUM_BIN
 68 | 
 69 | 
 70 | class FilePair(namedtuple('FilePairBase', 'correct generated')):
 71 |     """
 72 |     Pair of tests that are written as documents. Then an accuracy report may
 73 |     be produced.
 74 |     """
 75 | 
 76 |     @property
 77 |     def prefix(self):
 78 |         return str(hash(self.correct)).replace('-', '_')
 79 | 
 80 |     def write_to_dir(self, directory):
 81 |         directory.create_file(self.correct_filename, self.correct)
 82 |         directory.create_file(self.generated_filename, self.generated)
 83 | 
 84 |     @property
 85 |     def correct_filename(self):
 86 |         return '%s_correct' % self.prefix
 87 | 
 88 |     @property
 89 |     def generated_filename(self):
 90 |         return '%s_generated' % self.prefix
 91 | 
 92 |     @property
 93 |     def report_filename(self):
 94 |         return '%s_report' % self.prefix
 95 | 
 96 |     def write_accuracy_report(self, directory):
 97 |         self.write_to_dir(directory)
 98 | 
 99 |         # Return name of the report?
100 |         subprocess.check_call([
101 |             ACCURACY_BIN,
102 |             directory / self.correct_filename,
103 |             directory / self.generated_filename,
104 |             directory / self.report_filename
105 |         ])
106 | 
107 | 
108 | class ClassResult(namedtuple('ResultBase', 'count missed right character')):
109 |     pass
110 | 
111 | 
112 | def extract_bracketed_char(text):
113 |     match = re.match('^{(.+)}$', text)
114 |     return match.group(1)
115 | 
116 | 
117 | def nfc(text):
118 |     """
119 |     Returns NFC normalized text.
120 |     """
121 |     return unicodedata.normalize('NFC', u(text))
122 | 
123 | 
124 | def nfd(text):
125 |     """
126 |     Returns NFD normalized text.
127 |     """
128 |     return unicodedata.normalize('NFD', u(text))
129 | 
130 | 
131 | class ClassReport(object):
132 |     """
133 |     Wraps an accuracy report.
134 |     """
135 |     def __init__(self, *results):
136 |         self._results = OrderedDict((
137 |             (result.character, result) for result in results
138 |         ))
139 | 
140 |     def __getitem__(self, key):
141 |         return self._results[key]
142 | 
143 |     def __iter__(self):
144 |         return iter(self._results)
145 | 
146 |     def __contains__(self, key):
147 |         return key in self._results
148 | 
149 |     @classmethod
150 |     def from_accuracy_report(cls, report_text):
151 |         lines = report_text.split('\n\n')[-1].rstrip('\n').split('\n')
152 | 
153 |         # Assert we've got the right header
154 |         count, missed, right = lines.pop(0).split()
155 |         assert count == u('Count')
156 |         assert missed == u('Missed')
157 |         assert right == u('%Right')
158 | 
159 |         def generate_results():
160 |             for line in lines:
161 |                 count, missed, right, char = line.lstrip().split(None, 3)
162 |                 char = extract_bracketed_char(char)
163 |                 yield ClassResult(int(count), int(missed), right, char)
164 | 
165 |         return cls(*list(generate_results()))
166 | 
167 | 
168 | class TemporaryDirectory(object):
169 |     """
170 |     Context manager: creates a temporary directory and removes it when
171 |     finished.
172 |     """
173 |     def __enter__(self):
174 |         self._name = Path(tempfile.mkdtemp())
175 |         return self._name
176 | 
177 |     def __exit__(self, *exc_info):
178 |         shutil.rmtree(self._name)
179 | 
180 | 
181 | def accsum(reports):
182 |     """
183 |     Runs accsum, returning a ClassReport (the final section in the report).
184 |     """
185 |     report_bytes = subprocess.check_output(
186 |         [ACCSUM_BIN] + reports,
187 |         stderr=subprocess.STDOUT
188 |     )
189 |     contents = report_bytes.decode('UTF-8')
190 | 
191 |     return ClassReport.from_accuracy_report(contents)
192 | 
193 | 
194 | tests = [
195 |     # Test some delimiting and special characters
196 |     FilePair(correct=  nfc("{{"),
197 |              generated=nfc("{<")),
198 |     FilePair(correct=  nfc("<<"),
199 |              generated=nfc("<{")),
200 |     FilePair(correct=  nfc("q\\z"),
201 |              generated=nfc("q|z")),
202 | 
203 |     # Latin scripts
204 |     FilePair(correct=  nfc("Mirosław"),
205 |              generated=nfc("Miroslaw")),
206 |     # From: https://fi.wikipedia.org/w/index.php?title=Tekstintunnistus&oldid=15178566
207 |     FilePair(correct=  nfc("""käsin kirjoittamalla"""),
208 |              generated=nfc("""kasin kirjoittämalla""")),
209 |     FilePair(correct=  nfc("""sähköisesti muokattavaan muotoon"""),
210 |              generated=nfc("""sähköisesti muökattavaan muotoon""")),
211 | 
212 |     # Combining characters. Notice the use of NFD (decomposed)
213 |     FilePair(correct  =nfd("q̃◌q̃"),
214 |              generated=nfd("q̃◌q̂")),
215 | 
216 |     # Hiragana
217 |     FilePair(correct=  nfc("""びょおいん"""),
218 |              generated=nfc("""びよおいん""")),
219 | 
220 |     # Emoji
221 |     FilePair(correct=  nfc("""💩"""),
222 |              generated=nfc("""👜""")),
223 | ]
224 | 
225 | # TODO: Change this for an ACTUAL expected report (this one is incomplete)
226 | expected_report = ClassReport.from_accuracy_report(u(
227 | r"""ocreval Accuracy Report Version 7.0
228 | -----------------------------------
229 | 
230 |    Count   Missed   %Right
231 |        9        0   100.00   {<\n>}
232 |        3        0   100.00   { }
233 |        2        1    50.00   {<}
234 |        2        1    50.00   {{}
235 |        1        1    50.00   {\}
236 |        1        0   100.00   {M}
237 |        8        1    80.00   {a}
238 |        1        0    80.00   {e}
239 |        1        0   100.00   {h}
240 |        6        0   100.00   {i}
241 |        1        0   100.00   {j}
242 |        4        0   100.00   {k}
243 |        2        0   100.00   {l}
244 |        3        0   100.00   {m}
245 |        3        0   100.00   {n}
246 |        6        1    83.33   {o}
247 |        3        0   100.00   {q}
248 |        2        0   100.00   {r}
249 |        5        0   100.00   {s}
250 |        6        0   100.00   {t}
251 |        2        0   100.00   {u}
252 |        1        0   100.00   {v}
253 |        1        0   100.00   {w}
254 |        1        0   100.00   {z}
255 |        1        1     0.00   {ł}
256 |        2        1    50.00   {ä}
257 |        1        0   100.00   {ö}
258 |        1        0   100.00   {び}
259 |        1        1     0.00   {ょ}
260 |        1        0   100.00   {お}
261 |        1        0   100.00   {い}
262 |        1        0   100.00   {ん}
263 |        1        1     0.00   {💩}
264 |        2        1    50.00   {◌̃}
265 |        1        0   100.00   {◌}
266 | """))
267 | 
268 | 
269 | def main(temp_dir):
270 |     # Create each individual accuracy report:
271 |     for test in tests:
272 |         test.write_accuracy_report(temp_dir)
273 | 
274 |     reports = glob.glob(temp_dir / '*_report')
275 |     assert len(reports) == len(tests)
276 | 
277 |     # Create the accuracy summary!
278 |     actual_report = accsum(reports)
279 | 
280 |     for char in expected_report:
281 |         # Check if the character is even in the report.
282 |         assert char in actual_report, (
283 |             '{%s} not in report: %r' % (char, set(actual_report))
284 |         )
285 | 
286 |         # Check that the counts match
287 |         expected, actual = expected_report[char], actual_report[char]
288 |         assert expected.count == actual.count, (
289 |             '{%s}: counts does not match expected: %d; actual: %d' % (
290 |                 char, expected.count, actual.count
291 |             )
292 |         )
293 |         assert expected.missed == actual.missed, (
294 |             '{%s}: #missed does not match expected: %d; actual: %d' % (
295 |                 char, expected.missed, actual.missed
296 |             )
297 |         )
298 | 
299 |     difference = set(actual_report) - set(expected_report)
300 |     assert len(difference) == 0, (
301 |         'Actual report has extra characters: %r' % (difference,)
302 |     )
303 | 
304 | 
305 | if __name__ == '__main__':
306 |     import sys
307 |     try:
308 |         _, flag = sys.argv
309 |     except:
310 |         debug = False
311 |     else:
312 |         debug = flag == '--debug'
313 | 
314 |     # Create temporary files for each...
315 |     with TemporaryDirectory() as temp_dir:
316 |         try:
317 |             main(temp_dir)
318 |         except subprocess.CalledProcessError as error:
319 |             sys.stderr.write("Error %d running command: %s" % (
320 |                 error.returncode,
321 |                 ' '.join(error.cmd)
322 |             ))
323 |             sys.stderr.write("\n")
324 | 
325 |             if error.output is not None:
326 |                 sys.stderr.write("\n--- stdout ---\n")
327 |                 sys.stderr.write(error.output)
328 | 
329 |             if debug:
330 |                 import pdb
331 |                 pdb.set_trace()
332 |             sys.exit(-1)
333 |         except AssertionError as error:
334 |             if debug:
335 |                 import pdb
336 |                 pdb.set_trace()
337 |             print(error.message)
338 |             sys.exit(-1)
339 | 


--------------------------------------------------------------------------------