├── service
    ├── root
    │   └── index.html
    ├── src
    │   ├── data
    │   ├── lang
    │   ├── lib
    │   ├── models
    │   ├── view
    │   │   ├── rules
    │   │   │   ├── empty.slp
    │   │   │   ├── nom.slp
    │   │   │   ├── redundant.slp
    │   │   │   ├── passive.slp
    │   │   │   ├── avoid.slp
    │   │   │   ├── bias.slp
    │   │   │   ├── complex.slp
    │   │   │   ├── ruleview.slp
    │   │   │   ├── homophone.slp
    │   │   │   ├── homophone2.slp
    │   │   │   ├── category.slp
    │   │   │   └── nomit.slp
    │   │   ├── problem.slp
    │   │   ├── metric.slp
    │   │   ├── suggestions.slp
    │   │   ├── rule.slp
    │   │   ├── service.slp
    │   │   ├── wordpress_gen.slp
    │   │   ├── tinymce.slp
    │   │   ├── error.slp
    │   │   ├── quality.slp
    │   │   ├── wordpress26.slp
    │   │   └── wordpress.slp
    │   └── local.sl
    └── code
    │   ├── compile.txt
    │   ├── build.xml
    │   └── src
    │       └── org
    │           └── dashnine
    │               └── preditor
    │                   ├── GuessLanguage.java
    │                   ├── SortFromHash.java
    │                   └── LanguageModelSmall.java
├── data
    └── rules
    │   ├── nohomophone.txt
    │   ├── grammar
    │       ├── indef_uncount
    │       ├── aux_modals
    │       ├── personal_pronoun_case
    │       ├── infinitives
    │       ├── det_agreement_plural
    │       ├── weare
    │       ├── were
    │       ├── an
    │       ├── whose
    │       ├── subject_verb_agreement
    │       ├── contractedformnot
    │       ├── dneg2
    │       ├── possessive
    │       ├── count
    │       ├── lay
    │       ├── its
    │       ├── aux_noparticiple
    │       ├── separate
    │       ├── your
    │       ├── too
    │       ├── apostrophes
    │       ├── their
    │       ├── its2
    │       ├── det_agreement
    │       ├── repeats
    │       ├── determiners
    │       ├── combine
    │       ├── aux_been_was
    │       ├── comprised
    │       └── aux_wrong_verb
    │   ├── abbr.txt
    │   ├── agreement
    │       ├── plural.r
    │       ├── single.r
    │       ├── chunk_single.r
    │       └── chunk_plural.r
    │   ├── pronouns.txt
    │   ├── complex
    │       ├── been
    │       └── misc
    │   ├── prepositions.txt
    │   ├── nomdb.txt
    │   ├── avoiddb.txt
    │   ├── hyphens.txt
    │   ├── foreigndb.txt
    │   ├── irregular_nouns.txt
    │   ├── biasdb.txt
    │   └── diacritic
    │       └── diaeresis
├── lib
    ├── sleep.jar
    ├── cngram.jar
    ├── moconti.jar
    ├── spellutils.jar
    ├── object.sl
    └── quality.sl
├── bin
    ├── quality.sh
    ├── dictgrep.sh
    ├── tagit.sh
    ├── testr.sh
    ├── corpuswp.sh
    ├── fixdata.sh
    ├── buildrules.sh
    ├── agreement.sh
    ├── transr.sh
    ├── amigo.sh
    ├── compilespelltools.sh
    ├── make3.sh
    ├── testgr.sh
    ├── corpus-lex-diff.sh
    ├── trainhomophones.sh
    ├── traintagger.sh
    ├── inspect.sh
    ├── smallmodel.sh
    ├── buildedits.sh
    ├── prepositions.sh
    ├── buildgrammarsets.sh
    ├── all.sh
    ├── trainspellnocontext.sh
    ├── buildmodel.sh
    ├── buildspelldata.sh
    ├── trainspellcontext.sh
    ├── buildhomodata.sh
    └── buildtaggersets.sh
├── atdconfig.sl
├── README.txt
├── utils
    ├── bigrams
    │   ├── printcorpus.sl
    │   ├── contextprob.sl
    │   ├── builddict.sl
    │   ├── corpuswp.sl
    │   ├── buildsmallmodel.sl
    │   ├── fixgutenberg.sl
    │   ├── inspect.sl
    │   ├── qscore.sl
    │   ├── amigo.sl
    │   ├── corpus-lex-diff.sl
    │   └── buildunigrams.sl
    ├── spelldata
    │   ├── makesrc.sl
    │   ├── process.sl
    │   ├── maker.sl
    │   ├── torules.sl
    │   ├── bootstrapspell.sl
    │   ├── gen2.sl
    │   ├── gen3.sl
    │   ├── gen.sl
    │   └── gen4.sl
    ├── tagger
    │   ├── tagit.sl
    │   ├── fixtags.sl
    │   ├── makebootstrap.sl
    │   ├── makesentences.sl
    │   └── postest.sl
    ├── common
    │   ├── score.sl
    │   ├── bywords.sl
    │   ├── utils.sl
    │   ├── hotest.sl
    │   ├── homo.sl
    │   ├── exp.sl
    │   ├── spellcontext.sl
    │   └── spelltests.sl
    ├── rules
    │   ├── agreement.sl
    │   ├── findprepositions.sl
    │   ├── makespecial.sl
    │   ├── transr.sl
    │   ├── testr.sl
    │   ├── makeprepositions.sl
    │   └── testgr.sl
    └── spell
    │   ├── seededits.sl
    │   └── definitions.sl
├── run-lowmem.bat
├── run.sh
├── run-lowmem.sh
├── CREDITS.rules.txt
├── CREDITS.txt
└── models
    └── get_model_binaries.sh


/service/root/index.html:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/service/src/data:
--------------------------------------------------------------------------------
1 | ../../data


--------------------------------------------------------------------------------
/service/src/lang:
--------------------------------------------------------------------------------
1 | ../../lang


--------------------------------------------------------------------------------
/service/src/lib:
--------------------------------------------------------------------------------
1 | ../../lib


--------------------------------------------------------------------------------
/service/src/models:
--------------------------------------------------------------------------------
1 | ../../models


--------------------------------------------------------------------------------
/service/src/view/rules/empty.slp:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/service/src/view/rules/nom.slp:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/service/src/view/rules/redundant.slp:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/rules/nohomophone.txt:
--------------------------------------------------------------------------------
1 | me
2 | based
3 | we
4 | 


--------------------------------------------------------------------------------
/service/src/local.sl:
--------------------------------------------------------------------------------
1 | # put local modifications to service here
2 | 


--------------------------------------------------------------------------------
/lib/sleep.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/sleep.jar


--------------------------------------------------------------------------------
/lib/cngram.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/cngram.jar


--------------------------------------------------------------------------------
/lib/moconti.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/moconti.jar


--------------------------------------------------------------------------------
/service/src/view/problem.slp:
--------------------------------------------------------------------------------
1 | <results>
2 |   <message><% $1 %></message>
3 | </results>
4 | 


--------------------------------------------------------------------------------
/lib/spellutils.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/spellutils.jar


--------------------------------------------------------------------------------
/service/src/view/rules/passive.slp:
--------------------------------------------------------------------------------
1 |   <p>
2 |    <% $1["rule"] %> - <em><% $2 %></em>
3 |   </p>
4 | 


--------------------------------------------------------------------------------
/bin/quality.sh:
--------------------------------------------------------------------------------
1 | java -Xmx3328M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/qscore.sl $1
2 | 


--------------------------------------------------------------------------------
/service/src/view/rules/avoid.slp:
--------------------------------------------------------------------------------
1 |   <p>Translation:</p>
2 |   <ul>
3 |    <li><% $1["word"] %></li>
4 |   </ul>
5 | 


--------------------------------------------------------------------------------
/bin/dictgrep.sh:
--------------------------------------------------------------------------------
1 | java -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/dictgrep.sl $1
2 | 


--------------------------------------------------------------------------------
/bin/tagit.sh:
--------------------------------------------------------------------------------
1 | #
2 | java -Datd.lowmem=true -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/tagit.sl $1
3 | 


--------------------------------------------------------------------------------
/bin/testr.sh:
--------------------------------------------------------------------------------
1 | java -Datd.lowmem=true -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testr.sl $1 $2
2 | 


--------------------------------------------------------------------------------
/data/rules/grammar/indef_uncount:
--------------------------------------------------------------------------------
1 | a|an &uncountable .*/RP|VBZ|IN::word=\1 \2::filter=none
2 | a|an &uncountable 0END.0::word=\1::filter=none 
3 | 
4 | 


--------------------------------------------------------------------------------
/service/src/view/metric.slp:
--------------------------------------------------------------------------------
1 |    <metric>
2 |       <type><% $1 %></type>
3 |       <key><% $2 %></key>
4 |       <value><% $3 %></value>
5 |    </metric>
6 | 


--------------------------------------------------------------------------------
/service/src/view/suggestions.slp:
--------------------------------------------------------------------------------
1 |     <suggestions>
2 | <?sleep 
3 |         map({ println((' ' x 8) . "<option> $+ $1 $+ </option>"); }, $1); 
4 | ?>    </suggestions>
5 | 


--------------------------------------------------------------------------------
/bin/corpuswp.sh:
--------------------------------------------------------------------------------
1 | #
2 | # convert a WordPress WXR file to raw data suitable for use in the AtD corpus
3 | #
4 | 
5 | java -Xmx3584M -jar lib/sleep.jar utils/bigrams/corpuswp.sl $1
6 | 


--------------------------------------------------------------------------------
/bin/fixdata.sh:
--------------------------------------------------------------------------------
1 | #
2 | # do this once!
3 | #
4 | 
5 | cd data
6 | tar zxf corpora.tgz
7 | cd ..
8 | java -Xmx1024M -jar lib/sleep.jar utils/bigrams/fixgutenberg.sl data/corpus_gutenberg
9 | 


--------------------------------------------------------------------------------
/data/rules/grammar/aux_modals:
--------------------------------------------------------------------------------
1 | may|might|could|would .*/VBN|VBG::word=\0 \1:base::pivots=\1,\1:base
2 | may|might|could|would .*/VBZ::word=\0 \1:singular::pivots=\1,\1:singular
3 | 
4 | 


--------------------------------------------------------------------------------
/service/src/view/rules/bias.slp:
--------------------------------------------------------------------------------
1 |    <p>Replace <em><% $2 %></em> with</p>
2 | 
3 |    <ul>
4 |    <?sleep map({ println('<li>'.$1.'</li>'); }, split(', ', $1["word"])) ?>
5 |    </ul>
6 | 


--------------------------------------------------------------------------------
/bin/buildrules.sh:
--------------------------------------------------------------------------------
1 | #
2 | # This script creates the AtD rules
3 | #
4 | 
5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/rules.sl
6 | 


--------------------------------------------------------------------------------
/bin/agreement.sh:
--------------------------------------------------------------------------------
1 | cd data/rules/agreement
2 | java -jar ../../../lib/sleep.jar ../../../utils/rules/agreement.sl chunk_single.r single.r chunk_plural.r plural.r >../grammar/agreement
3 | cd ../../..
4 | 


--------------------------------------------------------------------------------
/data/rules/grammar/personal_pronoun_case:
--------------------------------------------------------------------------------
 1 | #
 2 | # personal pronoun I is always uppercase.
 3 | #
 4 | 
 5 | i::word=I
 6 | i'll::word=I'll
 7 | i'm::word=I'm
 8 | i've::word=I've
 9 | i'd::word=I'd
10 | 


--------------------------------------------------------------------------------
/service/src/view/rules/complex.slp:
--------------------------------------------------------------------------------
1 |    <p>Replace <em><% $2 %></em> with</p>
2 | 
3 |    <ul>
4 |    <?sleep map({ println('<li>'.$1.'</li>'); }, suggestions2(split(", ", $1["word"]), $2)); ?>
5 |    </ul>
6 | 


--------------------------------------------------------------------------------
/bin/transr.sh:
--------------------------------------------------------------------------------
1 | #
2 | # run through a corpus and transform matching sentences using the specified rules.
3 | #
4 | 
5 | java -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/transr.sl $1 $2
6 | 


--------------------------------------------------------------------------------
/service/code/compile.txt:
--------------------------------------------------------------------------------
 1 | To compile this code:
 2 | 
 3 | 1. Create a symbolic link to 
 4 |    ln -s ../../lib lib
 5 | 
 6 | 2. Use Apache Ant to build everything
 7 |    ant clean
 8 |    ant
 9 |    cp spellutils.jar to lib
10 | 


--------------------------------------------------------------------------------
/bin/amigo.sh:
--------------------------------------------------------------------------------
1 | # find homophones in corpus for a language
2 | # ./bin/amigo.sh [language]
3 | 
4 | java -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lang=$1 -classpath lib/\* sleep.console.TextConsole utils/bigrams/amigo.sl
5 | 


--------------------------------------------------------------------------------
/bin/compilespelltools.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Compiles the Sleep methods ported to Java contained in service/code
 3 | # 
 4 | 
 5 | cd service/code
 6 | ln -s ../../lib/ lib
 7 | ant clean
 8 | ant
 9 | mv spellutils.jar lib/spellutils.jar
10 | rm -f lib
11 | ant clean
12 | 


--------------------------------------------------------------------------------
/atdconfig.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # configuration file for the Moconti app server
 3 | #
 4 | 
 5 | [$server addSite: "service.afterthedeadline.com",
 6 |                   "service/src/site.sl",
 7 |                   "service/root",
 8 |                   ".",
 9 |                   "key"];
10 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | After the Deadline - Open Source Language Checking Technology README
2 | ------------------
3 | 
4 | Documentation on this code is at http://open.afterthedeadline.com
5 | 
6 | See LICENSE.txt for license information.  Enjoy the software.
7 | 
8 | -- Raphael Mudge (rsmudge@gmail.com)
9 | 


--------------------------------------------------------------------------------
/bin/make3.sh:
--------------------------------------------------------------------------------
1 | java -Xmx1024M -jar sleep.jar gen3.sl corpus2 homophones.txt ho_test_gutenberg_context.txt
2 | java -Xmx1024M -jar sleep.jar gen2.sl corpus2 homophones.txt ho_train_gutenberg_context.txt
3 | java -Xmx1024M -jar sleep.jar gen3.sl /home/raffi/spell/corpus homophones.txt ho_test_wp_context.txt
4 | 


--------------------------------------------------------------------------------
/bin/testgr.sh:
--------------------------------------------------------------------------------
1 | java -Datd.lowmem=true -Xmx4048M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testgr.sl data/tests/grammar_wikipedia.txt
2 | java -Datd.lowmem=true -Xmx4048M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testgr.sl data/tests/grammar_gutenberg.txt
3 | 


--------------------------------------------------------------------------------
/service/src/view/rule.slp:
--------------------------------------------------------------------------------
1 | <h3><% invoke($1["recommendation"], @_) %></h3>
2 | 
3 | <p><% $1["description"] %> <% iff($1["source"] ne "", '<small>(<a href="' . $1["sourceurl"] . '" target="_new">' . $1["source"] . '</a>)</small>') %></p>
4 | 
5 | <?sleep display(getFileProper("service", "src", $1["view"]), $1, $2); ?>
6 | 


--------------------------------------------------------------------------------
/data/rules/grammar/infinitives:
--------------------------------------------------------------------------------
1 | # infinitive phrases
2 | # http://www.chompchomp.com/terms/infinitivephrase.htm
3 | 
4 | to is::filter=kill
5 | to .*/VBZ .*/DT|NN::word=\0 \1:base \2::pivots=\1,\1:base
6 | To .*/VBZ .*/DT|NN::word=\0 \1:base \2::pivots=\1,\1:base
7 | 
8 | need|going|have|ought to .*/VBG::word=\0 \1 \2:base::pivots=\2,\2:base
9 | 


--------------------------------------------------------------------------------
/bin/corpus-lex-diff.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # compare a corpus text file to the current wordlists and see what needs to be added 
 3 | #
 4 | 
 5 | # to generate a wordlist suitable for the AtD wordlists directory:
 6 | #
 7 | # ./bin/corpus-lex-diff.sh filename.txt 50 wordlist 
 8 | 
 9 | java -Xmx3072M -jar lib/sleep.jar utils/bigrams/corpus-lex-diff.sl $1 $2 $3
10 | 


--------------------------------------------------------------------------------
/utils/bigrams/printcorpus.sl:
--------------------------------------------------------------------------------
 1 | include("lib/nlp.sl");
 2 | 
 3 | $handle = openf(@ARGV[0]);
 4 | $data = readb($handle, -1);
 5 | closef($handle);
 6 | 
 7 | foreach $paragraph (splitByParagraph($data))
 8 | {
 9 |    println("PARAGRAPH BEGIN!");
10 | 
11 |    foreach $sentence ($paragraph)
12 |    {
13 |       println("     $sentence");
14 |    }
15 | }
16 | 


--------------------------------------------------------------------------------
/bin/trainhomophones.sh:
--------------------------------------------------------------------------------
1 | #
2 | # train and test the homophone misuse detection models
3 | 
4 | java -Datd.lowmem=true -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainHomophoneModels
5 | java -Datd.lowmem=true -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runHomophoneTests
6 | 


--------------------------------------------------------------------------------
/bin/traintagger.sh:
--------------------------------------------------------------------------------
1 | #
2 | # code to generate and evaluate the tagger models.
3 | #
4 | 
5 | java -Xmx3072M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/postrain.sl wikipedia_sentences_tagged_f.txt
6 | java -Xmx3072M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/postest.sl  data/gutenberg_sentences_tagged_f.txt
7 | 


--------------------------------------------------------------------------------
/run-lowmem.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | REM
3 | REM startup script for AtD web service
4 | REM
5 | java -Dfile.encoding=UTF-8 -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lowmem=true -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath .\lib\sleep.jar;.\lib\moconti.jar;.\lib\spellutils.jar httpd.Moconti atdconfig.sl
6 | 


--------------------------------------------------------------------------------
/data/rules/abbr.txt:
--------------------------------------------------------------------------------
 1 | Mr
 2 | Mrs
 3 | No
 4 | pp
 5 | St
 6 | no
 7 | Dr
 8 | Prof
 9 | Sr
10 | Bros
11 | etc
12 | vs
13 | esp
14 | Fig
15 | fig
16 | Jan
17 | Feb
18 | Mar
19 | Apr
20 | Jun
21 | Jul
22 | Aug
23 | Sep
24 | Sept
25 | Oct
26 | Nov
27 | Dec
28 | Ph.D
29 | PhD
30 | Lt
31 | LT
32 | 2Lt
33 | 1Lt
34 | Capt
35 | Maj
36 | Col
37 | Gen
38 | Brig
39 | Sgt
40 | Esq
41 | i.e
42 | e.g
43 | 


--------------------------------------------------------------------------------
/data/rules/agreement/plural.r:
--------------------------------------------------------------------------------
1 | *prefix* is a|the term|field::filter=kill::avoid=live, rest
2 | *prefix* is::word=*text* are, *transform*::filter=sane::avoid=live, rest
3 | *prefix* was::word=*text* were, *transform*::filter=sane::avoid=live, rest
4 | *prefix* doesn't::word=*text* don't, *transform*::filter=sane::avoid=live, rest
5 | *prefix* [a-z]+/VBZ::word=*text* \X:base, *transform*::filter=sane::avoid=live, rest
6 | 


--------------------------------------------------------------------------------
/bin/inspect.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # startup script for AtD web service
 3 | #
 4 | 
 5 | #!/bin/sh
 6 | 
 7 | export PRODUCTION=/home/atd
 8 | export ATD_HOME=/home/atd/atd
 9 | export LOG_DIR=$ATD_HOME/logs
10 | 
11 | export LC_CTYPE=en_US.UTF-8
12 | export LANG=en_US.UTF-8
13 | 
14 | java -Datd.lowmem=true -Dfile.encoding=UTF-8 -Xmx3512M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/inspect.sl
15 | 
16 | 


--------------------------------------------------------------------------------
/data/rules/grammar/det_agreement_plural:
--------------------------------------------------------------------------------
1 | #
2 | # determiner agreement rules for determiners expecting a plural noun
3 | #
4 | 
5 | Both|Many|Several|Many|Few|Fewer|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten [a-z].*/NN [a-z].*ed/VBD::word=\0 \1:plural \2::pivots=\1,\1:plural
6 | both|these|those|us|many|several|few|fewer|two|three|four|five|six|seven|eight|nine|ten [a-z].*/NN [a-z].*ed/VBD::word=\0 \1:plural::pivots=\1,\1:plural
7 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # startup script for AtD web service
 3 | #
 4 | 
 5 | #!/bin/sh
 6 | 
 7 | export ATD_HOME=.
 8 | export LOG_DIR=$ATD_HOME/logs
 9 | 
10 | java -server -Datd.lowmem=true -Dsleep.pattern_cache_size=8192 -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath "$ATD_HOME/lib/*" httpd.Moconti atdconfig.sl
11 | 


--------------------------------------------------------------------------------
/data/rules/grammar/weare:
--------------------------------------------------------------------------------
1 | if were|where .*/VBN|RB|VBG::word=\0 we're \2::pivots=\1,we're
2 | what were|where .*/VBN::word=\0 we're \2::filter=none
3 | what were|where .*/RB|VBG::word=\0 we're \2::pivots=\1,we're
4 | since were|where .*/RB|VBN|VBG::word=\0 we're \2::pivots=\1,we're
5 | that were|where .*/VBG::word=\0 we're \2::pivots=\1,we're
6 | where were::word=where we're::pivots=were,we're
7 | we're are::word=we are, where are::pivots=we're,we,where
8 | 


--------------------------------------------------------------------------------
/run-lowmem.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # startup script for AtD web service
 4 | #
 5 | 
 6 | export LC_CTYPE=en_US.UTF-8
 7 | export LANG=en_US.UTF-8
 8 | 
 9 | java -Dfile.encoding=UTF-8 -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lowmem=true -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath ./lib/sleep.jar:./lib/moconti.jar:./lib/spellutils.jar:./lib/* httpd.Moconti atdconfig.sl
10 | 


--------------------------------------------------------------------------------
/service/src/view/service.slp:
--------------------------------------------------------------------------------
 1 | <results>
 2 | <?sleep
 3 |     local('$error $rule $text $path $context @suggestions $unk $tags $index');
 4 |     foreach $error ($1)
 5 |     {
 6 |        ($rule, $text, $path, $context, @suggestions, $unk, $tags, $index) = $error;
 7 |        if ($rule !is $null)
 8 |        {
 9 |           display("service/src/view/error.slp", $text, $rule, $context, @suggestions, $tags, $index);
10 |        }
11 |     }
12 | ?></results>
13 | 


--------------------------------------------------------------------------------
/data/rules/pronouns.txt:
--------------------------------------------------------------------------------
 1 | # Personal Pronouns: 
 2 | # subjective, objective, reflective, possessive pronoun, possessive determiner
 3 | #
 4 | # http://wapedia.mobi/en/English_personal_pronouns
 5 | 
 6 | I, me, myself, mine, mine, my
 7 | we, us, ourselves, ours, our
 8 | you, you, yourselves, yours, your
 9 | he, him, himself, his, his
10 | she, her, herself, hers, her
11 | it, it, itself, its, its
12 | they, them, themselves, theirs, their
13 | who, whom, whose, whose
14 | 


--------------------------------------------------------------------------------
/data/rules/grammar/were:
--------------------------------------------------------------------------------
1 | were are|is|did|will::word=where \1::pivots=were,where
2 | is also were::word=is also where::pivots=were,where
3 | were .*/EX .*/VBZ::word=where \1 \2::pivots=were,where
4 | is were::word=is where::pivots=were,where
5 | where .*/VBN::word=were \1::pivot=where,were
6 | were .*/VB|VBP::word=where \1::pivot=were,where
7 | we|they|I|he|she where .*/NNP|VBN::word=\0 were \2::pivots=where,were
8 | who where::word=who were::pivots=where,were::options=where,were 
9 | 


--------------------------------------------------------------------------------
/bin/smallmodel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 
 3 | # Create a language model for low-memory AtD
 4 | #
 5 | rm -f models/model.zip
 6 | rm -rf tmp
 7 | mkdir tmp
 8 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildsmallmodel.sl
 9 | cd tmp
10 | 
11 | # we're using this instead of zip because zip on some systems creates corrupt
12 | # zip files when dealing with as many files as we have... get the JDK out.
13 | jar -cf ../models/model.zip . 1>/dev/null
14 | cd ..
15 | 


--------------------------------------------------------------------------------
/data/rules/complex/been:
--------------------------------------------------------------------------------
 1 | has been .*ing and .*ing::word=\0 \2:participle and \4:participle
 2 | have been .*ing::word=\0 \2:participle
 3 | has been .*ing::word=\0 \2:participle
 4 | had been .*ing::word=\0 \2:participle
 5 | They've|they've been .*ing::word=\0 \2:participle
 6 | You've|you've been .*ing::word=\0 \2:participle
 7 | I've been .*ing::word=\0 \2:participle
 8 | We've|we've been .*ing::word=\0 \2:participle
 9 | should've been .*ing::word=\0 \2:participle
10 | could've been .*ing::word=\0 \2:participle
11 | would've been .*ing::word=\0 \2:participle
12 | 


--------------------------------------------------------------------------------
/data/rules/grammar/an:
--------------------------------------------------------------------------------
 1 | #
 2 | # these rules pick up when a/an are misused
 3 | #
 4 | 
 5 | # killing errors related to an indef article with a number 
 6 | # have to solve the problem with hundreds, teens, etc.
 7 | An|A|a|an [\d+]\w+::filter=kill
 8 | a|an|A|An RPG|RSS|XSS|SEC::filter=kill
 9 | 
10 | a/.* [aeiouyhAEIOUYH18]\w+/.*::filter=indefarticle::word=an \1
11 | an/.* [^aeiAEIMNRSX8]\w+/.*::filter=indefarticle::word=a \1
12 | 0BEGIN.0 A/.* [aeiouyhAEIOUYH18]\w+/.*::filter=indefarticle::word=An \1
13 | 0BEGIN.0 An/.* [^aeiAEIMNRSX8]\w+/.*::filter=indefarticle::word=A \1
14 | 


--------------------------------------------------------------------------------
/data/rules/grammar/whose:
--------------------------------------------------------------------------------
 1 | Who's|who's .*ing::filter=kill
 2 | 
 3 | who's .*/NN::word=whose \1::pivots=who's,whose
 4 | whose .*/DT::word=who's \1::pivots=whose,who's
 5 | Who's .*/NN::word=Whose \1::pivots=Who's,Whose
 6 | Whose .*/DT::word=Who's \1::pivots=Whose,Who's
 7 | 
 8 | about who's::word=about whose::pivots=who's,whose::options=who's,whose
 9 | who's actual::word=whose actual::pivots=who's,whose::options=who's,whose
10 | who's name::word=whose name::pivots=who's,whose::options=who's,whose
11 | who's previous::word=whose previous::pivots=who's,whose::options=who's,whose  
12 | 


--------------------------------------------------------------------------------
/lib/object.sl:
--------------------------------------------------------------------------------
 1 | # everything you need for Sleep OO 
 2 | sub object 
 3 | { 
 4 |    local('$function'); 
 5 |    $function = function("& $+ $type $+ :: $+ $0"); 
 6 |    if ($function !is $null) 
 7 |    { 
 8 |       return invoke($function, @_, $0, $this => $this); 
 9 |    } 
10 |    throw "$type $+ :: $+ $0 - no such method"; 
11 | } 
12 | 
13 | sub newObject 
14 | { 
15 |    local('$object'); 
16 |    $object = lambda(&object, $type => $1); 
17 |    # invoke the constructor 
18 |    invoke($object, sublist(@_, 1), "init", $this => $object); 
19 |    return $object; 
20 | } 
21 | 


--------------------------------------------------------------------------------
/bin/buildedits.sh:
--------------------------------------------------------------------------------
1 | #
2 | # seed the edits model
3 | # This model is nothing more than a cache of potential edits for common word mispellings.  The purpose is to speed up processing.  AtD uses an LRU cache
4 | # when running to track and grow this information.  The seeding is done because the edits operation is so expensive that have this information available
5 | # makes training, testing, and warm up time significantly faster.
6 | #
7 | java  -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/seededits.sl sp_test_aspell_nocontext.txt sp_test_wpcm_nocontext.txt
8 | 


--------------------------------------------------------------------------------
/utils/bigrams/contextprob.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # a tool to inspect the language model
 3 | #
 4 | 
 5 | import org.dashnine.preditor.* from: lib/spellutils.jar;
 6 | use(^SpellingUtils);
 7 | 
 8 | # misc junk
 9 | include("lib/dictionary.sl");
10 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
11 | $model      = get_language_model();
12 | $dictionary = dictionary();
13 | $dsize      = size($dictionary);
14 | 
15 | $total = 0L;
16 | foreach $word ($dictionary) {
17 | 	$total += count($word);
18 | }
19 | 
20 | println($total);
21 | 


--------------------------------------------------------------------------------
/data/rules/grammar/subject_verb_agreement:
--------------------------------------------------------------------------------
 1 | # I rules with correctons
 2 | 
 3 | 0BEGIN.0 I is|be|are::word=I am::pivots=\1,am
 4 | 
 5 | # You rules with corrections
 6 | 
 7 | We|They|You is|am::word=\0 are::pivots=\1,are
 8 | We|They|You was::word=\0 were::pivots=\1,were
 9 | 0BEGIN.0 I has::word=\0 have::pivots=\1,have
10 | We|They|You has::word=\0 have::pivots=\1,have
11 | 
12 | # He/She rules with corrections
13 | 
14 | 0BEGIN.0 I were::word=\0 was::pivots=\1,was
15 | He|She were::word=\0 was::pivots=\1,was
16 | He|She have::word=\0 has::pivots=\1,has
17 | He|She be|am|are::word=\0 is::pivots=\1,is
18 | 
19 | 


--------------------------------------------------------------------------------
/service/src/view/rules/ruleview.slp:
--------------------------------------------------------------------------------
 1 | <div class="complex" name="<% unpack("H*", digest($1["path"]))[0] %>">
 2 |    <div style="float: right">
 3 |       <a class="button" href="javascript:display('<% unpack("H*", digest($1["path"]))[0] %>')" onclick="this.blur();">Display Rule</a>
 4 |       <a class="button" href="javascript:unignore('<% unpack("H*", digest($1["path"]))[0] %>')" onclick="this.blur();">Add Rule</a>
 5 |    </div>
 6 |    <h4><% $1["rule"] %>: <% $1["text"] %></h4>
 7 | 
 8 |    <p><% $1["description"] %> <small>(<a href="<% $1["sourceurl"] %>"><% $1["source"] %></a>)</small></p>
 9 | </div>
10 | 


--------------------------------------------------------------------------------
/data/rules/grammar/contractedformnot:
--------------------------------------------------------------------------------
 1 | ain't not::word=\0::filter=none
 2 | aren't not::word=\0::filter=none
 3 | can't not::word=\0::filter=none
 4 | couldn't not::word=\0::filter=none
 5 | didn't not::word=\0::filter=none
 6 | doesn't not::word=\0::filter=none
 7 | don't not::word=\0::filter=none
 8 | hasn't not::word=\0::filter=none
 9 | isn't not::word=\0::filter=none
10 | mightn't not::word=\0::filter=none
11 | mustn't not::word=\0::filter=none
12 | shan't not::word=\0::filter=none
13 | shouldn't not::word=\0::filter=none
14 | weren't not::word=\0::filter=none
15 | won't not::word=\0::filter=none
16 | wouldn't not::word=\0::filter=none
17 | 


--------------------------------------------------------------------------------
/service/src/view/rules/homophone.slp:
--------------------------------------------------------------------------------
 1 | <?sleep
 2 |    global('%homodict');
 3 |    if (size(%homodict) == 0)
 4 |    {
 5 |       local('$handle $text $word $def');
 6 |       $handle = openf("data/rules/definitions.txt");
 7 |       while $text (readln($handle))
 8 |       {
 9 |          ($word, $def) = split('\t+', $text);
10 |          %homodict[$word] = $def;
11 |       }
12 |    }
13 | ?>
14 | 
15 |    <p>Review definitions:</p>
16 | 
17 |    <ul>
18 |    <?sleep map({ println('<li><b>'.$1.'</b><br>' . %homodict[$1] . '</li>'); }, map({ return iff($1 in %homodict, $1, baseVerb($1)); }, split(', ', $1["word"]))); ?>
19 |    </ul>
20 | 
21 | 


--------------------------------------------------------------------------------
/bin/prepositions.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # code to generate rules for prepositions
 3 | #
 4 | 
 5 | echo '#' >data/rules/grammar/prepositions
 6 | echo '# This file is automatically generated by ./bin/prepositions.sh - do not edit' >> data/rules/grammar/prepositions
 7 | echo '#' >> data/rules/grammar/prepositions
 8 | 
 9 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/findprepositions.sl >preps.tmp
10 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/makeprepositions.sl preps.tmp >>data/rules/grammar/prepositions
11 | rm -f preps.tmp
12 | 


--------------------------------------------------------------------------------
/service/src/view/rules/homophone2.slp:
--------------------------------------------------------------------------------
 1 | <?sleep
 2 |    global('%homodict');
 3 |    if (size(%homodict) == 0)
 4 |    {
 5 |       local('$handle $text $word $def');
 6 |       $handle = openf("data/rules/definitions.txt");
 7 |       while $text (readln($handle))
 8 |       {
 9 |          ($word, $def) = split('\t+', $text);
10 |          %homodict[$word] = $def;
11 |       }
12 |    }
13 | ?>
14 | 
15 |    <p>Review definitions:</p>
16 | 
17 |    <ul>
18 |    <?sleep map({ println('<li><b>'.$1.'</b><br>' . %homodict[$1] . '</li>'); }, map({ return iff($1 in %homodict, $1, baseVerb($1)); }, split(',', $1["options"]))); ?>
19 |    </ul>
20 | 
21 | 


--------------------------------------------------------------------------------
/bin/buildgrammarsets.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # build grammar corpora
 3 | #
 4 | 
 5 | if [ -f wp.txt ]
 6 | then
 7 | 
 8 |   java -jar lib/sleep.jar utils/spelldata/torules.sl wrong >rules.out
 9 | 
10 |   # make the grammar rules files
11 | 
12 |   java -jar lib/sleep.jar utils/spelldata/maker.sl rules.out data/wikipedia_sentences.txt >data/tests/grammar_wikipedia.txt
13 |   java -jar lib/sleep.jar utils/spelldata/maker.sl rules.out data/gutenberg_sentences.txt >data/tests/grammar_gutenberg.txt
14 | 
15 |   rm -f rules.out
16 | 
17 | else
18 |   echo "No wp.txt file is present, cut and paste Wikipedia Common Errors List to wp.txt and try again"
19 | 
20 | fi
21 | 


--------------------------------------------------------------------------------
/data/rules/grammar/dneg2:
--------------------------------------------------------------------------------
 1 | #
 2 | # Style Double Negatives
 3 | #
 4 | 
 5 | not a|an unifable|unified|uniformed|unifying|united|undulated|undulating|universalized|universalised|unrest|(.*?der)|university|understood|understanding::filter=kill
 6 | not unifable|unified|uniformed|unifying|united|undulated|undulating|universalized|universalised|unrest|(.*?der)|university|understood|understanding::filter=kill
 7 | 
 8 | not a|an un[aeiouy].*::word=an \2:positive
 9 | not a|an un[^aeiouy].*::word=a \2:positive
10 | not un.*::word=\1:positive
11 | 
12 | # another double negative rule.  Changes the meaning of the sentence but is easier to understand
13 | dont have|need no::word=\0 \1 any::pivots=no,any
14 | 


--------------------------------------------------------------------------------
/bin/all.sh:
--------------------------------------------------------------------------------
 1 | ./bin/compilespelltools.sh # don't do this as the build box doesn't have ant on it (yet)
 2 | 
 3 | #
 4 | # set some vars that may help the cause.
 5 | #
 6 | export LC_CTYPE=en_US.UTF-8
 7 | export LANG=en_US.UTF-8
 8 | 
 9 | #
10 | # build the foundational NLP models
11 | #
12 | ./bin/buildmodel.sh
13 | #./bin/buildtaggersets.sh  # do not uncomment this
14 | 
15 | #
16 | # intermediate stuff
17 | #
18 | ./bin/buildrules.sh
19 | ./bin/testgr.sh
20 | ./bin/buildedits.sh
21 | 
22 | #
23 | # train various models
24 | #
25 | #./bin/traintagger.sh     # no good reason to do this unless tagger data changes
26 | ./bin/trainspellcontext.sh
27 | ./bin/trainspellnocontext.sh
28 | ./bin/trainhomophones.sh
29 | 


--------------------------------------------------------------------------------
/service/src/view/rules/category.slp:
--------------------------------------------------------------------------------
 1 | <div class="<% $2["style"] %>box" name="<% $1 %>">
 2 |    <h4><% $2["rule"] %></h4>
 3 | 
 4 |    <p><% $2["description"] %> <% iff($2["source"] ne "", '<small>(<a href="'.$2["sourceurl"].'">'.$2["source"].'</a>)</small>') %></p>
 5 | 
 6 |    <ul name="list.<% $1 %>">
 7 |    <?sleep
 8 |       local('$ex');
 9 | 
10 |       foreach $ex (filter(let({ return iff($1["category"] eq $cat, $1); }, $cat => $1), @exclude))
11 |       {
12 |          println('<li name="'. $ex["id"] .'">');
13 |          println('   <a href="javascript:unignore(\''. $ex["id"] .'\',\''.$1.'\')" onclick="this.blur();">'.$ex["text"].'</a>');
14 |          println('</li>');
15 |       }
16 |    ?>
17 |    </ul>
18 | </div>
19 | 


--------------------------------------------------------------------------------
/data/rules/grammar/possessive:
--------------------------------------------------------------------------------
 1 | #
 2 | # errors related to possession vs. plural 
 3 | #
 4 | 
 5 | Your|your|My|my|Their|their|Her|her|His|his|That|The|the|that [a-z].*/NNS .*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
 6 | with|a|an|With|A|an [a-z].*/NNS .*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
 7 | 
 8 | before|after|in|before|during|at [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
 9 | Before|After|In|Before|During|At [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
10 | about|in|for|on|with [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
11 | 
12 | 


--------------------------------------------------------------------------------
/utils/bigrams/builddict.sl:
--------------------------------------------------------------------------------
 1 | # 
 2 | # This is a script to generate a spellchecker dictionary using the specified threshold.  It's fun stuff.
 3 | #
 4 | # java -jar sleep.jar builddict.sl threshold models/model.bin models/dictionary.txt
 5 | #
 6 | 
 7 | debug(7 | 34);
 8 | 
 9 | import org.dashnine.preditor.* from: lib/spellutils.jar;
10 | use(^SpellingUtils);
11 | 
12 | include("lib/dictionary.sl");
13 | 
14 | sub main
15 | {
16 |    global('$model $threshold $handle $index $1 $2');
17 |    $model   = get_language_model($2);
18 | 
19 |    $handle = openf(iff($2 is $null, ">models/dictionary.txt", "> $+ $3"));
20 | 
21 |    printAll($handle, [SleepUtils getArrayWrapper: [$model harvest: int($1)]]);
22 | 
23 |    closef($handle);
24 | }
25 | 
26 | invoke(&main, @ARGV);
27 | 


--------------------------------------------------------------------------------
/CREDITS.rules.txt:
--------------------------------------------------------------------------------
 1 | The AtD rule set was inspired from many resources and projects around the web.  
 2 | The following resources were particularly helpful:
 3 | 
 4 | LanguageTool Open Source Language Checker
 5 | http://www.languagetool.org
 6 | 
 7 | PlainLanguage.gov
 8 | http://www.plainlanguage.gov
 9 | 
10 | GNU Style and Diction
11 | http://www.gnu.org/software/diction/diction.html
12 | 
13 | Wikipedia
14 | http://en.wikipedia.org/wiki/Category:Wikipedia_style_guidelines
15 | http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings
16 |   (and many other lists...)
17 | 
18 | Graviax Grammar Checker
19 | http://graviax.sourceforge.net/
20 | 
21 | Cliches: Avoid Them Like the Plague
22 | http://suspense.net/whitefish/cliche.htm
23 | 
24 | WordNet - Lexical Database for English
25 | http://wordnet.princeton.edu/
26 | 


--------------------------------------------------------------------------------
/utils/spelldata/makesrc.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # transform the homophonesdb file into something our other scripts can handle
 3 | # using the bad\ngood format.
 4 | 
 5 | ($inh, $outh) = @ARGV;
 6 | 
 7 | $handle = openf("models/dictionary.txt");
 8 | putAll(%dictionary, readAll($handle), { return 1; });
 9 | closef($handle);
10 | 
11 | $handle = openf($inh);
12 | @data = readAll($handle);
13 | closef($handle);
14 | 
15 | $handle = openf("> $+ $outh");
16 | foreach $d (@data)
17 | {
18 |    @words = split(',\s*', $d);
19 |    foreach $w1 (@words)
20 |    {
21 |       foreach $w2 (@words)
22 |       {
23 |          if ($w1 ne $w2 && $w1 in %dictionary && $w2 in %dictionary)
24 |          {
25 |             println($handle, "$w2");
26 |             println($handle, "$w1");
27 |          }
28 |       }
29 |    }
30 | }
31 | closef($handle);
32 | 


--------------------------------------------------------------------------------
/utils/bigrams/corpuswp.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # Export posts (only!) from a WordPress WXR file and make the content as plain text as possible.
 3 | # use this to preprocess a file for adding to data/corpus_extra
 4 | #
 5 | 
 6 | $handle = openf(@ARGV[0]);
 7 | $data = readb($handle, -1);
 8 | closef($handle);
 9 | 
10 | $data = join(' ', split("\n|\r", $data));
11 | @data = matches($data, '\<content\:encoded\>\<\!\[CDATA\[(.*?)\]\]\>\</content\:encoded\>');
12 | 
13 | foreach $index => $data (@data)
14 | {
15 |    if (strlen($data) > 0)
16 |    {
17 |       $data = strrep($data, '&amp;', '&', '&nbsp;', ' ', '<br>', "\n", '<p>', "\n", '&quote;', '"', '&#8220;', "'", '&#8221;', "'", '&#8217;', "'", '&laquo;', '"', '&raquo;', '"', '&rsquo;', "'");
18 |       $data = replace($data, '(<[^>]*?>)', '');
19 |       println($data);
20 |    }
21 | }
22 | 


--------------------------------------------------------------------------------
/bin/trainspellnocontext.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # train and test the spellchecker models
 3 | #
 4 | 
 5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainNoContext
 6 | 
 7 | echo "=== NON-CONTEXTUAL DATA ======================================================================="
 8 | 
 9 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingTest sp_test_aspell_nocontext.txt
10 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingTest sp_test_wpcm_nocontext.txt
11 | 
12 | # normal spelling test
13 | #java -Xmx1024M -jar lib/sleep.jar utils/spell/test.sl runSpellingTest tests1.txt
14 | #java -Xmx1024M -jar lib/sleep.jar utils/spell/test.sl runSpellingTest tests2.txt
15 | 


--------------------------------------------------------------------------------
/utils/tagger/tagit.sl:
--------------------------------------------------------------------------------
 1 | # this script simply tags sentences in a file.  it assumes each setence is on a line by itself.
 2 | 
 3 | include("lib/engine.sl");
 4 | include("utils/rules/rules.sl");
 5 | 
 6 | sub initAll
 7 | {
 8 |    global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
 9 |    $model      = get_language_model();
10 |    $dsize      = size($dictionary);
11 |    $hnetwork   = get_network("hnetwork.bin");
12 |    $verbs      = loadVerbData();
13 |    initTaggerModels();
14 | }
15 | 
16 | sub main
17 | {
18 |    local('$handle $sentence @results @past');
19 | 
20 |    initAll();
21 | 
22 |    $handle = openf($1);
23 |    while $sentence (readln($handle))
24 |    {
25 |       println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
26 |    }
27 | } 
28 | 
29 | invoke(&main, @ARGV);
30 | 


--------------------------------------------------------------------------------
/utils/tagger/fixtags.sl:
--------------------------------------------------------------------------------
 1 | sub main
 2 | {
 3 |    local('$handle $sentence $entry $word $tag $previous @s');
 4 |    $handle = openf($1);
 5 |    while $sentence (readln($handle))
 6 |    {
 7 |       @s = @();
 8 | 
 9 |       foreach $entry (split(' ', $sentence))
10 |       {
11 |          ($word, $tag) = split('/', $entry);
12 |          if ("'" isin $word && size(@s) > 0)
13 |          {
14 |             if ($tag eq "''")
15 |             {
16 |                @s[-1] = @(@s[-1][0] . $word, @s[-1][1]);
17 |             }
18 |             else
19 |             {
20 |                @s[-1] = @(@s[-1][0] . $word, @s[-1][1] . ',' . $tag);
21 |             }
22 |          }
23 |          else
24 |          {
25 |             push(@s, @(lc($word), $tag));
26 |          }
27 | 
28 |       }
29 |       println(  join(" ", map({ return join('/', $1); }, @s)) );
30 |    }
31 | }
32 | 
33 | invoke(&main, @ARGV);
34 | 


--------------------------------------------------------------------------------
/data/rules/prepositions.txt:
--------------------------------------------------------------------------------
 1 | about
 2 | above
 3 | according to
 4 | across
 5 | after
 6 | against
 7 | along
 8 | along with
 9 | among
10 | apart from
11 | around
12 | as
13 | as for
14 | at
15 | because of
16 | before
17 | behind
18 | below
19 | beneath
20 | beside
21 | between
22 | beyond
23 | but*
24 | by
25 | by means of
26 | concerning
27 | despite
28 | down
29 | during
30 | except
31 | except for
32 | excepting
33 | for
34 | from
35 | in
36 | in addition to
37 | in back of
38 | in case of
39 | in front of
40 | in place of
41 | inside
42 | in spite of
43 | instead of
44 | into
45 | like
46 | near
47 | next
48 | of
49 | off
50 | on
51 | onto
52 | on top of
53 | out
54 | out of
55 | outside
56 | over
57 | past
58 | regarding
59 | round
60 | since
61 | through
62 | throughout
63 | till
64 | to
65 | toward
66 | under
67 | underneath
68 | unlike
69 | until
70 | up
71 | upon
72 | up to
73 | with
74 | within
75 | without
76 | 


--------------------------------------------------------------------------------
/utils/spelldata/process.sl:
--------------------------------------------------------------------------------
 1 | $handle = openf("spelling.txt");
 2 | 
 3 | global('%dataset');
 4 | 
 5 | while $bad (readln($handle))
 6 | {
 7 |    $good = readln($handle);
 8 |    %dataset[$bad] = $good;
 9 | }
10 | 
11 | closef($handle);
12 | 
13 | 
14 | $handle = openf("batch0.tab");
15 | while $text (readln($handle))
16 | {
17 |    ($bad, $good) = split('\s+', $text);
18 |    %dataset[$bad] = $good;
19 | }
20 | 
21 | closef($handle);
22 | 
23 | $handle = openf("batch0.tab.1");
24 | while $text (readln($handle))
25 | {
26 |    ($bad, $good) = split('\s+', $text);
27 |    %dataset[$bad] = $good;
28 | }
29 | 
30 | closef($handle);
31 | 
32 | $handle = openf(">output.txt");
33 | $handle2 = openf(">output2.txt");
34 | 
35 | @bads = sorta(keys(%dataset));
36 | foreach $bword (@bads)
37 | {
38 |    println($handle, $bword);
39 |    println($handle2, $bword);
40 |    println($handle, %dataset[$bword]);
41 | }
42 | 
43 | closef($handle);
44 | closef($handle2);
45 | 


--------------------------------------------------------------------------------
/bin/buildmodel.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # This script creates the AtD bigram model (corpus.zip)
 3 | #
 4 | 
 5 | java -version
 6 | 
 7 | rm -f models/model.bin
 8 | 
 9 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_gutenberg
10 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_wikipedia
11 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_extra
12 | 
13 | # build dictionary (make sure it's done *after* zipping)
14 | 
15 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:NewSize=512M -jar lib/sleep.jar utils/bigrams/builddict.sl 2
16 | 
17 | # create the not misspelled dictionary...
18 | 
19 | cp data/wordlists/accented.txt models/not_misspelled.txt
20 | 
21 | # create LM for low-memory AtD
22 | ./bin/smallmodel.sh
23 | 


--------------------------------------------------------------------------------
/service/src/view/wordpress_gen.slp:
--------------------------------------------------------------------------------
 1 | <html>
 2 |  <head>
 3 |    <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
 4 |    <title><% $1["rule"] %></title>
 5 |    <style>
 6 |    body
 7 |    {
 8 |       background: #f1f1f1;
 9 |       font-family:"Lucida Grande",Verdana,Arial,sans-serif;
10 |       font-size:12px;
11 |       font-size-adjust:none;
12 |    }
13 | 
14 |    h1, h2, h3, h4
15 |    {
16 |       padding: 0;
17 |       margin: 0;
18 |    }
19 | 
20 |    #content
21 |    {
22 |       background: #ffffff;
23 |       border: solid 1px #dfdfdf;
24 |       margin: 0.25em;
25 |       padding: 1.5em;
26 |       height: 292px;
27 |    }
28 | 
29 |    a
30 |    {
31 |       color: #259ac8;
32 |    }
33 | 
34 |    </style>
35 |  </head>
36 |  <body>
37 | 
38 | <h1><% $1["rule"] %></h1>
39 | 
40 | <div id="content">
41 |   <?sleep display("service/src/view/rule.slp", $1, $2); ?>
42 | </div>
43 |  </body>
44 | </html>
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/bin/buildspelldata.sh:
--------------------------------------------------------------------------------
 1 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/sp_test_wpcm_nocontext.txt data/tests/sp_test_gutenberg_context1.txt
 2 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/sp_test_aspell_nocontext.txt data/tests/sp_test_gutenberg_context2.txt
 3 | 
 4 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_wikipedia data/tests/sp_test_wpcm_nocontext.txt data/tests/sp_test_wp_context1.txt
 5 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_wikipedia data/tests/sp_test_aspell_nocontext.txt data/tests/sp_test_wp_context2.txt
 6 | 
 7 | #java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/train.txt data/tests/sp_train_gutenberg_context.txt
 8 | #echo "are * blind|you, oyu" >>data/tests/sp_train_gutenberg_context.txt
 9 | 
10 | 


--------------------------------------------------------------------------------
/data/rules/grammar/count:
--------------------------------------------------------------------------------
 1 | # lowercase
 2 | fewer &uncountable::word=less \1::filter=indefarticle
 3 | &uncountable or fewer::word=\0 or less::filter=none
 4 | few &uncountable::word=little \1::filter=indefarticle
 5 | 
 6 | the less::filter=die
 7 | less &uncountable::filter=kill
 8 | less .*/NNS::word=fewer \1::filter=indefarticle
 9 | 
10 | little people::filter=kill
11 | little &uncountable::word=few \1::filter=indefarticle
12 | 
13 | # uppercase
14 | Fewer &uncountable::word=Less \1::filter=indefarticle
15 | Few &uncountable::word=Little \1::filter=indefarticle
16 | 
17 | The less::filter=die
18 | Less &uncountable::filter=kill
19 | Less .*/NNS::word=Fewer \1::filter=indefarticle
20 | 
21 | Little people::filter=kill
22 | Little &uncountable::word=Few \1::filter=indefarticle
23 | 
24 | # hide situations where the uncountable noun is used as an adjective
25 | # (e.g., water snails)
26 | few|fewer|Few|Fewer &uncountable .*/NNS::filter=kill
27 | little|Little &uncountable .*/NNS::filter=kill
28 | 
29 | 


--------------------------------------------------------------------------------
/bin/trainspellcontext.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # train and test the spellchecker models
 3 | #
 4 | 
 5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainWithContext
 6 | 
 7 | echo "=== CONTEXTUAL DATA ==========================================================================="
 8 | 
 9 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_wp_context1.txt
10 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_wp_context2.txt
11 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_gutenberg_context1.txt
12 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_gutenberg_context2.txt
13 | 
14 | 


--------------------------------------------------------------------------------
/CREDITS.txt:
--------------------------------------------------------------------------------
 1 | After the Deadline uses the following libraries:
 2 | 
 3 | lib/cngramj.jar 
 4 | http://ngramj.sourceforge.net/
 5 | 
 6 |   ngramj is a language guessing library for Java. It's licensed under the LGPL.
 7 |   I modified it by packaging a language profile for Indonesian:
 8 |   http://blog.afterthedeadline.com/2010/02/08/n-gram-language-guessing-with-ngramj/
 9 | 
10 | lang/lib/languagetool
11 | http://www.languagetool.org
12 | 
13 |   Language Tool is a rule-based language checking program. It's licensed under the LGPL.
14 |   No modifications to Language Tool were made.
15 | 
16 | lang/*/wordlists/*.utf8.txt
17 | 
18 |   Several dictionaries were extracted from the Open Office dictionaries page and converted 
19 |   to their normal form using unmunch and then converted to UTF8 by me.
20 | 
21 |   The licenses for the original source files range from GPL, LGPL, MPL (Mozilla Public 
22 |   License) and Creative Commons ShareAlike licenses.
23 | 
24 |   http://wiki.services.openoffice.org/wiki/Dictionaries
25 | 


--------------------------------------------------------------------------------
/utils/tagger/makebootstrap.sl:
--------------------------------------------------------------------------------
 1 | debug(7 | 34);
 2 | 
 3 | import java.util.List;
 4 | import java.io.BufferedReader;
 5 | import java.io.FileReader;
 6 | 
 7 | import edu.stanford.nlp.ling.Sentence from: stanford-postagger-2008-09-28.jar;
 8 | import edu.stanford.nlp.ling.TaggedWord from: stanford-postagger-2008-09-28.jar;
 9 | import edu.stanford.nlp.ling.HasWord from: stanford-postagger-2008-09-28.jar;
10 | import edu.stanford.nlp.tagger.maxent.MaxentTagger from: stanford-postagger-2008-09-28.jar;
11 | 
12 | global('$x $semaphore $handle $file @array');
13 | 
14 | $semaphore = semaphore();
15 | $handle = openf(@ARGV[1]);
16 | $file = @ARGV[0];
17 | 
18 | sub doit
19 | {
20 |    local('$taggedLine $tagger $text $sentence');
21 | 
22 |    $tagger = [new MaxentTagger: $file];
23 | 
24 |    while $text (readln($handle))
25 |    {
26 |       $sentence = [Sentence toSentence: cast(split('\s+', strrep($text, "'", " '")), ^String)];
27 |       $taggedLine = [$tagger tagSentence: $sentence];
28 |       println([$taggedLine toString: 0]);
29 |    }
30 | }
31 | 
32 | doit();
33 | 


--------------------------------------------------------------------------------
/service/src/view/tinymce.slp:
--------------------------------------------------------------------------------
 1 | <html xmlns="http://www.w3.org/1999/xhtml">
 2 | <head>
 3 |         <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
 4 | 	<title><% $1["rule"] %></title>
 5 | 	<script type="text/javascript" src="tiny_mce/tiny_mce_popup.js"></script>
 6 |         <script type="text/javascript" src="tiny_mce/utils/mctabs.js"></script>
 7 | 	<base target="_self" />
 8 | </head>
 9 | <body style="overflow:hidden; display: hidden">
10 |   <div class="tabs">
11 |     <ul>
12 |       <li id="general_tab" class="current"><span><a href="javascript:mcTabs.displayTab('general_tab','general_panel');" onmousedown="return false;"><% $1["rule"] %></a></span></li>
13 |     </ul>
14 |   </div>
15 |   <div class="panel_wrapper">
16 |     <div id="general_panel" class="panel current">
17 |       <?sleep display("service/src/view/rule.slp", $1, $2); ?>
18 |     </div>
19 |   </div>
20 | 
21 |   <div class="mceActionPanel">
22 |    <div style="float: right">
23 |      <input type="button" id="cancel" name="cancel" value="{#close}" onclick="tinyMCEPopup.close();" />
24 |    </div>
25 |   </div>
26 | 
27 | </body>
28 | </html>
29 | 


--------------------------------------------------------------------------------
/data/rules/grammar/lay:
--------------------------------------------------------------------------------
 1 | # http://grammar.quickanddirtytips.com/lay-versus-lie.aspx
 2 | 
 3 | #
 4 | # Confused word: laid
 5 | #
 6 | 
 7 | laid ahead::word=lay ahead::pivots=laid,lay::options=laid,lay
 8 | 
 9 | #
10 | # Confused word: lay
11 | #
12 | 
13 | lay around::word=lie around::pivots=lay,lie::options=lay,lie
14 | lay low::word=lie low::pivots=lay,lie::options=lay,lie
15 | 
16 | #
17 | # Confused word: laying
18 | #
19 | 
20 | laying around::word=lying around::pivots=laying,lying::options=laying,lying
21 | laying low::word=lying low::pivots=laying,lying::options=laying,lying
22 | 
23 | #
24 | # Confused word: lays
25 | #
26 | 
27 | lays atop::word=lies atop::pivots=lays,lies::options=lays,lies
28 | lays beside::word=lies beside::pivots=lays,lies::options=lays,lies
29 | lays low::word=lies low::pivots=lays,lies::options=lays,lies
30 | lays near::word=lies near::pivots=lays,lies::options=lays,lies
31 | lays on::word=lies on::pivots=lays,lies::options=lays,lies
32 | 
33 | #
34 | # Confused word: lain
35 | #
36 | 
37 | was lain::word=was laid::pivots=lain,laid::options=lain,laid    
38 | were lain::word=were laid::pivots=lain,laid::options=lain,laid
39 | 


--------------------------------------------------------------------------------
/data/rules/grammar/its:
--------------------------------------------------------------------------------
 1 | # yes, I know some parts of this rule are redundant with others--why mess with a working formula
 2 | Its .*/JJ|JJS .*/NN .*/TO|PRP|NNP::word=It's \1 \2 \3::filter=none
 3 | Its .*/JJ|JJS .*/TO|PRP|NNP::word=It's \1 \2::filter=none
 4 | Its .*/JJ|JJS .*/NN a|an|that|because|as::word=It's \1 \2 \3::filter=none
 5 | Its .*/JJ|JJS a|an|that|because|as::word=It's \1 \2::filter=none
 6 | Its .*/JJ for::word=It's \1 \2::filter=none
 7 | 
 8 | Its .*/RB|DT::word=It's \1::filter=none
 9 | 
10 | its .*/DT|IN|MD|POS|PP|WDT|WP|WRB::name=its rule::word=it's \1::filter=none
11 | its .*/CC|RB::name=its rule::word=it's \1::pivots=its,it's
12 | Its .*/DT|IN|MD|POS|PP|WDT|WP|WRB::name=its rule::word=It's \1::filter=none
13 | Its .*/CC|RB::name=its rule::word=It's \1::pivots=Its,It's
14 | its .*ed/VBN|VBD .*/NN|NNS::word=it's \1 \2::filter=kill
15 | its .*ed/VBN|VBD::word=it's \1::pivots=its,it's
16 | Its .*ed/VBN|VBD .*/NN|NNS::word=it's \1 \2::filter=kill
17 | Its .*ed/VBN|VBD::word=it's \1::pivots=its,it's
18 | 
19 | its not::word=it's not::pivots=its,it's
20 | 
21 | 
22 | its .*/VBG .*/NN|NNS::filter=kill
23 | its .*/VBG::word=it's \1::pivots=\0,it's
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/service/src/view/rules/nomit.slp:
--------------------------------------------------------------------------------
 1 | <?sleep
 2 |     local('$option $word $words $examples $o');
 3 |     $word   = split('\s+', $2)[$1["index"]];
 4 |     $option = denom($word);   
 5 | 
 6 |     if ($option ne "")
 7 |     {
 8 |        print("Try to revise <b> $+ $2 $+ </b> with ");
 9 | 
10 |        $o = map({ return "<b> $+ $1 $+ </b>"; }, split(', ', $option));
11 |        $o = filter(lambda({ if ($1 !in %nodupes) { %nodupes[$1] = 1; return $1; } }, %nodupes => %()), $o);
12 | 
13 |        if (size($o) == 1)
14 |        {
15 |           print($o[0]);
16 |        }
17 |        else
18 |        {
19 |           print([(join(",", sublist($o, 0, -1)) . " or " . $o[-1]) trim]);
20 |        }
21 | 
22 |        println(".");
23 |     }
24 |     else
25 |     {
26 |        println("You should revise <b> $+ $2 $+ </b> to bring out the verb.");
27 |     }
28 | ?>
29 | 
30 | <br>
31 | <br><b>Revision Examples</b> 
32 | <br>
33 | <br>Before: Bonuses are based on the <u>performance</u> of the company.
34 | <br>After: Bonuses are based on how the company <u>performs</u>.
35 | <br>
36 | <br>Before: An <u>Explanation</u> of Hidden Verbs.
37 | <br>After: Hidden Verbs <u>Explained</u>.
38 | 


--------------------------------------------------------------------------------
/utils/bigrams/buildsmallmodel.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # convert the large language model to pieces that we can load as needed
 3 | #
 4 | debug(7 | 34);
 5 | 
 6 | import org.dashnine.preditor.* from: lib/spellutils.jar;
 7 | use(^SpellingUtils);
 8 | 
 9 | # misc junk
10 | include("lib/dictionary.sl");
11 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
12 | $model      = get_language_model();
13 | 
14 | sub main {
15 |         local('$handle $x $entry $wid $file');
16 | 	$handle = openf(">models/stringpool.bin");
17 | 	writeObject($handle, [$model getStringPool]);
18 | 	writeObject($handle, [$model count]);
19 | 	closef($handle);
20 | 
21 | 	# make the necessary directories
22 | 	mkdir("tmp");
23 | 	for ($x = 0; $x < 512; $x++) {
24 | 		mkdir("tmp/ $+ $x");
25 | 	}
26 | 	
27 | 	# create each individual entry
28 | 	foreach $entry ([[[$model getStringPool] entrySet] iterator]) {
29 | 		$wid = [$entry getValue];
30 | 		$file = getFileProper("tmp", $wid % 512, $wid);
31 | 		$handle = openf("> $+ $file"); 
32 | 		writeAsObject($handle, [[$model getLanguageModel] get: $wid]);
33 | 		closef($handle);
34 | 	}
35 | }
36 | 
37 | invoke(&main, @ARGV);
38 | 


--------------------------------------------------------------------------------
/utils/common/score.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # code for the score object
 3 | #
 4 | 
 5 | sub sortScores
 6 | {
 7 |    return [$1 value] <=> [$2 value];
 8 | }
 9 | 
10 | sub score::init
11 | {
12 |    this('$desc $count $fneg $fpos $correct $sugg');
13 |    ($desc) = @_;
14 | }
15 | 
16 | sub score::record
17 | {
18 |    $count++;
19 | }
20 | 
21 | sub score::falseNegative
22 | {
23 |    $fneg++;
24 | }
25 | 
26 | sub score::falsePositive
27 | {
28 |    $fpos++;
29 | }
30 | 
31 | sub score::correct
32 | {
33 |    $correct++;
34 | }
35 | 
36 | sub score::correctSugg
37 | {
38 |    $sugg++;
39 | }
40 | 
41 | sub score::value
42 | {
43 |    return (double($correct) / $count);
44 | }
45 | 
46 | sub score::print
47 | {
48 |    println("Report for $desc");
49 |    println("Correct:        " . ((double($correct) / $count) * 100.0));
50 | 
51 |    if ($sugg != 0)
52 |    {
53 |    println("Suggestion Acc: " . ((double($sugg) / $count) * 100.0));
54 |    println("-" x 20);
55 |    }
56 |    if ($fneg != 0)
57 |    {
58 |    println("False Negative: " . ((double($fneg) / $count) * 100.0));
59 |    }
60 |    if ($fpos != 0)
61 |    {
62 |    println("False Positive: " . ((double($fpos) / $count) * 100.0));
63 |    }
64 | }
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/bin/buildhomodata.sh:
--------------------------------------------------------------------------------
 1 | # generate the source data
 2 | rm -rf tmp
 3 | mkdir tmp
 4 | java -jar lib/sleep.jar utils/spelldata/makesrc.sl data/rules/homophonedb.txt tmp/homophones.txt
 5 | 
 6 | #
 7 | # build with parts-of-speech
 8 | #
 9 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/wikipedia_sentences.txt data/tests/ho_test_wp_pos_context.txt 15
10 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/gutenberg_sentences.txt data/tests/ho_test_gutenberg_pos_context.txt 15
11 | 
12 | # was 8
13 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/gutenberg_sentences.txt data/tests/ho_train_gutenberg_pos_context.txt 6
14 | 
15 | #
16 | # build without parts-of-speech
17 | #
18 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen3.sl data/corpus_gutenberg tmp/homophones.txt data/tests/ho_test_gutenberg_context.txt
19 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg tmp/homophones.txt data/tests/ho_train_gutenberg_context.txt
20 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen3.sl data/corpus_wikipedia tmp/homophones.txt data/tests/ho_test_wp_context.txt
21 | rm -rf tmp
22 | 


--------------------------------------------------------------------------------
/data/rules/agreement/single.r:
--------------------------------------------------------------------------------
 1 | *prefix* are::word=*text* is, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
 2 | *prefix* were::word=*text* was, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
 3 | *prefix* don't::word=*text* doesn't, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
 4 | *prefix* [a-z]+/VBP::word=*text* \X:plural, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let::filter=sane
 5 | *prefix* be::filter=kill
 6 | *prefix* by::filter=kill
 7 | *prefix* [a-z]+/VB is::filter=kill
 8 | *prefix* [a-z]+/VB of|for::filter=kill
 9 | *prefix* [a-z]+/VB [a-z]+/VBD|VBZ::filter=kill
10 | *prefix* [a-z]+/VB::word=*text* \X:plural, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
11 | *prefix* are [a-z]+/VBN::filter=kill
12 | *prefix* [a-z]+/MD::filter=kill
13 | One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|Eleven|Twelve|Thirteen|Fourteen|Fifteen|Sixteen|Seventeen|Eighteen|Nineteen|Twenty|Thirty|Fourty|Fifty|Sixty|Seventy|Eighty|Ninenty dollars|pounds|points|feet|inches|meters::filter=kill
14 | 


--------------------------------------------------------------------------------
/utils/bigrams/fixgutenberg.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # this program fixes the gutenberg corpus by looping through each file and collapsing paragraphs onto a single line.
 3 | # this will lead to a more accurate language model which is a really good thing.
 4 | #
 5 | # do not do this twice or bad things will happen!!!!
 6 | #
 7 | 
 8 | sub fixFile
 9 | {
10 |    local('$handle $buffer $text $data');
11 | 
12 |    # read the file and populate our buffer please
13 | 
14 |    $buffer = allocate(lof($1));
15 |    $handle = openf($1);
16 |    while $text (readln($handle))
17 |    {
18 |       if ($text eq "")
19 |       {
20 |          print($buffer, "\n");
21 |       }
22 |       else
23 |       {
24 |          print($buffer, "$text ");
25 |       }
26 |    }
27 |    closef($handle);
28 |    closef($buffer);
29 | 
30 |    # read the contents of the buffer in
31 | 
32 |    $data = readb($buffer, -1);
33 |    closef($buffer);
34 | 
35 |    # transfer the contents of the buffer to 
36 | 
37 |    $handle = openf("> $+ $1");
38 |    writeb($handle, $data);
39 |    closef($handle);
40 | }
41 | 
42 | 
43 | map({
44 |    if (-isDir $1)
45 |    {
46 |       map($this, ls($1));
47 |    }  
48 |    else
49 |    {
50 |       fixFile($1);
51 |    }
52 | }, @ARGV);
53 | 
54 | println("Corpus Prepared");
55 | 


--------------------------------------------------------------------------------
/utils/rules/agreement.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # make a super rule file based on the chunker
 3 | #
 4 | 
 5 | sub fix {
 6 | 	local('$s $c $t');
 7 | 	$s = split('\s+', $1);
 8 | 	foreach $c => $t ($s) {
 9 | 		$t = "\\ $+ $c";
10 | 	}
11 | 	return join(" ", $s);
12 | }
13 | 
14 | sub count {
15 | 	local('$s $c $t');
16 | 	$s = split('\s+', $1);
17 | 	return "\\" . (size($s) + $2);
18 | }
19 | 
20 | sub noempties {
21 | 	return iff(strlen([$1 trim]) > 0, $1);
22 | }
23 | 
24 | sub makeData {
25 | 	local('$a $b');
26 | 	($a, $b) = split('::', $1);
27 | 	if (strlen($b) > 0) { $b = ", $b " . count($a); }
28 | 	return @($a, fix($a), count($a, 0), $b, count($a, 1));
29 | }
30 | 
31 | sub main {
32 | 	local('$handle @prefixes @rules $rule');
33 | 	$handle = openf($1);
34 | 	@prefixes = map(&makeData, filter(&noempties, readAll($handle)));
35 | 	closef($handle);
36 | 
37 | 	$handle = openf($2);
38 | 	@rules = readAll($handle);
39 | 	closef($handle);
40 | 
41 | 	foreach $rule (@rules) {
42 | 		printAll(map(lambda({ return '0BEGIN.0 ' . strrep($rule, '*prefix*', $1[0], '*text*', $1[1], '\\X', $1[2], '\\Y', $1[4], ', *transform*', $1[3]); }, \$rule), @prefixes));
43 | 	}
44 | 
45 | 	printAll(map({ return '0BEGIN.0 ' . $1[0] . "::filter=kill"; }, @prefixes));
46 | }
47 | 
48 | invoke(&main, sublist(@ARGV, 2));
49 | invoke(&main, @ARGV);
50 | 
51 | 


--------------------------------------------------------------------------------
/utils/common/bywords.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # this class looks at how often the trigram tagger guesses a word's correctness by the confused word
 3 | # used to generate the homobias class to
 4 | #
 5 | 
 6 | sub byword::init
 7 | {
 8 |    this('%data');
 9 | 
10 |    %data = ohash();
11 |    setMissPolicy(%data,
12 |    {
13 |       return newObject("score", "$2");
14 |    });
15 | }
16 | 
17 | sub byword::process
18 | {
19 |    local('$correct $wrong $wrongs $pre2 $pre1 $next @temp $nbase $tbase $solution $all %scores');
20 |    ($correct, $wrong, $wrongs, $pre2, $pre1, $next) = @_;
21 | 
22 |    $all = tagAll($pre2[1], $pre1[1], $pre1[0], $wrongs);
23 | 
24 |    if (isDifferent($all))
25 |    {
26 |       $solution = getBest($all)[0];
27 |       if ($solution eq $correct)
28 |       {
29 |          [%data[$solution] correct];
30 |       }
31 |       [%data[$solution] record];
32 |    }
33 | }
34 | 
35 | sub byword::finish
36 | {
37 |    map({ [$1 print]; }, sort(&sortScores, values(%data)));
38 | }
39 | 
40 | sub byword::save
41 | {
42 |    local('$key $value $handle');
43 |    foreach $key => $value (%data)
44 |    {
45 |       $value = [$value value];
46 |    #   warn("$key -> $value");
47 |    }
48 | 
49 |    $handle = openf(">models/bywords.bin");
50 |    writeObject($handle, %data);
51 |    closef($handle);
52 |    println("Model saved");
53 | }
54 | 


--------------------------------------------------------------------------------
/service/code/build.xml:
--------------------------------------------------------------------------------
 1 | <project name="Moconti" default="all" basedir=".">
 2 |   <property name="project.src"   location="src" />
 3 |   <property name="project.build" location="bin" />
 4 |   <property name="project.compiler" value="" />
 5 | 
 6 |   <property name="project.path"  value="rero" />
 7 | 
 8 |   <target name="all" depends="init, compile, jar" />
 9 | 
10 |   <target name="init">
11 |     <tstamp />
12 |     <mkdir dir="${project.build}" />
13 |   </target>
14 | 
15 |   <target name="compile" description="compile the source " >
16 |     <javac srcdir="${project.src}/" 
17 |            destdir="${project.build}"
18 |            nowarn="yes"
19 |            depend="yes"
20 |            source="1.4"
21 |            target="1.4"
22 |            debug="yes"
23 |            optimize="yes"
24 |     >
25 |        <classpath path="./lib/sleep.jar" />
26 |        <classpath path="./lib/cngram.jar" />
27 |     </javac>
28 |   </target>
29 | 
30 |   <target name="clean" description="clean up" >
31 |     <delete dir="${project.build}"/>
32 |   </target>
33 | 
34 |   <target name="jar" depends="compile">
35 |      <jar destfile="spellutils.jar"
36 |          basedir="bin"
37 |          includes="**/*">
38 |        <manifest>
39 |          <attribute name="Built-By" value="${user.name}"/>
40 |       </manifest>
41 |      </jar>
42 |    </target>
43 | </project>
44 | 


--------------------------------------------------------------------------------
/service/src/view/error.slp:
--------------------------------------------------------------------------------
 1 |   <error>
 2 |     <string><% $1 %></string>
 3 |     <description><% $2["rule"] %></description>
 4 |     <precontext><% iff($3 ne "0BEGIN.0" && $3 !isin ',()-[];:/--', $3) %></precontext>
 5 | <?sleep 
 6 |        if (size($4) > 0)
 7 |        {
 8 |           display("service/src/view/suggestions.slp", $4);
 9 |        }
10 | ?>
11 |     <type><?sleep
12 |        if ($2["style"] eq "green")
13 |        {
14 |           print("grammar");
15 |        } 
16 |        else if ($2["style"] eq "red")
17 |        {
18 |           print("spelling");
19 |        }
20 |        else
21 |        {
22 |           print("suggestion");
23 |        }
24 |     ?></type>
25 |     <?sleep 
26 |        if ($2["rule"] ne "Spelling" && $2["info"] ne "none")
27 |        {
28 |           local('@tokens @tags');
29 |           @tokens = split(' ', $1);
30 | 
31 |           if ($5 !is $null)
32 |           {
33 |              @tags   = sublist($5, 0, size(@tokens));
34 |           }
35 | 
36 | 	  
37 |     #      println('<url>http://service.afterthedeadline.com/info.slp?text='.[java.net.URLEncoder encode: $1].'</url>'); 
38 |           println('<url>' . $INFOURL . '/info.slp?text='.[java.net.URLEncoder encode: $1].'&amp;tags='.[java.net.URLEncoder encode: join('/', map({ return $1[1]; }, @tags))].'&amp;engine='.$6.'</url>'); 
39 |        }
40 |    ?>
41 |   </error>
42 | 
43 | 


--------------------------------------------------------------------------------
/utils/rules/findprepositions.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # a tool to inspect the language model
 3 | #
 4 | 
 5 | import org.dashnine.preditor.* from: lib/spellutils.jar;
 6 | use(^SpellingUtils);
 7 | 
 8 | # misc junk
 9 | include("lib/dictionary.sl");
10 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
11 | $model      = get_language_model();
12 | $dictionary = dictionary();
13 | $dsize      = size($dictionary);
14 | 
15 | global('@prepositions');
16 | @prepositions = filter({ return iff(indexOf($1, ' ') is $null, $1); }, map({ return [$1 trim]; }, `cat data/rules/prepositions.txt`));
17 | 
18 | foreach $word (sort({ return count($2) <=> count($1); }, keys($dictionary)))
19 | {
20 |    if (count($word) < 100)
21 |    {
22 |       continue;
23 |    }
24 | 
25 |    foreach $preposition (@prepositions)
26 |    {
27 |       # Pnext(preposition|word) 
28 |       if (Pbigram1($word, $preposition) > 0.50)
29 |       {
30 |          println("$word $+ , $preposition : Pbigram1( $+ $word $+ , $preposition $+ ) = " . Pbigram1($word, $preposition));
31 |       }
32 |       # Pprev(preposition|word)
33 |       else if (Pbigram2($preposition, $word) > 0.50)
34 |       {
35 |          println("$word $+ , $preposition : Pbigram2( $+ $preposition $+ , $word $+ ) = " . Pbigram2($preposition, $word));
36 |       }
37 |    }
38 | }
39 | 


--------------------------------------------------------------------------------
/data/rules/nomdb.txt:
--------------------------------------------------------------------------------
 1 | an NOM of|with|from
 2 | a NOM of|with|from
 3 | in the NOM with|of
 4 | in NOM with|of
 5 | the NOM with|of|from
 6 | come to a|an|the NOM
 7 | came to a|an|the NOM
 8 | make a|an|the NOM
 9 | makes a|an|the NOM
10 | making a|an|the NOM
11 | made a|an|the NOM
12 | do a|an|the NOM
13 | did a|an|the NOM
14 | does a|an|the NOM
15 | doesnt a|an|the NOM
16 | give a|an|the NOM
17 | given a|an|the NOM
18 | have a|an|the NOM
19 | has a|an|the NOM
20 | had a|an|the NOM
21 | having a|an|the NOM
22 | have a|an|the NOM
23 | achieve a|an|the NOM
24 | achieved a|an|the NOM
25 | be NOM
26 | provided a|an|the NOM
27 | perform a|an|the NOM
28 | performed a|an|the NOM
29 | conduct a|an|the NOM
30 | conducted a|an|the NOM
31 | accomplish a|an|the NOM
32 | accomplished a|an|the NOM
33 | achieved a|an|the NOM
34 | attained a|an|the NOM
35 | carry out a|an|the NOM
36 | carried out a|an|the NOM
37 | conduct a|an|the NOM
38 | conducted a|an|the NOM
39 | effected a|an|the NOM
40 | experienced a|an|the NOM
41 | experience a|an|the NOM
42 | facilitated a|an|the NOM
43 | given a|an|the NOM
44 | implemented a|an|the NOM
45 | indicate a|an|the NOM
46 | indicated a|an|the NOM
47 | involve a|an|the NOM
48 | involved a|an|the NOM
49 | made a|an|the NOM
50 | obtained a|an|the NOM
51 | occurred a|an|the NOM
52 | performed a|an|the NOM
53 | proceeded a|an|the NOM
54 | produced a|an|the NOM         
55 | required a|an|the NOM
56 | require a|an|the NOM
57 | 


--------------------------------------------------------------------------------
/utils/spell/seededits.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # this is a script to run unit tests and calculute the effectiveness of the 
 3 | # preditor engine
 4 | #
 5 | 
 6 | debug(debug() | 7 | 34);
 7 | 
 8 | map({ iff('*.sl' iswm $1, include($1)); }, ls("utils/common"));
 9 | 
10 | include("lib/engine.sl");
11 | include("lib/object.sl");
12 | 
13 | global('$dictionary $model $dsize $trie');
14 | $model      = get_language_model();
15 | $dictionary = dictionary();
16 | $trie       = trie($dictionary);
17 | $dsize      = size($dictionary);
18 | 
19 | sub seedFile
20 | {
21 |    local('$score $good $bad $word');
22 |    
23 |    $score = newObject("score", "Word pool accuracy: $1");
24 | 
25 |    while $word (words($1))
26 |    {  
27 |       ($bad, $good) = $word;
28 | 
29 |       if ($bad !in %edits)
30 |       {
31 |          %edits[$bad] = editst($dictionary, $trie, $bad); # filterByDictionary($bad, $dictionary);
32 |       }
33 | 
34 |       if ($good in %edits[$bad])
35 |       {
36 |          [$score correct];
37 |       }
38 |       else
39 |       {
40 | #         println("$bad -> $good ".editDistance($bad, $good)."  is not in " . %edits[$bad]);
41 |       }
42 |       [$score record];
43 |    }
44 | 
45 |    [$score print];
46 | }
47 | 
48 | global('%edits $handle');
49 | %edits = ohasha();
50 | 
51 | map(&seedFile, @ARGV);
52 | 
53 | $handle = openf(">models/edits.bin");
54 | writeObject($handle, %edits);
55 | closef($handle);
56 | 
57 | println("Edits flushed!");
58 | 


--------------------------------------------------------------------------------
/service/code/src/org/dashnine/preditor/GuessLanguage.java:
--------------------------------------------------------------------------------
 1 | package org.dashnine.preditor;
 2 | 
 3 | import sleep.bridges.*;
 4 | import sleep.runtime.*;
 5 | import sleep.interfaces.*;
 6 | 
 7 | import java.util.*;
 8 | 
 9 | import de.spieleck.app.cngram.NGramProfiles;
10 | 
11 | /** Utilities for the Sleep Spellchecker used in AtD */
12 | public class GuessLanguage implements Loadable, Function
13 | {
14 |    private static NGramProfiles profiles = null;
15 |    static
16 |    {
17 |       try
18 |       {
19 |          profiles = new NGramProfiles();
20 |       }
21 |       catch (Exception ex) { ex.printStackTrace(); }
22 |    }
23 | 
24 |    public String guessLanguage(String text)
25 |    {
26 |       if (text.length() > 1024)
27 |           text = text.substring(0, 1024);
28 | 
29 |       NGramProfiles.Ranker ranker = profiles.getRanker();
30 |       ranker.account(text);
31 |       NGramProfiles.RankResult result = ranker.getRankResult();
32 |       return result.getName(0);
33 |    }
34 | 
35 |    public Scalar evaluate(String name, ScriptInstance script, Stack args)
36 |    {
37 |       return SleepUtils.getScalar(guessLanguage(BridgeUtilities.getString(args, "")));
38 |    }
39 | 
40 |    public void scriptLoaded(ScriptInstance script)
41 |    {
42 |       script.getScriptEnvironment().getEnvironment().put("&guessLanguage", this);
43 |    }
44 |     
45 |    public void scriptUnloaded(ScriptInstance script) 
46 |    {
47 |        
48 |    }
49 | }
50 | 


--------------------------------------------------------------------------------
/data/rules/avoiddb.txt:
--------------------------------------------------------------------------------
 1 | All reasonable men think	I believe
 2 | As is well known		I think
 3 | As mentioned earlier		This is superfluous
 4 | As you know			You probably do not know
 5 | Critics claim			I claim
 6 | Experience shows that		My experience shows
 7 | For obvious reasons		I have no evidence
 8 | I don't know if you		You are ignorant
 9 | I don't want to bore you		This statement is boring
10 | I heard that			I don't have a reliable source
11 | I wouldn't hesitate to recommend	I recommend
12 | If you will			Please, pretty please, I'm begging you
13 | It has been decided that	I decided that
14 | It has been mentioned that	I say
15 | It is evident that		I think
16 | It is generally agreed that	Some people think
17 | It is known that		I think
18 | It is likely that		I have not good enough evidence
19 | It is not necessary to stress the fact		I should not need to tell you
20 | It is perhaps true to say	I do not know what to think
21 | People say			I say
22 | Popular wisdom has it that	I think
23 | So far as we know		We could be wrong
24 | Tentative conclusions		Possibilities
25 | The most typical example	The example that best suits my purpose
26 | There is evidence that		I don't have good evidence
27 | There is no doubt that		I am convinced
28 | To be honest with you		Up to this point, I have not told the truth
29 | To tell you the truth		Up to this point, I have not told the truth
30 | Would you object to		Here is my suggestion
31 | You probably never heard of	You are ignorant
32 | if you will			Please, pretty please, I'm begging you
33 | 


--------------------------------------------------------------------------------
/utils/bigrams/inspect.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # a tool to inspect the language model
 3 | #
 4 | 
 5 | debug(7 | 34);
 6 | 
 7 | import org.dashnine.preditor.* from: lib/spellutils.jar;
 8 | use(^SpellingUtils);
 9 | 
10 | # misc junk
11 | include("lib/dictionary.sl");
12 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
13 | $model      = get_language_model();
14 | $dictionary = dictionary();
15 | $dsize      = size($dictionary);
16 | 
17 | print("> ");
18 | 
19 | while $command (readln())
20 | {
21 |    @temp = split('\s+', $command);
22 |    if (size(@temp) == 5)
23 |    {
24 |       println("Trigram 1: " . sublist(@temp, 0, 3) . " = " . Ptrigram(@temp[0], @temp[1], @temp[2]));
25 |       println("Trigram 2: " . sublist(@temp, 2, 5) . " = " . Ptrigram2(@temp[2], @temp[3], @temp[4]));
26 |    }
27 |    else if (size(@temp) == 3)
28 |    {
29 |       println("Trigram 1: " . @temp . " = " . Ptrigram(@temp[0], @temp[1], @temp[2]));
30 |       println("Trigram 2: " . @temp . " = " . Ptrigram2(@temp[0], @temp[1], @temp[2]));
31 |    }
32 |    else if (size(@temp) == 2)
33 |    {
34 |       println("Bigram b, a->b " . @temp . " = " . Pbigram1(@temp[0], @temp[1]) );
35 |       println("Bigram b, b<-a " . @temp . " = " . Pbigram2(@temp[0], @temp[1]) );
36 |    }
37 |    else if (size(@temp) == 1)
38 |    {
39 |       println("Unigram " . @temp . " = " . Pword(@temp[0]));
40 |       println("Count " . @temp . " = " . count(@temp[0]));
41 |    }
42 | 
43 |    print("> ");
44 | }
45 | 


--------------------------------------------------------------------------------
/service/src/view/quality.slp:
--------------------------------------------------------------------------------
 1 | <scores>
 2 | <?sleep
 3 |     local('$metric $data %metrics $type $name');
 4 | 
 5 |     %metrics = 
 6 |     [{
 7 |         local('%value');
 8 |         %value = ohash();
 9 | 
10 |         %value['grammar.errors']         = $1['Auxiliary Verb Agreement'] + $1['Determiner Agreement'] + $1['Double Negative'] + $1['Subject Verb Agreement'] + $1['Wrong article'] + $1['Wrong Auxiliary Verb'];
11 |         %value['grammar.revise']         = $1['Revise'];
12 |         %value['grammar.repeated words'] = $1['Repeated Word'];
13 |         %value['spell.hyphenate']        = $1['Hyphen Required'];
14 |         %value['spell.misused words']    = $1['Did you mean...'] + $1['Did you mean?'];
15 |         %value['spell.raw']              = $1['Spelling'];
16 |         %value['spell.estimate']         = $1['miss'];
17 |         %value['stats.sentences']        = $1['sentences'];
18 |         %value['stats.words']            = $1['words'];
19 |         %value['stats.bias language']    = $1['Bias Language'];
20 |         %value['style.cliches']          = $1['Cliches'];
21 |         %value['style.complex phrases']  = $1['Complex Expression'];
22 |         %value['style.hidden verbs']     = $1['Hidden Verbs'];
23 |         %value['style.passive voice']    = $1['Passive voice'];
24 | 
25 |         return %value;
26 |      }: $1];
27 | 
28 |     foreach $metric => $data (%metrics)
29 |     {
30 |        if ($data > 0.0)
31 |        {
32 |           ($type, $name) = split('\.', $metric);
33 |           display("service/src/view/metric.slp", $type, $name, $data);
34 |        }
35 |     }
36 | ?></scores>
37 | 


--------------------------------------------------------------------------------
/data/rules/hyphens.txt:
--------------------------------------------------------------------------------
 1 | # seeded from http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Grammar_and_Misc
 2 | 
 3 | day to day::word=day-to-day
 4 | out and out::word=out-and-out
 5 | out of door::word=out-of-door
 6 | out of doors::word=out-of-doors
 7 | out of the way::word=out-of-the-way
 8 | out of band::word=out-of-band
 9 | out of bounds::word=out-of-bounds
10 | out of town::word=out-of-town
11 | out of state::word=out-of-state
12 | out of wedlock::word=out-of-wedlock
13 | out of pocket::word=out-of-pocket
14 | out of order::word=out-of-order
15 | out of place::word=out-of-place
16 | part time::word=part-time
17 | full time::word=full-time
18 | 1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59|60|61|62|63|64|65|66|67|68|69|70|71|72|73|74|75|76|77|78|79|80|81|82|83|84|85|86|87|88|89|90|91|92|93|94|95|96|97|98|99 year old::word=\0-year-old
19 | 100|200|250|500|100 year old::word=\0-year-old
20 | right|left handed::word=\0-handed
21 | case sensitive::word=case-sensitive
22 | case insensitive::word=case-insensitive
23 | award winning::word=award-winning
24 | out of body::word=out-of-body
25 | runner up::word=runner-up
26 | commander in chief::word=commander-in-chief
27 | win win::word=win-win
28 | win lose::word=win-lose
29 | lose lose::word=lose-lose
30 | built in::word=built-in
31 | ebook::word=e-book
32 | ereader::word=e-reader
33 | click throughs::word=click-throughs
34 | click through::word=click-through
35 | high five::word=high-five
36 | high fived::word=high-fived
37 | flu like::word=flu-like
38 | 


--------------------------------------------------------------------------------
/utils/common/utils.sl:
--------------------------------------------------------------------------------
 1 | sub toTaggerForm
 2 | {
 3 |    return map({ return split('/', $1); }, $1);
 4 | }
 5 | 
 6 | sub sentences
 7 | {
 8 |    local('$handle $sentence $candidates $line');
 9 | 
10 |    $handle = openf("data/tests/ $+ $1");
11 | 
12 |    while $line (readln($handle))
13 |    {
14 |       ($sentence, $candidates) = split('\\|', $line);
15 |       $candidates = split('[,;] ', $candidates);
16 |       yield @($sentence, $candidates[0], sublist($candidates, 1));
17 |    }
18 | 
19 |    closef($handle);
20 | }
21 | 
22 | sub words
23 | {
24 |    local('$handle $bad $good');
25 |    $handle = openf("data/tests/ $+ $1");
26 |    while $bad (readln($handle))
27 |    {
28 |       $good = readln($handle);
29 |       yield @($bad, $good);
30 |    }
31 |    closef($handle);
32 | }
33 | 
34 | sub loopHomophones
35 | {
36 |    local('$entry $sentence $correct $wrongs $previous $next $wrong');
37 | 
38 |    while $entry (sentences($1))
39 |    {
40 |       ($sentence, $correct, $wrongs) = $entry;
41 |       ($previous, $next) = split('\\*', $sentence);
42 |       $previous = split('\\s+', [$previous trim])[-1];
43 |       $previous = iff($previous eq "", '0BEGIN.0', $previous);
44 |       $next     = split('\\s+', [$next trim])[0];
45 |       $next     = iff($next eq "" || $next ismatch '[\\.!?]', '0END.0', $next);
46 |       $next     = iff(charAt($next, -1) ismatch '[\\.!?]', substr($next, 0, -1), $next);
47 | 
48 |       push($wrongs, $correct);
49 | 
50 |       foreach $wrong ($wrongs)
51 |       {
52 |          [$2 process: $correct, $wrong, $wrongs, $previous, $next];
53 |       }
54 |    }
55 | 
56 |    [$2 finish];
57 | }
58 | 


--------------------------------------------------------------------------------
/models/get_model_binaries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | svn export https://openatd.svn.wordpress.org/atd-server/models/cnetwork.bin ./models/cnetwork.bin
 3 | svn export https://openatd.svn.wordpress.org/atd-server/models/cnetwork2.bin ./models/cnetwork2.bin
 4 | svn export https://openatd.svn.wordpress.org/atd-server/models/dictionary.txt ./models/dictionary.txt
 5 | svn export https://openatd.svn.wordpress.org/atd-server/models/edits.bin ./models/edits.bin
 6 | svn export https://openatd.svn.wordpress.org/atd-server/models/endings.bin ./models/endings.bin
 7 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork.bin ./models/hnetwork.bin
 8 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork2.bin ./models/hnetwork2.bin
 9 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork4.bin ./models/hnetwork4.bin
10 | svn export https://openatd.svn.wordpress.org/atd-server/models/lexicon.bin ./models/lexicon.bin
11 | svn export https://openatd.svn.wordpress.org/atd-server/models/model.bin ./models/model.bin
12 | svn export https://openatd.svn.wordpress.org/atd-server/models/model.zip ./models/model.zip
13 | svn export https://openatd.svn.wordpress.org/atd-server/models/network3f.bin ./models/network3f.bin
14 | svn export https://openatd.svn.wordpress.org/atd-server/models/network3p.bin ./models/network3p.bin
15 | svn export https://openatd.svn.wordpress.org/atd-server/models/not_misspelled.txt ./models/not_misspelled.txt
16 | svn export https://openatd.svn.wordpress.org/atd-server/models/stringpool.bin ./models/stringpool.bin
17 | svn export https://openatd.svn.wordpress.org/atd-server/models/trigrams.bin ./models/trigrams.bin
18 | ./bin/buildrules.sh
19 | 


--------------------------------------------------------------------------------
/data/rules/grammar/aux_noparticiple:
--------------------------------------------------------------------------------
 1 | has &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
 2 | hasn't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
 3 | has not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
 4 | 
 5 | have &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
 6 | haven't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
 7 | have not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
 8 | 
 9 | had &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
10 | hadn't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
11 | had not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
12 | 
13 | were &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
14 | weren't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
15 | were not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
16 | 
17 | could've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
18 | would've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
19 | should've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
20 | you've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
21 | You've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
22 | I've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
23 | we've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
24 | We've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
25 | they've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
26 | They've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
27 | 


--------------------------------------------------------------------------------
/data/rules/foreigndb.txt:
--------------------------------------------------------------------------------
 1 | a fortiori  	with even stronger reason
 2 | a posteriori 	from effects to causes; reasoning based on past experience
 3 | a priori 	from causes to effects; conclusions drawn from assumptions; from what comes before; deductive reasoning
 4 | ab initio 	from the beginning
 5 | ad hoc		improvised
 6 | ad infinitum 	never ending
 7 | ad lib		at will, off the top of the head
 8 | bona fide	in good faith
 9 | caveat		caution, warning
10 | curricula vitae 	the courses of one's life, resumes
11 | curriculum vitae 	the course of one's life, resume
12 | de facto 	from the fact
13 | de jure 	from the law
14 | ex officio 	out of one's duty, out of one's office
15 | ex post facto 	after the fact, retrospectively
16 | hors d'oeuvre	appetizer
17 | hors d'oeuvres	appetizers
18 | hors de combat	out of the battle, out of service
19 | in situ 	in its original place
20 | in toto 	in its entirety
21 | infra 	below
22 | inter alia 	among other things
23 | ipso facto 	by the fact itself
24 | locus classicus 	standard or most authoritative source
25 | non sequitur 	it does not follow
26 | passim		here and there, throughout, in several places
27 | per capita 	per head
28 | prima facie 	at first sight, on the face of it
29 | pro bono 	for the public good, at no cost
30 | pro rata 	in proportion
31 | quid pro quo 	something in return
32 | raison d'etre	reason for, purpose
33 | scilicet	that is to say, namely
34 | scire licet 	that is to say, namely
35 | sic 		thus used, thus spelt
36 | sine die 	without a day, with no time fixed
37 | sine qua non 	without which not, essential precondition
38 | status quo 	things as they are
39 | stet 	as it was originally
40 | supra 	above
41 | vide 	see
42 | vide supre 	see above
43 | viva		oral examination
44 | voce		oral examination
45 | 


--------------------------------------------------------------------------------
/utils/rules/makespecial.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # this script extracts relevant irregular verbs from the internal data to allow us to create rules
 3 | #
 4 | 
 5 | 
 6 | include("lib/engine.sl");
 7 | include("utils/rules/rules.sl");
 8 | 
 9 | sub checkSentenceSpelling
10 | {
11 | }
12 | 
13 | sub initAll
14 | {
15 |    global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
16 |    $model      = get_language_model();
17 |    $dictionary = dictionary();
18 |    $dsize      = size($dictionary);
19 |    $hnetwork   = get_network("hnetwork.bin");
20 |    $verbs      = loadVerbData();
21 |    initTaggerModels();
22 | }
23 | 
24 | sub main
25 | {
26 |    initAll();
27 | 
28 |    local('$key $value $base $past $participle @results @past @base');
29 | 
30 |    foreach $key => $value ($verbs['base'])
31 |    {
32 |       ($base, $past, $participle) = values($value, @("base", "past", "participle"));
33 |       if ($past ne $participle)
34 |       {
35 |          push(@past, $past);
36 |          push(@results, $past);
37 |       }
38 | 
39 |       if ($base ne $participle && $base ne $past)
40 |       {
41 |          push(@base, $base);
42 |          push(@results, $base);
43 |       }
44 |    }
45 | 
46 |    @results = filter({ return iff(count($1) > 2, $1, println("Killed $[20]1 " .  count($1))  ); }, @results);
47 |    @past = filter({ return iff(count($1) > 2, $1); }, @past);
48 |    @base = filter({ return iff(count($1) > 2, $1); }, @base);
49 | 
50 |    println("Total words: " . size(@results));
51 |    println("==== RESULTS ====");
52 |    println(join("|", sorta(@results)));
53 |    println("==== PAST ====");
54 |    println(join("|", sorta(@past)));
55 |    println("==== BASE ====");
56 |    println(join("|", sorta(@base)));
57 | }
58 | 
59 | invoke(&main, @ARGV);
60 | 


--------------------------------------------------------------------------------
/data/rules/grammar/separate:
--------------------------------------------------------------------------------
 1 | #
 2 | # words that should be separated (and in what context)
 3 | #
 4 | 
 5 | # everyone of -> every one of
 6 | 
 7 | everyone of::word=every one of::pivots=\1,one of::rule=Separate everyone
 8 | 
 9 | # flashpoint -> flash point
10 | 
11 | flashpoint::word=flash point
12 | 
13 | # a while vs. awhile (split)
14 | 
15 | after|for|in awhile::word=\0 a while::pivots=awhile,a while
16 | 
17 | can backup::word=can back up
18 | can blackout::word=can black out
19 | can setup::word=can set up
20 | can workout::word=can work out
21 | for along time::word=for a long time
22 | for awhile::word=for a while
23 | for quite awhile::word=for quite a while
24 | got setup::word=got set up
25 | got shutdown::word=got shut down
26 | got shutout::word=got shut out
27 | had comeback::word=had come back
28 | had setup::word=had set up
29 | has setup::word=has set up
30 | have setup::word=have set up
31 | help setup::word=help set up
32 | in along time::word=in a long time
33 | in anyway::word=in any way
34 | in awhile::word=in a while
35 | in quite awhile::word=in quite a while
36 | incase of::word=in case of
37 | is setup::word=is set up
38 | Portland Trailblazers::word=Portland Trail Blazers
39 | take awhile::word=take a while
40 | to backout::word=to back out
41 | to backup::word=to back up
42 | to blackout::word=to black out
43 | to comeback::word=to come back
44 | to setup::word=to set up
45 | to shutdown::word=to shut down
46 | after along time::word=after a long time
47 | after awhile::word=after a while
48 | after quite awhile::word=after quite a while
49 | allot of::word=a lot of
50 | along time::word=a long time
51 | downpayment::word=down payment
52 | smartphone::word=smart phone
53 | ala mode::word=à la mode::filter=none
54 | afterall::word=after all
55 | to bailout::word=\0 bail out::pivots=bailout,bail out
56 | 
57 | 


--------------------------------------------------------------------------------
/utils/tagger/makesentences.sl:
--------------------------------------------------------------------------------
 1 | debug(7 | 34);
 2 | 
 3 | sub process
 4 | {
 5 |    local('@words $entry $previous $current $next');
 6 | 
 7 |    $1 = [$1 trim];
 8 |    if ($1 !ismatch '[A-Z][A-Za-z\'\,0-9 ]*?[\.\?\!]')
 9 |    {
10 |       return;
11 |    }
12 | 
13 |    @words = splitIntoWords($1);
14 | 
15 |    if (size(@words) < 3)
16 |    {
17 |       return;
18 |    }
19 | 
20 | #   foreach $entry (@words)
21 | #   {
22 | #      if (%dictionary[$entry] is $null)
23 | #      {
24 | #         return;
25 | #      }
26 | #   }
27 | 
28 | #    println($output, lc(join(" ", @words)) );
29 |     println($output, join(" ", @words) );
30 | }
31 | 
32 | sub processFile
33 | {
34 |    local('$handle $key $data $text @paragraphs');
35 | 
36 |    # read in our corpus.
37 |    $handle = openf($1);
38 |    $text   = replace(readb($handle, -1), '<[^>]*?>', '');
39 |    closef($handle);
40 | 
41 |    # start processing it?!?
42 |    @paragraphs = splitByParagraph($text);
43 |    map({ map(&process, $1); }, @paragraphs);
44 | }
45 | 
46 | sub main
47 | {
48 |    # setup our file that we're going to dump the output to.
49 |    global('$output');
50 |    $output = openf("> $+ $2");
51 |    
52 |    # ok go through all the junk parsing through the files.
53 | 
54 |    include("lib/nlp.sl");
55 |    include("lib/dictionary.sl");
56 | 
57 |    global('%dictionary');
58 |    %dictionary = dictionary();
59 |    %dictionary["0BEGIN.0"] = 1;
60 |    %dictionary["0END.0"] = 1;
61 | 
62 |    # collect list of files.
63 |    [{
64 |       if (-isDir $1)
65 |       {
66 |          map($this, ls($1));
67 |       }
68 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
69 |       {
70 |          processFile($1);
71 |       }
72 |     }: $1];
73 | 
74 | 
75 |    closef($output);
76 |    println("Done!");
77 | }
78 | 
79 | invoke(&main, @ARGV);
80 | 


--------------------------------------------------------------------------------
/data/rules/grammar/your:
--------------------------------------------------------------------------------
 1 | your .*ing/VBG::word=you're \1::pivots=your,you're
 2 | if your .*/DT::word=if you're \2::pivots=your,you're
 3 | your the|a|an::word=you're \1::filter=none
 4 | Your .*ing/VBG::word=You're \1::pivots=Your,You're
 5 | If your .*/DT::word=If you're \2::pivots=your,you're
 6 | Your the|a|an::word=You're \1::filter=none
 7 | 
 8 | about|around|at|by|for|from|in|near|of|on|over|through|to|towards|under|with|without you're::word=\0 your::pivots=you're,your
 9 | 
10 | you're [a-z].*/NN|NNS are|is::word=your \1 \2::pivots=you're,your
11 | to .*/VB you're .*/NN::word=\0 \1 your \3::pivots=you're,your
12 | Your right::word=You're right::pivots=Your,You're::options=your,you're
13 | 
14 | you're .* could|would|should|did|may|will|has|have|can|couldn't|wouldn't|shouldn't|didn't|won't|hasn't|haven't|can't::word=your \1 \2::pivots=you're,your
15 | 
16 | to you're::word=to your::pivots=you're, your
17 | 
18 | your welcome::word=you're welcome::pivots=your,you're
19 | Your welcome::word=You're welcome::pivots=Your,You're::options=your,you're
20 | Your welcome 0END.0::word=You're welcome::filter=none
21 | 
22 | you're are::word=you are::filter=none
23 | your are::word=you're::filter=none           
24 | your are .*ing::word=you are \2::filter=none
25 | 
26 | Your not::word=You're not::pivots=Your,You're
27 | your not::word=you're not::pivots=your,you're
28 | your in|at::word=you're \1::filter=none
29 | Your in|at::word=You're \1::filter=none
30 | 
31 | has|is you're::word=\0 your::pivots=you're,your::options=you're,your
32 | your so|as|gonna::word=you're \1::pivots=your,you're::options=your,you're
33 | Your so|as|gonna::word=You're \1::pivots=Your,You're::options=Your,You're
34 | 
35 | as you're .*/NN::word=\0 your \2::pivots=you're,your::options=you're,your
36 | As you're .*/NN::word=\0 your \2::pivots=you're,your::options=you're,your
37 | 


--------------------------------------------------------------------------------
/utils/common/hotest.sl:
--------------------------------------------------------------------------------
 1 | sub hotest::init
 2 | {
 3 |    this('$score1 $score2 $score $criterf $network $criteria');
 4 | 
 5 |    $criterf  = criteria($2);
 6 |    $network  = get_network($1);
 7 |    $criteria = $2;
 8 | 
 9 |    $score1  = newObject("score", "Correct   $4");
10 |    $score2  = newObject("score", "Wrong     $4");
11 |    $score   = newObject("score", "Composite $4");
12 | }
13 | 
14 | sub hotest::process
15 | {
16 |    local('$correct $wrong $wrongs $pre2 $pre1 $next $next2 @temp');
17 |    ($correct, $wrong, $wrongs, $pre2, $pre1, $next, $next2) = @_;
18 | 
19 |    if (size($criteria) == 0)
20 |    {
21 |       @temp[0] = rand($wrongs);
22 |    }
23 |    else
24 |    {
25 |       @temp = checkAnyHomophone($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]), $pre2[0], $next2[0], $criteriaf => $criterf);
26 |    #   println(join(', ', @($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]))) . ' = ' . @temp);
27 |    }
28 | 
29 |    if (size(@temp) == 0)
30 |    {
31 |       @temp[0] = $wrong;
32 |    }
33 | 
34 |    if (@temp[0] eq $correct)
35 |    {
36 |       [iff($wrong eq $correct, $score1, $score2) correct];
37 |       [$score correct];
38 |   #          warn("Correct!");
39 |    }
40 |    else
41 |    {
42 |        if ($wrong eq $correct)
43 |        {
44 |           [$score1 falsePositive];
45 |           [$score falsePositive];
46 |  #              warn("FP!");
47 |         }
48 |         else
49 |         {
50 |            [$score2 falseNegative];
51 |            [$score falseNegative];
52 | #           warn("FN!");
53 |         }
54 |    }
55 | 
56 |    [$score record];
57 |    [iff($wrong eq $correct, $score1, $score2) record];
58 | }
59 | 
60 | sub hotest::finish
61 | {
62 |    [$score1 print];
63 |    [$score2 print];
64 |    [$score print];
65 |    println("-" x 30);
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/utils/bigrams/qscore.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # generate statistics about a datset to evaluate writing quality
 3 | #
 4 | debug(7 | 34);
 5 | 
 6 | include("lib/quality.sl");
 7 | include("lib/engine.sl");
 8 | 
 9 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs $locks $trie %common');
10 | 
11 | $model      = get_language_model();
12 | $dictionary = dictionary();
13 | $rules      = get_rules();
14 | $network    = get_network("cnetwork.bin");
15 | $hnetwork   = get_network("hnetwork.bin");
16 | %edits      = initEdits();
17 | $dsize      = size($dictionary);
18 | $verbs      = loadVerbData();
19 | %common     = loadCommonWords();
20 | initTaggerModels();
21 | 
22 | sub report
23 | {
24 |    local('@keys $metric $words $sentences $a $b $key');
25 | 
26 |    @keys = sort({ return lc($1) cmp lc($2); }, keys($2));
27 | 
28 |    $words = double($2['words']);
29 |    $sentences = double($2['sentences']);
30 |   
31 |    foreach $key (@keys)
32 |    {
33 |       $metric = double($2[$key]);
34 |       $a      = ($metric / $words) * 100.0;
35 |       $b      = ($metric / $sentences) * 100.0;
36 |       println("$[20]1 : $[30]key : $[10]metric $[25]a $[25]b");
37 |    }
38 | }
39 | 
40 | sub checkDocument
41 | {
42 |    local('$data %stats $start');
43 |    
44 |    $start = ticks();
45 | 
46 |    # strip HTML please
47 |    $data = strrep($2, '&nbsp;', ' ', '<br>', "\n", '<p>', "\n", '<span>', "\n", '&quote;', '"', '&amp;', '&');
48 |    $data = replace($data, '(<[^>]*?>)', '');
49 |     
50 |    %stats = processDocumentQuality($data);
51 |    report(getFileName($1), %stats);
52 | 
53 |    println("Time: " . (ticks() - $start) . "ms");
54 | }
55 | 
56 | sub main
57 | {
58 |    local('$handle $data');
59 |    $handle = openf($1);
60 |    $data = readb($handle, -1);
61 |    closef($handle);
62 | 
63 |    checkDocument($1, $data);   
64 | }
65 | 
66 | invoke(&main, @ARGV)
67 | 


--------------------------------------------------------------------------------
/data/rules/grammar/too:
--------------------------------------------------------------------------------
 1 | too niche::filter=kill
 2 | too .*/NN|VB .*/VB.*::word=too \1 \2::                  # ruling out a false positive
 3 | too .*/NN|VB::word=to \1::pivots=too,to
 4 | too do::word=to \1::pivots=too,to
 5 | too the::word=to \1::pivots=too,to
 6 | to much|few of::filter=kill
 7 | to much|few::word=too \1::pivots=to,too
 8 | two many::words=to many,too many::pivots=two,to,too
 9 | is to|two late|easy::word=\0 too \2::pivots=\1,too
10 | was to|two late|easy::word=\0 too \2::pivots=\1,too
11 | be to|two late|easy::word=\0 too \2::pivots=\1,too
12 | were to|two late|easy::word=\0 too \2::pivots=\1,too
13 | are to|two late|easy::word=\0 too \2::pivots=\1,too
14 | been to|two late|easy::word=\0 too \2::pivots=\1,too
15 | comes to|two soon::word=\0 too \2::pivots=\1,too
16 | came to|two soon::word=\0 too \2::pivots=\1,too
17 | much to|two soon|late|early|easy::word=\0 too \2::pivots=\1,too
18 | is to|two soon::word=\0 too \2::pivots=\1,too
19 | was to|two soon::word=\0 too \2::pivots=\1,too
20 | were to|two soon::word=\0 too \2::pivots=\1,too
21 | are to|two soon::word=\0 too \2::pivots=\1,too
22 | been to|two soon::word=\0 too \2::pivots=\1,too
23 | is to .*/JJ.* 0END.0::word=\0 too \2::filter=none                
24 | was to .*/JJ.* 0END.0::word=\0 too \2::filter=none               
25 | be to .*/JJ.* 0END.0::word=\0 too \2::filter=none                 
26 | were to .*/JJ.* 0END.0::word=\0 too \2::filter=none               
27 | are to .*/JJ.* 0END.0::word=\0 too \2::filter=none                
28 | been to .*/JJ.* 0END.0::word=\0 too \2::filter=none
29 | not to .*/JJ 0END.0::word=too \1::filter=none
30 | is to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
31 | was to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
32 | be to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
33 | were to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
34 | are to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
35 | been to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
36 | 


--------------------------------------------------------------------------------
/utils/common/homo.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # test out spelling with associated context information
 3 | #
 4 | 
 5 | sub suggestTest
 6 | {
 7 |    local('$suspect $dict $previous $next @suggestions $f');
 8 |    ($suspect, $dict, $previous, $next) = @_;
 9 | 
10 |    @suggestions = %edits[$suspect];
11 | 
12 |    if ($correct in @suggestions)
13 |    {
14 |       foreach $f (@functions)
15 |       {
16 |          [$f : $suspect, $correct, copy(@suggestions), $previous, $next];
17 |       }
18 |     #  warn("Done for $previous $suspect $next -> $correct");
19 |    }
20 | 
21 |    return @();
22 | }
23 | 
24 | sub testCorrectionsContext
25 | {
26 |    local('$score $entry $sentence $correct $wrongs @results @words $rule $wrong $previous $next $func');
27 | 
28 |    while $entry (sentences($1))
29 |    {
30 |       ($sentence, $correct, $wrongs) = $entry;
31 |       ($previous, $next) = split(' \\* ', $sentence);
32 |       $func = lambda(&suggestTest, \$score, \$correct, @functions => sublist(@_, 1));
33 | 
34 |       #
35 |       # check for a false negative
36 |       #
37 |       foreach $wrong ($wrongs)
38 |       {
39 |          [$func: $wrong, $dictionary, $previous, $next]
40 |       } 
41 |    }
42 | }
43 | 
44 | sub loopHomophonesPOS
45 | {
46 |    local('$entry $sentence $correct $wrongs $pre2 $pre1 $next $object $wrong $next2'); 
47 | 
48 |    while $entry (sentences($1))
49 |    {
50 |       ($sentence, $correct, $wrongs) = $entry;
51 |       ($pre2, $pre1, $null, $next, $next2) = toTaggerForm(split(' ', $sentence));
52 | 
53 |       if ($pre2[1] eq "UNK") { $pre2[1] = ""; }
54 |       if ($pre1[1] eq "UNK") { $pre1[1] = ""; }
55 | 
56 |       $correct = split('/', $correct)[0];
57 | 
58 |       push($wrongs, $correct);
59 | 
60 |       foreach $wrong ($wrongs)
61 |       {
62 |          [$2 process: $correct, $wrong, $wrongs, $pre2, $pre1, $next, $next2];
63 |       }
64 | 
65 | #      [$2 process: $correct, $correct, $wrongs, $pre2, $pre1, $next];
66 |    }
67 | 
68 |    [$2 finish];
69 | }
70 | 


--------------------------------------------------------------------------------
/lib/quality.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # calculate quality score for a dataset
 3 | #
 4 | 
 5 | sub loadCommonWords
 6 | {
 7 |    this('$common');
 8 |    if ($common is $null)
 9 |    {
10 |       $common = %();  
11 |       local('$handle $bad $good $foo');
12 | 
13 |       #  function to load file data and add it to our hash
14 |       $foo = lambda(
15 |       {
16 |          local('$handle $bad');
17 |          $handle = openf($1);
18 |          while $bad (readln($handle))
19 |          {
20 |             if ($bad !in $dictionary)
21 |             {
22 |                $common[$bad] = 1;
23 |             }
24 |          }
25 |          closef($handle);
26 |       }, \$common);
27 | 
28 |       [$foo : 'data/tests/tests1.txt'];
29 |       [$foo : 'data/tests/tests2.txt'];
30 |    }
31 | 
32 |    return $common;
33 | }
34 | 
35 | sub generateStatistics
36 | {
37 |    local('$error $rule');
38 | 
39 |    foreach $error ($1)
40 |    {
41 |       $rule = $error[0];
42 |       $2[$rule['rule']] += 1;
43 |    }
44 | }
45 | 
46 | sub processDocumentQuality
47 | {
48 |    local('@paragraphs $paragraph $sentence @results @words $count $word %common $suggest %stats');
49 | 
50 |    %common     = loadCommonWords();
51 |    @paragraphs = splitByParagraph($1);
52 | 
53 |    $suggest    = function('&suggest');
54 |    setf('&suggest', { return @(); });
55 | 
56 |    foreach $count => $paragraph (@paragraphs)
57 |    {
58 |       foreach $sentence ($paragraph)
59 |       {
60 |          if ($sentence eq "")
61 |          {
62 |             continue;
63 |          }
64 | 
65 |          @words = splitIntoWords($sentence);
66 |          %stats['words'] += size(@words);
67 |          %stats['sentences'] += 1;
68 | 
69 |          foreach $word (@words) { if ($word in %common) { %stats['miss'] += 1; } }
70 | 
71 |          processSentence(\$sentence, \@results);
72 |       }
73 | 
74 |       generateStatistics(@results, %stats);
75 |       @results = @();
76 |    }
77 | 
78 |    setf('&suggest', $suggest);
79 |    return %stats;
80 | }
81 | 
82 | 


--------------------------------------------------------------------------------
/utils/spelldata/maker.sl:
--------------------------------------------------------------------------------
 1 | # 
 2 | # This is a script to generate an AtD test corpus from a rule file (assumes you used torules.sl or something similar to generate the file)
 3 | #
 4 | # java -jar utils/rules/maker.sl <rule file> <sentences file>
 5 | #
 6 | # <rule file> format:
 7 | #
 8 | # correct text|word=wrong text
 9 | #
10 | 
11 | include("lib/engine.sl");
12 | include("utils/rules/rules.sl");
13 | 
14 | sub checkSentenceSpelling
15 | {
16 | }
17 | 
18 | sub initAll
19 | {
20 |    global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
21 |    $model      = get_language_model();
22 |    $dictionary = dictionary();
23 |    $dsize      = size($dictionary);
24 |    $hnetwork   = get_network("hnetwork.bin");
25 |    $verbs      = loadVerbData();
26 |    initTaggerModels();
27 | }
28 | 
29 | sub main
30 | {
31 |    local('$handle $sentence @results @past');
32 | 
33 |    initAll();
34 | 
35 |    if (function("& $+ $1") !is $null)
36 |    {
37 |       $rules = machine();
38 |       invoke(function("& $+ $1"));
39 |    }
40 |    else
41 |    {
42 |       $rules = loadRules(machine(), $1, %());
43 |    }
44 | 
45 |    $handle = openf($2);
46 |    while $sentence (readln($handle))
47 |    {
48 |       @results = @();
49 |       processSentence(\$sentence, \@results);
50 | 
51 |       @past = copy(@results);
52 | 
53 |       if (size(@past) == 1)
54 |       {
55 |           foreach $index => $r (@past)
56 |           {
57 |              local('$rule $text $path $context @suggestions');
58 |              ($rule, $text, $path, $context, @suggestions) = $r;
59 |             
60 |              %count[$rule['word']] += 1;
61 | 
62 |              if (%count[$rule['word']] < 5)
63 |              {
64 |                 println(strrep($sentence, " $text ", ' * ') . '|' . $rule['word'] . ', ' . iff($rule['options'] ne "", $rule['options'], $text) . '|' . $text);
65 |              }
66 |           }
67 |       }
68 |    }
69 | } 
70 | 
71 | invoke(&main, @ARGV);
72 | 


--------------------------------------------------------------------------------
/data/rules/grammar/apostrophes:
--------------------------------------------------------------------------------
  1 | #
  2 | # missing apostrophes
  3 | #
  4 | 
  5 | # Verbs with not contracted:
  6 | 
  7 | arent::word=aren't
  8 | didnt::word=didn't
  9 | dont::word=don't
 10 | isnt::word=isn't
 11 | #cant::word=can't
 12 | werent::word=weren't
 13 | wouldnt::word=wouldn't
 14 | doesnt::word=doesn't
 15 | hasnt::word=hasn't
 16 | couldnt::word=couldn't
 17 | hadnt::word=hadn't
 18 | 
 19 | Arent::word=Aren't
 20 | Didnt::word=Didn't
 21 | Dont::word=Don't
 22 | Isnt::word=Isn't
 23 | #cant::word=Can't
 24 | Werent::word=Weren't
 25 | Wouldnt::word=Wouldn't
 26 | Doesnt::word=Doesn't
 27 | Hasnt::word=Hasn't
 28 | Couldnt::word=Couldn't
 29 | Hadnt::word=Hadn't
 30 | 
 31 | # Pronouns with will
 32 | 
 33 | Ill::word=I'll
 34 | 
 35 | youll::word=you'll
 36 | #hell::word=he'll
 37 | #shell::word=she'll
 38 | theyll::word=they'll
 39 | 
 40 | Youll::word=You'll
 41 | #hell::word=he'll
 42 | #shell::word=she'll
 43 | Theyll::word=Yhey'll
 44 | 
 45 | # pronouns with the verb to be
 46 | 
 47 | Im::word=I'm
 48 | 
 49 | youre::word=you're
 50 | whos::word=who's
 51 | hes::word=he's
 52 | shes::word=she's
 53 | #its::word=it's
 54 | #were::word=we're
 55 | theyre::word=they're
 56 | thats::word=that's::filter=none
 57 | 
 58 | Youre::word=You're
 59 | Whos::word=Who's
 60 | Hes::word=He's
 61 | Shes::word=She's
 62 | #its::word=it's
 63 | #were::word=we're
 64 | Theyre::word=They're
 65 | Thats::word=That's
 66 | 
 67 | # to have
 68 | 
 69 | Ive::word=I've
 70 | 
 71 | youve::word=you've
 72 | weve::word=we've
 73 | theyve::word=they've
 74 | 
 75 | Youve::word=You've
 76 | Weve::word=We've
 77 | Theyve::word=They've
 78 | 
 79 | # would or had
 80 | 
 81 | #Id::word=I'd
 82 | 
 83 | hed::word=he'd
 84 | #shed::word=she'd
 85 | youd::word=you'd
 86 | #wed::word=we'd
 87 | theyd::word=they'd
 88 | 
 89 | Hed::word=He'd
 90 | #shed::word=she'd
 91 | Youd::word=You'd
 92 | #wed::word=we'd
 93 | Theyd::word=They'd
 94 | 
 95 | # 
 96 | 
 97 | Theres::word=There's
 98 | theres::word=there's
 99 | 
100 | oclock::word=o'clock
101 | 
102 | heres::word=here's
103 | 


--------------------------------------------------------------------------------
/data/rules/grammar/their:
--------------------------------------------------------------------------------
 1 | their is|are|a|an::word=there \1::pivots=their,there
 2 | there to::filter=kill
 3 | there .*/JJ.* .*/NN::word=their \1 \2::pivots=there,their
 4 | there .*ing/NN::word=their \1, they're \1::pivots=there,their,they're
 5 | there .*/NN::word=their \1::pivots=there,their
 6 | Their is|are|a|an::word=There \1::pivots=their,there
 7 | There .*/JJ.* .*/NN::word=Their \1 \2::pivots=there,their
 8 | There .*ing/NN::word=Their \1, They're \1::pivots=there,their,they're
 9 | There .*/NN::word=Their \1::pivots=there,their
10 | is there .*/NN::word=\0 \1 \2
11 | is there .*/JJ .*/NN::word=\0 \1 \2 \3
12 | isn't there .*/NN::word=\0 \1 \2
13 | isn't there .*/JJ .*/NN::word=\0 \1 \2 \3
14 | was there .*/NN::word=\0 \1 \2
15 | was there .*/JJ .*/NN::word=\0 \1 \2 \3
16 | are there .*/NN::word=\0 \1 \2
17 | are there .*/JJ .*/NN::word=\0 \1 \2 \3
18 | if their .*ing::word=\0 they're \2::pivots=\1,they're
19 | to .*/VB there .*/NN::word=\0 \1 their \3::pivots=\2,their
20 | in there|they're .*/NN|JJ .*/NN::word=\0 their \2 \3::pivots=\1,their
21 | in there|they're .*/NN::word=\0 their \2::pivots=\1,their
22 | they're are::word=there are, they are::pivots=they're,there,they
23 | They're are::word=There are, They are::pivots=They're,There,They
24 | .*/VB there .*/NNS::word=\0 their \2::pivots=\1,their
25 | .*/VB there .*/JJ .*/NNS::word=\0 their \2 \3::pivots=\1,their
26 | .*/IN there .*/NNS::word=\0 their \2::pivots=\1,their
27 | .*/IN there .*/JJ .*/NNS::word=\0 their \2 3::pivots=\1,their
28 | 
29 | has|is they're::word=\0 their::pivots=they're,their::options=they're,their
30 | their so|as|gonna::word=they're \1::pivots=their,they're::options=their,They're
31 | Their so|as|gonna::word=They're \1::pivots=Their,They're::options=Their,They're
32 | 
33 | #
34 | # some rules to map their|there -> they're
35 | #
36 | their doing so::filter=kill
37 | there being .*/IN|DT::filter=kill
38 | there|their .*/VBG .*/IN::word=they're \1 \2::pivots=\0,they're
39 | there|their .*/VBG .*/DT::word=they're \1 \2::pivots=\0,they're
40 | there|their .*/VBG 0END.0::word=they're \1 \2::pivots=\0,they're
41 | 


--------------------------------------------------------------------------------
/service/code/src/org/dashnine/preditor/SortFromHash.java:
--------------------------------------------------------------------------------
 1 | package org.dashnine.preditor;
 2 | 
 3 | import sleep.runtime.*;
 4 | import sleep.bridges.*;
 5 | import sleep.interfaces.*;
 6 | 
 7 | import java.util.*;
 8 | 
 9 | /* Code to implement a sort function that sorts values by their corresponding Double values in a hashtable. This class exists to replace
10 |    <code>sort(lambda({ return %hash[$1] <=> %hash[$2]; }, \%hash)</code>.  This snippet was identified by the profiler as consuming more time
11 |    than any other function */
12 | public class SortFromHash implements Loadable
13 | {
14 |     private static class CompareHashItems implements Comparator
15 |     {
16 |         protected ScalarHash     hash;
17 | 
18 |         public CompareHashItems(ScalarHash _hash)
19 |         {
20 |            hash = _hash;
21 |         }
22 | 
23 |         public int compare(Object a, Object b)
24 |         {
25 |            double aa, bb;
26 |            aa = hash.getAt((Scalar)a).doubleValue();
27 |            bb = hash.getAt((Scalar)b).doubleValue();           
28 | 
29 |            if (aa > bb)
30 |            {
31 |               return -1;
32 |            }
33 |            else if (aa < bb)
34 |            {
35 |               return 1;
36 |            }
37 |            else
38 |            {
39 |               return 0;
40 |            }
41 |         }
42 |     }
43 | 
44 |     private static class func_sortFromHash implements Function
45 |     {
46 |         public Scalar evaluate(String n, ScriptInstance i, Stack l)
47 |         {
48 |            ScalarArray  array     = BridgeUtilities.getWorkableArray(l);
49 |            ScalarHash   hash      = BridgeUtilities.getHash(l);
50 | 
51 |            array.sort(new CompareHashItems(hash));
52 | 
53 |            return SleepUtils.getArrayScalar(array);
54 |         }
55 |     }
56 | 
57 |    public void scriptLoaded(ScriptInstance script)
58 |    {
59 |       script.getScriptEnvironment().getEnvironment().put("&sortHash", new func_sortFromHash());
60 |    }
61 | 
62 |    public void scriptUnloaded(ScriptInstance script)
63 |    {
64 |    }
65 | }
66 | 


--------------------------------------------------------------------------------
/bin/buildtaggersets.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # code to generate the data used to bootstrap the tagger
 3 | #
 4 | 
 5 | mkdir tmp
 6 | 
 7 | java -Xmx1024M -jar lib/sleep.jar utils/tagger/makesentences.sl data/corpus_wikipedia tmp/wikipedia_sentences.txt
 8 | java -Xmx1024M -jar lib/sleep.jar utils/tagger/makesentences.sl data/corpus_gutenberg tmp/gutenberg_sentences.txt
 9 | 
10 | #
11 | # You *must* download the Stanford POS Tagger (GPL) from: http://nlp.stanford.edu/software/tagger.shtml
12 | # and extract it into your AtD directory.
13 | #
14 | # This tagger will take 3 days to run / file
15 | #                       ------
16 | 
17 | cd stanford-postagger-2008-09-28
18 | java -Xmx1024M -XX:+AggressiveHeap -XX:+UseParallelGC -jar ../lib/sleep.jar ../utils/tagger/makebootstrap.sl models/bidirectional-wsj-0-18.tagger ../data/gutenberg_sentences.txt >../tmp/gutenberg_sentences_tagged.txt &
19 | java -Xmx1024M -XX:+AggressiveHeap -XX:+UseParallelGC -jar ../lib/sleep.jar ../utils/tagger/makebootstrap.sl models/bidirectional-wsj-0-18.tagger ../data/wikipedia_sentences.txt >../tmp/wikipedia_sentences_tagged.txt &
20 | 
21 | #
22 | # Or, optionally, you can use this Tagger which includes source but use is allowed for non-commercial research purposes only
23 | #  
24 | # http://www-tsujii.is.s.u-tokyo.ac.jp/~tsuruoka/postagger/
25 | #
26 | # This tagger will execute in 5 minutes / file
27 | #                             ---------
28 | 
29 | # Oh, irony of ironies-- this tagger and the Stanford tagger produce nearly identical data (AtD bootstraps from the Stanford data though)
30 | 
31 | #
32 | #cd postagger-1.0
33 | #./tagger <../tmp/wikipedia_sentences.txt >../tmp/wikipedia_sentences_tagged.txt 
34 | #./tagger <../tmp/gutenberg_sentences.txt >../tmp/gutenberg_sentences_tagged.txt 
35 | #
36 | cd ..
37 | 
38 | java -jar lib/sleep.jar utils/tagger/fixtags.sl tmp/wikipedia_sentences_tagged.txt >data/wikipedia_sentences_tagged_f.txt
39 | java -jar lib/sleep.jar utils/tagger/fixtags.sl tmp/gutenberg_sentences_tagged.txt >data/gutenberg_sentences_tagged_f.txt
40 | 
41 | mv tmp/wikipedia_sentences.txt data/wikipedia_sentences.txt
42 | mv tmp/gutenberg_sentences.txt data/gutenberg_sentences.txt
43 | 
44 | rm -rf tmp
45 | 


--------------------------------------------------------------------------------
/data/rules/agreement/chunk_single.r:
--------------------------------------------------------------------------------
 1 | .*/NNP [a-z]+/NN or [a-z]+/PRP.* [a-z]+/NN::\0 \1 and \3 \4
 2 | .*/NNP [a-z]+/NNS or [a-z]+/PRP.* [a-z]+/NN::\0 \1 and \3 \4
 3 | A [a-z]+/NN or [a-z]+/NN::\0 \1 and \3
 4 | An [a-z]+/NN or [a-z]+/NN::\0 \1 and \3
 5 | .*/NNP or [a-z]+/NNP::\0 and \2
 6 | Every one of [a-z]+/DT [a-z]+/NNS::\3:upper \4
 7 | One of [a-z]+/PRP.* [a-z]+/NNS::\2:upper \3
 8 | Each one of [a-z]+/PRP.* [a-z]+/NNS::\3:upper \4
 9 | The [a-z]+/NN [a-z]+/IN::\0 \1:plural \2
10 | The [a-z]+/NN::\0 \1:plural
11 | This [a-z]+/NN [a-z]+/IN::These \1:plural \2
12 | This [a-z]+/NN::These \1:plural
13 | One of [a-z]+/DT [a-z]+/NNS::\2:upper \3
14 | .*/NNP,POS [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
15 | .*/NNP,POS [a-z]+/NN::\0 \1:plural
16 | The [a-z]+/NN [a-z]+/IN [a-z]+/DT [a-z]+/NN::\0 \1:plural \2 \3 \4
17 | This [a-z]+/NN [a-z]+/IN [a-z]+/DT [a-z]+/NN::These \1:plural \2 \3 \4
18 | .*/RB one
19 | The [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
20 | This [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
21 | Their [a-z]+/NN::\0 \1:plural
22 | Their [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
23 | Their [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
24 | Your [a-z]+/NN::\0 \1:plural
25 | Your [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
26 | Your [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
27 | His [a-z]+/NN::\0 \1:plural
28 | His [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
29 | His [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
30 | Her [a-z]+/NN::\0 \1:plural
31 | Her [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
32 | Her [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
33 | My [a-z]+/NN::\0 \1:plural
34 | My [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
35 | My [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
36 | The [a-z]+/VBN [a-z]+/NN::\0 \1 \2:plural
37 | This [a-z]+/VBN [a-z]+/NN::\0 \1 \2:plural
38 | .*/CD dollars|pounds|points|feet|inches|meters
39 | The [a-z]+/NN [a-z]+/VB [a-z]+/NN::\0 \1 \2 \3:plural
40 | The [a-z]+/NN [a-z]+/VB [a-z]+/NN [a-z]+/VBP [a-z]+/JJ [a-z]+/NN
41 | The [a-z]+/JJ [a-z]+/NN [a-z]+/VBP [a-z]+/JJ [a-z]+/NN
42 | The [a-z]+/NN of [a-z]+/VB [a-z]+/NN::\0 \1 \2 \3 \4:plural
43 | The [a-z]+/NN [a-z]+/VB
44 | The [a-z]+/NN of [a-z]+/VB [a-z]+/NNS::\0 \1:plural of \3 \4
45 | Either [a-z]+/NN
46 | .*/NN::\0:plural
47 | Either [a-z]+/NNP [a-z]+/NNS or [a-z]+/PRP.* [a-z]+/NN::\1:upper \2 and \4 \5
48 | 


--------------------------------------------------------------------------------
/utils/common/exp.sl:
--------------------------------------------------------------------------------
 1 | sub exp::init
 2 | {
 3 |    this('$score1 $score2 $score $criterf $network $criteria %dpoints $tscores $nscores $oscores $criterf2 $network2 $criteria2');
 4 | 
 5 |    $criterf  = criteria($2);
 6 |    $network  = get_network($1);
 7 |    $criteria = $2;
 8 | 
 9 |    $nscores = newObject("score", "network  total");
10 |    $tscores = newObject("score", "trigrams total");
11 |    $oscores = newObject("score", "best score");
12 | }
13 | 
14 | sub exp::process
15 | {
16 |    local('$correct $wrong $wrongs $pre2 $pre1 $next @temp $nbase $tbase $solution $all %scores');
17 |    ($correct, $wrong, $wrongs, $pre2, $pre1, $next) = @_;
18 | 
19 |    # do a trigram check?
20 |    if ($wrong eq $correct)
21 |    {
22 |       $all = tagAll($pre2[1], $pre1[1], $pre1[0], $wrongs);
23 | 
24 |       if (isDifferent($all))
25 |       {
26 |          $solution = getBest($all)[0];
27 |          if ($solution eq $correct)
28 |          {
29 |             [$tscores correct];
30 |          }
31 |          else
32 |          {
33 |             if ($bywords[$solution] == 1.0)
34 |             {
35 |   #             warn("$solution is wrong, correct is $correct : " . $bywords[$correct]);
36 |             }
37 |          }
38 |          [$tscores record];
39 |       }
40 |    }
41 | 
42 |    if ($wrong eq $correct)
43 |    {
44 |       (@temp, %scores) = checkAnyHomophone2($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]), 
45 |              $criteriaf => $criterf);
46 | 
47 |       if (size(@temp) == 0)
48 |       {
49 |           @temp[0] = $wrong;
50 |       }
51 | 
52 |       if ($bywords[$solution] >= 1.0)  #&& $solution eq $correct)
53 |       {
54 |           @temp[0] = $solution;
55 |       }
56 | 
57 |       if (@temp[0] eq $correct)
58 |       {
59 |          [$nscores correct];
60 |       }
61 |       [$nscores record];
62 | 
63 |       if (@temp[0] eq $correct || $solution eq $correct)
64 |       {
65 |          [$oscores correct];
66 |       }
67 |       [$oscores record];
68 | 
69 |       if ($solution ne $correct && $bywords[$solution] == 1.0)
70 |       {
71 | #          warn("$solution - " . $bywords[$solution] . " vs. $correct " . $bywords[$correct]);
72 |       }
73 |    }
74 | }
75 | 
76 | sub exp::finish
77 | {
78 |    [$nscores print];
79 |    [$tscores print];
80 |    [$oscores print];
81 | }
82 | 


--------------------------------------------------------------------------------
/data/rules/irregular_nouns.txt:
--------------------------------------------------------------------------------
  1 | addendum	addenda
  2 | alga	algae
  3 | alumna	alumnae
  4 | alumnus	alumni
  5 | analysis	analyses
  6 | antenna	antennas,antennae
  7 | apparatus	apparatuses
  8 | appendix	appendices,appendixes
  9 | axis	axes
 10 | bacillus	bacilli
 11 | bacterium	bacteria
 12 | basis	bases
 13 | beau	beaux
 14 | bison	bison
 15 | buffalo	buffalos,buffaloes
 16 | bureau	bureaus
 17 | bus	busses,buses
 18 | cactus	cactuses,cacti
 19 | calf	calves
 20 | child	children
 21 | corps	corps
 22 | corpus	corpora,corpuses
 23 | crisis	crises
 24 | criterion	criteria
 25 | curriculum	curricula
 26 | datum	data
 27 | deer	deer
 28 | die	dice
 29 | dwarf	dwarfs,dwarves
 30 | diagnosis	diagnoses
 31 | echo	echoes
 32 | elf	elves
 33 | ellipsis	ellipses
 34 | embargo	embargoes
 35 | emphasis	emphases
 36 | erratum	errata
 37 | fireman	firemen
 38 | fish	fish,fishes
 39 | focus	focuses
 40 | foot	feet
 41 | formula	formulas
 42 | fungus	fungi,funguses
 43 | genus	genera
 44 | goose	geese
 45 | half	halves
 46 | hero	heroes
 47 | hippopotamus	hippopotami,hippopotamuses
 48 | hoof	hoofs,hooves
 49 | hypothesis	hypotheses
 50 | index	indices,indexes
 51 | knife	knives
 52 | leaf	leaves
 53 | life	lives
 54 | loaf	loaves
 55 | louse	lice
 56 | man	men
 57 | matrix	matrices
 58 | means	means
 59 | medium	media
 60 | memorandum	memoranda
 61 | millennium	millenniums,milennia
 62 | moose	moose
 63 | mosquito	mosquitoes
 64 | mouse	mice
 65 | nebula	nebulae,nebulas
 66 | neurosis	neuroses
 67 | nucleus	nuclei
 68 | oasis	oases
 69 | octopus	octopi,octopuses
 70 | ovum	ova
 71 | ox	oxen
 72 | paralysis	paralyses
 73 | parenthesis	parentheses
 74 | person	people
 75 | phenomenon	phenomena
 76 | potato	potatoes
 77 | radius	radii,radiuses
 78 | scarf	scarfs,scarves
 79 | self	selves
 80 | series	series
 81 | sheep	sheep
 82 | shelf	shelves
 83 | scissors	scissors
 84 | species	species
 85 | stimulus	stimuli
 86 | stratum	strata
 87 | syllabus	syllabi,syllabuses
 88 | symposium	symposia,symposiums
 89 | synthesis	syntheses
 90 | synopsis	synopses
 91 | tableau	tableaux
 92 | that	those
 93 | thesis	theses
 94 | thief	thieves
 95 | this	these
 96 | tomato	tomatoes
 97 | tooth	teeth
 98 | torpedo	torpedoes
 99 | vertebra	vertebrae
100 | veto	vetoes
101 | vita	vitae
102 | watch	watches
103 | wife	wives
104 | wolf	wolves
105 | woman	women
106 | zero	zeros,zeroes
107 | 


--------------------------------------------------------------------------------
/data/rules/grammar/its2:
--------------------------------------------------------------------------------
 1 | on it's own::name=it's rule::word=on its own::filter=none
 2 | of it's own::name=it's rule::word=of its own::filter=none
 3 | such as it's::name=it's rule::word=such as its::filter=none
 4 | from all it's::name=it's rule::word=from all its::filter=none
 5 | by all it's::name=it's rule::word=by all its::filter=none
 6 | it's approach::name=it's rule::word=its approach::filter=none
 7 | by it's::name=it's rule::word=by its::filter=none
 8 | By it's::name=it's rule::word=By its::filter=none
 9 | with it's::name=it's rule::word=\0 its::pivots=it's,its
10 | with/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
11 | With/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
12 | in/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
13 | In/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
14 | without/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
15 | Without/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
16 | from/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
17 | From/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
18 | Under/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
19 | under/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
20 | over/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
21 | Over/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
22 | above/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
23 | Above/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
24 | for it's .*/JJ|NN|NNS::word=for its::pivots=\1,its
25 | 
26 | it's class|color|current|end|first|former|fourth|goal|highest|history|inital|junction|lack|last|lead|lowest|maximum|minimum|money|name|northern|original|own|peak|previous|primary|second|third|timeslot|toll|way::word=its \1::pivots=it's,its::options=it's,its
27 | 
28 | at|be|about|above|across|against|along|among|around|at|behind|by|for|from|had|in|near|of|on|over|through|to|towards|under|upon|with|without it's .*/JJ|NN|NNS::word=\0 its \2::pivots=it's,its
29 | 
30 | it's you::filter=kill
31 | it's [a-z].*/NNP::word=its \1::pivots=\0,its
32 | 
33 | to .*/VB it's .*/NN|NNS::word=\0 \1 its \3::pivots=it's,its
34 | 
35 | it's .*/JJ .*/NNS|NN::word=its \1 \2::pivots=it's,its
36 | 


--------------------------------------------------------------------------------
/data/rules/agreement/chunk_plural.r:
--------------------------------------------------------------------------------
 1 | The [a-z]+/JJ two|three|four|five|six|seven|eight|nine|ten|hundred|thousand|million|billion|trillion
 2 | My|Your|His|Her|Their pants
 3 | .*/NNP [a-z]+/NN and [a-z]+/PRP.* [a-z]+/NN::\0 \1 or \3 \4
 4 | .*/NNP [a-z]+/NNS and [a-z]+/PRP.* [a-z]+/NN::\0 \1 or \3 \4
 5 | .*/NNP and [a-z]+/NNP::\0 or \2
 6 | .*/NNP and [a-z]+/PRP.* [a-z]+/NNS::\0 or \2 \3:singular
 7 | The [a-z]+/NN and [a-z]+/DT [a-z]+/NNS::\0 \1 or \3 \4:singular 
 8 | The [a-z]+/NN or [a-z]+/DT [a-z]+/NNS::\0 \1 \2 \3 \4:singular
 9 | The [a-z]+/NN and [a-z]+/NN::The \1 or \3
10 | The [a-z]+/NNS::\0 \1:singular
11 | The [a-z]+/NNS::\0 \1:singular
12 | These [a-z]+/NN and [a-z]+/DT [a-z]+/NNS::The \1 or the \4:singular
13 | These [a-z]+/NN or [a-z]+/DT [a-z]+/NNS::word=The \1 \2 the \4:singular
14 | These [a-z]+/NNS::The \1:singular
15 | All||all of [a-z]+/DT [a-z]+/NNS::\2:upper \3:singular
16 | The [a-z]+/NNS of|for [a-z]+/NN::\0 \1:singular \2 \3
17 | These [a-z]+/NNS of|for [a-z]+/NN::Each \1:singular \2 \3
18 | The [a-z]+/NNS of|for [a-z]+/JJ [a-z]+/NN::\0 \1:singular \2 \3 \4
19 | These [a-z]+/NNS of|for [a-z]+/JJ [a-z]+/NN::Each \1:singular \2 \3 \4
20 | .*/NNP,POS [a-z]+/NNS::\0 \1:singular
21 | .*/NNP,POS [a-z]+/NNS in [a-z]+/DT [a-z]+/NN::\0 \1:singular \2 \3 \4
22 | The [a-z]+/JJS [a-z]+/JJ [a-z]+/NNS of|for|from [a-z]+/NN [a-z]+/NN::\0 \1 \2 \3:singular \4 \5 \6
23 | The [a-z]+/JJS [a-z]+/JJ [a-z]+/NNS::\0 \1 \2 \3:singular
24 | .*/NNS of|for|from [a-z]+/NNS::\0:singular \1 \2:singular
25 | .*/NNP,POS [a-z]+/NNS in [a-z]+/DT [a-z]+/NN::\0 \1:singular \2 \3 \4
26 | .*/CD [a-z]+/NNS
27 | The series of [a-z]+ [a-z]+/NNS::\0 \1 \2 \3 \4:singular
28 | The series of [a-z]+/NNS::\0 \1 \2 \3:singular
29 | The/DT [a-z]+/NN [a-z]+/IN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3 \4:singular
30 | The [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
31 | My [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
32 | Your [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
33 | His [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
34 | Her [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
35 | My [a-z]+/NNS::\0 \1:singular
36 | Your [a-z]+/NNS::\0 \1:singular
37 | Their [a-z]+/NNS::\0 \1:singular
38 | His [a-z]+/NNS::\0 \1:singular
39 | Her [a-z]+/NNS::\0 \1:singular
40 | .*/JJ [a-z]+/NNS::\0 \1:singular
41 | The [a-z]+/NN [a-z]+/IN [a-z]+/VB [a-z]+/NNS
42 | My [a-z]+/NNS and I
43 | My [a-z]+/NN and I
44 | 


--------------------------------------------------------------------------------
/utils/rules/transr.sl:
--------------------------------------------------------------------------------
 1 | # 
 2 | # this is a script to transform sentences in a corpus using rules from an AtD rule file
 3 | #
 4 | # java -jar utils/rules/testr.sl <rule file> <sentences file>
 5 | #
 6 | # <rule file> format:
 7 | #
 8 | # rule..|[key=value|...]
 9 | #
10 | # note that key=value are parsed and dumped into a hash.  This information is used by the system to 
11 | # filter out false positives and stuff.
12 | #
13 | 
14 | include("lib/engine.sl");
15 | include("utils/rules/rules.sl");
16 | 
17 | sub checkSentenceSpelling
18 | {
19 | }
20 | 
21 | setf('&score', let({
22 |   local('$value');
23 |   $value = invoke($oldf, @_);
24 |   warn("Looking at: " . join("|", @_) . " = " . $value);
25 |   return $value;
26 | }, $oldf => &score));
27 | 
28 | sub initAll
29 | {
30 |    global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
31 |    $model      = get_language_model();
32 |    $dictionary = dictionary();
33 |    $dsize      = size($dictionary);
34 |    $hnetwork   = get_network("hnetwork4.bin");
35 |    $verbs      = loadVerbData();
36 |    initTaggerModels();
37 | }
38 | 
39 | sub main
40 | {
41 |    local('$handle $sentence @results @past');
42 | 
43 |    initAll();
44 | 
45 |    if (function("& $+ $1") !is $null)
46 |    {
47 |       $rules = machine();
48 |       invoke(function("& $+ $1"));
49 |    }
50 |    else
51 |    {
52 |       $rules = loadRules(machine(), $1, %());
53 |    }
54 | 
55 |    $handle = openf($2);
56 |    while $sentence (readln($handle))
57 |    {
58 |       @results = @();
59 |       processSentence(\$sentence, \@results);
60 | 
61 |       @past = copy(@results);
62 | 
63 |       if (size(@past) > 0)
64 |       {
65 | #          println($sentence);
66 | #          println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
67 |           foreach $index => $r (@past)
68 |           {
69 |              local('$rule $text $path $context @suggestions');
70 |              ($rule, $text, $path, $context, @suggestions) = $r;
71 | 
72 |              if ($r in @results)
73 |              {
74 |                 $n = strrep($sentence, $text, @suggestions[0]);
75 |                 println($n);
76 | 
77 |                 if ($n eq $sentence)
78 |                 {
79 |                     println("===> $context $text => " . @suggestions);
80 |                 }
81 | 
82 |                 break;
83 |              }
84 | 
85 |              
86 |          }
87 | 
88 |       }
89 |    }
90 | } 
91 | 
92 | invoke(&main, @ARGV);
93 | 


--------------------------------------------------------------------------------
/utils/spelldata/torules.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # Generate a rule file from cut and paste Wikipedia rules data
 3 | # http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Grammar_and_Misc
 4 | #
 5 | # use java -jar lib/sleep.jar torules.sl wrong to generate a reverse rules file suitable for error corpus generation
 6 | #
 7 | # paste the contents into a text editor, then paste into a text file and process with this program
 8 | #
 9 | 
10 | $handle = openf("wp.txt");
11 | 
12 | %sections = ohash();
13 | setMissPolicy(%sections, { return @(); });
14 | 
15 | while $text (readln($handle))
16 | {
17 |    if ($text ismatch '.*?[\*\#] (.*?) \((.*?)\).*')
18 |    {
19 |       ($wrong, $correct) = matched();
20 | 
21 |       if (',' !isin $correct)
22 |       {
23 |          @a = split(' ', $wrong);
24 |          @b = split(' ', $correct);
25 |  
26 |          if (size(@a) == size(@b))
27 |          {
28 |             foreach $index => $word (@a)
29 |             {
30 |                if ($word !in @b) { $special = $word; $replace = @b[$index]; }
31 |             }
32 | 
33 |             if (@ARGV[0] eq 'wrong')
34 |             {
35 |                push(%sections["Confused word: $special"], "$correct $+ ::word= $+ $wrong");
36 |             }
37 |             else
38 |             {
39 |                push(%sections["Confused word: $special"], "$wrong $+ ::word= $+ $correct $+ ::pivots= $+ $special $+ , $+ $replace $+ ::options= $+ $special $+ , $+ $replace");
40 |             }
41 |          }
42 |          else
43 |          {
44 |             if (@ARGV[0] eq 'wrong')
45 |             {
46 |                push(%sections["Multiple Options"], "$correct $+ ::word= $+ $wrong");
47 |             }
48 |             else
49 |             {
50 |                push(%sections["Multiple Options"], "$wrong $+ ::word= $+ $correct");
51 |             }
52 |          }
53 |       }
54 |       else
55 |       {
56 |          if (@ARGV[0] ne 'wrong')
57 |          {
58 |             push(%sections["Misc"], "$wrong $+ ::word= $+ $correct");
59 |             #push(%sections["Misc"], "$correct $+ ::word= $+ $wrong");
60 |          }
61 |          else
62 |          {
63 |             @temp = split(', ', $correct);
64 |             map(lambda({ push(%sections["Misc"], "$1 $+ ::word= $+ $wrong $+ ::options= $+ $correct"); }, \$wrong, \$correct), @temp);
65 |          }
66 |       }
67 |    }
68 |    else
69 |    {
70 |   #    push(%sections["__Rejects__"], $text);
71 |    }
72 | }
73 | 
74 | foreach $key => $value (%sections)
75 | {
76 |    println("\n#\n# $key \n#\n");
77 |    printAll($value);
78 | }
79 | 


--------------------------------------------------------------------------------
/service/code/src/org/dashnine/preditor/LanguageModelSmall.java:
--------------------------------------------------------------------------------
 1 | package org.dashnine.preditor;
 2 | 
 3 | import java.io.*;
 4 | import java.util.*;
 5 | import java.util.zip.*;
 6 | 
 7 | /** This class holds the (minified) AtD language model */
 8 | public class LanguageModelSmall extends LanguageModel implements Serializable
 9 | {
10 |    protected ZipFile entries;
11 | 
12 |    private static long lowMemoryThreshold = 256 * 1024 * 1024;
13 | 
14 |    protected class CacheMap extends LinkedHashMap 
15 |    {
16 |        protected boolean removeEldestEntry(Map.Entry eldest) 
17 |        {
18 |           long memory = Runtime.getRuntime().freeMemory() + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory());
19 |           return (size() > 16384 || memory < lowMemoryThreshold);
20 |        }
21 |    }
22 | 
23 |    /* read a string value from the specified map... adds the string if it doesn't exist */
24 |    protected Value getStringValue(Map map, String word, boolean makeAsNecessary)
25 |    {
26 |       Object sid = getStringId(word, false);
27 | 
28 |       if (sid != null)
29 |       {
30 |          synchronized (this)   
31 |          {
32 |             Value val = (Value)map.get(sid);
33 |             if (val == null && map == model)
34 |             {
35 |                try 
36 |                {
37 |                   int sid_i = ((Integer)sid).intValue();
38 | 	
39 |                   ZipEntry entry = entries.getEntry((sid_i % 512) + "/" + sid_i);
40 |                   if (entry != null)
41 |                   {
42 |                      ObjectInputStream stream = new ObjectInputStream(entries.getInputStream(entry));
43 |                      val = (Value)stream.readObject();
44 |                      map.put(sid, val);
45 |                   }
46 |                }
47 |                catch (Exception ex) 
48 |                {
49 |                   System.err.println("Could not load: " + word + "(" + sid + ")");
50 |                   ex.printStackTrace();
51 |                }
52 |             }
53 |             return val;
54 |          }
55 |       }
56 | 
57 |       return null;
58 |    }
59 | 
60 |    public LanguageModelSmall(Map _string_pool, long _count, File entries_file)
61 |    {
62 |       string_pool = _string_pool;
63 |       count = _count;
64 |       model = new CacheMap();
65 |       try 
66 |       {
67 |          entries = new ZipFile(entries_file);      
68 |       }
69 |       catch (Exception ex) 
70 |       {
71 |          System.err.println("Could not load zipfile: " + entries_file);
72 |          ex.printStackTrace();
73 |       }
74 |    }
75 | }
76 | 


--------------------------------------------------------------------------------
/utils/bigrams/amigo.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # test spanish homophones against spanish corpora.
  3 | #
  4 | 
  5 | debug(7 | 24);
  6 | 
  7 | include("lib/quality.sl");
  8 | include("lib/engine.sl");
  9 | 
 10 | #
 11 | # load AtD models
 12 | #
 13 | global('$lang');
 14 | 
 15 | $lang = systemProperties()["atd.lang"];
 16 | if ($lang ne "" && -exists "lang/ $+ $lang $+ /load.sl") {
 17 | 	include("lang/ $+ $lang $+ /load.sl");
 18 | 	initAllModels();
 19 | }
 20 | 
 21 | #
 22 | # load homophones
 23 | #
 24 | sub homophones {
 25 | 	local('$handle $text %h @candidates');
 26 | 	$handle = openf("lang/ $+ $lang $+ /homophonedb.txt");
 27 | 	while $text (readln($handle)) {
 28 | 		if ('-*' iswm $text) {
 29 | 			%h[substr($text, 1)] = $null;
 30 | 		}
 31 | 		else {
 32 | 			@candidates = split(',\s+', $text);
 33 | 			map(lambda({ %h[$1] = @candidates; }, \%h, \@candidates), @candidates);
 34 | 		}
 35 | 	}
 36 | 	return %h;
 37 | }
 38 | 
 39 | sub isHomophone {
 40 | 	local('$sentence $pre2 $pre1 $current $next @results');
 41 | 	($sentence, $pre2, $pre1, $current, $next) = @_;
 42 | 
 43 | 	@results = checkHomophone($hnetwork, $current, %homophones[$current], $pre1, $next, @(), $pre2, $bias1 => 30.0, $bias2 => 10.0);
 44 | 
 45 | 	if (size(@results) > 0) {
 46 | 		println("\t $+ $sentence");
 47 | 		println("\t $+ $pre2 $pre1 | $current | $next or: " . @results . "\n");
 48 | 	}
 49 | }
 50 | 
 51 | #
 52 | # check a sentence for homophones
 53 | #
 54 | sub checkSentenceForHomophones {
 55 | 	local('$pre2 $pre1 $current $next $word');
 56 | 
 57 | 	$current = '0BEGIN.0';
 58 | 	
 59 | 	foreach $next (splitIntoWords($1)) {
 60 | 		if ($current ne '0BEGIN.0' && $current in %homophones) {
 61 | 			isHomophone($1, $pre2, $pre1, $current, $next);
 62 | 		}
 63 | 		$pre2 = $pre1;
 64 | 		$pre1 = $current;
 65 | 		$current = $next;
 66 | 	}
 67 | 
 68 | 	$next = '0END.0';
 69 | 
 70 | 	if ($current in %homophones) {
 71 | 		isHomophone($1, $pre2, $pre1, $current, $next);
 72 | 	}
 73 | }
 74 | 
 75 | #
 76 | # loop through the file, look for homophones... report them!
 77 | #
 78 | sub checkForHomophones {
 79 | 	local('$handle $contents');
 80 | 	$handle = openf($1);
 81 | 	$contents = splitIntoSentences(join("\n", readAll($handle, -1)));
 82 | 	map(&checkSentenceForHomophones, $contents);
 83 | 	closef($handle);
 84 | }
 85 | 
 86 | sub main {
 87 | 	global('%homophones');
 88 | 	%homophones = homophones();
 89 | 	[{
 90 | 		if (-isDir $1) {
 91 | 			map($this, ls($1));
 92 | 		}
 93 | 		else {
 94 | 			if ('*.txt' iswm $1) {
 95 | 				println($1);
 96 | 				checkForHomophones($1);
 97 | 			}
 98 | 		}
 99 | 	}: "lang/ $+ $lang $+ /corpus"];
100 | }
101 | 
102 | invoke(&main, @ARGV);
103 | 


--------------------------------------------------------------------------------
/utils/spelldata/bootstrapspell.sl:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Walk through a corpus and find spelling errors and their corrections
 3 | #
 4 | # java [all the memory junk here] -jar lib/sleep.jar utils/spelldata/bootstrapspell.sl data/corpus_wikipedia
 5 | #
 6 | 
 7 | debug(7 | 34);
 8 | 
 9 | include("lib/engine.sl");
10 | 
11 | global('$model $dictionary $trie $rules $network $hnetwork %edits $dsize $old_suggest %words');
12 | 
13 | $model      = get_language_model();
14 | $dictionary = dictionary();
15 | $rules      = get_rules();
16 | $trie       = trie($dictionary);
17 | $network    = get_network("cnetwork.bin");
18 | $hnetwork   = get_network("hnetwork2.bin");
19 | %edits      = initEdits();
20 | setRemovalPolicy(%edits, { return 1; });
21 | $dsize      = size($dictionary);
22 | initTaggerModels();
23 | 
24 | $old_suggest = function('&getSuggestionPool');
25 | 
26 | sub getSuggestionPool
27 | {
28 |    local('$error $dict $pre $next @suggests %scores'); 
29 |    ($error, $dict, $pre, $next) = @_;
30 | 
31 |    if ($error ismatch '[a-z]+\'{0,1}[a-z]+' && $pre ne "" && $next ne "" && ($pre ne '0BEGIN.0' || $next ne '0END.0') && $pre ismatch '[a-zA-Z0-9\\.,]+' && $next ismatch '[a-zA-Z0-9\\.,]+')
32 | #   if ($error in %words && $pre ne "" && $next ne "" && ($pre ne '0BEGIN.0' || $next ne '0END.0') && $pre ismatch '[a-zA-Z0-9\\.,]+' && $next ismatch '[a-zA-Z0-9\\.,]+')
33 |    {
34 |       (@suggests, %scores) = invoke($old_suggest, @_);
35 | 
36 |       if (size(@suggests) > 0 && %seen[@_] is $null)
37 |       {
38 |          println("$pre * $next $+ |" . @suggests[0] . ", $error $+ |" . %scores[@suggests[0]]);
39 |          %seen[@_] = 1;
40 |       }
41 | 
42 |       return @(@suggests, %scores);
43 |    }
44 | 
45 |    return @(@(), %());
46 | }
47 | 
48 | sub checkIt
49 | {
50 |    local('$handle $data');
51 |    $handle = openf($1);
52 |    $data = readb($handle, -1);
53 |    closef($handle);
54 |  
55 |    $data = stripHTML($data);
56 |   
57 |    processDocument($data)
58 | 
59 |    local('@paragraphs $paragraph $sentence');
60 |    @paragraphs = splitByParagraph($data);
61 |    
62 |    foreach $paragraph (@paragraphs)
63 |    {
64 |       foreach $sentence ($paragraph)
65 |       {
66 |          if ($sentence eq "")
67 |          {
68 |             continue;
69 |          }
70 | 
71 |          checkSentenceSpelling(splitIntoWords($sentence), @results => @());
72 |       }
73 |    }
74 | 
75 |    [System gc];
76 | }
77 | 
78 | sub main
79 | {
80 |   # collect list of files.
81 |    [{    
82 |       if (-isDir $1)
83 |       {
84 |          map($this, ls($1));
85 |       }
86 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
87 |       {
88 |          checkIt($1);
89 |       }
90 |     }: $1];
91 | }
92 | 
93 | invoke(&main, @ARGV);
94 | 


--------------------------------------------------------------------------------
/utils/bigrams/corpus-lex-diff.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # Analyze a text file containing raw text data and show the top words not in the current wordlist data
  3 | #
  4 | #
  5 | 
  6 | sub loadWordlists
  7 | {
  8 |    if (-isDir $1)
  9 |    {
 10 |       map($this, ls($1));
 11 |    }
 12 |    else
 13 |    {
 14 |       loadWordlist($1, \%wordlist);
 15 |    }
 16 | }
 17 | 
 18 | sub loadWordlist
 19 | {
 20 |    local('$handle $word');
 21 |    $handle = openf($1);
 22 |    map(lambda({ %wordlist[$1] = 1; }, \%wordlist), split("\n", readb($handle, -1)));
 23 |    closef($handle);
 24 | }
 25 | 
 26 | sub wordlists
 27 | {
 28 |    this('$dictionary');
 29 |    if ($dictionary is $null)
 30 |    {
 31 |       $dictionary = %();
 32 |       [lambda(&loadWordlists, %wordlist => $dictionary) : "data/wordlists"];
 33 | 
 34 |       # add punctuation chars here
 35 | 
 36 | #      warn("Loaded: " . size($dictionary) . " words");
 37 | 
 38 |       $dictionary[","] = 1; # make sure commas are in the wordlist
 39 |    }
 40 |    return $dictionary;
 41 | }
 42 | 
 43 | #
 44 | # tool to build a corpus.  <3
 45 | #
 46 | 
 47 | debug(7 | 34);
 48 | 
 49 | sub process
 50 | {
 51 |    local('@words $head $next');
 52 | 
 53 |    @words = splitIntoWords($1);
 54 | 
 55 |    while (size(@words) > 1)
 56 |    {
 57 |       ($next) = @words;
 58 |  
 59 |       if ($next !in %wordlists && lc($next) !in %wordlists && !-isnumber $next)
 60 |       {
 61 |          %nots[$next] += 1;
 62 |       }
 63 | 
 64 |       @words = sublist(@words, 1);
 65 |    }
 66 | }
 67 | 
 68 | sub processFile
 69 | {
 70 |    local('$handle $key $data $text @paragraphs');
 71 | 
 72 |    # read in our corpus.
 73 |    $handle = openf($1);
 74 |    $text   = replace(readb($handle, -1), '<[^>]*?>', '');
 75 |    closef($handle);
 76 | 
 77 |    # start processing it?!?
 78 |    @paragraphs = splitByParagraph($text);
 79 |    map({ map({ map(&process, splitIntoClauses($1)); }, $1); }, @paragraphs);
 80 | }
 81 | 
 82 | sub main
 83 | {
 84 |    global('%wordlists %dictionary @files %current %nots');
 85 | 
 86 |    include("lib/nlp.sl");
 87 |    include("lib/dictionary.sl");
 88 | 
 89 |    %wordlists  = wordlists();
 90 | 
 91 |    processFile(@ARGV[0]);
 92 | 
 93 |    local('@words $word');
 94 | 
 95 |    # sort everything...
 96 | 
 97 |    @words = sort({ return %nots[$2] <=> %nots[$1]; }, filter(lambda({ return iff($min == 0 || %nots[$1] > $min, $1); }, $min => $2), keys(%nots)));
 98 | 
 99 |    foreach $word (@words)
100 |    {
101 |       if (($2 == 0 || %nots[$word] > $2))
102 |       {
103 |          if ($3 eq "")
104 |          {
105 |              println("$[50]word ... " . %nots[$word]);
106 |          }
107 |          else
108 |          {
109 |              println($word);
110 |          }
111 |       }
112 |    }
113 | }
114 | 
115 | invoke(&main, @ARGV);
116 | 


--------------------------------------------------------------------------------
/utils/common/spellcontext.sl:
--------------------------------------------------------------------------------
 1 | #
 2 | # test out spelling with associated context information
 3 | #
 4 | 
 5 | sub suggestTest
 6 | {
 7 |    local('$suspect $dict $previous $next @suggestions $f');
 8 |    ($suspect, $dict, $previous, $next) = @_;
 9 | 
10 |    @suggestions = %edits[$suspect];
11 | 
12 |    if ($correct in @suggestions)
13 |    {
14 |       foreach $f (@functions)
15 |       {
16 |          [$f : $suspect, $correct, copy(@suggestions), $previous, $next];
17 |       }
18 |     #  warn("Done for $previous $suspect $next -> $correct");
19 |    }
20 | 
21 |    return @();
22 | }
23 | 
24 | sub testCorrectionsContext
25 | {
26 |    local('$score $entry $sentence $correct $wrongs @results @words $rule $wrong $previous $next $func');
27 | 
28 |    while $entry (sentences($1))
29 |    {
30 |       ($sentence, $correct, $wrongs) = $entry;
31 |       ($previous, $next) = split(' \\* ', $sentence);
32 |       $func = lambda(&suggestTest, \$score, \$correct, @functions => sublist(@_, 1));
33 | 
34 |       #
35 |       # check for a false negative
36 |       #
37 |       foreach $wrong ($wrongs)
38 |       {
39 |          [$func: $wrong, $dictionary, $previous, $next]
40 |       } 
41 |    }
42 | }
43 | 
44 | sub checkAnyHomophone
45 | {
46 |    return invoke(&checkAnyHomophone2, @_, parameters => %(\$criteriaf))[0];
47 | }
48 | 
49 | sub checkAnyHomophone2
50 | {
51 |    local('$current $options $pre $next %scores $criteriaf @results $option $hnetwork $tags $pre2 $next2');
52 |    ($hnetwork, $current, $options, $pre, $next, $tags, $pre2, $next2) = @_;
53 | 
54 |    # setup the criteria function
55 | #   $criteriaf = criteria(@("pref", "postf", "probability"));
56 | 
57 | #   $options = filter(lambda({ return iff(Pbigram1($pre, $1) > 0.0 || Pbigram2($1, $next) > 0.0, $1); }, \$pre, \$next), $options);
58 | 
59 |    # score the options
60 |    foreach $option ($options)    
61 |    {
62 | #      warn(@_ . " -> " . [$criteriaf: $current, $option, $options, $pre, $next, $tags]);
63 |       %scores[$option] = [$hnetwork getresult: [$criteriaf: $current, $option, $options, $pre, $next, $tags, $pre2, $next2]]["result"];
64 |       if ($option eq $current)              
65 |       {
66 | #         warn(Pword($current));
67 |          %scores[$option] *= 10.0; # * (1.0 - (Pword($current) * 2500));
68 |       }
69 |    }   
70 | 
71 |    # filter out any unacceptable words
72 |    @results = filter(lambda({ return iff(%scores[$1] >= %scores[$current] && $1 ne $current && %scores[$1] > 0.0, $1, $null); }, \%scores, \$current), $options);
73 | 
74 |    # sort the remaining results (probably only one left at this point)
75 |    @results = sort(lambda({ return %scores[$2] <=> %scores[$1]; }, \%scores), @results);
76 | 
77 |    if (size(@results) > 0)
78 |    {
79 |      # warn("checkHomophone: " . @_ . " -> " . @results);
80 |      # warn("                " . %scores);
81 |    }
82 | 
83 |    # return the results
84 |    return @(@results, %scores);
85 | }
86 | 


--------------------------------------------------------------------------------
/utils/rules/testr.sl:
--------------------------------------------------------------------------------
  1 | # 
  2 | # This is a script to test the rules out.  It's fun stuff.
  3 | #
  4 | # java -jar utils/rules/testr.sl <rule file> <sentences file>
  5 | #
  6 | # <rule file> format:
  7 | #
  8 | # rule..|[key=value|...]
  9 | #
 10 | # note that key=value are parsed and dumped into a hash.  This information is used by the system to 
 11 | # filter out false positives and stuff.
 12 | #
 13 | 
 14 | include("lib/engine.sl");
 15 | include("utils/rules/rules.sl");
 16 | 
 17 | sub checkSentenceSpelling
 18 | {
 19 | }
 20 | 
 21 | setf('&score', let({
 22 |   local('$value');
 23 |   $value = invoke($oldf, @_);
 24 |   warn("Looking at: " . join("|", @_) . " = " . $value);
 25 |   return $value;
 26 | }, $oldf => &score));
 27 | 
 28 | sub initAll
 29 | {
 30 |    global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
 31 |    $model      = get_language_model();
 32 |    $dictionary = dictionary();
 33 |    $dsize      = size($dictionary);
 34 |    $hnetwork   = get_network("hnetwork4.bin");
 35 |    $verbs      = loadVerbData();
 36 |    initTaggerModels();
 37 | }
 38 | 
 39 | sub main
 40 | {
 41 |    local('$handle $sentence @results @past');
 42 | 
 43 |    initAll();
 44 | 
 45 |    if (function("& $+ $1") !is $null)
 46 |    {
 47 |       $rules = machine();
 48 |       invoke(function("& $+ $1"));
 49 |    }
 50 |    else
 51 |    {
 52 |       $rules = loadRules(machine(), $1, %());
 53 |    }
 54 | 
 55 |    # processSentence now expects $rules to be an array of rule packages
 56 |    $rules = @( $rules );
 57 | 
 58 |    $handle = openf($2);
 59 |    while $sentence (readln($handle))
 60 |    {
 61 |       @results = @();
 62 |       processSentence(\$sentence, \@results);
 63 | 
 64 |       @past = copy(@results);
 65 | 
 66 |       if (size(@past) > 0)
 67 |       {
 68 |           println($sentence);
 69 |           println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
 70 |           foreach $index => $r (@past)
 71 |           {
 72 |              local('$rule $text $path $context @suggestions');
 73 |              ($rule, $text, $path, $context, @suggestions) = $r;
 74 | 
 75 |              if ($r in @results)
 76 |              {
 77 |                 println("   $index $+ ) [ACCEPT] $context $+ , $text -> " . @suggestions);
 78 |              }
 79 |              else
 80 |              {
 81 |                 println("   $index $+ ) [REJECT] $context $+ , $text -> " . @suggestions);
 82 |              }
 83 | 
 84 |              foreach $key => $value ($rule)
 85 |              {
 86 |                  println("        $[10]key => $value");
 87 |              }
 88 |           }
 89 |       }
 90 |       else
 91 |       {
 92 | #          println("NOT FOUND");
 93 | #          println($sentence);
 94 | #          println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
 95 |       }
 96 |    }
 97 | } 
 98 | 
 99 | invoke(&main, @ARGV);
100 | 


--------------------------------------------------------------------------------
/data/rules/grammar/det_agreement:
--------------------------------------------------------------------------------
 1 | These|Those is::word=This \1::filter=none    
 2 | These|Those was::word=This \1::filter=none   
 3 | These|Those is .*/NNS::word=\0 are \2::filter=none
 4 | These|Those was .*/NNS::word=\0 were \2::filter=none
 5 | These|Those is .*/JJ .*/NNS::word=\0 are \2 \3::filter=none
 6 | These|Those was .*/JJ .*/NNS::word=\0 were \2 \3::filter=none
 7 | 
 8 | This are .*/NNS::word=These \1 \2::filter=none     
 9 | This were .*/NNS::word=Those \1 \2::filter=none    
10 | This are .*/JJ .*/NNS::word=These \1 \2 \3::filter=none     
11 | This were .*/JJ .*/NNS::word=Those \1 \2 \3::filter=none    
12 | This are::word=This is::filter=none      
13 | This were::word=This was::filter=none  
14 | 
15 | # rules for there
16 | 
17 | there|There is none::filter=kill
18 | there|There are none|but|today|plenty|way::filter=kill
19 | 
20 | there|There is .*/NNS of .*/NN|VBG::filter=kill
21 | there|There are .*/NN of .*/NNS|VBG|JJ::filter=kill
22 | there|There are .*/NN of .*/NN .*/NNS|VBG::filter=kill
23 | there|There are .*/NN .*/NNS::filter=kill
24 | there|There are .*/NN .*/NN .*/NNS::filter=kill
25 | there|There are .*/NN too many::filter=kill
26 | 
27 | #there/EX are/VBP plenty/NN of/IN advantages/NNS to/TO
28 | 
29 | # according to http://ask.metafilter.com/84536/There-is-or-There-are
30 | # I should use the closest noun to determine is/are.  So these rules are not
31 | # needed.  Just the same I'm commenting them out for future reference.
32 | #there|there is .*/NN and .*/NN::word=\0 are \2 \3 \4::filter=none
33 | #there|There are .*/NN and .*/NN::filter=kill
34 | 
35 | there|There are .*/NN::word=\0 is \2::pivots=\1,is
36 | there|There is .*/NNS::word=\0 are \2::pivots=\1,are
37 | there|There is .*/NN .*/NNS::word=\0 are \2 \3::pivots=\1,are
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | there's|There's none::filter=kill
49 | there's|There's none|but|today|plenty::filter=kill
50 | 
51 | there's|There's .*/NNS of .*/NN|VBG::filter=kill
52 | there're|There're .*/NN of .*/NNS|VBG|JJ::filter=kill
53 | there're|There're .*/NN of .*/NN .*/NNS|VBG::filter=kill
54 | there're|There're .*/NN .*/NNS::filter=kill
55 | there're|There're .*/NN .*/NN .*/NNS::filter=kill
56 | there're|There're .*/NN too many::filter=kill
57 | 
58 | #there/EX are/VBP plenty/NN of/IN advantages/NNS to/TO
59 | 
60 | # according to http://ask.metafilter.com/84536/There-is-or-There-are
61 | # I should use the closest noun to determine is/are.  So these rules are not
62 | # needed.  Just the same I'm commenting them out for future reference.
63 | #there|there is .*/NN and .*/NN::word=\0 are \2 \3 \4::filter=none
64 | #there|There are .*/NN and .*/NN::filter=kill
65 | 
66 | There're .*/NN::word=There's \1::pivots=\0,There's
67 | there're .*/NN::word=there's \1::pivots=\0,there's
68 | 
69 | There's .*/NNS::word=There are \1::pivots=\0,There are
70 | there's .*/NNS::word=there are \1::pivots=\0,there are
71 | 
72 | There's .*/NN .*/NNS::word=There are \1 \2::pivots=\0,There are
73 | there's .*/NN .*/NNS::word=there are \1 \2::pivots=\0,there are
74 | 


--------------------------------------------------------------------------------
/data/rules/grammar/repeats:
--------------------------------------------------------------------------------
 1 | #
 2 | # some repeated words, makes no sense.
 3 | #
 4 | 
 5 | you'll will::word=you will::filter=none
 6 | You'll will::word=You will::filter=none
 7 | I'll will::word=I will::filter=none
 8 | we'll will::word=we will::filter=none
 9 | We'll will::word=We will::filter=none
10 | they'll will::word=they will::filter=none
11 | They'll will::word=They will::filter=none
12 | She'll will::word=She will::filter=none
13 | she'll will::word=she will::filter=none
14 | He'll will::word=He will::filter=none
15 | he'll will::word=he will::filter=none
16 | 
17 | aren't not::word=are not::filter=none
18 | didn't not::word=did not::filter=none
19 | don't not::word=do not::filter=none
20 | isn't not::word=is not::filter=none
21 | can't not::word=can not::filter=none
22 | weren't not::word=were not::filter=none
23 | wouldn't not::word=would not::filter=none
24 | doesn't not::word=does not::filter=none
25 | hasn't not::word=has not::filter=none
26 | couldn't not::word=could not::filter=none
27 | 
28 | Aren't not::word=Are not::filter=none
29 | Didn't not::word=Did not::filter=none
30 | Don't not::word=Do not::filter=none
31 | Isn't not::word=Is not::filter=none
32 | Can't not::word=Can not::filter=none
33 | Weren't not::word=Were not::filter=none
34 | Wouldn't not::word=Would not::filter=none
35 | Doesn't not::word=Does not::filter=none
36 | Hasn't not::word=Has not::filter=none
37 | Couldn't not::word=Could not::filter=none
38 | 
39 | it's is::word=it is::filter=none
40 | It's is::word=It is::filter=none
41 | That's is::word=That is::filter=none
42 | that's is::word=that is::filter=none
43 | there's is::word=there's is::filter=none
44 | There's is::word=There's is::filter=none
45 | he's is::word=he is::filter=none
46 | He's is::word=He is::filter=none
47 | she's is::word=she is::filter=none
48 | She's is::word=She is::filter=none
49 | who's is::word=who is::filter=none
50 | Who's is::word=Who is::filter=none
51 | 
52 | we're are::word=We are::filter=none
53 | you're are::word=You are::filter=none
54 | they're are::word=They are::filter=none
55 | We're are::word=We are::filter=none
56 | You're are::word=You are::filter=none
57 | They're are::word=They are::filter=none
58 | Who're are::word=Who are::filter=none
59 | who're are::word=who are::filter=none
60 | 
61 | I'm am::word=I am::filter=none
62 | I've have::word=I have::filter=none
63 | 
64 | you've have::word=you have::filter=none
65 | we've have::word=we have::filter=none
66 | they've have::word=they have::filter=none
67 | 
68 | You've have::word=You have::filter=none
69 | We've have::word=We have::filter=none
70 | They've have::word=They have::filter=none
71 | 
72 | I'd would::word=I would::filter=none
73 | 
74 | he'd would::word=he would::filter=none
75 | she'd would::word=she would::filter=none
76 | you'd would::word=you would::filter=none
77 | we'd would::word=we would::filter=none
78 | they'd would::word=they would::filter=none
79 | 
80 | He'd would::word=He would::filter=none
81 | She'd would::word=She would::filter=none
82 | You'd would::word=You would::filter=none
83 | We'd would::word=We would::filter=none
84 | They'd would::word=They would::filter=none
85 | 


--------------------------------------------------------------------------------
/utils/spelldata/gen2.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
  3 | #
  4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
  5 | #
  6 | # wordfile must be in bad\ngood\n order
  7 | #
  8 | 
  9 | debug(7 | 34);
 10 | 
 11 | sub getthree
 12 | {
 13 |    local('@words');
 14 |    @words = copy($1);
 15 |    add(@words, '0BEGIN.0');
 16 |    push(@words, '0END.0');
 17 | 
 18 |    while (size(@words) >= 3)
 19 |    {
 20 |       yield sublist(@words, 0, 3);
 21 |       @words = sublist(@words, 1);
 22 |    }
 23 | }
 24 | 
 25 | sub process
 26 | {
 27 |    local('@words $entry $previous $current $next');
 28 | 
 29 |    $1 = [$1 trim];
 30 |    if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]')
 31 |    {
 32 |       return;
 33 |    }
 34 | 
 35 |    @words = splitIntoWords($1);
 36 | 
 37 |    while $entry (getthree(@words))
 38 |    {
 39 |       ($previous, $current, $next) = $entry;
 40 | 
 41 |       if (%words[$current] !is $null && %dictionary[$previous] !is $null && %dictionary[$next] !is $null && %counts[$current] < 1)
 42 |       {
 43 |          println($output, "$previous * $next $+ |" . join(", ", @($current, rand(%dataset[$current]))) );
 44 |          %counts[$current] += 1;
 45 |       }
 46 |    }
 47 | }
 48 | 
 49 | sub processFile
 50 | {
 51 |    local('$handle $key $data $text @paragraphs');
 52 | 
 53 |    # read in our corpus.
 54 |    $handle = openf($1);
 55 |    $text   = replace(readb($handle, -1), '<[^>]*?>', '');
 56 |    closef($handle);
 57 | 
 58 |    # start processing it?!?
 59 |    @paragraphs = splitByParagraph($text);
 60 |    map({ map(&process, $1); }, @paragraphs);
 61 | 
 62 |    #warn("Processed $1 $+ !");
 63 | }
 64 | 
 65 | sub main
 66 | {
 67 |    global('%dataset $goal %words %counts');
 68 | 
 69 |    # load the words we're interested in.
 70 |    local('$handle $text $good');
 71 | 
 72 |       $handle = openf($2);
 73 |       while $text (readln($handle))
 74 |       {
 75 |          $good = readln($handle);
 76 | 
 77 |          if (%dataset[$good] is $null) { %dataset[$good] = @(); }
 78 |          push(%dataset[$good], $text); 
 79 |          %words[$good] += 1;
 80 |       }
 81 |       closef($handle);
 82 | 
 83 |    $goal = size(%dataset);
 84 | 
 85 |    # setup our file that we're going to dump the output to.
 86 |    global('$output');
 87 |    $output = openf("> $+ $3");
 88 |    
 89 |    # ok go through all the junk parsing through the files.
 90 | 
 91 |    include("lib/nlp.sl");
 92 |    include("lib/dictionary.sl");
 93 |    global('%dictionary');
 94 |    %dictionary = dictionary();
 95 |    %dictionary["0BEGIN.0"] = 1;
 96 |    %dictionary["0END.0"] = 1;
 97 | 
 98 |    # collect list of files.
 99 |    [{
100 |       if (-isDir $1)
101 |       {
102 |          map($this, ls($1));
103 |       }
104 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
105 |       {
106 |          processFile($1);
107 |       }
108 |     }: $1];
109 | 
110 | 
111 |    closef($output);
112 |    println("Done!");
113 | }
114 | 
115 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile";
116 | 
117 | invoke(&main, @ARGV);
118 | 


--------------------------------------------------------------------------------
/utils/spelldata/gen3.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
  3 | #
  4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
  5 | #
  6 | # wordfile must be in bad\ngood\n order
  7 | #
  8 | 
  9 | debug(7 | 34);
 10 | 
 11 | sub getthree
 12 | {
 13 |    local('@words');
 14 |    @words = copy($1);
 15 |    add(@words, '0BEGIN.0');
 16 |    push(@words, '0END.0');
 17 | 
 18 |    while (size(@words) >= 3)
 19 |    {
 20 |       yield sublist(@words, 0, 3);
 21 |       @words = sublist(@words, 1);
 22 |    }
 23 | }
 24 | 
 25 | sub process
 26 | {
 27 |    local('@words $entry $previous $current $next');
 28 | 
 29 |    $1 = [$1 trim];
 30 |    if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]')
 31 |    {
 32 |       return;
 33 |    }
 34 | 
 35 |    @words = splitIntoWords($1);
 36 | 
 37 |    while $entry (getthree(@words))
 38 |    {
 39 |       ($previous, $current, $next) = $entry;
 40 | 
 41 |       if (%words[$current] !is $null && %dictionary[$previous] !is $null && %dictionary[$next] !is $null && %counts[$current] < 10)
 42 |       {
 43 |          println($output, "$previous * $next $+ |" . join(", ", concat($current, %dataset[$current])) );
 44 |          %counts[$current] += 1;
 45 |       }
 46 |    }
 47 | }
 48 | 
 49 | sub processFile
 50 | {
 51 |    local('$handle $key $data $text @paragraphs');
 52 | 
 53 |    # read in our corpus.
 54 |    $handle = openf($1);
 55 |    $text   = replace(readb($handle, -1), '<[^>]*?>', '');
 56 |    closef($handle);
 57 | 
 58 |    # start processing it?!?
 59 |    @paragraphs = splitByParagraph($text);
 60 |    map({ map(&process, $1); }, @paragraphs);
 61 | 
 62 |    #warn("Processed $1 $+ !");
 63 | }
 64 | 
 65 | sub main
 66 | {
 67 |    global('%dataset $goal %words %counts');
 68 | 
 69 |    # load the words we're interested in.
 70 |    local('$handle $text $good');
 71 | 
 72 |       $handle = openf($2);
 73 |       while $text (readln($handle))
 74 |       {
 75 |          $good = readln($handle);
 76 | 
 77 |          if (%dataset[$good] is $null) { %dataset[$good] = @(); }
 78 |          push(%dataset[$good], $text); 
 79 |          %words[$good] += 1;
 80 |       }
 81 |       closef($handle);
 82 | 
 83 |    $goal = size(%dataset);
 84 | 
 85 |    # setup our file that we're going to dump the output to.
 86 |    global('$output');
 87 |    $output = openf("> $+ $3");
 88 |    
 89 |    # ok go through all the junk parsing through the files.
 90 | 
 91 |    include("lib/nlp.sl");
 92 |    include("lib/dictionary.sl");
 93 |    global('%dictionary');
 94 |    %dictionary = dictionary();
 95 |    %dictionary["0BEGIN.0"] = 1;
 96 |    %dictionary["0END.0"] = 1;
 97 | 
 98 |    # collect list of files.
 99 |    [{
100 |       if (-isDir $1)
101 |       {
102 |          map($this, ls($1));
103 |       }
104 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
105 |       {
106 |          processFile($1);
107 |       }
108 |     }: $1];
109 | 
110 | 
111 |    closef($output);
112 |    println("Done!");
113 | }
114 | 
115 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile";
116 | 
117 | invoke(&main, @ARGV);
118 | 


--------------------------------------------------------------------------------
/utils/tagger/postest.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # test the tagger
  3 | #
  4 | 
  5 | debug(debug() | 7 | 34);
  6 | 
  7 | include("lib/tagger.sl");
  8 | initTaggerModels();
  9 | 
 10 | sub both
 11 | {
 12 |    local('$a $b');
 13 |    ($a, $b) = @_;
 14 |    while (size($a) > 0 && size($b) > 0)
 15 |    {
 16 |       yield @($a[0], $b[0]);
 17 |       $a = sublist($a, 1);
 18 |       $b = sublist($b, 1);
 19 |    }
 20 | }
 21 | 
 22 | sub tests
 23 | {
 24 |    local('$lexicon $handle $count $score $line $item $word $tag $f $compare $taggit $opt $count $word $tag');
 25 | 
 26 |    $handle = openf(@ARGV[0]);
 27 |    while $line (readln($handle))
 28 |    {
 29 |       $compare = map({ return split('/', $1)[0]; }, split(' ', $line));
 30 | 
 31 |       foreach $f (@_)
 32 |       {
 33 |          $taggit  = taggerToString([$f tag: $compare]);
 34 | 
 35 |          while $opt (both(split(' ', $line), split(' ', $taggit)))
 36 |          {
 37 |             ($word, $tag) = split('/', $opt[0]);
 38 | 
 39 |             if ($word in $lexdb)
 40 |             {
 41 |                if ($opt[0] eq $opt[1])
 42 |                {
 43 |                   [$f scoreK];
 44 |                }
 45 |                [$f countK];
 46 |             }
 47 |             else
 48 |             {
 49 |                if ($opt[0] eq $opt[1])
 50 |                {
 51 |                   [$f scoreU];
 52 |                }
 53 |                [$f countU];
 54 |             }
 55 |          }
 56 |       }        
 57 | 
 58 |       $count++;
 59 | #      if (($count % 2500) == 0 && $count > 0)
 60 | #      {
 61 | #          foreach $f (@_)
 62 | #          {
 63 | #             [$f print];
 64 | #          }
 65 | #          println("$[-20]count");
 66 | #      }
 67 |    }
 68 | 
 69 |    foreach $f (@_)
 70 |    {
 71 |       [$f print];
 72 |    }
 73 | }
 74 | 
 75 | sub test
 76 | {
 77 |    return lambda(
 78 |    {
 79 |       if ($0 eq "tag")
 80 |       {
 81 |          return invoke($function, @_);
 82 |       }
 83 |       else if ($0 eq "scoreK")
 84 |       {
 85 |          $scoreK += 1;
 86 |       }
 87 |       else if ($0 eq "countK")
 88 |       {
 89 |          $countK += 1;
 90 |       }
 91 |       else if ($0 eq "scoreU")
 92 |       {
 93 |          $scoreU += 1;
 94 |       }
 95 |       else if ($0 eq "countU")
 96 |       {
 97 |          $countU += 1;
 98 |       }
 99 |       else if ($0 eq "print")
100 |       {
101 |          println("test: $description = known: " . ($scoreK / $countK) . " unknown: " . ($scoreU / $countU) . " composite: " . (($scoreK + $scoreU) / ($countK + $countU)));
102 |       }
103 |    }, $function => $2, $description => $1, $scoreK => 0.0, $countK => 0.0, $scoreU => 0.0, $countU => 0.0);
104 | }
105 | 
106 | tests(
107 | #  test("pytagger", &taggerPython),
108 | #  test("brill-light", &taggerLikeBrill),
109 |   test("trigrams", &taggerWithTrigrams),
110 |   test("lexprob", &taggerWithLexProb),
111 | #   test("trigrams w/ neural", &taggerWithNeuralTrigrams),
112 | #  test("trigrams w/ fix", &taggerWithTrigramsFix),
113 | #  test("trigrams - no fixes", &taggerWithTrigrams2),
114 | #  test("random", &taggerRandom)
115 | #   test("HMM", &taggerHMM)
116 | );
117 | 


--------------------------------------------------------------------------------
/utils/spell/definitions.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # this script creates a dictionary definitions file for AtD from the raw text of the public
  3 | # domain OPTED dictionary (Online Plain Text English Dictionary)
  4 | #
  5 | # Available at: http://msowww.anu.edu.au/~ralph/OPTED/
  6 | #
  7 | # Depends on:
  8 | #   data/rules/homophonedb.txt (list of words we want to create def file for)
  9 | #   
 10 | # Outputs to:
 11 | #   data/rules/definitions.txt (a word<tab>definition file)
 12 | 
 13 | debug(7 | 34);
 14 | 
 15 | sub loadWords
 16 | {
 17 |    local('$handle $words $text $word $def');
 18 |    $handle = openf("data/rules/homophonedb.txt");
 19 |    $words = split(',\s+', join(", ",readAll($handle)));
 20 |    closef($handle);
 21 | 
 22 |    $handle = openf("data/rules/homo/definitions.txt");
 23 |    while $text (readln($handle))
 24 |    {
 25 |       ($word, $def) = split('\t+', $text);
 26 |       push($words, $word);
 27 |       %alts[$word] = $def;
 28 |    }
 29 |    closef($handle);
 30 | 
 31 |    map({ $dictionary[$1] = 1; }, sort({ return lc($1) cmp lc($2); }, $words));
 32 | }
 33 | 
 34 | sub suckUpDictFile
 35 | {
 36 |    local('$handle $text $word $pos $definition $check');
 37 |    $handle = openf($1);
 38 |    while $text (readln($handle))
 39 |    {
 40 |       if ($text ismatch '<P><B>(.*?)</B> \(<I>(.*?)</I>\) (.*?)</P>')
 41 |       {
 42 |          ($word, $pos, $definition) = matched();
 43 |          if ("See*" iswm $definition || "Alt. of*" iswm $definition || "pl. of" iswm $definition || "of *" iswm $definition)
 44 |          {
 45 |             continue;
 46 |          }
 47 | 
 48 |          if ($word in $dictionary && strlen($dictionary[$word]) == 1)
 49 |          {
 50 |             $dictionary[$word] = $definition;
 51 |          }
 52 |          if (lc($word) in $dictionary && strlen($dictionary[lc($word)]) == 1)
 53 |          {
 54 |             $dictionary[lc($word)] = $definition;
 55 |          }
 56 | 
 57 |          $check = lc($word) . "s";
 58 |          if ($check in $dictionary && strlen($dictionary[$check]) == 1)
 59 |          {
 60 |             $dictionary[$check] = "Plural of " . lc($word) . ". " . $definition;
 61 |          }
 62 |       }
 63 |    }
 64 | 
 65 |    closef($handle);
 66 | }
 67 | 
 68 | 
 69 | sub main
 70 | {
 71 |   global('$dictionary %alts');
 72 |   $dictionary = ohash();
 73 |   loadWords();
 74 | 
 75 |   [{ 
 76 |      if (-isDir $1)
 77 |      {
 78 |         map($this, ls($1));
 79 |      }
 80 |      else
 81 |      {
 82 |         suckUpDictFile($1);
 83 |      }
 84 |    }: "data/OPTED"];
 85 | 
 86 |    local('$word $definition');
 87 | 
 88 |    foreach $word => $definition ($dictionary)
 89 |    {
 90 |       if ($definition eq "1" || "See*" iswm $definition || "Alt. of*" iswm $definition || "of *" iswm $definition)
 91 |       {
 92 |          [[System err] println: "Substituting: $word = " . %alts[$word]];
 93 |          $definition = uc(charAt(%alts[$word], 0)) . substr(%alts[$word], 1);
 94 |       }
 95 |       else
 96 |       {
 97 |          $definition = split(';', $definition)[0];
 98 |       }
 99 | 
100 |       println("$word $+ \t $+ $definition");
101 |    }
102 | }
103 | 
104 | invoke(&main, @ARGV);
105 | 


--------------------------------------------------------------------------------
/utils/bigrams/buildunigrams.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # code to load wordlists.
  3 | # we use this here because this code actually builds the corpus.
  4 | #
  5 | # java -jar sleep.jar buildunigrams.sl corpus/ outputfile.bin
  6 | 
  7 | import org.dashnine.preditor.* from: 'lib/spellutils.jar';
  8 | 
  9 | #
 10 | # tool to build a corpus.  <3
 11 | #
 12 | 
 13 | debug(7 | 34);
 14 | 
 15 | sub process
 16 | {
 17 |    local('@words $head $next $previous');
 18 | 
 19 |    @words = splitIntoWords($1);
 20 |    add(@words, '0BEGIN.0', 0);
 21 | 
 22 |    [$model addUnigram: '0BEGIN.0'];
 23 | 
 24 |    while (size(@words) > 1)
 25 |    {
 26 |       ($head, $next) = @words;
 27 |       [$model addUnigram: $next];
 28 |       @words = sublist(@words, 1);
 29 |    }
 30 | 
 31 |    [$model addUnigram: '0END.0'];
 32 | }
 33 | 
 34 | sub processFile
 35 | {
 36 |    local('$handle $key $data $text @paragraphs');
 37 | 
 38 |    # read in our corpus.
 39 |    $handle = openf($1);
 40 |    $text   = stripHTML(join("\n", readAll($handle)));
 41 |    closef($handle);
 42 | 
 43 |    # start processing it?!?
 44 |    @paragraphs = splitByParagraph($text);
 45 |    map({ map(&process, $1); }, @paragraphs);
 46 |    warn("$1 complete");
 47 | }
 48 | 
 49 | sub agent
 50 | {
 51 |    local('$next $key $data $size $ticks $lsize $lang');
 52 | 
 53 |    include("lib/nlp.sl");
 54 | 
 55 |    $lang = systemProperties()["atd.lang"];
 56 |    if ($lang ne "" && -exists "lang/ $+ $lang $+ /load.sl")
 57 |    {
 58 |       include("lang/ $+ $lang $+ /load.sl");
 59 |    }
 60 | 
 61 |    $next = @files[0];
 62 |    removeAt(@files, 0);
 63 |    $size = size(@files);
 64 | 
 65 |    println("ready!");
 66 | 
 67 |    while ($next !is $null)
 68 |    {
 69 |       processFile($next);
 70 |       $next = @files[0];
 71 |       @files = sublist(@files, 1);
 72 |    }
 73 | }
 74 | 
 75 | sub main
 76 | {
 77 |    global('%dictionary @files %homophones $model $lock');
 78 | 
 79 |    local('$handle');
 80 | 
 81 |    if (-exists $2) 
 82 |    {
 83 |       $handle = openf($2);
 84 |       $model = readObject($handle);
 85 |       closef($handle);
 86 |    }
 87 |    else
 88 |    {
 89 |       $model = [new LanguageModel];
 90 |    }
 91 | 
 92 |    # collect list of files.
 93 |    [{
 94 |       if (-isDir $1)
 95 |       {
 96 |          map($this, ls($1));
 97 |       }
 98 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
 99 |       {
100 |          push(@files, $1);
101 |       }
102 |     }: $1];
103 | 
104 |    local('@agents @store $index $value $threads');
105 | 
106 |    $threads = 8;
107 | 
108 |    @store = @(@(), @(), @(), @(), @(), @(), @(), @());
109 | 
110 |    foreach $index => $value (@files)
111 |    {
112 |       push(@store[$index % $threads], $value);
113 |    }
114 | 
115 |    for ($index = 0; $index < $threads; $index++)
116 |    {
117 |       push(@agents, fork(&agent, @files => copy(@store[$index]), \$model, \%homophones, \%dictionary));
118 |    }
119 | 
120 |    foreach $index => $value (@agents)
121 |    {
122 |       wait($value);
123 |       warn("Agent $index complete");
124 |    }
125 | 
126 |    # save model
127 |    $handle = openf("> $+ $2");
128 |    writeObject($handle, $model);
129 |    closef($handle);
130 | 
131 |    println("Done!");
132 | }
133 | 
134 | invoke(&main, @ARGV);
135 | 


--------------------------------------------------------------------------------
/utils/rules/makeprepositions.sl:
--------------------------------------------------------------------------------
 1 | $handle = openf(@ARGV[0]);
 2 | while $text (readln($handle))
 3 | {
 4 |    ($first, $second, $type) = matches($text, '(\w+), (\w+) : (\w+)\\(.*');
 5 |    if ($type eq 'Pbigram1' && $first ne "wont" && $first ne "continue" && '*ed' !iswm $first && $first ne "attempts")
 6 |    {
 7 |       if ($second eq "to")
 8 |       {
 9 |           if ($first eq "decided")
10 |           {
11 |              println(".*/DT $first stir::filter=kill");
12 |           }
13 |           else if ($first eq "attempt")
14 |           {
15 |              println(".*/DT $first be::filter=kill");
16 |           }
17 |           else if ($first eq "reference")
18 |           {
19 |              println(".*/DT $first have::filter=kill");
20 |           }
21 |           else if ($first eq "wanted" || $first eq "wants" || $first eq "want")
22 |           {
23 |              println(".*/PRP $first help::filter=kill");
24 |              println(".*/NNP $first help::filter=kill");
25 |           }
26 | 
27 |          if (-islower charAt($first, 0))
28 |          {
29 |             println(".*/PRP $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
30 |             println(".*/NNP $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
31 |             println(".*/DT $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
32 |          }
33 |          else
34 |          {
35 |             println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
36 |             println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
37 |             println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
38 |          }
39 |       }
40 |       else if ($second eq "of")
41 |       {
42 |          if ($first eq "couple")
43 |          {
44 |             println(".*/DT $first .*/NN|NNS::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
45 |          }
46 |          else if ($first eq "beware")
47 |          {
48 |             println(".*/DT $first .*/DT .*/NN|NNS::word=\\0 \\1 $second \\2 \\3::pivots= $+ $first $+ , $+ $first $second");
49 |             println(".*/DT $first .*/NN|NNS::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
50 |          }
51 |       }
52 |       else if ($second eq "on" || $second eq "with" || $second eq "in")
53 |       {
54 | #         println("$first .*/DT .*/NN|NNS::word=\\0 $second \\1 \\2::pivots= $+ $first $+ , $+ $first $second");
55 | #         println("$first .*/NN|NNS::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
56 |       }
57 |       else if ($second ne "of" && $second ne "to")
58 |       {
59 | #         println("$first $second $+ ::filter=none");
60 |       }
61 |    }
62 |    else if ($type eq 'Pbigram2')
63 |    {
64 | #      println(".*/DT .*/NN $first $+ ::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $second $first");
65 | #      println(".*/VB $first $+ ::word=\\0 $second \\1::pivots=\\1, $+ $second \\1");
66 | #      println(".*/VBD $first $+ ::word=\\0 $second \\1::pivots=\\1, $+ $second \\1");
67 | #      println(".*/VBD .*/PRP $first $+ ::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $second $first");
68 |    }
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------
/data/rules/biasdb.txt:
--------------------------------------------------------------------------------
 1 | African people	South Asian peoples
 2 | Dwarves		Dwarfs
 3 | East Indian 	South Asian
 4 | Siamese twins	conjoined twins
 5 | West Indian	Caribbean
 6 | afflicted with	has a disability, has an illness
 7 | amputee			person with an amputation
 8 | black market 	underground economy, deals on the side
 9 | black sheep 	reprobate, backslider
10 | blackball  	ostracize, disapprove, reject
11 | blacklist 	condemn, ostracize, boycott
12 | blackmail 	extort, threaten, demand
13 | businessman 	business person
14 | chairman 	chair, co-ordinator, convenor
15 | chronic mental illness	long-term mental illness, persistent mental illness, psychiatric disability
16 | cleaning woman 	cleaner
17 | clergyman 	clergy, deacon, minister, pastor, priest, rabbi
18 | colored people	Black peoples, people of African descent,
19 | common man 	average person, members of the public
20 | confined to a wheelchair	uses a wheelchair
21 | craftsman 	artisan, craftsperson
22 | crippled	impaired, flawed, disabled
23 | deaf mute	deaf
24 | disabled person		person with a disability
25 | disseminate	broadcast, inform, publicise
26 | dwarves		dwarfs
27 | epileptics		individuals with epilepsy
28 | fair sex 	women
29 | fireman 	firefighter
30 | forefathers 	ancestors
31 | founding fathers	founders
32 | hearing impaired	hard of hearing
33 | housewife 	homemaker
34 | ladies		women
35 | lady		woman
36 | layman		layperson, average person
37 | low man|woman on the totem pole	lowest rung of the ladder
38 | man hours 	working hours
39 | man in the street	public person in the street, public, member of the public
40 | man the \w+s	staff the, handle the
41 | man-made 	synthetic, artificial
42 | mankind 	civilization, humanity, people
43 | manpower 	personnel, staff, staffing requirements, workers, workforce
44 | master copy	top copy, original
45 | master of ceremonies	host, emcee
46 | masterful	domineering, very skilful
47 | mentally ill child|adult|person|boy|girl	person with mental illness, person with psychiatric disability
48 | middleman	wholesaler, go-between, intermediary
49 | mistress of ceremonies	host, emcee
50 | newsman 	journalist, reporter
51 | niggard		miser
52 | niggardly	miserly, stingy
53 | non-whites 	people of colour
54 | old masters	classic art, artists
55 | one man show	one person show
56 | Oriental	Asian
57 | orientals 	Asian peoples, East Asian peoples, Southeast Asian peoples
58 | paraplegics		individuals with paraplegia
59 | physically challenged	physically disabled
60 | policeman 	officer, police officer
61 | postman 	postal worker, mail carrier
62 | primitive societies 	non-industrial societies
63 | retarded adult		adult with mental retardation
64 | right-hand man	assistant
65 | salesman	clerk, sales rep
66 | schizophrenics		people who have schizophrenia
67 | seminal	classical, formative
68 | sexual preference	sexual orientation, gender orientation
69 | spokesman	spokesperson, representative, speaker, official
70 | stewardess	flight attendant
71 | suffering from	has a disability, has an illness
72 | the crippled 	people with a disability
73 | the disabled 	persons with disabiliites, people with disabilities
74 | the handicapped	people with disabilities
75 | the man in the street	people in general
76 | the rights of man	peoples/citizens rights, the rights of the individual
77 | tribes	ethnic groups
78 | wheelchair-bound	uses a wheelchair
79 | wives and children	families, family
80 | workman		worker
81 | 


--------------------------------------------------------------------------------
/data/rules/grammar/determiners:
--------------------------------------------------------------------------------
 1 | # These rules look for missing determiners
 2 | 
 3 | .*/VBP &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
 4 | .*/VBP &determiner_wanted 0END.0::word=\0 \1:determiner \1, \0 \1:determiner2 \1::pivots=\1,\1:determiner \1,\1:determiner2 \1
 5 | .*/VBP .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2 \3, \0 \1:determiner2 \1 \2 \3::pivots=\1,\1:determiner \1,\1:determiner2 \1
 6 | .*/VBP .*/JJ &determiner_wanted 0END.0::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
 7 | 
 8 | .*/VBZ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
 9 | .*/VBZ &determiner_wanted 0END.0::word=\0 \1:determiner \1, \0 \1:determiner2 \1::pivots=\1,\1:determiner \1,\1:determiner2 \1
10 | .*/VBZ .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2 \3, \0 \1:determiner2 \1 \2 \3::pivots=\1,\1:determiner \1,\1:determiner2 \1
11 | .*/VBZ .*/JJ &determiner_wanted 0END.0::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
12 | 
13 | .*/MD .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
14 | .*/MD .*/VB &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2
15 | .*/MD .*/VB .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3 \4, \0 \1 \2:determiner2 \2 \3 \4::pivots=\2,\2:determiner \2,\2:determiner2 \2
16 | .*/MD .*/VB .*/JJ &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
17 | 
18 | .*/PRP .*/VBD &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
19 | .*/PRP .*/VBD &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2
20 | .*/PRP .*/VBD .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3 \4, \0 \1 \2:determiner2 \2 \3 \4::pivots=\2,\2:determiner \2,\2:determiner2 \2
21 | .*/PRP .*/VBD .*/JJ &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
22 | 
23 | .*/PRP be &determiner_wanted::filter=kill
24 | .*/PRP .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
25 | .*/PRP .*/VB &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2
26 | .*/PRP .*/VBP\,RB .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2 \3:determiner \3 \4, \0 \1 \2 \3:determiner2 \3 \4::pivots=\3,\3:determiner \3,\3:determiner2 \3
27 | .*/PRP .*/VBP\,RB .*/VB &determiner_wanted 0END.0::word=\0 \1 \2 \3:determiner \3, \0 \1 \2 \3:determiner2 \3::pivots=\3,\3:determiner \3,\3:determiner2 \3
28 | .*/PRP .*/VBP &determiner_wanted .*ing::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
29 | 


--------------------------------------------------------------------------------
/utils/spelldata/gen.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
  3 | #
  4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
  5 | #
  6 | # wordfile must be in bad\ngood\n order
  7 | #
  8 | 
  9 | debug(7 | 34);
 10 | 
 11 | sub process
 12 | {
 13 |    local('@words $head $next $count $candidate $prev $indict');
 14 | 
 15 |    $1 = [$1 trim];
 16 |    if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]')
 17 |    {
 18 |       return;
 19 |    }
 20 | 
 21 |    if ("we're" isin $1 || "they're" isin $1 || "it's" isin $1)
 22 |    {
 23 |       warn("Could be? $1");
 24 |    }
 25 | 
 26 |    @words = splitIntoWords($1);
 27 |    $count = 0;
 28 | 
 29 |    # make sure there is only one misspelling in this sentence.
 30 |    foreach $word (@words)
 31 |    {
 32 |       if (%words[$word] !is $null)
 33 |       {
 34 |          $candidate = $word;
 35 |          $count++;
 36 |       }
 37 | 
 38 |       if (%dictionary[$word] is $null)
 39 |       {
 40 |          $indict++;
 41 |       }
 42 |    }
 43 | 
 44 |    if ($count == 1 && size(@words) >= 3 && %counts[$candidate] < 10 && $indict == 0)
 45 |    {
 46 |       $change = replace($1, "\\b $+ $candidate $+ \\b", '*');   
 47 | 
 48 |       println($output, "$change $+ |" . join(", ", concat(@($candidate), %dataset[$candidate]) ));
 49 |       %counts[$candidate] += 1;
 50 |    }
 51 |    else if ("we're" isin $1 || "they're" isin $1 || "it's" isin $1)
 52 |    {
 53 |       warn("Could be? $1 - Nope: $count and " . %counts[$candidate] . " and $indict");
 54 |    }
 55 | }
 56 | 
 57 | sub processFile
 58 | {
 59 |    local('$handle $key $data $text @paragraphs');
 60 | 
 61 |    # read in our corpus.
 62 |    $handle = openf($1);
 63 |    $text   = replace(readb($handle, -1), '<[^>]*?>', '');
 64 |    closef($handle);
 65 | 
 66 |    # start processing it?!?
 67 |    @paragraphs = splitByParagraph($text);
 68 |    map({ map(&process, $1); }, @paragraphs);
 69 | 
 70 |    #warn("Processed $1 $+ !");
 71 | }
 72 | 
 73 | sub main
 74 | {
 75 |    global('%dataset $goal %words %counts');
 76 | 
 77 |    # load the words we're interested in.
 78 |    local('$handle $text $good');
 79 | 
 80 |       $handle = openf($2);
 81 |       while $text (readln($handle))
 82 |       {
 83 |          $good = readln($handle);
 84 | 
 85 |          if (%dataset[$good] is $null) { %dataset[$good] = @(); }
 86 |          push(%dataset[$good], $text); 
 87 |          %words[$good] += 1;
 88 |       }
 89 |       closef($handle);
 90 | 
 91 |    $goal = size(%dataset);
 92 | 
 93 |    # setup our file that we're going to dump the output to.
 94 |    global('$output');
 95 |    $output = openf("> $+ $3");
 96 |    
 97 |    # ok go through all the junk parsing through the files.
 98 | 
 99 |    include("nlp.sl");
100 |    include("dictionary.sl");
101 |    global('%dictionary');
102 |    %dictionary = dictionary();
103 | 
104 |    # collect list of files.
105 |    [{
106 |       if (-isDir $1)
107 |       {
108 |          map($this, ls($1));
109 |       }
110 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
111 |       {
112 |          processFile($1);
113 |       }
114 |     }: $1];
115 | 
116 | 
117 |    closef($output);
118 |    println("Done!");
119 | }
120 | 
121 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile";
122 | 
123 | invoke(&main, @ARGV);
124 | 


--------------------------------------------------------------------------------
/data/rules/grammar/combine:
--------------------------------------------------------------------------------
  1 | #
  2 | # words that should be combined
  3 | #
  4 | 
  5 | # every day (daily) vs. everyday (common)
  6 | 
  7 | an|in|for|the|to every day::word=\0 everyday::pivots=\1 \2,everyday
  8 | 
  9 | # before hand -> beforehand
 10 | 
 11 | before hand::word=beforehand
 12 | 
 13 | # an other -> another
 14 | 
 15 | an other::word=another
 16 | 
 17 | # all ways -> always (unless refering to everything) like she is better than him in all ways. <- this is ok
 18 | in all ways::filter=kill
 19 | all ways::word=always
 20 | 
 21 | # every where -> everywhere
 22 | 
 23 | every where::word=everywhere
 24 | 
 25 | #
 26 | # more words to combine
 27 | #
 28 | eye sight::word=eyesight
 29 | eye sore::word=eyesore
 30 | figure head::word=figurehead
 31 | flag ship::word=flagship
 32 | head gear::word=headgear
 33 | head quarters::word=headquarters
 34 | head stone::word=headstone
 35 | head wear::word=headwear
 36 | how ever::word=however
 37 | in stead of::word=instead of
 38 | in tact::word=intact
 39 | it self::word=itself
 40 | key note::word=keynote
 41 | laughing stock::word=laughingstock
 42 | life time::word=lifetime
 43 | mean while::word=meanwhile
 44 | nation wide::word=nationwide
 45 | near by::word=nearby
 46 | new comer::word=newcomer
 47 | no where to::word=nowhere to
 48 | note worthy::word=noteworthy
 49 | now a days::word=nowadays
 50 | on going::word=ongoing
 51 | out grow::word=outgrow
 52 | out side::word=outside
 53 | over looked::word=overlooked
 54 | over looking::word=overlooking
 55 | over rated::word=overrated
 56 | over seas::word=overseas
 57 | short coming::word=shortcoming
 58 | short cut::word=shortcut
 59 | side kick::word=sidekick
 60 | sky diving::word=skydiving
 61 | some how::word=somehow
 62 | some what::word=somewhat
 63 | stale mate::word=stalemate
 64 | them selves::word=themselves
 65 | back fire::word=backfire
 66 | world wide::word=worldwide
 67 | worth while::word=worthwhile
 68 | where as::word=whereas
 69 | where by::word=whereby
 70 | where upon::word=whereupon
 71 | #with in an|a|the second|minute|hour|year|decade|century|day::word=within \2 \3::filter=none
 72 | with in::word=within
 73 | with out::word=without
 74 | way side::word=wayside
 75 | along side::word=alongside
 76 | be cause::word=because
 77 | be ware::word=beware
 78 | before hand::word=beforehand
 79 | down side::word=downside
 80 | eye brow::word=eyebrow
 81 | eye lash::word=eyelash
 82 | eye lid::word=eyelid
 83 | through out::word=throughout
 84 | on-going::word=ongoing
 85 | light weight::word=lightweight
 86 | heavy weight::word=heavyweight
 87 | free lance::word=freelance
 88 | free lancer::word=freelancer
 89 | free lances::word=freelances
 90 | free lancing::word=freelancing
 91 | 
 92 | # awhile is an adverb, should be used after a verb
 93 | .*/VB a while::word=\0 awhile::pivots=a while,awhile
 94 | 
 95 | # join web site into website
 96 | web site::word=website
 97 | Web Site|site::word=Website
 98 | 
 99 | head scarf::word=headscarf
100 | head scarves::word=headscarves
101 | 
102 | key words::word=keywords
103 | crowd sourcing::word=crowdsourcing
104 | meta data::word=metadata
105 | mis .*::word=\0\1::filter=sane
106 | 
107 | stand alone::word=standalone
108 | past time::word=pastime
109 | any where::word=anywhere
110 | some where::word=somewhere
111 | no where::word=nowhere
112 | .*/DT bail out::word=\0 bailout::pivots=bail out,bailout
113 | 
114 | out come::word=outcome
115 | 
116 | 


--------------------------------------------------------------------------------
/utils/spelldata/gen4.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
  3 | #
  4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
  5 | #
  6 | # wordfile must be in bad\ngood\n order
  7 | #
  8 | 
  9 | debug(7 | 34);
 10 | 
 11 | sub getnext
 12 | {
 13 |    local('@words');
 14 |    @words = copy($1);
 15 |    add(@words, @('0BEGIN.0', 'UNK'));
 16 |    push(@words, @('0END.0', 'UNK'));
 17 | 
 18 |    while (size(@words) >= 5)
 19 |    {
 20 |       yield sublist(@words, 0, 5);
 21 |       @words = sublist(@words, 1);
 22 |    }
 23 | }
 24 | 
 25 | sub process
 26 | {
 27 |    local('@words $entry $previous $current $next $pre2 $pre1 $next1 $next2');
 28 | 
 29 |    $1 = [$1 trim];
 30 |    if ($1 !ismatch '[A-Z][A-Za-z\'\,\- ]*?[\.\?\!]{0,1}')
 31 |    {
 32 |       return;
 33 |    }
 34 | 
 35 |    @words = taggerWithTrigrams(splitIntoWords($1));
 36 | 
 37 |    while $entry (getnext(@words))
 38 |    {
 39 |       ($pre2, $pre1, $current, $next1, $next2) = map({ return $1[0]; }, $entry);
 40 | 
 41 |       if (%words[$current] !is $null && %dictionary[$pre2] !is $null && %dictionary[$pre1] !is $null && %dictionary[$next1] !is $null && %dictionary[$next2] !is $null && %counts[$current] < $max)
 42 |       {
 43 |          ($pre2, $pre1, $current, $next1, $next2) = map({ return join('/', $1); }, $entry);
 44 | 
 45 |          println($output, "$pre2 $pre1 * $next1 $next2 $+ |" . join("; ", concat($current, %dataset[$entry[2][0]])) );
 46 |          %counts[$entry[2][0]] += 1;
 47 |       }
 48 |    }
 49 | }
 50 | 
 51 | sub processFile
 52 | {
 53 |    local('$handle $key $data $text @paragraphs');
 54 | 
 55 |    # read in our corpus.
 56 |    $handle = openf($1);
 57 |    $text   = replace(readb($handle, -1), '<[^>]*?>', '');
 58 |    closef($handle);
 59 | 
 60 |    # start processing it?!?
 61 |    @paragraphs = splitByParagraph($text);
 62 |    map(lambda({ map(lambda(&process, \$max), $1); }, \$max), @paragraphs);
 63 | 
 64 |    #warn("Processed $1 $+ !");
 65 | }
 66 | 
 67 | sub main
 68 | {
 69 |    global('%dataset $goal %words %counts');
 70 | 
 71 |    # load the words we're interested in.
 72 |    local('$handle $text $good');
 73 | 
 74 |       $handle = openf($1);
 75 |       while $text (readln($handle))
 76 |       {
 77 |          $good = readln($handle);
 78 | 
 79 |          if (%dataset[$good] is $null) { %dataset[$good] = @(); }
 80 |          push(%dataset[$good], $text); 
 81 |          %words[$good] += 1;
 82 |       }
 83 |       closef($handle);
 84 | 
 85 |    $goal = size(%dataset);
 86 | 
 87 |    # setup our file that we're going to dump the output to.
 88 |    global('$output');
 89 |    $output = openf("> $+ $3");
 90 |    
 91 |    # ok go through all the junk parsing through the files.
 92 | 
 93 |    include("lib/nlp.sl");
 94 |    include("lib/dictionary.sl");
 95 |    include("lib/tagger.sl");
 96 | 
 97 |    global('%dictionary');
 98 |    %dictionary = dictionary();
 99 |    %dictionary["0BEGIN.0"] = 1;
100 |    %dictionary["0END.0"] = 1;
101 | 
102 |    initTaggerModels();
103 | 
104 |    # collect list of files.
105 |    [{
106 |       if (-isDir $1)
107 |       {
108 |          map($this, ls($1));
109 |       }
110 |       else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
111 |       {
112 |          processFile($1, \$max);
113 |       }
114 |     }: $2, $max => $4];
115 | 
116 | 
117 |    closef($output);
118 |    println("Done!");
119 | }
120 | 
121 | assert size(@ARGV) == 4 : "java -jar sleep.jar corpus_data wordlist outputfile max_entries_per_word";
122 | 
123 | invoke(&main, @ARGV);
124 | 


--------------------------------------------------------------------------------
/utils/rules/testgr.sl:
--------------------------------------------------------------------------------
  1 | # 
  2 | # This is a script to test grammar rules.  It's fun stuff.
  3 | #
  4 | # java -jar utils/rules/testgr.sl <sentences file> [missing|wrong]
  5 | #
  6 | 
  7 | debug(7 | 34);
  8 | 
  9 | include("lib/engine.sl");
 10 | include("utils/rules/rules.sl");
 11 | include("utils/common/score.sl");
 12 | 
 13 | sub checkSentenceSpelling
 14 | {
 15 | }
 16 | 
 17 | sub initAll
 18 | {
 19 |    global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
 20 |    $model      = get_language_model();
 21 |    $dictionary = dictionary();
 22 |    $rules      = get_rules(); 
 23 |    $dsize      = size($dictionary);
 24 |    $hnetwork   = get_network("hnetwork4.bin");
 25 |    $verbs      = loadVerbData();
 26 |    initTaggerModels();
 27 | }
 28 | 
 29 | sub measure
 30 | {
 31 |    local('@results $options $correct $score $s_score $good $index $r @suggs $debug');
 32 |    (@results, $options, $correct, $score, $s_score, $debug) = @_;
 33 | 
 34 |    if (size(@results) > 0)
 35 |    {
 36 |       foreach $index => $r (@results)
 37 |       {
 38 |          local('$rule $text $path $context @suggestions');
 39 |          ($rule, $text, $path, $context, @suggestions) = $r;
 40 | 
 41 |          if (!-isarray @suggestions) { @suggestions = split(', ', @suggestions); }
 42 | 
 43 |          if ($text eq $options[0])
 44 |          {
 45 |             @suggs = filter(lambda({ return iff($1 in $options, 1); }, $options => sublist($options, 1)), @suggestions); 
 46 | 
 47 |             if (size(@suggs) > 0)
 48 |             {
 49 |                [$score correctSugg];
 50 |                [$s_score correctSugg];
 51 | 
 52 |                if ($correct in @suggestions)
 53 |                {
 54 |                   [$score correct];
 55 |                   [$s_score correct];
 56 |                }
 57 |             }
 58 |             else if ('wrong' isin $debug)
 59 |             {
 60 |                println("$wrong => $text");
 61 |                println("  - entry:   " . $entry);
 62 |                println("  - expect:  " . sublist($options, 1));
 63 |                println("  - options: " . @suggestions);
 64 |                println("  - " . $rule['category'] . ' = ' . $rule['rule'] );
 65 |             }
 66 |             $good = 1;
 67 |    
 68 |             [$s_score record];
 69 |          }
 70 |       }
 71 |    }
 72 | 
 73 |    if (!$good)
 74 |    {
 75 |       [$score falseNegative]; # move if $text eq options[1] never happens
 76 | 
 77 |        if ('missing' isin $debug)
 78 |        {
 79 |           println("$wrong => $text");
 80 |           println("  - entry:   " . $entry);
 81 |           println("  - expect:  " . sublist($options, 1));
 82 |        }
 83 |    }
 84 | 
 85 |    [$score record];
 86 | }
 87 | 
 88 | sub main
 89 | {
 90 |    local('$handle $sentence $entry @results $options $correct $wrong $score1 $score2 $2');
 91 | 
 92 |    $score1 = newObject('score', "Suggestion score for $1");
 93 |    $score2 = newObject('score', "Grammar score for $1");
 94 | 
 95 |    initAll();
 96 | 
 97 |    $handle = openf($1);
 98 |    while $entry (readln($handle))
 99 |    {
100 |       ($sentence, $options, $correct) = split('\|', $entry);
101 |       $options = split(', ', $options);
102 | 
103 |       $wrong = strrep($sentence, ' * ', " " . $options[0] . " ");
104 | 
105 |       @results = @();
106 |       processSentence($sentence => $wrong, \@results);
107 | 
108 |       measure(@results, $options, $correct, $score2, $score1, $2, \$entry, \$wrong);
109 |    }
110 | 
111 |    [$score1 print];
112 |    [$score2 print];
113 | } 
114 | 
115 | invoke(&main, @ARGV);
116 | 


--------------------------------------------------------------------------------
/data/rules/diacritic/diaeresis:
--------------------------------------------------------------------------------
 1 | #
 2 | # http://en.wikipedia.org/wiki/Diaeresis
 3 | #
 4 | 
 5 | achroodextrin::word=achroödextrin::filter=none
 6 | aedes::word=aëdes::filter=none
 7 | Ajie::word=Ajië::filter=none
 8 | Bootes::word=Boötes::filter=none
 9 | chiliaedron::word=chiliaëdron::filter=none
10 | Chloe::word=Chloë::filter=none
11 | cooperate::word=coöperate::filter=none
12 | cooperation::word=coöperation::filter=none
13 | coopt::word=coöpt::filter=none
14 | coordinate::word=coördinate::filter=none
15 | coordinated::word=coördinated::filter=none
16 | coordinately::word=coördinately::filter=none
17 | coordinateness::word=coördinateness::filter=none
18 | coordinates::word=coördinates::filter=none
19 | coordination::word=coördination::filter=none
20 | coordinative::word=coördinative::filter=none
21 | coordinator::word=coördinator::filter=none
22 | diploe::word=diploë::filter=none
23 | eleemosynary::word=eleëmosynary::filter=none
24 | naive::word=naïve::filter=none
25 | naively::word=naïvely::filter=none
26 | noel::word=noël::filter=none
27 | Noel::word=Noël::filter=none
28 | oogone::word=oögone::filter=none
29 | ooidal::word=oöidal::filter=none
30 | oology::word=oölogy::filter=none
31 | preempt::word=preëmpt::filter=none
32 | preempted::word=preëmpted::filter=none
33 | preemptible::word=preëmptible::filter=none
34 | preemption::word=preëmption::filter=none
35 | preemptioner::word=preëmptioner::filter=none
36 | preemptive::word=preëmptive::filter=none
37 | preemptively::word=preëmptively::filter=none
38 | preemptor::word=preëmptor::filter=none
39 | preemptory::word=preëmptory::filter=none
40 | preexisting::word=preëxisting::filter=none
41 | reeducate::word=reëducate::filter=none
42 | reelect::word=reëlect::filter=none
43 | reenter::word=reënter::filter=none
44 | reentry::word=reëntry::filter=none
45 | reexamination::word=reëxamination::filter=none
46 | reexamine::word=reëxamine::filter=none
47 | reextend::word=reëxtend::filter=none
48 | uncoordinate::word=uncoördinate::filter=none
49 | uncoordinated::word=uncoördinated::filter=none
50 | vacuum::word=vacuüm::filter=none
51 | zoea::word=zoëa::filter=none
52 | zoochemistry::word=zoöchemistry::filter=none
53 | zoochemy::word=zoöchemy::filter=none
54 | zoochlorella::word=zoöchlorella::filter=none
55 | zoocyst::word=zoöcyst::filter=none
56 | zoocytium::word=zoöcytium::filter=none
57 | zooerythrine::word=zoöerythrine::filter=none
58 | zoogeography::word=zoögeography::filter=none
59 | zooglaea::word=zoöglœa::filter=none
60 | zoographer::word=zoögrapher::filter=none
61 | zoography::word=zoögraphy::filter=none
62 | zoolatry::word=zoölatry::filter=none
63 | zoology::word=zoölogy::filter=none
64 | zoomelanin::word=zoömelanin::filter=none
65 | zoomorphism::word=zoömorphism::filter=none
66 | zoon::word=zoön::filter=none
67 | zoonite::word=zoönite::filter=none
68 | zoonomy::word=zoönomy::filter=none
69 | zoonule::word=zoönule::filter=none
70 | zoopathology::word=zoöpathology::filter=none
71 | zoophaga::word=zoöphaga::filter=none
72 | zoophagan::word=zoöphagan::filter=none
73 | zoophagous::word=zoöphagous::filter=none
74 | zoophilist::word=zoöphilist::filter=none
75 | zoophily::word=zoöphily::filter=none
76 | zoophite::word=zoöphite::filter=none
77 | zoophorous::word=zoöphorous::filter=none
78 | Zoophyta::word=Zoöphyta::filter=none
79 | zoophyte::word=zoöphyte::filter=none
80 | zoophytic::word=zoöphytic::filter=none
81 | zoophytology::word=zoöphytology::filter=none
82 | zoopraxiscope::word=zoöpraxiscope::filter=none
83 | zoopsychology::word=zoöpsychology::filter=none
84 | zoosperm::word=zoösperm::filter=none
85 | zoosporangium::word=zoösporangium::filter=none
86 | zoospore::word=zoöspore::filter=none
87 | zoospores::word=zoöspores::filter=none
88 | zootic::word=zoötic::filter=none
89 | zootomist::word=zoötomist::filter=none
90 | zootomy::word=zoötomy::filter=none
91 | zootrophic::word=zoötrophic::filter=none
92 | 


--------------------------------------------------------------------------------
/data/rules/grammar/aux_been_was:
--------------------------------------------------------------------------------
 1 | been .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
 2 | been .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
 3 | been .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
 4 | been .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
 5 | was .*/VB 0END.0::word=\0 \1:participle::pivots=\1,\1:participle
 6 | was .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
 7 | was .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
 8 | was .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
 9 | was .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
10 | were .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
11 | were .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
12 | were .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
13 | were .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
14 | are .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
15 | are .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
16 | are .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
17 | are .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
18 | am .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
19 | am .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
20 | am .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
21 | am .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
22 | is .*/VB|VBP for|by|as::word=\0 \1:participle \2::pivots=\1,\1:participle
23 | is .*/RB .*/VB|VBP for|by|as::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
24 | is .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
25 | is .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
26 | do is .*/VB::filter=kill
27 | been .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
28 | been .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
29 | was .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
30 | was .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
31 | were .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
32 | were .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
33 | are .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
34 | are .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
35 | am .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
36 | am .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
37 | is .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
38 | is .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
39 | it is .*/VBP|VB|VBD::word=\0 \1 \2:participle::pivots=\2,\2:participle
40 | It is .*/VBP|VB|VBD::word=\0 \1 \2:participle::pivots=\2,\2:participle
41 | is .*/VBP|VBD|VB as|for|to::word=\0 \1:participle \2::pivots=\1,\1:participle
42 | 
43 | # are [base verb mistagged as a noun] to -> are [past tense] to
44 | are .*(?<!ing|kin)/NN to::word=\0 \1:past \2::pivots=\1,\1:past
45 | are .*(?<!ing|kin|part)/NN .*/IN::word=\0 \1:past \2::pivots=\1,\1:past
46 | 


--------------------------------------------------------------------------------
/data/rules/complex/misc:
--------------------------------------------------------------------------------
  1 | corporation organized and existing under the laws of the|The state|State of New|North|South .*::word=\11 \12 corporation
  2 | corporation organized and existing under the laws of the|The state|State of .*::word=\11 corporation
  3 | data cited::word=facts \1, results \1, observations \1
  4 | data to::word=facts \1, results \1, observations \1
  5 | to|doesn't|didn't|won't function::word=\0 act, \0 work
  6 | its function::word=\0 role
  7 | utilized::word=used
  8 | 
  9 | # objective
 10 | are|less|more|be|is|was objective::word=\0 balanced, \0 fair::pivots=objective,balanced,fair
 11 | objective view|opinion|experiment|results|data::word=balanced \1, fair \1::pivots=objective,balanced,fair
 12 | 
 13 | # rules for immediately (some neat stuff happening here)
 14 | immediately .*/VBN to|by::word=\1 \2
 15 | immediately to|by::word=\1
 16 | immediately .*(able|ible)::word=ready to \1:nosuffix::filter=none
 17 | 
 18 | #individual::word=person, personal
 19 | 
 20 | substantial::word=real
 21 | your|his|her|their feedback::word=\0 opinion, \0 reaction
 22 | a wide range|variety of::word=a \2 of, a lot of::filter=none
 23 | 
 24 | currently::word=(omit), now
 25 | 
 26 | whereas::word=because, since, where
 27 | 
 28 | attains the age of .*::word=becomes \4 years old::filter=none
 29 | individual .*/NNS::word=\1, each \1:singular
 30 | 
 31 | # rules for submit
 32 | submit::word=give, send
 33 | submit to::filter=kill
 34 | 
 35 | # who are|was|were
 36 | of .* who are|was|were .*::word=\0 \1 \4::pivots=\1 \2 \3,\1
 37 | 
 38 | # in which rules
 39 | in which::word=where
 40 | in which to::word=to, (omit)
 41 | 
 42 | therefore::word=(omit)
 43 | is therefore::word=so
 44 | that is|are .*/VBN::word=\2
 45 | 
 46 | # for example
 47 | just for example::word=such as, (omit)
 48 | for example::word=such as, (omit)
 49 | 
 50 | # rather than
 51 | rather than::word=and not, not, than::pivots=\0 \1,and not,not,than
 52 | 
 53 | # some kind of
 54 | some kind of [aeiouAEIOU].*::word=an \3
 55 | some kind of [^aeiouAEIOU].*::word=a \3
 56 | some kind of a|an::word=\3
 57 | 
 58 | # get rid of the gerund
 59 | #0BEGIN.0 I am .*/VBG::word=I \2:base, I \2:past::pivots=\1 \2,\2:base,\2:past
 60 | #0BEGIN.0 I'm .*/VBG::word=I \1:base, I \1:past::pivots=\0 \1,I \1:base,I \1:past
 61 | 
 62 | # review
 63 | review/VB::word=check
 64 | 
 65 | #
 66 | # complex expression
 67 | #
 68 | due to the fact::word=because
 69 | due to the fact that::word=because
 70 | more optimal::word=better
 71 | 
 72 | reticent to::word=reluctant to::pivots=reticent,reluctant
 73 | reticence to::word=reluctance to::pivots=reticence,reluctance::options=reticence,reluctance
 74 | 
 75 | first and foremost::word=first, foremost
 76 | flow of current::word=current, flow of charge
 77 | 
 78 | most optimal::word=best, optimal
 79 | very minimal::word=very little, very small
 80 | very optimal::word=optimal, very good
 81 | 
 82 | #
 83 | a broad swath of::word=many
 84 | swath::word=group, part, segment
 85 | a wide swath of::word=a wide
 86 | 
 87 | advocates to::word=works \1::pivots=\0,works
 88 | advocated to::word=worked \1::pivots=\0,works
 89 | 
 90 | 
 91 | provide .*/PRP with::word=give \1, offer \1::pivots=provide,give,offer
 92 | 
 93 | nowadays::word=today
 94 | Nowadays::word=Today
 95 | 
 96 | .*/MD reference::word=\0 refer to::pivots=reference,refer to
 97 | .*/JJ reference::word=\0 note::pivots=reference,note
 98 | .*/VB reference::word=\0 cite, \0 consider, \0 endorse::pivots=\1,cite,consider,endorse
 99 | 
100 | # some more fun words...
101 | actionable::word=doable, feasible::filter=none
102 | in actual fact::word=actually::filter=none
103 | administrate::administer::filter=none
104 | second of all::word=second::filter=none
105 | 
106 | .*/MD access::word=\0 use, \0 reach, \0 get access to::pivots=\1,use,reach,get access to
107 | to access::word=\0 use, \0 reach, \0 get access to::pivots=\1,use,reach,get access to
108 | 
109 | 


--------------------------------------------------------------------------------
/data/rules/grammar/comprised:
--------------------------------------------------------------------------------
 1 | #
 2 | # comprise means to contain or embrace
 3 | # compose means to put together
 4 | #
 5 | 
 6 | 
 7 | #
 8 | # comprised vs. (composed, consisting, comprising, consisted)
 9 | #                  
10 | 
11 | album comprised of::word=album composed of::pivots=comprised,composed::options=comprised,composed
12 | area comprised of::word=area consisting of::pivots=comprised,consisting::options=comprised,consisting
13 | army comprised of::word=army composed of::pivots=comprised,composed::options=comprised,composed
14 | band comprised of::word=band composed of, band comprising::pivots=comprised,composed,comprising::options=comprised,composed,comprising
15 | comprised primarily of::word=composed primarily of::pivots=comprised,composed::options=comprised,composed
16 | comprised principally of::word=composed principally of::pivots=comprised,composed::options=comprised,composed
17 | entirely comprised of::word=entirely composed of::pivots=comprised,composed::options=comprised,composed
18 | group comprised of::word=group composed of::pivots=comprised,composed::options=comprised,composed
19 | mainly comprised of::word=mainly composed of::pivots=comprised,composed::options=comprised,composed
20 | mostly comprised of::word=mostly composed of::pivots=comprised,composed::options=comprised,composed  
21 | 
22 | comprised chiefly of::word=composed chiefly of, comprised chiefly, consisted chiefly of::options=comprised,composed,consisted                                      
23 | comprised entirely of::word=composed entirely of, comprised entirely, consisted entirely of::options=comprised,composed,consisted
24 | comprised generally of::word=composed generally of, comprised generally, consisted generally of::options=comprised,composed,consisted
25 | comprised largely of::word=composed largely of, comprised largely, consisted largely of::options=comprised,composed,consisted
26 | comprised mainly of::word=composed mainly of, comprised mainly, consisted mainly of::options=comprised,composed,consisted
27 | comprised mostly of::word=composed mostly of, comprised mostly, consisted mostly of::options=comprised,composed,consisted
28 | comprised of::word=composed of, comprised, consisted of::options=comprised,composed,consisted
29 | comprised only of::word=composed only of, comprised only, consisted only of::options=comprised,composed,consisted
30 | comprised wholly of::word=composed wholly of, comprised wholly, consisted wholly of::options=comprised,composed,consisted
31 | 
32 | comprises of::word=comprises, consists of::options=comprises,consists
33 | comprises mostly of::word=comprises mostly, consists mostly of::options=comprises,consists    
34 | comprising of::word=comprising, consisting of::options=comprising,consisting
35 | 
36 | comprise of::word=comprise, consist of::options=comprise,consist
37 | 
38 | was comprised of::word=comprised, consisted of, was composed of::options=comprised,consisted,composed
39 | were comprised of::word=comprised, were composed of::options=comprised,composed
40 | which comprised of::word=which comprised, which consisted of, which was composed of::options=comprised,consisted,composed
41 | will be comprised of::word=will be composed of, will comprise, will consist of::options=comprised,composed,comprise
42 | will comprise of::word=will be composed of, will comprise, will consist of::options=comprise,composed,comprise,consist
43 | would comprise of::word=would comprise, would consist of::options=comprise,consist
44 | 
45 | government comprised of::word=government composed of, government consisting of::options=comprised,composed,consisting
46 | 
47 | is comprised mostly of::word=comprises mostly, consists mostly of, is composed mostly of::options=comprised,comprises,consists,composed::pivots=comprised mostly of,comprises mostly,consists mostly of,composed mostly of
48 | is comprised of::word=comprises, consists of, is composed of::options=comprised,comprises,consists,composed::pivots=comprised of,comprises,consists of,composed of
49 | 


--------------------------------------------------------------------------------
/data/rules/grammar/aux_wrong_verb:
--------------------------------------------------------------------------------
 1 | #
 2 | # determiner agreement rules for determiners expecting a singular noun
 3 | #
 4 | 
 5 | 
 6 | # determiner agreement rules for this/the/these
 7 | 
 8 | This [a-z].*/NNS was::word=These \1 were, \0 \1:singular \2::pivots=\1,\1,\1:singular
 9 | This [a-z].*/NNS is::word=These \1 are, \0 \1:singular \2::pivots=\1,\1,\1:singular
10 | This [a-z].*/NNS has::word=These \1 have, \0 \1:singular \2::pivots=\1,\1,\1:singular
11 | The [a-z].*/NNS is::word=\0 \1 are, \0 \1:singular \2::pivots=\1,\1,\1:singular
12 | The [a-z].*/NNS was::word=\0 \1 were, \0 \1:singular \2::pivots=\1,\1,\1:singular
13 | The [a-z].*/NNS has::word=\0 \1 have, \0 \1:singular \2::pivots=\1,\1,\1:singular
14 | This [a-z].*/NNS [a-z].*/NNS is::word=These \1 \2 are, These \1:singular \2 are, \0 \1 \2:singular \3::pivots=\2,\2,\2,\2:singular
15 | This [a-z].*/NNS [a-z].*/NNS was::word=These \1 \2 were, These \1:singular \2 were, \0 \1 \2:singular \3::pivots=\2,\2,\2,\2:singular
16 | This [a-z].*/NNS [a-z].*/NNS has::word=These \1 \2 have, These \1:singular \2 has, \0 \1 \2:singular \3::pivots=\2,\2,\2,\2:singular
17 | The [a-z].*/NNS [a-z].*/NNS is::word=\0 \1 \2 are, \0 \1:singular \2 are, \0 \1 \2:singular \3::pivots=\2,\2,\2,\2:singular
18 | The [a-z].*/NNS [a-z].*/NNS was::word=\0 \1 \2 were, \0 \1:singular \2 were, \0 \1 \2:singular \3::pivots=\2,\2,\2,\2:singular
19 | The [a-z].*/NNS [a-z].*/NNS has::word=\0 \1 \2 have, \0 \1:singular \2 has, \0 \1 \2:singular \3::pivots=\2,\2,\2,\2:singular
20 | The|These [a-z].*/NNS [a-z].*/NNS are|were|have::word=\0 \1:singular \2 \3::pivots=\1,\1:singular
21 | The [a-z].*/NNS of [a-z].*/NN is::word=\0 \1 \2 \3 are, \0 \1:singular \2 \3 \4::pivots=\1 \2 \3,\1 \2 \3,\1:singular \2 \3
22 | The [a-z].*/NNS of [a-z].*/NN was::word=\0 \1 \2 \3 were, \0 \1:singular \2 \3 \4::pivots=\1 \2 \3,\1 \2 \3,\1:singular \2 \3
23 | The [a-z].*/NNS of [a-z].*/NN has::word=\0 \1 \2 \3 have, \0 \1:singular \2 \3 \4::pivots=\1 \2 \3,\1 \2 \3,\1:singular \2 \3
24 | This [a-z].*/NNS of [a-z].*/NN is::word=These \1 \2 \3 are, \0 \1:singular \2 \3 \4::pivots=\1 \2 \3,\1 \2 \3,\1:singular \2 \3
25 | This [a-z].*/NNS of [a-z].*/NN was::word=These \1 \2 \3 were, \0 \1:singular \2 \3 \4::pivots=\1 \2 \3,\1 \2 \3,\1:singular \2 \3
26 | This [a-z].*/NNS of [a-z].*/NN has::word=These \1 \2 \3 have, \0 \1:singular \2 \3 \4::pivots=\1 \2 \3,\1 \2 \3,\1:singular \2 \3
27 | 
28 | # determiner agreement rules for a/an
29 | an eagerness to::filter=kill
30 | 
31 | a .*/NNS has::word=\1 have, \0 \1:singular has::filter=none
32 | an .*/NNS has::word=\1 have, \0 \1:singular has::filter=none
33 | a .*/NNS is::word=\1 are, \0 \1:singular is::filter=none
34 | an .*/NNS is::word=\1 are, \0 \1:singular is::filter=none
35 | A .*/NNS has::word=\1 have, \0 \1:singular has::filter=none
36 | An .*/NNS has::word=\1 have, \0 \1:singular has::filter=none
37 | A .*/NNS is::word=\1 are, \0 \1:singular is::filter=none
38 | An .*/NNS is::word=\1 are, \0 \1:singular is::filter=none
39 | a .*/NNS would|will|to|can|should|at|is|as|has|have|are::word=\1 \2, \0 \1:singular \2::filter=none
40 | an .*/NNS would|will|to|can|should|at|is|as|has|have|are::word=\1 \2, \0 \1:singular \2::filter=none
41 | A .*/NNS would|will|to|can|should|at|is|as|has|have|are::word=\1 \2, \0 \1:singular \2::filter=none
42 | An .*/NNS would|will|to|can|should|at|is|as|has|have|are::word=\1 \2, \0 \1:singular \2::filter=none
43 | 
44 | An|an alumni of::word=\0 alumnus of::filter=none
45 | 
46 | # determiner agreement rules for each/every
47 | 
48 | each .*/NNS is::word=all \1 are, \0 \1:singular \2::filter=none
49 | Each .*/NNS is::word=All \1 are, \0 \1:singular \2::filter=none
50 | every .*/NNS is::word=all \1 are, \0 \1:singular \2::filter=none
51 | Every .*/NNS is::word=All \1 are, \0 \1:singular \2::filter=none
52 | each .*/NNS has::word=all \1 have, \0 \1:singular \2::filter=none
53 | Each .*/NNS has::word=All \1 have, \0 \1:singular \2::filter=none
54 | every .*/NNS has::word=all \1 have, \0 \1:singular \2::filter=none
55 | Every .*/NNS has::word=All \1 have, \0 \1:singular \2::filter=none
56 | 


--------------------------------------------------------------------------------
/service/src/view/wordpress26.slp:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US">
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
  5 | <title>Rich Editor Help</title>
  6 | <script type="text/javascript" src="http://killall.dashnine.org/wp-content/plugins/tinymce-advanced/tiny_mce_popup.js"></script>
  7 | <link rel='stylesheet' href='http://killall.dashnine.org/wp-admin/css/global.css' type='text/css' media='all' />
  8 | <link rel='stylesheet' href='http://killall.dashnine.org/wp-admin/wp-admin.css' type='text/css' media='all' />
  9 | <style type="text/css">
 10 | 	#wphead {
 11 | 		font-size: 80%;
 12 | 		border-top: 0;
 13 | 		color:#555;
 14 | 		background-color: #e4f2fd;
 15 | 	}
 16 | 	#wphead h1 {
 17 | 		font-size: 32px;
 18 | 		color: #555;
 19 | 		margin: 0;
 20 | 		padding: 10px;
 21 | 	}
 22 | 	#adminmenu {
 23 | 		padding-top: 2px;
 24 | 		padding-left: 15px;
 25 | 		background-color: #e4f2fd;
 26 | 		border-color: #C6D9E9;
 27 | 	}
 28 | 	#adminmenu a.current {
 29 | 		background-color: #fff;
 30 | 		border-color: #c6d9e9;
 31 | 		border-bottom-color: #fff;
 32 | 		color: #d54e21;
 33 | 	}
 34 | 	#adminmenu a {
 35 | 		color: #2583AD;
 36 | 		padding: 6px;
 37 | 		border-width: 1px;
 38 | 		border-style: solid solid none;
 39 | 		border-color: #E4F2FD;
 40 | 	}
 41 | 	#adminmenu a:hover {
 42 | 		color: #d54e21;
 43 | 	}
 44 | 	.wrap h2 {
 45 | 		border-bottom-color:#DADADA;
 46 | 		color:#666666;
 47 | 		margin: 12px 0;
 48 | 		padding: 0;
 49 | 	}
 50 | 	#user_info {
 51 | 		right: 5%;
 52 | 		top: 5px;
 53 | 	}
 54 | 	h3 {
 55 | 		font-size: 1.1em;
 56 | 		margin-top: 20px;
 57 | 		margin-bottom: 0px;
 58 | 	}
 59 | 	#flipper {
 60 | 		margin: 0;
 61 | 		padding: 5px 20px 10px;
 62 | 		background-color: #fff;
 63 | 		border-left: 1px solid #c6d9e9;
 64 | 		border-bottom: 1px solid #c6d9e9;
 65 | 	}
 66 | 	* html {
 67 |         overflow-x: hidden;
 68 |         overflow-y: scroll;
 69 |     }
 70 | 	#flipper div p {
 71 | 		margin-top: 0.4em;
 72 | 		margin-bottom: 0.8em;
 73 | 		text-align: justify;
 74 | 	}
 75 | 	th {
 76 | 		text-align: center;
 77 | 	}
 78 | 	.top th {
 79 | 		text-decoration: underline;
 80 | 	}
 81 | 	.top .key {
 82 | 		text-align: center;
 83 | 		width: 36px;
 84 | 	}
 85 | 	.top .action {
 86 | 		text-align: left;
 87 | 	}
 88 | 	.align {
 89 | 		border-left: 3px double #333;
 90 | 		border-right: 3px double #333;
 91 | 	}
 92 | 	.keys {
 93 | 		margin-bottom: 15px;
 94 | 	}
 95 | 	.keys p {
 96 | 		display: inline-block;
 97 | 		margin: 0px;
 98 | 		padding: 0px;
 99 | 	}
100 | 	.keys .left { text-align: left; }
101 | 	.keys .center { text-align: center; }
102 | 	.keys .right { text-align: right; }
103 | 	td b {
104 | 		font-family: "Times New Roman" Times serif;
105 | 	}
106 | 	#buttoncontainer {
107 | 		text-align: center;
108 | 		margin-bottom: 20px;
109 | 	}
110 | 	#buttoncontainer a, #buttoncontainer a:hover {
111 | 		border-bottom: 0px;
112 | 	}
113 | </style>
114 | <script type="text/javascript">
115 | 	function d(id) { return document.getElementById(id); }
116 | 
117 | 	function flipTab(n) {
118 | 		for (i=1;i<=4;i++) {
119 | 			c = d('content'+i.toString());
120 | 			t = d('tab'+i.toString());
121 | 			if ( n == i ) {
122 | 				c.className = '';
123 | 				t.className = 'current';
124 | 			} else {
125 | 				c.className = 'hidden';
126 | 				t.className = '';
127 | 			}
128 | 		}
129 | 	}
130 | 
131 |     function init() {
132 |         document.getElementById('version').innerHTML = tinymce.majorVersion + "." + tinymce.minorVersion;
133 |         document.getElementById('date').innerHTML = tinymce.releaseDate;
134 |     }
135 |     tinyMCEPopup.onInit.add(init);
136 | </script>
137 | </head>
138 | <body>
139 | 
140 | <div class="zerosize"></div>
141 | 
142 | <div id="wphead"><h1>After the Deadline</h1></div>
143 | 
144 | <ul id="adminmenu">
145 |    <li><a id="tab1" href="javascript:flipTab(1)" title="<% $1["rule"] %>" accesskey="1" tabindex="1" class="current"><% $1["rule"] %></a></li>
146 | <!--   <li><a id="tab2" href="javascript:flipTab(2)" title="About After the Deadline" accesskey="2" tabindex="2">About</a></li> -->
147 | </ul>
148 | 
149 | <div id="flipper" class="wrap">
150 |   <div id="content1">
151 |      <?sleep display("service/src/view/rule.slp", $1, $2); ?>
152 |   </div>
153 | <!--  <div id="content2" style="display: none">
154 |      <h1>About</h1>
155 |      <p>Some cool about text will go here later on...</p>
156 |   </div> -->
157 | </div>
158 | 
159 | <!-- <div class="mceActionPanel">
160 | 	<div style="margin: 8px auto; text-align: center;padding-bottom: 10px;">
161 | 		<input type="button" id="cancel" name="cancel" value="Close" title="Close" onclick="tinyMCEPopup.close();" />
162 | 	</div>
163 | </div> -->
164 | 
165 | 
166 | </body>
167 | </html>
168 | 


--------------------------------------------------------------------------------
/utils/common/spelltests.sl:
--------------------------------------------------------------------------------
  1 | #
  2 | # this is a script to run unit tests and calculute the effectiveness of the 
  3 | # preditor engine
  4 | #
  5 | 
  6 | sub testSpellingNoContext
  7 | {
  8 |    local('$handle $score $bad $good');
  9 |    $handle = openf("tests/tests2.txt");
 10 | 
 11 |    $score = newObject("score", "Spellchecker w/ No Context");
 12 | 
 13 |    while $bad (readln($handle))
 14 |    {
 15 |       $good = readln($handle);
 16 |       if ($dictionary[$bad] !is $null)
 17 |       {
 18 |          local('$source $size');
 19 |          [$score falseNegative];
 20 |       }
 21 |       else
 22 |       {
 23 |          [$score correct];
 24 |       }
 25 | 
 26 |       if ($dictionary[$good] is $null)
 27 |       {
 28 |          [$score falsePositive];
 29 |       }
 30 | 
 31 |       [$score record];
 32 |    }
 33 | 
 34 |    [$score print];
 35 | }
 36 | 
 37 | sub testSoundEx
 38 | {
 39 |    local('$score $entry $bad $good');
 40 |    $score = newObject("score", "Test of SoundEx");
 41 |    while $entry (words("tests2.txt"))
 42 |    {
 43 |       ($bad, $good) = $entry;
 44 |       if (soundex($bad) eq soundex($good))
 45 |       {
 46 |          [$score correct];
 47 |       }
 48 |       else
 49 |       {
 50 |          warn("$[25]bad " . soundex($bad) . " $[25]good " . soundex($good));
 51 |       }
 52 | 
 53 |       [$score record];
 54 |    }
 55 | 
 56 |    [$score print];
 57 | }
 58 | 
 59 | sub testSoundExEditDistance
 60 | {
 61 |    local('%distance %totals $count $entry $bad $good $key $value $p $t');
 62 | 
 63 |    while $entry (words("tests2.txt"))
 64 |    {
 65 |       ($bad, $good) = $entry;
 66 |       if (soundex($bad) eq soundex($good))
 67 |       {
 68 |          %distance[editDistance($good, $bad)] += 1;
 69 |       }
 70 |  
 71 |       if (editDistance($good, $bad) == 0)
 72 |       {
 73 |          warn("$good -> $bad has an edit distance of 0?!?");
 74 |       }
 75 |       
 76 |       %totals[editDistance($good, $bad)] += 1;
 77 |       $count++;
 78 |    }
 79 | 
 80 |    foreach $key => $value (%distance)
 81 |    {
 82 |       $p = double($value) / $count;
 83 |       $t = double($value) / %totals[$key];
 84 | 
 85 |       println("$[5]key $[20]t $p"); 
 86 |    }
 87 | }
 88 | 
 89 | sub testCorrectionsNoContext
 90 | {
 91 |    local('$good $bad $entry $score @suggestions $f $c');
 92 | 
 93 |    $score = newObject("score", "Test of Corrections w/o Context");
 94 |    $c = 0;
 95 | 
 96 | 
 97 |    while $entry (words(@_[0]))
 98 |    {
 99 |       ($bad, $good) = $entry;
100 | 
101 |       if ($dictionary[$bad] is $null && $dictionary[$good] !is $null)
102 |       {
103 |          @suggestions = %edits[$bad]; # filterByDictionary($bad, $dictionary);
104 | 
105 |          if ($good in @suggestions)
106 |          {
107 |             foreach $f (sublist(@_, 1))
108 |             {
109 |                [$f : $bad, $good, copy(@suggestions), $null, $null];
110 |             }
111 |             [$score correct];
112 |          }
113 |          else
114 |          {
115 |        #     println("$bad -> $good : " . editDistance($bad, $good));
116 |          }
117 | 
118 |          [$score record];
119 |       }
120 |       else
121 |       {
122 |          if ($dictionary[$bad] !is $null)
123 |          {
124 |             [$score falseNegative];
125 |             $c++;
126 |          }
127 | 
128 |          if ($dictionary[$good] is $null)
129 |          {
130 |             [$score falsePositive];
131 |          }
132 |       }
133 |    }
134 | 
135 |    println("Present words: $c");
136 |    [$score print];
137 | }
138 | 
139 | sub RandomGuess
140 | {
141 |    [$score record];
142 |    if (rand($3) eq $2)
143 |    {
144 |       [$score correct];
145 |    }
146 | }
147 | 
148 | sub FrequencyCount
149 | { 
150 |    local('@suggs');
151 | 
152 |    [$score record];
153 |    @suggs = sort({ return Pword($2) <=> Pword($1); }, $3);  
154 |    if (@suggs[0] eq $2)
155 |    {
156 |       [$score correct];
157 |    }
158 | }
159 | 
160 | sub scoreIt
161 | {
162 |    return (         ( 0.75 / (  editDistance($word, $1) + 1  ) )         ) + 
163 |           (  0.25 *  Pword($1) ) ;
164 | }
165 | sub scoreIt2
166 | {
167 |    return (         ( 0.75 / (  editDistance($word, $1) + 1  ) )         ) + 
168 |           (  0.25 * Pword($1)  ) ;
169 | }
170 | 
171 | sub CombineFreqEdit
172 | {
173 |    local('@suggs');
174 | 
175 |    let(&scoreIt, $word => $1);
176 |    let(&scoreIt2, $word => $1);
177 | 
178 |    [$score record];
179 |    @suggs = sort({ return scoreIt2($2) <=> scoreIt2($1); }, $3);  
180 | 
181 |    if (@suggs[0] eq $2)
182 |    {
183 |       [$score correct];
184 |    }
185 | }
186 | 
187 | sub NeuralNetworkScore
188 | {
189 |    local('@suggs $4 $5 $cs');
190 | 
191 |    [$score record];
192 |    @suggs = sortHash($3, CompareSuggestions($network, $criteriaf, $1, $pool => $3, $pre => $4, $next => $5));
193 | 
194 |    if (@suggs[0] eq $2)
195 |    {
196 |       [$score correct];
197 |    }
198 | }
199 | 


--------------------------------------------------------------------------------
/service/src/view/wordpress.slp:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US">
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
  5 | <title>Rich Editor Help</title>
  6 | <script type="text/javascript" src="tiny_mce_popup.js?ver=321"></script>
  7 | <link rel='stylesheet' href='http://killall.dashnine.org/wp-admin/css/global.css?ver=20081210' type='text/css' media='all' />
  8 | <link rel='stylesheet' href='http://killall.dashnine.org/wp-admin/wp-admin.css?ver=20081210' type='text/css' media='all' />
  9 | <style type="text/css">
 10 | 	#wphead {
 11 | 		font-size: 80%;
 12 | 		border-top: 0;
 13 | 		color: #555;
 14 | 		background-color: #f1f1f1;
 15 | 	}
 16 | 	#wphead h1 {
 17 | 		font-size: 24px;
 18 | 		color: #555;
 19 | 		margin: 0;
 20 | 		padding: 10px;
 21 | 	}
 22 | 	#tabs {
 23 | 		padding: 15px 15px 3px;
 24 | 		background-color: #f1f1f1;
 25 | 		border-bottom: 1px solid #dfdfdf;
 26 | 	}
 27 | 	#tabs li {
 28 | 		display: inline;
 29 | 	}
 30 | 	#tabs a.current {
 31 | 		background-color: #fff;
 32 | 		border-color: #dfdfdf;
 33 | 		border-bottom-color: #fff;
 34 | 		color: #d54e21;
 35 | 	}
 36 | 	#tabs a {
 37 | 		color: #2583AD;
 38 | 		padding: 6px;
 39 | 		border-width: 1px 1px 0;
 40 | 		border-style: solid solid none;
 41 | 		border-color: #f1f1f1;
 42 | 		text-decoration: none;
 43 | 	}
 44 | 	#tabs a:hover {
 45 | 		color: #d54e21;
 46 | 	}
 47 | 	.wrap h2 {
 48 | 		border-bottom-color: #dfdfdf;
 49 | 		color: #555;
 50 | 		margin: 5px 0;
 51 | 		padding: 0;
 52 | 		font-size: 18px;
 53 | 	}
 54 | 	#user_info {
 55 | 		right: 5%;
 56 | 		top: 5px;
 57 | 	}
 58 | 	h3 {
 59 | 		font-size: 1.1em;
 60 | 		margin-top: 10px;
 61 | 		margin-bottom: 0px;
 62 | 	}
 63 | 	#flipper {
 64 | 		margin: 0;
 65 | 		padding: 5px 20px 10px;
 66 | 		background-color: #fff;
 67 | 		border-left: 1px solid #dfdfdf;
 68 | 		border-bottom: 1px solid #dfdfdf;
 69 | 	}
 70 | 	* html {
 71 |         overflow-x: hidden;
 72 |         overflow-y: scroll;
 73 |     }
 74 | 	#flipper div p {
 75 | 		margin-top: 0.4em;
 76 | 		margin-bottom: 0.8em;
 77 | 		text-align: justify;
 78 | 	}
 79 | 	th {
 80 | 		text-align: center;
 81 | 	}
 82 | 	.top th {
 83 | 		text-decoration: underline;
 84 | 	}
 85 | 	.top .key {
 86 | 		text-align: center;
 87 | 		width: 5em;
 88 | 	}
 89 | 	.top .action {
 90 | 		text-align: left;
 91 | 	}
 92 | 	.align {
 93 | 		border-left: 3px double #333;
 94 | 		border-right: 3px double #333;
 95 | 	}
 96 | 	.keys {
 97 | 		margin-bottom: 15px;
 98 | 	}
 99 | 	.keys p {
100 | 		display: inline-block;
101 | 		margin: 0px;
102 | 		padding: 0px;
103 | 	}
104 | 	.keys .left { text-align: left; }
105 | 	.keys .center { text-align: center; }
106 | 	.keys .right { text-align: right; }
107 | 	td b {
108 | 		font-family: "Times New Roman" Times serif;
109 | 	}
110 | 	#buttoncontainer {
111 | 		text-align: center;
112 | 		margin-bottom: 20px;
113 | 	}
114 | 	#buttoncontainer a, #buttoncontainer a:hover {
115 | 		border-bottom: 0px;
116 | 	}
117 | </style>
118 | <script type="text/javascript">
119 | 	function d(id) { return document.getElementById(id); }
120 | 
121 | 	function flipTab(n) {
122 | 		for (i=1;i<=4;i++) {
123 | 			c = d('content'+i.toString());
124 | 			t = d('tab'+i.toString());
125 | 			if ( n == i ) {
126 | 				c.className = '';
127 | 				t.className = 'current';
128 | 			} else {
129 | 				c.className = 'hidden';
130 | 				t.className = '';
131 | 			}
132 | 		}
133 | 	}
134 | 
135 |     function init() {
136 |         document.getElementById('version').innerHTML = tinymce.majorVersion + "." + tinymce.minorVersion;
137 |         document.getElementById('date').innerHTML = tinymce.releaseDate;
138 |     }
139 |     tinyMCEPopup.onInit.add(init);
140 | </script>
141 | </head>
142 | <body>
143 | 
144 | <div class="zerosize"></div>
145 | 
146 | <div id="wphead"><h1>After the Deadline</h1></div>
147 | 
148 | <ul id="tabs">
149 |    <li><a id="tab1" href="javascript:flipTab(1)" title="<% $1["rule"] %>" accesskey="1" tabindex="1" class="current"><% $1["rule"] %></a></li>
150 | <!--   <li><a id="tab2" href="javascript:flipTab(2)" title="About After the Deadline" accesskey="2" tabindex="2">About</a></li> -->
151 | </ul>
152 | 
153 | <div id="flipper" class="wrap">
154 |   <div id="content1">
155 |      <?sleep display("service/src/view/rule.slp", $1, $2); ?>
156 |   </div>
157 | <!--  <div id="content2" style="display: none">
158 |      <h1>About</h1>
159 |      <p>Some cool about text will go here later on...</p>
160 |   </div> -->
161 | </div>
162 | 
163 | <!-- <div class="mceActionPanel">
164 |         <div style="margin: 8px auto; text-align: center;padding-bottom: 10px;">
165 |                 <input type="button" id="cancel" name="cancel" value="Close" title="Close" onclick="tinyMCEPopup.close();" />
166 |         </div>
167 | </div> -->
168 | 
169 | 
170 | </body>
171 | </html>
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------