├── service
├── root
│ └── index.html
├── src
│ ├── data
│ ├── lang
│ ├── lib
│ ├── models
│ ├── view
│ │ ├── rules
│ │ │ ├── empty.slp
│ │ │ ├── nom.slp
│ │ │ ├── redundant.slp
│ │ │ ├── passive.slp
│ │ │ ├── avoid.slp
│ │ │ ├── bias.slp
│ │ │ ├── complex.slp
│ │ │ ├── ruleview.slp
│ │ │ ├── homophone.slp
│ │ │ ├── homophone2.slp
│ │ │ ├── category.slp
│ │ │ └── nomit.slp
│ │ ├── problem.slp
│ │ ├── metric.slp
│ │ ├── suggestions.slp
│ │ ├── rule.slp
│ │ ├── service.slp
│ │ ├── wordpress_gen.slp
│ │ ├── tinymce.slp
│ │ ├── error.slp
│ │ ├── quality.slp
│ │ ├── wordpress26.slp
│ │ └── wordpress.slp
│ └── local.sl
└── code
│ ├── compile.txt
│ ├── build.xml
│ └── src
│ └── org
│ └── dashnine
│ └── preditor
│ ├── GuessLanguage.java
│ ├── SortFromHash.java
│ └── LanguageModelSmall.java
├── data
└── rules
│ ├── nohomophone.txt
│ ├── grammar
│ ├── indef_uncount
│ ├── aux_modals
│ ├── personal_pronoun_case
│ ├── infinitives
│ ├── det_agreement_plural
│ ├── weare
│ ├── were
│ ├── an
│ ├── whose
│ ├── subject_verb_agreement
│ ├── contractedformnot
│ ├── dneg2
│ ├── possessive
│ ├── count
│ ├── lay
│ ├── its
│ ├── aux_noparticiple
│ ├── separate
│ ├── your
│ ├── too
│ ├── apostrophes
│ ├── their
│ ├── its2
│ ├── det_agreement
│ ├── repeats
│ ├── determiners
│ ├── combine
│ ├── aux_been_was
│ ├── comprised
│ └── aux_wrong_verb
│ ├── abbr.txt
│ ├── agreement
│ ├── plural.r
│ ├── single.r
│ ├── chunk_single.r
│ └── chunk_plural.r
│ ├── pronouns.txt
│ ├── complex
│ ├── been
│ └── misc
│ ├── prepositions.txt
│ ├── nomdb.txt
│ ├── avoiddb.txt
│ ├── hyphens.txt
│ ├── foreigndb.txt
│ ├── irregular_nouns.txt
│ ├── biasdb.txt
│ └── diacritic
│ └── diaeresis
├── lib
├── sleep.jar
├── cngram.jar
├── moconti.jar
├── spellutils.jar
├── object.sl
└── quality.sl
├── bin
├── quality.sh
├── dictgrep.sh
├── tagit.sh
├── testr.sh
├── corpuswp.sh
├── fixdata.sh
├── buildrules.sh
├── agreement.sh
├── transr.sh
├── amigo.sh
├── compilespelltools.sh
├── make3.sh
├── testgr.sh
├── corpus-lex-diff.sh
├── trainhomophones.sh
├── traintagger.sh
├── inspect.sh
├── smallmodel.sh
├── buildedits.sh
├── prepositions.sh
├── buildgrammarsets.sh
├── all.sh
├── trainspellnocontext.sh
├── buildmodel.sh
├── buildspelldata.sh
├── trainspellcontext.sh
├── buildhomodata.sh
└── buildtaggersets.sh
├── atdconfig.sl
├── README.txt
├── utils
├── bigrams
│ ├── printcorpus.sl
│ ├── contextprob.sl
│ ├── builddict.sl
│ ├── corpuswp.sl
│ ├── buildsmallmodel.sl
│ ├── fixgutenberg.sl
│ ├── inspect.sl
│ ├── qscore.sl
│ ├── amigo.sl
│ ├── corpus-lex-diff.sl
│ └── buildunigrams.sl
├── spelldata
│ ├── makesrc.sl
│ ├── process.sl
│ ├── maker.sl
│ ├── torules.sl
│ ├── bootstrapspell.sl
│ ├── gen2.sl
│ ├── gen3.sl
│ ├── gen.sl
│ └── gen4.sl
├── tagger
│ ├── tagit.sl
│ ├── fixtags.sl
│ ├── makebootstrap.sl
│ ├── makesentences.sl
│ └── postest.sl
├── common
│ ├── score.sl
│ ├── bywords.sl
│ ├── utils.sl
│ ├── hotest.sl
│ ├── homo.sl
│ ├── exp.sl
│ ├── spellcontext.sl
│ └── spelltests.sl
├── rules
│ ├── agreement.sl
│ ├── findprepositions.sl
│ ├── makespecial.sl
│ ├── transr.sl
│ ├── testr.sl
│ ├── makeprepositions.sl
│ └── testgr.sl
└── spell
│ ├── seededits.sl
│ └── definitions.sl
├── run-lowmem.bat
├── run.sh
├── run-lowmem.sh
├── CREDITS.rules.txt
├── CREDITS.txt
└── models
└── get_model_binaries.sh
/service/root/index.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/service/src/data:
--------------------------------------------------------------------------------
1 | ../../data
--------------------------------------------------------------------------------
/service/src/lang:
--------------------------------------------------------------------------------
1 | ../../lang
--------------------------------------------------------------------------------
/service/src/lib:
--------------------------------------------------------------------------------
1 | ../../lib
--------------------------------------------------------------------------------
/service/src/models:
--------------------------------------------------------------------------------
1 | ../../models
--------------------------------------------------------------------------------
/service/src/view/rules/empty.slp:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/service/src/view/rules/nom.slp:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/service/src/view/rules/redundant.slp:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/rules/nohomophone.txt:
--------------------------------------------------------------------------------
1 | me
2 | based
3 | we
4 |
--------------------------------------------------------------------------------
/service/src/local.sl:
--------------------------------------------------------------------------------
1 | # put local modifications to service here
2 |
--------------------------------------------------------------------------------
/lib/sleep.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/sleep.jar
--------------------------------------------------------------------------------
/lib/cngram.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/cngram.jar
--------------------------------------------------------------------------------
/lib/moconti.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/moconti.jar
--------------------------------------------------------------------------------
/service/src/view/problem.slp:
--------------------------------------------------------------------------------
1 |
2 | <% $1 %>
3 |
4 |
--------------------------------------------------------------------------------
/lib/spellutils.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/spellutils.jar
--------------------------------------------------------------------------------
/service/src/view/rules/passive.slp:
--------------------------------------------------------------------------------
1 |
2 | <% $1["rule"] %> - <% $2 %>
3 |
4 |
--------------------------------------------------------------------------------
/bin/quality.sh:
--------------------------------------------------------------------------------
1 | java -Xmx3328M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/qscore.sl $1
2 |
--------------------------------------------------------------------------------
/service/src/view/rules/avoid.slp:
--------------------------------------------------------------------------------
1 | Translation:
2 |
5 |
--------------------------------------------------------------------------------
/bin/dictgrep.sh:
--------------------------------------------------------------------------------
1 | java -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/dictgrep.sl $1
2 |
--------------------------------------------------------------------------------
/bin/tagit.sh:
--------------------------------------------------------------------------------
1 | #
2 | java -Datd.lowmem=true -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/tagit.sl $1
3 |
--------------------------------------------------------------------------------
/bin/testr.sh:
--------------------------------------------------------------------------------
1 | java -Datd.lowmem=true -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testr.sl $1 $2
2 |
--------------------------------------------------------------------------------
/data/rules/grammar/indef_uncount:
--------------------------------------------------------------------------------
1 | a|an &uncountable .*/RP|VBZ|IN::word=\1 \2::filter=none
2 | a|an &uncountable 0END.0::word=\1::filter=none
3 |
4 |
--------------------------------------------------------------------------------
/service/src/view/metric.slp:
--------------------------------------------------------------------------------
1 |
2 | <% $1 %>
3 | <% $2 %>
4 | <% $3 %>
5 |
6 |
--------------------------------------------------------------------------------
/service/src/view/suggestions.slp:
--------------------------------------------------------------------------------
1 |
2 | $+ $1 $+ "); }, $1);
4 | ?>
5 |
--------------------------------------------------------------------------------
/bin/corpuswp.sh:
--------------------------------------------------------------------------------
1 | #
2 | # convert a WordPress WXR file to raw data suitable for use in the AtD corpus
3 | #
4 |
5 | java -Xmx3584M -jar lib/sleep.jar utils/bigrams/corpuswp.sl $1
6 |
--------------------------------------------------------------------------------
/bin/fixdata.sh:
--------------------------------------------------------------------------------
1 | #
2 | # do this once!
3 | #
4 |
5 | cd data
6 | tar zxf corpora.tgz
7 | cd ..
8 | java -Xmx1024M -jar lib/sleep.jar utils/bigrams/fixgutenberg.sl data/corpus_gutenberg
9 |
--------------------------------------------------------------------------------
/data/rules/grammar/aux_modals:
--------------------------------------------------------------------------------
1 | may|might|could|would .*/VBN|VBG::word=\0 \1:base::pivots=\1,\1:base
2 | may|might|could|would .*/VBZ::word=\0 \1:singular::pivots=\1,\1:singular
3 |
4 |
--------------------------------------------------------------------------------
/service/src/view/rules/bias.slp:
--------------------------------------------------------------------------------
1 | Replace <% $2 %> with
2 |
3 |
4 | '.$1.''); }, split(', ', $1["word"])) ?>
5 |
6 |
--------------------------------------------------------------------------------
/bin/buildrules.sh:
--------------------------------------------------------------------------------
1 | #
2 | # This script creates the AtD rules
3 | #
4 |
5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/rules.sl
6 |
--------------------------------------------------------------------------------
/bin/agreement.sh:
--------------------------------------------------------------------------------
1 | cd data/rules/agreement
2 | java -jar ../../../lib/sleep.jar ../../../utils/rules/agreement.sl chunk_single.r single.r chunk_plural.r plural.r >../grammar/agreement
3 | cd ../../..
4 |
--------------------------------------------------------------------------------
/data/rules/grammar/personal_pronoun_case:
--------------------------------------------------------------------------------
1 | #
2 | # personal pronoun I is always uppercase.
3 | #
4 |
5 | i::word=I
6 | i'll::word=I'll
7 | i'm::word=I'm
8 | i've::word=I've
9 | i'd::word=I'd
10 |
--------------------------------------------------------------------------------
/service/src/view/rules/complex.slp:
--------------------------------------------------------------------------------
1 | Replace <% $2 %> with
2 |
3 |
4 | '.$1.''); }, suggestions2(split(", ", $1["word"]), $2)); ?>
5 |
6 |
--------------------------------------------------------------------------------
/bin/transr.sh:
--------------------------------------------------------------------------------
1 | #
2 | # run through a corpus and transform matching sentences using the specified rules.
3 | #
4 |
5 | java -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/transr.sl $1 $2
6 |
--------------------------------------------------------------------------------
/service/code/compile.txt:
--------------------------------------------------------------------------------
1 | To compile this code:
2 |
3 | 1. Create a symbolic link to
4 | ln -s ../../lib lib
5 |
6 | 2. Use Apache Ant to build everything
7 | ant clean
8 | ant
9 | cp spellutils.jar to lib
10 |
--------------------------------------------------------------------------------
/bin/amigo.sh:
--------------------------------------------------------------------------------
1 | # find homophones in corpus for a language
2 | # ./bin/amigo.sh [language]
3 |
4 | java -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lang=$1 -classpath lib/\* sleep.console.TextConsole utils/bigrams/amigo.sl
5 |
--------------------------------------------------------------------------------
/bin/compilespelltools.sh:
--------------------------------------------------------------------------------
1 | #
2 | # Compiles the Sleep methods ported to Java contained in service/code
3 | #
4 |
5 | cd service/code
6 | ln -s ../../lib/ lib
7 | ant clean
8 | ant
9 | mv spellutils.jar lib/spellutils.jar
10 | rm -f lib
11 | ant clean
12 |
--------------------------------------------------------------------------------
/atdconfig.sl:
--------------------------------------------------------------------------------
1 | #
2 | # configuration file for the Moconti app server
3 | #
4 |
5 | [$server addSite: "service.afterthedeadline.com",
6 | "service/src/site.sl",
7 | "service/root",
8 | ".",
9 | "key"];
10 |
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | After the Deadline - Open Source Language Checking Technology README
2 | ------------------
3 |
4 | Documentation on this code is at http://open.afterthedeadline.com
5 |
6 | See LICENSE.txt for license information. Enjoy the software.
7 |
8 | -- Raphael Mudge (rsmudge@gmail.com)
9 |
--------------------------------------------------------------------------------
/bin/make3.sh:
--------------------------------------------------------------------------------
1 | java -Xmx1024M -jar sleep.jar gen3.sl corpus2 homophones.txt ho_test_gutenberg_context.txt
2 | java -Xmx1024M -jar sleep.jar gen2.sl corpus2 homophones.txt ho_train_gutenberg_context.txt
3 | java -Xmx1024M -jar sleep.jar gen3.sl /home/raffi/spell/corpus homophones.txt ho_test_wp_context.txt
4 |
--------------------------------------------------------------------------------
/bin/testgr.sh:
--------------------------------------------------------------------------------
1 | java -Datd.lowmem=true -Xmx4048M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testgr.sl data/tests/grammar_wikipedia.txt
2 | java -Datd.lowmem=true -Xmx4048M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testgr.sl data/tests/grammar_gutenberg.txt
3 |
--------------------------------------------------------------------------------
/service/src/view/rule.slp:
--------------------------------------------------------------------------------
1 | <% invoke($1["recommendation"], @_) %>
2 |
3 | <% $1["description"] %> <% iff($1["source"] ne "", '(' . $1["source"] . ')') %>
4 |
5 |
6 |
--------------------------------------------------------------------------------
/data/rules/grammar/infinitives:
--------------------------------------------------------------------------------
1 | # infinitive phrases
2 | # http://www.chompchomp.com/terms/infinitivephrase.htm
3 |
4 | to is::filter=kill
5 | to .*/VBZ .*/DT|NN::word=\0 \1:base \2::pivots=\1,\1:base
6 | To .*/VBZ .*/DT|NN::word=\0 \1:base \2::pivots=\1,\1:base
7 |
8 | need|going|have|ought to .*/VBG::word=\0 \1 \2:base::pivots=\2,\2:base
9 |
--------------------------------------------------------------------------------
/bin/corpus-lex-diff.sh:
--------------------------------------------------------------------------------
1 | #
2 | # compare a corpus text file to the current wordlists and see what needs to be added
3 | #
4 |
5 | # to generate a wordlist suitable for the AtD wordlists directory:
6 | #
7 | # ./bin/corpus-lex-diff.sh filename.txt 50 wordlist
8 |
9 | java -Xmx3072M -jar lib/sleep.jar utils/bigrams/corpus-lex-diff.sl $1 $2 $3
10 |
--------------------------------------------------------------------------------
/utils/bigrams/printcorpus.sl:
--------------------------------------------------------------------------------
1 | include("lib/nlp.sl");
2 |
3 | $handle = openf(@ARGV[0]);
4 | $data = readb($handle, -1);
5 | closef($handle);
6 |
7 | foreach $paragraph (splitByParagraph($data))
8 | {
9 | println("PARAGRAPH BEGIN!");
10 |
11 | foreach $sentence ($paragraph)
12 | {
13 | println(" $sentence");
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/bin/trainhomophones.sh:
--------------------------------------------------------------------------------
1 | #
2 | # train and test the homophone misuse detection models
3 |
4 | java -Datd.lowmem=true -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainHomophoneModels
5 | java -Datd.lowmem=true -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runHomophoneTests
6 |
--------------------------------------------------------------------------------
/bin/traintagger.sh:
--------------------------------------------------------------------------------
1 | #
2 | # code to generate and evaluate the tagger models.
3 | #
4 |
5 | java -Xmx3072M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/postrain.sl wikipedia_sentences_tagged_f.txt
6 | java -Xmx3072M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/postest.sl data/gutenberg_sentences_tagged_f.txt
7 |
--------------------------------------------------------------------------------
/run-lowmem.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | REM
3 | REM startup script for AtD web service
4 | REM
5 | java -Dfile.encoding=UTF-8 -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lowmem=true -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath .\lib\sleep.jar;.\lib\moconti.jar;.\lib\spellutils.jar httpd.Moconti atdconfig.sl
6 |
--------------------------------------------------------------------------------
/data/rules/abbr.txt:
--------------------------------------------------------------------------------
1 | Mr
2 | Mrs
3 | No
4 | pp
5 | St
6 | no
7 | Dr
8 | Prof
9 | Sr
10 | Bros
11 | etc
12 | vs
13 | esp
14 | Fig
15 | fig
16 | Jan
17 | Feb
18 | Mar
19 | Apr
20 | Jun
21 | Jul
22 | Aug
23 | Sep
24 | Sept
25 | Oct
26 | Nov
27 | Dec
28 | Ph.D
29 | PhD
30 | Lt
31 | LT
32 | 2Lt
33 | 1Lt
34 | Capt
35 | Maj
36 | Col
37 | Gen
38 | Brig
39 | Sgt
40 | Esq
41 | i.e
42 | e.g
43 |
--------------------------------------------------------------------------------
/data/rules/agreement/plural.r:
--------------------------------------------------------------------------------
1 | *prefix* is a|the term|field::filter=kill::avoid=live, rest
2 | *prefix* is::word=*text* are, *transform*::filter=sane::avoid=live, rest
3 | *prefix* was::word=*text* were, *transform*::filter=sane::avoid=live, rest
4 | *prefix* doesn't::word=*text* don't, *transform*::filter=sane::avoid=live, rest
5 | *prefix* [a-z]+/VBZ::word=*text* \X:base, *transform*::filter=sane::avoid=live, rest
6 |
--------------------------------------------------------------------------------
/bin/inspect.sh:
--------------------------------------------------------------------------------
1 | #
2 | # startup script for AtD web service
3 | #
4 |
5 | #!/bin/sh
6 |
7 | export PRODUCTION=/home/atd
8 | export ATD_HOME=/home/atd/atd
9 | export LOG_DIR=$ATD_HOME/logs
10 |
11 | export LC_CTYPE=en_US.UTF-8
12 | export LANG=en_US.UTF-8
13 |
14 | java -Datd.lowmem=true -Dfile.encoding=UTF-8 -Xmx3512M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/inspect.sl
15 |
16 |
--------------------------------------------------------------------------------
/data/rules/grammar/det_agreement_plural:
--------------------------------------------------------------------------------
1 | #
2 | # determiner agreement rules for determiners expecting a plural noun
3 | #
4 |
5 | Both|Many|Several|Many|Few|Fewer|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten [a-z].*/NN [a-z].*ed/VBD::word=\0 \1:plural \2::pivots=\1,\1:plural
6 | both|these|those|us|many|several|few|fewer|two|three|four|five|six|seven|eight|nine|ten [a-z].*/NN [a-z].*ed/VBD::word=\0 \1:plural::pivots=\1,\1:plural
7 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #
2 | # startup script for AtD web service
3 | #
4 |
5 | #!/bin/sh
6 |
7 | export ATD_HOME=.
8 | export LOG_DIR=$ATD_HOME/logs
9 |
10 | java -server -Datd.lowmem=true -Dsleep.pattern_cache_size=8192 -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath "$ATD_HOME/lib/*" httpd.Moconti atdconfig.sl
11 |
--------------------------------------------------------------------------------
/data/rules/grammar/weare:
--------------------------------------------------------------------------------
1 | if were|where .*/VBN|RB|VBG::word=\0 we're \2::pivots=\1,we're
2 | what were|where .*/VBN::word=\0 we're \2::filter=none
3 | what were|where .*/RB|VBG::word=\0 we're \2::pivots=\1,we're
4 | since were|where .*/RB|VBN|VBG::word=\0 we're \2::pivots=\1,we're
5 | that were|where .*/VBG::word=\0 we're \2::pivots=\1,we're
6 | where were::word=where we're::pivots=were,we're
7 | we're are::word=we are, where are::pivots=we're,we,where
8 |
--------------------------------------------------------------------------------
/run-lowmem.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # startup script for AtD web service
4 | #
5 |
6 | export LC_CTYPE=en_US.UTF-8
7 | export LANG=en_US.UTF-8
8 |
9 | java -Dfile.encoding=UTF-8 -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lowmem=true -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath ./lib/sleep.jar:./lib/moconti.jar:./lib/spellutils.jar:./lib/* httpd.Moconti atdconfig.sl
10 |
--------------------------------------------------------------------------------
/service/src/view/service.slp:
--------------------------------------------------------------------------------
1 |
2 |
13 |
--------------------------------------------------------------------------------
/data/rules/pronouns.txt:
--------------------------------------------------------------------------------
1 | # Personal Pronouns:
2 | # subjective, objective, reflective, possessive pronoun, possessive determiner
3 | #
4 | # http://wapedia.mobi/en/English_personal_pronouns
5 |
6 | I, me, myself, mine, mine, my
7 | we, us, ourselves, ours, our
8 | you, you, yourselves, yours, your
9 | he, him, himself, his, his
10 | she, her, herself, hers, her
11 | it, it, itself, its, its
12 | they, them, themselves, theirs, their
13 | who, whom, whose, whose
14 |
--------------------------------------------------------------------------------
/data/rules/grammar/were:
--------------------------------------------------------------------------------
1 | were are|is|did|will::word=where \1::pivots=were,where
2 | is also were::word=is also where::pivots=were,where
3 | were .*/EX .*/VBZ::word=where \1 \2::pivots=were,where
4 | is were::word=is where::pivots=were,where
5 | where .*/VBN::word=were \1::pivot=where,were
6 | were .*/VB|VBP::word=where \1::pivot=were,where
7 | we|they|I|he|she where .*/NNP|VBN::word=\0 were \2::pivots=where,were
8 | who where::word=who were::pivots=where,were::options=where,were
9 |
--------------------------------------------------------------------------------
/bin/smallmodel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Create a language model for low-memory AtD
4 | #
5 | rm -f models/model.zip
6 | rm -rf tmp
7 | mkdir tmp
8 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildsmallmodel.sl
9 | cd tmp
10 |
11 | # we're using this instead of zip because zip on some systems creates corrupt
12 | # zip files when dealing with as many files as we have... get the JDK out.
13 | jar -cf ../models/model.zip . 1>/dev/null
14 | cd ..
15 |
--------------------------------------------------------------------------------
/data/rules/complex/been:
--------------------------------------------------------------------------------
1 | has been .*ing and .*ing::word=\0 \2:participle and \4:participle
2 | have been .*ing::word=\0 \2:participle
3 | has been .*ing::word=\0 \2:participle
4 | had been .*ing::word=\0 \2:participle
5 | They've|they've been .*ing::word=\0 \2:participle
6 | You've|you've been .*ing::word=\0 \2:participle
7 | I've been .*ing::word=\0 \2:participle
8 | We've|we've been .*ing::word=\0 \2:participle
9 | should've been .*ing::word=\0 \2:participle
10 | could've been .*ing::word=\0 \2:participle
11 | would've been .*ing::word=\0 \2:participle
12 |
--------------------------------------------------------------------------------
/data/rules/grammar/an:
--------------------------------------------------------------------------------
1 | #
2 | # these rules pick up when a/an are misused
3 | #
4 |
5 | # killing errors related to an indef article with a number
6 | # have to solve the problem with hundreds, teens, etc.
7 | An|A|a|an [\d+]\w+::filter=kill
8 | a|an|A|An RPG|RSS|XSS|SEC::filter=kill
9 |
10 | a/.* [aeiouyhAEIOUYH18]\w+/.*::filter=indefarticle::word=an \1
11 | an/.* [^aeiAEIMNRSX8]\w+/.*::filter=indefarticle::word=a \1
12 | 0BEGIN.0 A/.* [aeiouyhAEIOUYH18]\w+/.*::filter=indefarticle::word=An \1
13 | 0BEGIN.0 An/.* [^aeiAEIMNRSX8]\w+/.*::filter=indefarticle::word=A \1
14 |
--------------------------------------------------------------------------------
/data/rules/grammar/whose:
--------------------------------------------------------------------------------
1 | Who's|who's .*ing::filter=kill
2 |
3 | who's .*/NN::word=whose \1::pivots=who's,whose
4 | whose .*/DT::word=who's \1::pivots=whose,who's
5 | Who's .*/NN::word=Whose \1::pivots=Who's,Whose
6 | Whose .*/DT::word=Who's \1::pivots=Whose,Who's
7 |
8 | about who's::word=about whose::pivots=who's,whose::options=who's,whose
9 | who's actual::word=whose actual::pivots=who's,whose::options=who's,whose
10 | who's name::word=whose name::pivots=who's,whose::options=who's,whose
11 | who's previous::word=whose previous::pivots=who's,whose::options=who's,whose
12 |
--------------------------------------------------------------------------------
/lib/object.sl:
--------------------------------------------------------------------------------
1 | # everything you need for Sleep OO
2 | sub object
3 | {
4 | local('$function');
5 | $function = function("& $+ $type $+ :: $+ $0");
6 | if ($function !is $null)
7 | {
8 | return invoke($function, @_, $0, $this => $this);
9 | }
10 | throw "$type $+ :: $+ $0 - no such method";
11 | }
12 |
13 | sub newObject
14 | {
15 | local('$object');
16 | $object = lambda(&object, $type => $1);
17 | # invoke the constructor
18 | invoke($object, sublist(@_, 1), "init", $this => $object);
19 | return $object;
20 | }
21 |
--------------------------------------------------------------------------------
/bin/buildedits.sh:
--------------------------------------------------------------------------------
1 | #
2 | # seed the edits model
3 | # This model is nothing more than a cache of potential edits for common word mispellings. The purpose is to speed up processing. AtD uses an LRU cache
4 | # when running to track and grow this information. The seeding is done because the edits operation is so expensive that have this information available
5 | # makes training, testing, and warm up time significantly faster.
6 | #
7 | java -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/seededits.sl sp_test_aspell_nocontext.txt sp_test_wpcm_nocontext.txt
8 |
--------------------------------------------------------------------------------
/utils/bigrams/contextprob.sl:
--------------------------------------------------------------------------------
1 | #
2 | # a tool to inspect the language model
3 | #
4 |
5 | import org.dashnine.preditor.* from: lib/spellutils.jar;
6 | use(^SpellingUtils);
7 |
8 | # misc junk
9 | include("lib/dictionary.sl");
10 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
11 | $model = get_language_model();
12 | $dictionary = dictionary();
13 | $dsize = size($dictionary);
14 |
15 | $total = 0L;
16 | foreach $word ($dictionary) {
17 | $total += count($word);
18 | }
19 |
20 | println($total);
21 |
--------------------------------------------------------------------------------
/data/rules/grammar/subject_verb_agreement:
--------------------------------------------------------------------------------
1 | # I rules with correctons
2 |
3 | 0BEGIN.0 I is|be|are::word=I am::pivots=\1,am
4 |
5 | # You rules with corrections
6 |
7 | We|They|You is|am::word=\0 are::pivots=\1,are
8 | We|They|You was::word=\0 were::pivots=\1,were
9 | 0BEGIN.0 I has::word=\0 have::pivots=\1,have
10 | We|They|You has::word=\0 have::pivots=\1,have
11 |
12 | # He/She rules with corrections
13 |
14 | 0BEGIN.0 I were::word=\0 was::pivots=\1,was
15 | He|She were::word=\0 was::pivots=\1,was
16 | He|She have::word=\0 has::pivots=\1,has
17 | He|She be|am|are::word=\0 is::pivots=\1,is
18 |
19 |
--------------------------------------------------------------------------------
/service/src/view/rules/ruleview.slp:
--------------------------------------------------------------------------------
1 | ">
2 |
6 |
<% $1["rule"] %>: <% $1["text"] %>
7 |
8 |
<% $1["description"] %> ("><% $1["source"] %>)
9 |
10 |
--------------------------------------------------------------------------------
/data/rules/grammar/contractedformnot:
--------------------------------------------------------------------------------
1 | ain't not::word=\0::filter=none
2 | aren't not::word=\0::filter=none
3 | can't not::word=\0::filter=none
4 | couldn't not::word=\0::filter=none
5 | didn't not::word=\0::filter=none
6 | doesn't not::word=\0::filter=none
7 | don't not::word=\0::filter=none
8 | hasn't not::word=\0::filter=none
9 | isn't not::word=\0::filter=none
10 | mightn't not::word=\0::filter=none
11 | mustn't not::word=\0::filter=none
12 | shan't not::word=\0::filter=none
13 | shouldn't not::word=\0::filter=none
14 | weren't not::word=\0::filter=none
15 | won't not::word=\0::filter=none
16 | wouldn't not::word=\0::filter=none
17 |
--------------------------------------------------------------------------------
/service/src/view/rules/homophone.slp:
--------------------------------------------------------------------------------
1 |
14 |
15 | Review definitions:
16 |
17 |
18 | '.$1.'
' . %homodict[$1] . ''); }, map({ return iff($1 in %homodict, $1, baseVerb($1)); }, split(', ', $1["word"]))); ?>
19 |
20 |
21 |
--------------------------------------------------------------------------------
/bin/prepositions.sh:
--------------------------------------------------------------------------------
1 | #
2 | # code to generate rules for prepositions
3 | #
4 |
5 | echo '#' >data/rules/grammar/prepositions
6 | echo '# This file is automatically generated by ./bin/prepositions.sh - do not edit' >> data/rules/grammar/prepositions
7 | echo '#' >> data/rules/grammar/prepositions
8 |
9 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/findprepositions.sl >preps.tmp
10 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/makeprepositions.sl preps.tmp >>data/rules/grammar/prepositions
11 | rm -f preps.tmp
12 |
--------------------------------------------------------------------------------
/service/src/view/rules/homophone2.slp:
--------------------------------------------------------------------------------
1 |
14 |
15 | Review definitions:
16 |
17 |
18 | '.$1.'
' . %homodict[$1] . ''); }, map({ return iff($1 in %homodict, $1, baseVerb($1)); }, split(',', $1["options"]))); ?>
19 |
20 |
21 |
--------------------------------------------------------------------------------
/bin/buildgrammarsets.sh:
--------------------------------------------------------------------------------
1 | #
2 | # build grammar corpora
3 | #
4 |
5 | if [ -f wp.txt ]
6 | then
7 |
8 | java -jar lib/sleep.jar utils/spelldata/torules.sl wrong >rules.out
9 |
10 | # make the grammar rules files
11 |
12 | java -jar lib/sleep.jar utils/spelldata/maker.sl rules.out data/wikipedia_sentences.txt >data/tests/grammar_wikipedia.txt
13 | java -jar lib/sleep.jar utils/spelldata/maker.sl rules.out data/gutenberg_sentences.txt >data/tests/grammar_gutenberg.txt
14 |
15 | rm -f rules.out
16 |
17 | else
18 | echo "No wp.txt file is present, cut and paste Wikipedia Common Errors List to wp.txt and try again"
19 |
20 | fi
21 |
--------------------------------------------------------------------------------
/data/rules/grammar/dneg2:
--------------------------------------------------------------------------------
1 | #
2 | # Style Double Negatives
3 | #
4 |
5 | not a|an unifable|unified|uniformed|unifying|united|undulated|undulating|universalized|universalised|unrest|(.*?der)|university|understood|understanding::filter=kill
6 | not unifable|unified|uniformed|unifying|united|undulated|undulating|universalized|universalised|unrest|(.*?der)|university|understood|understanding::filter=kill
7 |
8 | not a|an un[aeiouy].*::word=an \2:positive
9 | not a|an un[^aeiouy].*::word=a \2:positive
10 | not un.*::word=\1:positive
11 |
12 | # another double negative rule. Changes the meaning of the sentence but is easier to understand
13 | dont have|need no::word=\0 \1 any::pivots=no,any
14 |
--------------------------------------------------------------------------------
/bin/all.sh:
--------------------------------------------------------------------------------
1 | ./bin/compilespelltools.sh # don't do this as the build box doesn't have ant on it (yet)
2 |
3 | #
4 | # set some vars that may help the cause.
5 | #
6 | export LC_CTYPE=en_US.UTF-8
7 | export LANG=en_US.UTF-8
8 |
9 | #
10 | # build the foundational NLP models
11 | #
12 | ./bin/buildmodel.sh
13 | #./bin/buildtaggersets.sh # do not uncomment this
14 |
15 | #
16 | # intermediate stuff
17 | #
18 | ./bin/buildrules.sh
19 | ./bin/testgr.sh
20 | ./bin/buildedits.sh
21 |
22 | #
23 | # train various models
24 | #
25 | #./bin/traintagger.sh # no good reason to do this unless tagger data changes
26 | ./bin/trainspellcontext.sh
27 | ./bin/trainspellnocontext.sh
28 | ./bin/trainhomophones.sh
29 |
--------------------------------------------------------------------------------
/service/src/view/rules/category.slp:
--------------------------------------------------------------------------------
1 | box" name="<% $1 %>">
2 |
<% $2["rule"] %>
3 |
4 |
<% $2["description"] %> <% iff($2["source"] ne "", '('.$2["source"].')') %>
5 |
6 |
7 | $1), @exclude))
11 | {
12 | println('- ');
13 | println(' '.$ex["text"].'');
14 | println('
');
15 | }
16 | ?>
17 |
18 |
19 |
--------------------------------------------------------------------------------
/data/rules/grammar/possessive:
--------------------------------------------------------------------------------
1 | #
2 | # errors related to possession vs. plural
3 | #
4 |
5 | Your|your|My|my|Their|their|Her|her|His|his|That|The|the|that [a-z].*/NNS .*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
6 | with|a|an|With|A|an [a-z].*/NNS .*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
7 |
8 | before|after|in|before|during|at [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
9 | Before|After|In|Before|During|At [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
10 | about|in|for|on|with [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly
11 |
12 |
--------------------------------------------------------------------------------
/utils/bigrams/builddict.sl:
--------------------------------------------------------------------------------
1 | #
2 | # This is a script to generate a spellchecker dictionary using the specified threshold. It's fun stuff.
3 | #
4 | # java -jar sleep.jar builddict.sl threshold models/model.bin models/dictionary.txt
5 | #
6 |
7 | debug(7 | 34);
8 |
9 | import org.dashnine.preditor.* from: lib/spellutils.jar;
10 | use(^SpellingUtils);
11 |
12 | include("lib/dictionary.sl");
13 |
14 | sub main
15 | {
16 | global('$model $threshold $handle $index $1 $2');
17 | $model = get_language_model($2);
18 |
19 | $handle = openf(iff($2 is $null, ">models/dictionary.txt", "> $+ $3"));
20 |
21 | printAll($handle, [SleepUtils getArrayWrapper: [$model harvest: int($1)]]);
22 |
23 | closef($handle);
24 | }
25 |
26 | invoke(&main, @ARGV);
27 |
--------------------------------------------------------------------------------
/CREDITS.rules.txt:
--------------------------------------------------------------------------------
1 | The AtD rule set was inspired from many resources and projects around the web.
2 | The following resources were particularly helpful:
3 |
4 | LanguageTool Open Source Language Checker
5 | http://www.languagetool.org
6 |
7 | PlainLanguage.gov
8 | http://www.plainlanguage.gov
9 |
10 | GNU Style and Diction
11 | http://www.gnu.org/software/diction/diction.html
12 |
13 | Wikipedia
14 | http://en.wikipedia.org/wiki/Category:Wikipedia_style_guidelines
15 | http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings
16 | (and many other lists...)
17 |
18 | Graviax Grammar Checker
19 | http://graviax.sourceforge.net/
20 |
21 | Cliches: Avoid Them Like the Plague
22 | http://suspense.net/whitefish/cliche.htm
23 |
24 | WordNet - Lexical Database for English
25 | http://wordnet.princeton.edu/
26 |
--------------------------------------------------------------------------------
/utils/spelldata/makesrc.sl:
--------------------------------------------------------------------------------
1 | #
2 | # transform the homophonesdb file into something our other scripts can handle
3 | # using the bad\ngood format.
4 |
5 | ($inh, $outh) = @ARGV;
6 |
7 | $handle = openf("models/dictionary.txt");
8 | putAll(%dictionary, readAll($handle), { return 1; });
9 | closef($handle);
10 |
11 | $handle = openf($inh);
12 | @data = readAll($handle);
13 | closef($handle);
14 |
15 | $handle = openf("> $+ $outh");
16 | foreach $d (@data)
17 | {
18 | @words = split(',\s*', $d);
19 | foreach $w1 (@words)
20 | {
21 | foreach $w2 (@words)
22 | {
23 | if ($w1 ne $w2 && $w1 in %dictionary && $w2 in %dictionary)
24 | {
25 | println($handle, "$w2");
26 | println($handle, "$w1");
27 | }
28 | }
29 | }
30 | }
31 | closef($handle);
32 |
--------------------------------------------------------------------------------
/utils/bigrams/corpuswp.sl:
--------------------------------------------------------------------------------
1 | #
2 | # Export posts (only!) from a WordPress WXR file and make the content as plain text as possible.
3 | # use this to preprocess a file for adding to data/corpus_extra
4 | #
5 |
6 | $handle = openf(@ARGV[0]);
7 | $data = readb($handle, -1);
8 | closef($handle);
9 |
10 | $data = join(' ', split("\n|\r", $data));
11 | @data = matches($data, '\\<\!\[CDATA\[(.*?)\]\]\>\');
12 |
13 | foreach $index => $data (@data)
14 | {
15 | if (strlen($data) > 0)
16 | {
17 | $data = strrep($data, '&', '&', ' ', ' ', '
', "\n", '', "\n", '"e;', '"', '“', "'", '”', "'", '’', "'", '«', '"', '»', '"', '’', "'");
18 | $data = replace($data, '(<[^>]*?>)', '');
19 | println($data);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/bin/trainspellnocontext.sh:
--------------------------------------------------------------------------------
1 | #
2 | # train and test the spellchecker models
3 | #
4 |
5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainNoContext
6 |
7 | echo "=== NON-CONTEXTUAL DATA ======================================================================="
8 |
9 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingTest sp_test_aspell_nocontext.txt
10 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingTest sp_test_wpcm_nocontext.txt
11 |
12 | # normal spelling test
13 | #java -Xmx1024M -jar lib/sleep.jar utils/spell/test.sl runSpellingTest tests1.txt
14 | #java -Xmx1024M -jar lib/sleep.jar utils/spell/test.sl runSpellingTest tests2.txt
15 |
--------------------------------------------------------------------------------
/utils/tagger/tagit.sl:
--------------------------------------------------------------------------------
1 | # this script simply tags sentences in a file. it assumes each setence is on a line by itself.
2 |
3 | include("lib/engine.sl");
4 | include("utils/rules/rules.sl");
5 |
6 | sub initAll
7 | {
8 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
9 | $model = get_language_model();
10 | $dsize = size($dictionary);
11 | $hnetwork = get_network("hnetwork.bin");
12 | $verbs = loadVerbData();
13 | initTaggerModels();
14 | }
15 |
16 | sub main
17 | {
18 | local('$handle $sentence @results @past');
19 |
20 | initAll();
21 |
22 | $handle = openf($1);
23 | while $sentence (readln($handle))
24 | {
25 | println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
26 | }
27 | }
28 |
29 | invoke(&main, @ARGV);
30 |
--------------------------------------------------------------------------------
/utils/tagger/fixtags.sl:
--------------------------------------------------------------------------------
1 | sub main
2 | {
3 | local('$handle $sentence $entry $word $tag $previous @s');
4 | $handle = openf($1);
5 | while $sentence (readln($handle))
6 | {
7 | @s = @();
8 |
9 | foreach $entry (split(' ', $sentence))
10 | {
11 | ($word, $tag) = split('/', $entry);
12 | if ("'" isin $word && size(@s) > 0)
13 | {
14 | if ($tag eq "''")
15 | {
16 | @s[-1] = @(@s[-1][0] . $word, @s[-1][1]);
17 | }
18 | else
19 | {
20 | @s[-1] = @(@s[-1][0] . $word, @s[-1][1] . ',' . $tag);
21 | }
22 | }
23 | else
24 | {
25 | push(@s, @(lc($word), $tag));
26 | }
27 |
28 | }
29 | println( join(" ", map({ return join('/', $1); }, @s)) );
30 | }
31 | }
32 |
33 | invoke(&main, @ARGV);
34 |
--------------------------------------------------------------------------------
/data/rules/prepositions.txt:
--------------------------------------------------------------------------------
1 | about
2 | above
3 | according to
4 | across
5 | after
6 | against
7 | along
8 | along with
9 | among
10 | apart from
11 | around
12 | as
13 | as for
14 | at
15 | because of
16 | before
17 | behind
18 | below
19 | beneath
20 | beside
21 | between
22 | beyond
23 | but*
24 | by
25 | by means of
26 | concerning
27 | despite
28 | down
29 | during
30 | except
31 | except for
32 | excepting
33 | for
34 | from
35 | in
36 | in addition to
37 | in back of
38 | in case of
39 | in front of
40 | in place of
41 | inside
42 | in spite of
43 | instead of
44 | into
45 | like
46 | near
47 | next
48 | of
49 | off
50 | on
51 | onto
52 | on top of
53 | out
54 | out of
55 | outside
56 | over
57 | past
58 | regarding
59 | round
60 | since
61 | through
62 | throughout
63 | till
64 | to
65 | toward
66 | under
67 | underneath
68 | unlike
69 | until
70 | up
71 | upon
72 | up to
73 | with
74 | within
75 | without
76 |
--------------------------------------------------------------------------------
/utils/spelldata/process.sl:
--------------------------------------------------------------------------------
1 | $handle = openf("spelling.txt");
2 |
3 | global('%dataset');
4 |
5 | while $bad (readln($handle))
6 | {
7 | $good = readln($handle);
8 | %dataset[$bad] = $good;
9 | }
10 |
11 | closef($handle);
12 |
13 |
14 | $handle = openf("batch0.tab");
15 | while $text (readln($handle))
16 | {
17 | ($bad, $good) = split('\s+', $text);
18 | %dataset[$bad] = $good;
19 | }
20 |
21 | closef($handle);
22 |
23 | $handle = openf("batch0.tab.1");
24 | while $text (readln($handle))
25 | {
26 | ($bad, $good) = split('\s+', $text);
27 | %dataset[$bad] = $good;
28 | }
29 |
30 | closef($handle);
31 |
32 | $handle = openf(">output.txt");
33 | $handle2 = openf(">output2.txt");
34 |
35 | @bads = sorta(keys(%dataset));
36 | foreach $bword (@bads)
37 | {
38 | println($handle, $bword);
39 | println($handle2, $bword);
40 | println($handle, %dataset[$bword]);
41 | }
42 |
43 | closef($handle);
44 | closef($handle2);
45 |
--------------------------------------------------------------------------------
/bin/buildmodel.sh:
--------------------------------------------------------------------------------
1 | #
2 | # This script creates the AtD bigram model (corpus.zip)
3 | #
4 |
5 | java -version
6 |
7 | rm -f models/model.bin
8 |
9 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_gutenberg
10 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_wikipedia
11 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_extra
12 |
13 | # build dictionary (make sure it's done *after* zipping)
14 |
15 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:NewSize=512M -jar lib/sleep.jar utils/bigrams/builddict.sl 2
16 |
17 | # create the not misspelled dictionary...
18 |
19 | cp data/wordlists/accented.txt models/not_misspelled.txt
20 |
21 | # create LM for low-memory AtD
22 | ./bin/smallmodel.sh
23 |
--------------------------------------------------------------------------------
/service/src/view/wordpress_gen.slp:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | <% $1["rule"] %>
5 |
35 |
36 |
37 |
38 | <% $1["rule"] %>
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/bin/buildspelldata.sh:
--------------------------------------------------------------------------------
1 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/sp_test_wpcm_nocontext.txt data/tests/sp_test_gutenberg_context1.txt
2 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/sp_test_aspell_nocontext.txt data/tests/sp_test_gutenberg_context2.txt
3 |
4 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_wikipedia data/tests/sp_test_wpcm_nocontext.txt data/tests/sp_test_wp_context1.txt
5 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_wikipedia data/tests/sp_test_aspell_nocontext.txt data/tests/sp_test_wp_context2.txt
6 |
7 | #java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/train.txt data/tests/sp_train_gutenberg_context.txt
8 | #echo "are * blind|you, oyu" >>data/tests/sp_train_gutenberg_context.txt
9 |
10 |
--------------------------------------------------------------------------------
/data/rules/grammar/count:
--------------------------------------------------------------------------------
1 | # lowercase
2 | fewer &uncountable::word=less \1::filter=indefarticle
3 | &uncountable or fewer::word=\0 or less::filter=none
4 | few &uncountable::word=little \1::filter=indefarticle
5 |
6 | the less::filter=die
7 | less &uncountable::filter=kill
8 | less .*/NNS::word=fewer \1::filter=indefarticle
9 |
10 | little people::filter=kill
11 | little &uncountable::word=few \1::filter=indefarticle
12 |
13 | # uppercase
14 | Fewer &uncountable::word=Less \1::filter=indefarticle
15 | Few &uncountable::word=Little \1::filter=indefarticle
16 |
17 | The less::filter=die
18 | Less &uncountable::filter=kill
19 | Less .*/NNS::word=Fewer \1::filter=indefarticle
20 |
21 | Little people::filter=kill
22 | Little &uncountable::word=Few \1::filter=indefarticle
23 |
24 | # hide situations where the uncountable noun is used as an adjective
25 | # (e.g., water snails)
26 | few|fewer|Few|Fewer &uncountable .*/NNS::filter=kill
27 | little|Little &uncountable .*/NNS::filter=kill
28 |
29 |
--------------------------------------------------------------------------------
/bin/trainspellcontext.sh:
--------------------------------------------------------------------------------
1 | #
2 | # train and test the spellchecker models
3 | #
4 |
5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainWithContext
6 |
7 | echo "=== CONTEXTUAL DATA ==========================================================================="
8 |
9 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_wp_context1.txt
10 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_wp_context2.txt
11 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_gutenberg_context1.txt
12 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_gutenberg_context2.txt
13 |
14 |
--------------------------------------------------------------------------------
/CREDITS.txt:
--------------------------------------------------------------------------------
1 | After the Deadline uses the following libraries:
2 |
3 | lib/cngramj.jar
4 | http://ngramj.sourceforge.net/
5 |
6 | ngramj is a language guessing library for Java. It's licensed under the LGPL.
7 | I modified it by packaging a language profile for Indonesian:
8 | http://blog.afterthedeadline.com/2010/02/08/n-gram-language-guessing-with-ngramj/
9 |
10 | lang/lib/languagetool
11 | http://www.languagetool.org
12 |
13 | Language Tool is a rule-based language checking program. It's licensed under the LGPL.
14 | No modifications to Language Tool were made.
15 |
16 | lang/*/wordlists/*.utf8.txt
17 |
18 | Several dictionaries were extracted from the Open Office dictionaries page and converted
19 | to their normal form using unmunch and then converted to UTF8 by me.
20 |
21 | The licenses for the original source files range from GPL, LGPL, MPL (Mozilla Public
22 | License) and Creative Commons ShareAlike licenses.
23 |
24 | http://wiki.services.openoffice.org/wiki/Dictionaries
25 |
--------------------------------------------------------------------------------
/utils/tagger/makebootstrap.sl:
--------------------------------------------------------------------------------
1 | debug(7 | 34);
2 |
3 | import java.util.List;
4 | import java.io.BufferedReader;
5 | import java.io.FileReader;
6 |
7 | import edu.stanford.nlp.ling.Sentence from: stanford-postagger-2008-09-28.jar;
8 | import edu.stanford.nlp.ling.TaggedWord from: stanford-postagger-2008-09-28.jar;
9 | import edu.stanford.nlp.ling.HasWord from: stanford-postagger-2008-09-28.jar;
10 | import edu.stanford.nlp.tagger.maxent.MaxentTagger from: stanford-postagger-2008-09-28.jar;
11 |
12 | global('$x $semaphore $handle $file @array');
13 |
14 | $semaphore = semaphore();
15 | $handle = openf(@ARGV[1]);
16 | $file = @ARGV[0];
17 |
18 | sub doit
19 | {
20 | local('$taggedLine $tagger $text $sentence');
21 |
22 | $tagger = [new MaxentTagger: $file];
23 |
24 | while $text (readln($handle))
25 | {
26 | $sentence = [Sentence toSentence: cast(split('\s+', strrep($text, "'", " '")), ^String)];
27 | $taggedLine = [$tagger tagSentence: $sentence];
28 | println([$taggedLine toString: 0]);
29 | }
30 | }
31 |
32 | doit();
33 |
--------------------------------------------------------------------------------
/service/src/view/tinymce.slp:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | <% $1["rule"] %>
5 |
6 |
7 |
8 |
9 |
10 |
15 |
20 |
21 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/data/rules/grammar/lay:
--------------------------------------------------------------------------------
1 | # http://grammar.quickanddirtytips.com/lay-versus-lie.aspx
2 |
3 | #
4 | # Confused word: laid
5 | #
6 |
7 | laid ahead::word=lay ahead::pivots=laid,lay::options=laid,lay
8 |
9 | #
10 | # Confused word: lay
11 | #
12 |
13 | lay around::word=lie around::pivots=lay,lie::options=lay,lie
14 | lay low::word=lie low::pivots=lay,lie::options=lay,lie
15 |
16 | #
17 | # Confused word: laying
18 | #
19 |
20 | laying around::word=lying around::pivots=laying,lying::options=laying,lying
21 | laying low::word=lying low::pivots=laying,lying::options=laying,lying
22 |
23 | #
24 | # Confused word: lays
25 | #
26 |
27 | lays atop::word=lies atop::pivots=lays,lies::options=lays,lies
28 | lays beside::word=lies beside::pivots=lays,lies::options=lays,lies
29 | lays low::word=lies low::pivots=lays,lies::options=lays,lies
30 | lays near::word=lies near::pivots=lays,lies::options=lays,lies
31 | lays on::word=lies on::pivots=lays,lies::options=lays,lies
32 |
33 | #
34 | # Confused word: lain
35 | #
36 |
37 | was lain::word=was laid::pivots=lain,laid::options=lain,laid
38 | were lain::word=were laid::pivots=lain,laid::options=lain,laid
39 |
--------------------------------------------------------------------------------
/data/rules/grammar/its:
--------------------------------------------------------------------------------
1 | # yes, I know some parts of this rule are redundant with others--why mess with a working formula
2 | Its .*/JJ|JJS .*/NN .*/TO|PRP|NNP::word=It's \1 \2 \3::filter=none
3 | Its .*/JJ|JJS .*/TO|PRP|NNP::word=It's \1 \2::filter=none
4 | Its .*/JJ|JJS .*/NN a|an|that|because|as::word=It's \1 \2 \3::filter=none
5 | Its .*/JJ|JJS a|an|that|because|as::word=It's \1 \2::filter=none
6 | Its .*/JJ for::word=It's \1 \2::filter=none
7 |
8 | Its .*/RB|DT::word=It's \1::filter=none
9 |
10 | its .*/DT|IN|MD|POS|PP|WDT|WP|WRB::name=its rule::word=it's \1::filter=none
11 | its .*/CC|RB::name=its rule::word=it's \1::pivots=its,it's
12 | Its .*/DT|IN|MD|POS|PP|WDT|WP|WRB::name=its rule::word=It's \1::filter=none
13 | Its .*/CC|RB::name=its rule::word=It's \1::pivots=Its,It's
14 | its .*ed/VBN|VBD .*/NN|NNS::word=it's \1 \2::filter=kill
15 | its .*ed/VBN|VBD::word=it's \1::pivots=its,it's
16 | Its .*ed/VBN|VBD .*/NN|NNS::word=it's \1 \2::filter=kill
17 | Its .*ed/VBN|VBD::word=it's \1::pivots=its,it's
18 |
19 | its not::word=it's not::pivots=its,it's
20 |
21 |
22 | its .*/VBG .*/NN|NNS::filter=kill
23 | its .*/VBG::word=it's \1::pivots=\0,it's
24 |
25 |
26 |
--------------------------------------------------------------------------------
/service/src/view/rules/nomit.slp:
--------------------------------------------------------------------------------
1 | $+ $2 $+ with ");
9 |
10 | $o = map({ return " $+ $1 $+ "; }, split(', ', $option));
11 | $o = filter(lambda({ if ($1 !in %nodupes) { %nodupes[$1] = 1; return $1; } }, %nodupes => %()), $o);
12 |
13 | if (size($o) == 1)
14 | {
15 | print($o[0]);
16 | }
17 | else
18 | {
19 | print([(join(",", sublist($o, 0, -1)) . " or " . $o[-1]) trim]);
20 | }
21 |
22 | println(".");
23 | }
24 | else
25 | {
26 | println("You should revise $+ $2 $+ to bring out the verb.");
27 | }
28 | ?>
29 |
30 |
31 |
Revision Examples
32 |
33 |
Before: Bonuses are based on the performance of the company.
34 |
After: Bonuses are based on how the company performs.
35 |
36 |
Before: An Explanation of Hidden Verbs.
37 |
After: Hidden Verbs Explained.
38 |
--------------------------------------------------------------------------------
/utils/bigrams/buildsmallmodel.sl:
--------------------------------------------------------------------------------
1 | #
2 | # convert the large language model to pieces that we can load as needed
3 | #
4 | debug(7 | 34);
5 |
6 | import org.dashnine.preditor.* from: lib/spellutils.jar;
7 | use(^SpellingUtils);
8 |
9 | # misc junk
10 | include("lib/dictionary.sl");
11 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
12 | $model = get_language_model();
13 |
14 | sub main {
15 | local('$handle $x $entry $wid $file');
16 | $handle = openf(">models/stringpool.bin");
17 | writeObject($handle, [$model getStringPool]);
18 | writeObject($handle, [$model count]);
19 | closef($handle);
20 |
21 | # make the necessary directories
22 | mkdir("tmp");
23 | for ($x = 0; $x < 512; $x++) {
24 | mkdir("tmp/ $+ $x");
25 | }
26 |
27 | # create each individual entry
28 | foreach $entry ([[[$model getStringPool] entrySet] iterator]) {
29 | $wid = [$entry getValue];
30 | $file = getFileProper("tmp", $wid % 512, $wid);
31 | $handle = openf("> $+ $file");
32 | writeAsObject($handle, [[$model getLanguageModel] get: $wid]);
33 | closef($handle);
34 | }
35 | }
36 |
37 | invoke(&main, @ARGV);
38 |
--------------------------------------------------------------------------------
/utils/common/score.sl:
--------------------------------------------------------------------------------
1 | #
2 | # code for the score object
3 | #
4 |
5 | sub sortScores
6 | {
7 | return [$1 value] <=> [$2 value];
8 | }
9 |
10 | sub score::init
11 | {
12 | this('$desc $count $fneg $fpos $correct $sugg');
13 | ($desc) = @_;
14 | }
15 |
16 | sub score::record
17 | {
18 | $count++;
19 | }
20 |
21 | sub score::falseNegative
22 | {
23 | $fneg++;
24 | }
25 |
26 | sub score::falsePositive
27 | {
28 | $fpos++;
29 | }
30 |
31 | sub score::correct
32 | {
33 | $correct++;
34 | }
35 |
36 | sub score::correctSugg
37 | {
38 | $sugg++;
39 | }
40 |
41 | sub score::value
42 | {
43 | return (double($correct) / $count);
44 | }
45 |
46 | sub score::print
47 | {
48 | println("Report for $desc");
49 | println("Correct: " . ((double($correct) / $count) * 100.0));
50 |
51 | if ($sugg != 0)
52 | {
53 | println("Suggestion Acc: " . ((double($sugg) / $count) * 100.0));
54 | println("-" x 20);
55 | }
56 | if ($fneg != 0)
57 | {
58 | println("False Negative: " . ((double($fneg) / $count) * 100.0));
59 | }
60 | if ($fpos != 0)
61 | {
62 | println("False Positive: " . ((double($fpos) / $count) * 100.0));
63 | }
64 | }
65 |
66 |
67 |
--------------------------------------------------------------------------------
/bin/buildhomodata.sh:
--------------------------------------------------------------------------------
1 | # generate the source data
2 | rm -rf tmp
3 | mkdir tmp
4 | java -jar lib/sleep.jar utils/spelldata/makesrc.sl data/rules/homophonedb.txt tmp/homophones.txt
5 |
6 | #
7 | # build with parts-of-speech
8 | #
9 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/wikipedia_sentences.txt data/tests/ho_test_wp_pos_context.txt 15
10 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/gutenberg_sentences.txt data/tests/ho_test_gutenberg_pos_context.txt 15
11 |
12 | # was 8
13 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/gutenberg_sentences.txt data/tests/ho_train_gutenberg_pos_context.txt 6
14 |
15 | #
16 | # build without parts-of-speech
17 | #
18 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen3.sl data/corpus_gutenberg tmp/homophones.txt data/tests/ho_test_gutenberg_context.txt
19 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg tmp/homophones.txt data/tests/ho_train_gutenberg_context.txt
20 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen3.sl data/corpus_wikipedia tmp/homophones.txt data/tests/ho_test_wp_context.txt
21 | rm -rf tmp
22 |
--------------------------------------------------------------------------------
/data/rules/agreement/single.r:
--------------------------------------------------------------------------------
1 | *prefix* are::word=*text* is, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
2 | *prefix* were::word=*text* was, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
3 | *prefix* don't::word=*text* doesn't, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
4 | *prefix* [a-z]+/VBP::word=*text* \X:plural, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let::filter=sane
5 | *prefix* be::filter=kill
6 | *prefix* by::filter=kill
7 | *prefix* [a-z]+/VB is::filter=kill
8 | *prefix* [a-z]+/VB of|for::filter=kill
9 | *prefix* [a-z]+/VB [a-z]+/VBD|VBZ::filter=kill
10 | *prefix* [a-z]+/VB::word=*text* \X:plural, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane
11 | *prefix* are [a-z]+/VBN::filter=kill
12 | *prefix* [a-z]+/MD::filter=kill
13 | One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|Eleven|Twelve|Thirteen|Fourteen|Fifteen|Sixteen|Seventeen|Eighteen|Nineteen|Twenty|Thirty|Fourty|Fifty|Sixty|Seventy|Eighty|Ninenty dollars|pounds|points|feet|inches|meters::filter=kill
14 |
--------------------------------------------------------------------------------
/utils/bigrams/fixgutenberg.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this program fixes the gutenberg corpus by looping through each file and collapsing paragraphs onto a single line.
3 | # this will lead to a more accurate language model which is a really good thing.
4 | #
5 | # do not do this twice or bad things will happen!!!!
6 | #
7 |
8 | sub fixFile
9 | {
10 | local('$handle $buffer $text $data');
11 |
12 | # read the file and populate our buffer please
13 |
14 | $buffer = allocate(lof($1));
15 | $handle = openf($1);
16 | while $text (readln($handle))
17 | {
18 | if ($text eq "")
19 | {
20 | print($buffer, "\n");
21 | }
22 | else
23 | {
24 | print($buffer, "$text ");
25 | }
26 | }
27 | closef($handle);
28 | closef($buffer);
29 |
30 | # read the contents of the buffer in
31 |
32 | $data = readb($buffer, -1);
33 | closef($buffer);
34 |
35 | # transfer the contents of the buffer to
36 |
37 | $handle = openf("> $+ $1");
38 | writeb($handle, $data);
39 | closef($handle);
40 | }
41 |
42 |
43 | map({
44 | if (-isDir $1)
45 | {
46 | map($this, ls($1));
47 | }
48 | else
49 | {
50 | fixFile($1);
51 | }
52 | }, @ARGV);
53 |
54 | println("Corpus Prepared");
55 |
--------------------------------------------------------------------------------
/utils/rules/agreement.sl:
--------------------------------------------------------------------------------
1 | #
2 | # make a super rule file based on the chunker
3 | #
4 |
5 | sub fix {
6 | local('$s $c $t');
7 | $s = split('\s+', $1);
8 | foreach $c => $t ($s) {
9 | $t = "\\ $+ $c";
10 | }
11 | return join(" ", $s);
12 | }
13 |
14 | sub count {
15 | local('$s $c $t');
16 | $s = split('\s+', $1);
17 | return "\\" . (size($s) + $2);
18 | }
19 |
20 | sub noempties {
21 | return iff(strlen([$1 trim]) > 0, $1);
22 | }
23 |
24 | sub makeData {
25 | local('$a $b');
26 | ($a, $b) = split('::', $1);
27 | if (strlen($b) > 0) { $b = ", $b " . count($a); }
28 | return @($a, fix($a), count($a, 0), $b, count($a, 1));
29 | }
30 |
31 | sub main {
32 | local('$handle @prefixes @rules $rule');
33 | $handle = openf($1);
34 | @prefixes = map(&makeData, filter(&noempties, readAll($handle)));
35 | closef($handle);
36 |
37 | $handle = openf($2);
38 | @rules = readAll($handle);
39 | closef($handle);
40 |
41 | foreach $rule (@rules) {
42 | printAll(map(lambda({ return '0BEGIN.0 ' . strrep($rule, '*prefix*', $1[0], '*text*', $1[1], '\\X', $1[2], '\\Y', $1[4], ', *transform*', $1[3]); }, \$rule), @prefixes));
43 | }
44 |
45 | printAll(map({ return '0BEGIN.0 ' . $1[0] . "::filter=kill"; }, @prefixes));
46 | }
47 |
48 | invoke(&main, sublist(@ARGV, 2));
49 | invoke(&main, @ARGV);
50 |
51 |
--------------------------------------------------------------------------------
/utils/common/bywords.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this class looks at how often the trigram tagger guesses a word's correctness by the confused word
3 | # used to generate the homobias class to
4 | #
5 |
6 | sub byword::init
7 | {
8 | this('%data');
9 |
10 | %data = ohash();
11 | setMissPolicy(%data,
12 | {
13 | return newObject("score", "$2");
14 | });
15 | }
16 |
17 | sub byword::process
18 | {
19 | local('$correct $wrong $wrongs $pre2 $pre1 $next @temp $nbase $tbase $solution $all %scores');
20 | ($correct, $wrong, $wrongs, $pre2, $pre1, $next) = @_;
21 |
22 | $all = tagAll($pre2[1], $pre1[1], $pre1[0], $wrongs);
23 |
24 | if (isDifferent($all))
25 | {
26 | $solution = getBest($all)[0];
27 | if ($solution eq $correct)
28 | {
29 | [%data[$solution] correct];
30 | }
31 | [%data[$solution] record];
32 | }
33 | }
34 |
35 | sub byword::finish
36 | {
37 | map({ [$1 print]; }, sort(&sortScores, values(%data)));
38 | }
39 |
40 | sub byword::save
41 | {
42 | local('$key $value $handle');
43 | foreach $key => $value (%data)
44 | {
45 | $value = [$value value];
46 | # warn("$key -> $value");
47 | }
48 |
49 | $handle = openf(">models/bywords.bin");
50 | writeObject($handle, %data);
51 | closef($handle);
52 | println("Model saved");
53 | }
54 |
--------------------------------------------------------------------------------
/service/code/build.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/service/src/view/error.slp:
--------------------------------------------------------------------------------
1 |
2 | <% $1 %>
3 | <% $2["rule"] %>
4 | <% iff($3 ne "0BEGIN.0" && $3 !isin ',()-[];:/--', $3) %>
5 | 0)
7 | {
8 | display("service/src/view/suggestions.slp", $4);
9 | }
10 | ?>
11 |
25 | http://service.afterthedeadline.com/info.slp?text='.[java.net.URLEncoder encode: $1].'');
38 | println('' . $INFOURL . '/info.slp?text='.[java.net.URLEncoder encode: $1].'&tags='.[java.net.URLEncoder encode: join('/', map({ return $1[1]; }, @tags))].'&engine='.$6.'');
39 | }
40 | ?>
41 |
42 |
43 |
--------------------------------------------------------------------------------
/utils/rules/findprepositions.sl:
--------------------------------------------------------------------------------
1 | #
2 | # a tool to inspect the language model
3 | #
4 |
5 | import org.dashnine.preditor.* from: lib/spellutils.jar;
6 | use(^SpellingUtils);
7 |
8 | # misc junk
9 | include("lib/dictionary.sl");
10 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
11 | $model = get_language_model();
12 | $dictionary = dictionary();
13 | $dsize = size($dictionary);
14 |
15 | global('@prepositions');
16 | @prepositions = filter({ return iff(indexOf($1, ' ') is $null, $1); }, map({ return [$1 trim]; }, `cat data/rules/prepositions.txt`));
17 |
18 | foreach $word (sort({ return count($2) <=> count($1); }, keys($dictionary)))
19 | {
20 | if (count($word) < 100)
21 | {
22 | continue;
23 | }
24 |
25 | foreach $preposition (@prepositions)
26 | {
27 | # Pnext(preposition|word)
28 | if (Pbigram1($word, $preposition) > 0.50)
29 | {
30 | println("$word $+ , $preposition : Pbigram1( $+ $word $+ , $preposition $+ ) = " . Pbigram1($word, $preposition));
31 | }
32 | # Pprev(preposition|word)
33 | else if (Pbigram2($preposition, $word) > 0.50)
34 | {
35 | println("$word $+ , $preposition : Pbigram2( $+ $preposition $+ , $word $+ ) = " . Pbigram2($preposition, $word));
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/data/rules/nomdb.txt:
--------------------------------------------------------------------------------
1 | an NOM of|with|from
2 | a NOM of|with|from
3 | in the NOM with|of
4 | in NOM with|of
5 | the NOM with|of|from
6 | come to a|an|the NOM
7 | came to a|an|the NOM
8 | make a|an|the NOM
9 | makes a|an|the NOM
10 | making a|an|the NOM
11 | made a|an|the NOM
12 | do a|an|the NOM
13 | did a|an|the NOM
14 | does a|an|the NOM
15 | doesnt a|an|the NOM
16 | give a|an|the NOM
17 | given a|an|the NOM
18 | have a|an|the NOM
19 | has a|an|the NOM
20 | had a|an|the NOM
21 | having a|an|the NOM
22 | have a|an|the NOM
23 | achieve a|an|the NOM
24 | achieved a|an|the NOM
25 | be NOM
26 | provided a|an|the NOM
27 | perform a|an|the NOM
28 | performed a|an|the NOM
29 | conduct a|an|the NOM
30 | conducted a|an|the NOM
31 | accomplish a|an|the NOM
32 | accomplished a|an|the NOM
33 | achieved a|an|the NOM
34 | attained a|an|the NOM
35 | carry out a|an|the NOM
36 | carried out a|an|the NOM
37 | conduct a|an|the NOM
38 | conducted a|an|the NOM
39 | effected a|an|the NOM
40 | experienced a|an|the NOM
41 | experience a|an|the NOM
42 | facilitated a|an|the NOM
43 | given a|an|the NOM
44 | implemented a|an|the NOM
45 | indicate a|an|the NOM
46 | indicated a|an|the NOM
47 | involve a|an|the NOM
48 | involved a|an|the NOM
49 | made a|an|the NOM
50 | obtained a|an|the NOM
51 | occurred a|an|the NOM
52 | performed a|an|the NOM
53 | proceeded a|an|the NOM
54 | produced a|an|the NOM
55 | required a|an|the NOM
56 | require a|an|the NOM
57 |
--------------------------------------------------------------------------------
/utils/spell/seededits.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this is a script to run unit tests and calculute the effectiveness of the
3 | # preditor engine
4 | #
5 |
6 | debug(debug() | 7 | 34);
7 |
8 | map({ iff('*.sl' iswm $1, include($1)); }, ls("utils/common"));
9 |
10 | include("lib/engine.sl");
11 | include("lib/object.sl");
12 |
13 | global('$dictionary $model $dsize $trie');
14 | $model = get_language_model();
15 | $dictionary = dictionary();
16 | $trie = trie($dictionary);
17 | $dsize = size($dictionary);
18 |
19 | sub seedFile
20 | {
21 | local('$score $good $bad $word');
22 |
23 | $score = newObject("score", "Word pool accuracy: $1");
24 |
25 | while $word (words($1))
26 | {
27 | ($bad, $good) = $word;
28 |
29 | if ($bad !in %edits)
30 | {
31 | %edits[$bad] = editst($dictionary, $trie, $bad); # filterByDictionary($bad, $dictionary);
32 | }
33 |
34 | if ($good in %edits[$bad])
35 | {
36 | [$score correct];
37 | }
38 | else
39 | {
40 | # println("$bad -> $good ".editDistance($bad, $good)." is not in " . %edits[$bad]);
41 | }
42 | [$score record];
43 | }
44 |
45 | [$score print];
46 | }
47 |
48 | global('%edits $handle');
49 | %edits = ohasha();
50 |
51 | map(&seedFile, @ARGV);
52 |
53 | $handle = openf(">models/edits.bin");
54 | writeObject($handle, %edits);
55 | closef($handle);
56 |
57 | println("Edits flushed!");
58 |
--------------------------------------------------------------------------------
/service/code/src/org/dashnine/preditor/GuessLanguage.java:
--------------------------------------------------------------------------------
1 | package org.dashnine.preditor;
2 |
3 | import sleep.bridges.*;
4 | import sleep.runtime.*;
5 | import sleep.interfaces.*;
6 |
7 | import java.util.*;
8 |
9 | import de.spieleck.app.cngram.NGramProfiles;
10 |
11 | /** Utilities for the Sleep Spellchecker used in AtD */
12 | public class GuessLanguage implements Loadable, Function
13 | {
14 | private static NGramProfiles profiles = null;
15 | static
16 | {
17 | try
18 | {
19 | profiles = new NGramProfiles();
20 | }
21 | catch (Exception ex) { ex.printStackTrace(); }
22 | }
23 |
24 | public String guessLanguage(String text)
25 | {
26 | if (text.length() > 1024)
27 | text = text.substring(0, 1024);
28 |
29 | NGramProfiles.Ranker ranker = profiles.getRanker();
30 | ranker.account(text);
31 | NGramProfiles.RankResult result = ranker.getRankResult();
32 | return result.getName(0);
33 | }
34 |
35 | public Scalar evaluate(String name, ScriptInstance script, Stack args)
36 | {
37 | return SleepUtils.getScalar(guessLanguage(BridgeUtilities.getString(args, "")));
38 | }
39 |
40 | public void scriptLoaded(ScriptInstance script)
41 | {
42 | script.getScriptEnvironment().getEnvironment().put("&guessLanguage", this);
43 | }
44 |
45 | public void scriptUnloaded(ScriptInstance script)
46 | {
47 |
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/data/rules/avoiddb.txt:
--------------------------------------------------------------------------------
1 | All reasonable men think I believe
2 | As is well known I think
3 | As mentioned earlier This is superfluous
4 | As you know You probably do not know
5 | Critics claim I claim
6 | Experience shows that My experience shows
7 | For obvious reasons I have no evidence
8 | I don't know if you You are ignorant
9 | I don't want to bore you This statement is boring
10 | I heard that I don't have a reliable source
11 | I wouldn't hesitate to recommend I recommend
12 | If you will Please, pretty please, I'm begging you
13 | It has been decided that I decided that
14 | It has been mentioned that I say
15 | It is evident that I think
16 | It is generally agreed that Some people think
17 | It is known that I think
18 | It is likely that I have not good enough evidence
19 | It is not necessary to stress the fact I should not need to tell you
20 | It is perhaps true to say I do not know what to think
21 | People say I say
22 | Popular wisdom has it that I think
23 | So far as we know We could be wrong
24 | Tentative conclusions Possibilities
25 | The most typical example The example that best suits my purpose
26 | There is evidence that I don't have good evidence
27 | There is no doubt that I am convinced
28 | To be honest with you Up to this point, I have not told the truth
29 | To tell you the truth Up to this point, I have not told the truth
30 | Would you object to Here is my suggestion
31 | You probably never heard of You are ignorant
32 | if you will Please, pretty please, I'm begging you
33 |
--------------------------------------------------------------------------------
/utils/bigrams/inspect.sl:
--------------------------------------------------------------------------------
1 | #
2 | # a tool to inspect the language model
3 | #
4 |
5 | debug(7 | 34);
6 |
7 | import org.dashnine.preditor.* from: lib/spellutils.jar;
8 | use(^SpellingUtils);
9 |
10 | # misc junk
11 | include("lib/dictionary.sl");
12 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
13 | $model = get_language_model();
14 | $dictionary = dictionary();
15 | $dsize = size($dictionary);
16 |
17 | print("> ");
18 |
19 | while $command (readln())
20 | {
21 | @temp = split('\s+', $command);
22 | if (size(@temp) == 5)
23 | {
24 | println("Trigram 1: " . sublist(@temp, 0, 3) . " = " . Ptrigram(@temp[0], @temp[1], @temp[2]));
25 | println("Trigram 2: " . sublist(@temp, 2, 5) . " = " . Ptrigram2(@temp[2], @temp[3], @temp[4]));
26 | }
27 | else if (size(@temp) == 3)
28 | {
29 | println("Trigram 1: " . @temp . " = " . Ptrigram(@temp[0], @temp[1], @temp[2]));
30 | println("Trigram 2: " . @temp . " = " . Ptrigram2(@temp[0], @temp[1], @temp[2]));
31 | }
32 | else if (size(@temp) == 2)
33 | {
34 | println("Bigram b, a->b " . @temp . " = " . Pbigram1(@temp[0], @temp[1]) );
35 | println("Bigram b, b<-a " . @temp . " = " . Pbigram2(@temp[0], @temp[1]) );
36 | }
37 | else if (size(@temp) == 1)
38 | {
39 | println("Unigram " . @temp . " = " . Pword(@temp[0]));
40 | println("Count " . @temp . " = " . count(@temp[0]));
41 | }
42 |
43 | print("> ");
44 | }
45 |
--------------------------------------------------------------------------------
/service/src/view/quality.slp:
--------------------------------------------------------------------------------
1 |
2 | $data (%metrics)
29 | {
30 | if ($data > 0.0)
31 | {
32 | ($type, $name) = split('\.', $metric);
33 | display("service/src/view/metric.slp", $type, $name, $data);
34 | }
35 | }
36 | ?>
37 |
--------------------------------------------------------------------------------
/data/rules/hyphens.txt:
--------------------------------------------------------------------------------
1 | # seeded from http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Grammar_and_Misc
2 |
3 | day to day::word=day-to-day
4 | out and out::word=out-and-out
5 | out of door::word=out-of-door
6 | out of doors::word=out-of-doors
7 | out of the way::word=out-of-the-way
8 | out of band::word=out-of-band
9 | out of bounds::word=out-of-bounds
10 | out of town::word=out-of-town
11 | out of state::word=out-of-state
12 | out of wedlock::word=out-of-wedlock
13 | out of pocket::word=out-of-pocket
14 | out of order::word=out-of-order
15 | out of place::word=out-of-place
16 | part time::word=part-time
17 | full time::word=full-time
18 | 1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59|60|61|62|63|64|65|66|67|68|69|70|71|72|73|74|75|76|77|78|79|80|81|82|83|84|85|86|87|88|89|90|91|92|93|94|95|96|97|98|99 year old::word=\0-year-old
19 | 100|200|250|500|100 year old::word=\0-year-old
20 | right|left handed::word=\0-handed
21 | case sensitive::word=case-sensitive
22 | case insensitive::word=case-insensitive
23 | award winning::word=award-winning
24 | out of body::word=out-of-body
25 | runner up::word=runner-up
26 | commander in chief::word=commander-in-chief
27 | win win::word=win-win
28 | win lose::word=win-lose
29 | lose lose::word=lose-lose
30 | built in::word=built-in
31 | ebook::word=e-book
32 | ereader::word=e-reader
33 | click throughs::word=click-throughs
34 | click through::word=click-through
35 | high five::word=high-five
36 | high fived::word=high-fived
37 | flu like::word=flu-like
38 |
--------------------------------------------------------------------------------
/utils/common/utils.sl:
--------------------------------------------------------------------------------
1 | sub toTaggerForm
2 | {
3 | return map({ return split('/', $1); }, $1);
4 | }
5 |
6 | sub sentences
7 | {
8 | local('$handle $sentence $candidates $line');
9 |
10 | $handle = openf("data/tests/ $+ $1");
11 |
12 | while $line (readln($handle))
13 | {
14 | ($sentence, $candidates) = split('\\|', $line);
15 | $candidates = split('[,;] ', $candidates);
16 | yield @($sentence, $candidates[0], sublist($candidates, 1));
17 | }
18 |
19 | closef($handle);
20 | }
21 |
22 | sub words
23 | {
24 | local('$handle $bad $good');
25 | $handle = openf("data/tests/ $+ $1");
26 | while $bad (readln($handle))
27 | {
28 | $good = readln($handle);
29 | yield @($bad, $good);
30 | }
31 | closef($handle);
32 | }
33 |
34 | sub loopHomophones
35 | {
36 | local('$entry $sentence $correct $wrongs $previous $next $wrong');
37 |
38 | while $entry (sentences($1))
39 | {
40 | ($sentence, $correct, $wrongs) = $entry;
41 | ($previous, $next) = split('\\*', $sentence);
42 | $previous = split('\\s+', [$previous trim])[-1];
43 | $previous = iff($previous eq "", '0BEGIN.0', $previous);
44 | $next = split('\\s+', [$next trim])[0];
45 | $next = iff($next eq "" || $next ismatch '[\\.!?]', '0END.0', $next);
46 | $next = iff(charAt($next, -1) ismatch '[\\.!?]', substr($next, 0, -1), $next);
47 |
48 | push($wrongs, $correct);
49 |
50 | foreach $wrong ($wrongs)
51 | {
52 | [$2 process: $correct, $wrong, $wrongs, $previous, $next];
53 | }
54 | }
55 |
56 | [$2 finish];
57 | }
58 |
--------------------------------------------------------------------------------
/models/get_model_binaries.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | svn export https://openatd.svn.wordpress.org/atd-server/models/cnetwork.bin ./models/cnetwork.bin
3 | svn export https://openatd.svn.wordpress.org/atd-server/models/cnetwork2.bin ./models/cnetwork2.bin
4 | svn export https://openatd.svn.wordpress.org/atd-server/models/dictionary.txt ./models/dictionary.txt
5 | svn export https://openatd.svn.wordpress.org/atd-server/models/edits.bin ./models/edits.bin
6 | svn export https://openatd.svn.wordpress.org/atd-server/models/endings.bin ./models/endings.bin
7 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork.bin ./models/hnetwork.bin
8 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork2.bin ./models/hnetwork2.bin
9 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork4.bin ./models/hnetwork4.bin
10 | svn export https://openatd.svn.wordpress.org/atd-server/models/lexicon.bin ./models/lexicon.bin
11 | svn export https://openatd.svn.wordpress.org/atd-server/models/model.bin ./models/model.bin
12 | svn export https://openatd.svn.wordpress.org/atd-server/models/model.zip ./models/model.zip
13 | svn export https://openatd.svn.wordpress.org/atd-server/models/network3f.bin ./models/network3f.bin
14 | svn export https://openatd.svn.wordpress.org/atd-server/models/network3p.bin ./models/network3p.bin
15 | svn export https://openatd.svn.wordpress.org/atd-server/models/not_misspelled.txt ./models/not_misspelled.txt
16 | svn export https://openatd.svn.wordpress.org/atd-server/models/stringpool.bin ./models/stringpool.bin
17 | svn export https://openatd.svn.wordpress.org/atd-server/models/trigrams.bin ./models/trigrams.bin
18 | ./bin/buildrules.sh
19 |
--------------------------------------------------------------------------------
/data/rules/grammar/aux_noparticiple:
--------------------------------------------------------------------------------
1 | has &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
2 | hasn't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
3 | has not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
4 |
5 | have &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
6 | haven't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
7 | have not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
8 |
9 | had &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
10 | hadn't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
11 | had not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
12 |
13 | were &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
14 | weren't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
15 | were not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle
16 |
17 | could've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
18 | would've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
19 | should've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
20 | you've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
21 | You've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
22 | I've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
23 | we've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
24 | We've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
25 | they've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
26 | They've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle
27 |
--------------------------------------------------------------------------------
/data/rules/foreigndb.txt:
--------------------------------------------------------------------------------
1 | a fortiori with even stronger reason
2 | a posteriori from effects to causes; reasoning based on past experience
3 | a priori from causes to effects; conclusions drawn from assumptions; from what comes before; deductive reasoning
4 | ab initio from the beginning
5 | ad hoc improvised
6 | ad infinitum never ending
7 | ad lib at will, off the top of the head
8 | bona fide in good faith
9 | caveat caution, warning
10 | curricula vitae the courses of one's life, resumes
11 | curriculum vitae the course of one's life, resume
12 | de facto from the fact
13 | de jure from the law
14 | ex officio out of one's duty, out of one's office
15 | ex post facto after the fact, retrospectively
16 | hors d'oeuvre appetizer
17 | hors d'oeuvres appetizers
18 | hors de combat out of the battle, out of service
19 | in situ in its original place
20 | in toto in its entirety
21 | infra below
22 | inter alia among other things
23 | ipso facto by the fact itself
24 | locus classicus standard or most authoritative source
25 | non sequitur it does not follow
26 | passim here and there, throughout, in several places
27 | per capita per head
28 | prima facie at first sight, on the face of it
29 | pro bono for the public good, at no cost
30 | pro rata in proportion
31 | quid pro quo something in return
32 | raison d'etre reason for, purpose
33 | scilicet that is to say, namely
34 | scire licet that is to say, namely
35 | sic thus used, thus spelt
36 | sine die without a day, with no time fixed
37 | sine qua non without which not, essential precondition
38 | status quo things as they are
39 | stet as it was originally
40 | supra above
41 | vide see
42 | vide supre see above
43 | viva oral examination
44 | voce oral examination
45 |
--------------------------------------------------------------------------------
/utils/rules/makespecial.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this script extracts relevant irregular verbs from the internal data to allow us to create rules
3 | #
4 |
5 |
6 | include("lib/engine.sl");
7 | include("utils/rules/rules.sl");
8 |
9 | sub checkSentenceSpelling
10 | {
11 | }
12 |
13 | sub initAll
14 | {
15 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
16 | $model = get_language_model();
17 | $dictionary = dictionary();
18 | $dsize = size($dictionary);
19 | $hnetwork = get_network("hnetwork.bin");
20 | $verbs = loadVerbData();
21 | initTaggerModels();
22 | }
23 |
24 | sub main
25 | {
26 | initAll();
27 |
28 | local('$key $value $base $past $participle @results @past @base');
29 |
30 | foreach $key => $value ($verbs['base'])
31 | {
32 | ($base, $past, $participle) = values($value, @("base", "past", "participle"));
33 | if ($past ne $participle)
34 | {
35 | push(@past, $past);
36 | push(@results, $past);
37 | }
38 |
39 | if ($base ne $participle && $base ne $past)
40 | {
41 | push(@base, $base);
42 | push(@results, $base);
43 | }
44 | }
45 |
46 | @results = filter({ return iff(count($1) > 2, $1, println("Killed $[20]1 " . count($1)) ); }, @results);
47 | @past = filter({ return iff(count($1) > 2, $1); }, @past);
48 | @base = filter({ return iff(count($1) > 2, $1); }, @base);
49 |
50 | println("Total words: " . size(@results));
51 | println("==== RESULTS ====");
52 | println(join("|", sorta(@results)));
53 | println("==== PAST ====");
54 | println(join("|", sorta(@past)));
55 | println("==== BASE ====");
56 | println(join("|", sorta(@base)));
57 | }
58 |
59 | invoke(&main, @ARGV);
60 |
--------------------------------------------------------------------------------
/data/rules/grammar/separate:
--------------------------------------------------------------------------------
1 | #
2 | # words that should be separated (and in what context)
3 | #
4 |
5 | # everyone of -> every one of
6 |
7 | everyone of::word=every one of::pivots=\1,one of::rule=Separate everyone
8 |
9 | # flashpoint -> flash point
10 |
11 | flashpoint::word=flash point
12 |
13 | # a while vs. awhile (split)
14 |
15 | after|for|in awhile::word=\0 a while::pivots=awhile,a while
16 |
17 | can backup::word=can back up
18 | can blackout::word=can black out
19 | can setup::word=can set up
20 | can workout::word=can work out
21 | for along time::word=for a long time
22 | for awhile::word=for a while
23 | for quite awhile::word=for quite a while
24 | got setup::word=got set up
25 | got shutdown::word=got shut down
26 | got shutout::word=got shut out
27 | had comeback::word=had come back
28 | had setup::word=had set up
29 | has setup::word=has set up
30 | have setup::word=have set up
31 | help setup::word=help set up
32 | in along time::word=in a long time
33 | in anyway::word=in any way
34 | in awhile::word=in a while
35 | in quite awhile::word=in quite a while
36 | incase of::word=in case of
37 | is setup::word=is set up
38 | Portland Trailblazers::word=Portland Trail Blazers
39 | take awhile::word=take a while
40 | to backout::word=to back out
41 | to backup::word=to back up
42 | to blackout::word=to black out
43 | to comeback::word=to come back
44 | to setup::word=to set up
45 | to shutdown::word=to shut down
46 | after along time::word=after a long time
47 | after awhile::word=after a while
48 | after quite awhile::word=after quite a while
49 | allot of::word=a lot of
50 | along time::word=a long time
51 | downpayment::word=down payment
52 | smartphone::word=smart phone
53 | ala mode::word=à la mode::filter=none
54 | afterall::word=after all
55 | to bailout::word=\0 bail out::pivots=bailout,bail out
56 |
57 |
--------------------------------------------------------------------------------
/utils/tagger/makesentences.sl:
--------------------------------------------------------------------------------
1 | debug(7 | 34);
2 |
3 | sub process
4 | {
5 | local('@words $entry $previous $current $next');
6 |
7 | $1 = [$1 trim];
8 | if ($1 !ismatch '[A-Z][A-Za-z\'\,0-9 ]*?[\.\?\!]')
9 | {
10 | return;
11 | }
12 |
13 | @words = splitIntoWords($1);
14 |
15 | if (size(@words) < 3)
16 | {
17 | return;
18 | }
19 |
20 | # foreach $entry (@words)
21 | # {
22 | # if (%dictionary[$entry] is $null)
23 | # {
24 | # return;
25 | # }
26 | # }
27 |
28 | # println($output, lc(join(" ", @words)) );
29 | println($output, join(" ", @words) );
30 | }
31 |
32 | sub processFile
33 | {
34 | local('$handle $key $data $text @paragraphs');
35 |
36 | # read in our corpus.
37 | $handle = openf($1);
38 | $text = replace(readb($handle, -1), '<[^>]*?>', '');
39 | closef($handle);
40 |
41 | # start processing it?!?
42 | @paragraphs = splitByParagraph($text);
43 | map({ map(&process, $1); }, @paragraphs);
44 | }
45 |
46 | sub main
47 | {
48 | # setup our file that we're going to dump the output to.
49 | global('$output');
50 | $output = openf("> $+ $2");
51 |
52 | # ok go through all the junk parsing through the files.
53 |
54 | include("lib/nlp.sl");
55 | include("lib/dictionary.sl");
56 |
57 | global('%dictionary');
58 | %dictionary = dictionary();
59 | %dictionary["0BEGIN.0"] = 1;
60 | %dictionary["0END.0"] = 1;
61 |
62 | # collect list of files.
63 | [{
64 | if (-isDir $1)
65 | {
66 | map($this, ls($1));
67 | }
68 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
69 | {
70 | processFile($1);
71 | }
72 | }: $1];
73 |
74 |
75 | closef($output);
76 | println("Done!");
77 | }
78 |
79 | invoke(&main, @ARGV);
80 |
--------------------------------------------------------------------------------
/data/rules/grammar/your:
--------------------------------------------------------------------------------
1 | your .*ing/VBG::word=you're \1::pivots=your,you're
2 | if your .*/DT::word=if you're \2::pivots=your,you're
3 | your the|a|an::word=you're \1::filter=none
4 | Your .*ing/VBG::word=You're \1::pivots=Your,You're
5 | If your .*/DT::word=If you're \2::pivots=your,you're
6 | Your the|a|an::word=You're \1::filter=none
7 |
8 | about|around|at|by|for|from|in|near|of|on|over|through|to|towards|under|with|without you're::word=\0 your::pivots=you're,your
9 |
10 | you're [a-z].*/NN|NNS are|is::word=your \1 \2::pivots=you're,your
11 | to .*/VB you're .*/NN::word=\0 \1 your \3::pivots=you're,your
12 | Your right::word=You're right::pivots=Your,You're::options=your,you're
13 |
14 | you're .* could|would|should|did|may|will|has|have|can|couldn't|wouldn't|shouldn't|didn't|won't|hasn't|haven't|can't::word=your \1 \2::pivots=you're,your
15 |
16 | to you're::word=to your::pivots=you're, your
17 |
18 | your welcome::word=you're welcome::pivots=your,you're
19 | Your welcome::word=You're welcome::pivots=Your,You're::options=your,you're
20 | Your welcome 0END.0::word=You're welcome::filter=none
21 |
22 | you're are::word=you are::filter=none
23 | your are::word=you're::filter=none
24 | your are .*ing::word=you are \2::filter=none
25 |
26 | Your not::word=You're not::pivots=Your,You're
27 | your not::word=you're not::pivots=your,you're
28 | your in|at::word=you're \1::filter=none
29 | Your in|at::word=You're \1::filter=none
30 |
31 | has|is you're::word=\0 your::pivots=you're,your::options=you're,your
32 | your so|as|gonna::word=you're \1::pivots=your,you're::options=your,you're
33 | Your so|as|gonna::word=You're \1::pivots=Your,You're::options=Your,You're
34 |
35 | as you're .*/NN::word=\0 your \2::pivots=you're,your::options=you're,your
36 | As you're .*/NN::word=\0 your \2::pivots=you're,your::options=you're,your
37 |
--------------------------------------------------------------------------------
/utils/common/hotest.sl:
--------------------------------------------------------------------------------
1 | sub hotest::init
2 | {
3 | this('$score1 $score2 $score $criterf $network $criteria');
4 |
5 | $criterf = criteria($2);
6 | $network = get_network($1);
7 | $criteria = $2;
8 |
9 | $score1 = newObject("score", "Correct $4");
10 | $score2 = newObject("score", "Wrong $4");
11 | $score = newObject("score", "Composite $4");
12 | }
13 |
14 | sub hotest::process
15 | {
16 | local('$correct $wrong $wrongs $pre2 $pre1 $next $next2 @temp');
17 | ($correct, $wrong, $wrongs, $pre2, $pre1, $next, $next2) = @_;
18 |
19 | if (size($criteria) == 0)
20 | {
21 | @temp[0] = rand($wrongs);
22 | }
23 | else
24 | {
25 | @temp = checkAnyHomophone($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]), $pre2[0], $next2[0], $criteriaf => $criterf);
26 | # println(join(', ', @($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]))) . ' = ' . @temp);
27 | }
28 |
29 | if (size(@temp) == 0)
30 | {
31 | @temp[0] = $wrong;
32 | }
33 |
34 | if (@temp[0] eq $correct)
35 | {
36 | [iff($wrong eq $correct, $score1, $score2) correct];
37 | [$score correct];
38 | # warn("Correct!");
39 | }
40 | else
41 | {
42 | if ($wrong eq $correct)
43 | {
44 | [$score1 falsePositive];
45 | [$score falsePositive];
46 | # warn("FP!");
47 | }
48 | else
49 | {
50 | [$score2 falseNegative];
51 | [$score falseNegative];
52 | # warn("FN!");
53 | }
54 | }
55 |
56 | [$score record];
57 | [iff($wrong eq $correct, $score1, $score2) record];
58 | }
59 |
60 | sub hotest::finish
61 | {
62 | [$score1 print];
63 | [$score2 print];
64 | [$score print];
65 | println("-" x 30);
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/utils/bigrams/qscore.sl:
--------------------------------------------------------------------------------
1 | #
2 | # generate statistics about a datset to evaluate writing quality
3 | #
4 | debug(7 | 34);
5 |
6 | include("lib/quality.sl");
7 | include("lib/engine.sl");
8 |
9 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs $locks $trie %common');
10 |
11 | $model = get_language_model();
12 | $dictionary = dictionary();
13 | $rules = get_rules();
14 | $network = get_network("cnetwork.bin");
15 | $hnetwork = get_network("hnetwork.bin");
16 | %edits = initEdits();
17 | $dsize = size($dictionary);
18 | $verbs = loadVerbData();
19 | %common = loadCommonWords();
20 | initTaggerModels();
21 |
22 | sub report
23 | {
24 | local('@keys $metric $words $sentences $a $b $key');
25 |
26 | @keys = sort({ return lc($1) cmp lc($2); }, keys($2));
27 |
28 | $words = double($2['words']);
29 | $sentences = double($2['sentences']);
30 |
31 | foreach $key (@keys)
32 | {
33 | $metric = double($2[$key]);
34 | $a = ($metric / $words) * 100.0;
35 | $b = ($metric / $sentences) * 100.0;
36 | println("$[20]1 : $[30]key : $[10]metric $[25]a $[25]b");
37 | }
38 | }
39 |
40 | sub checkDocument
41 | {
42 | local('$data %stats $start');
43 |
44 | $start = ticks();
45 |
46 | # strip HTML please
47 | $data = strrep($2, ' ', ' ', '
', "\n", '', "\n", '', "\n", '"e;', '"', '&', '&');
48 | $data = replace($data, '(<[^>]*?>)', '');
49 |
50 | %stats = processDocumentQuality($data);
51 | report(getFileName($1), %stats);
52 |
53 | println("Time: " . (ticks() - $start) . "ms");
54 | }
55 |
56 | sub main
57 | {
58 | local('$handle $data');
59 | $handle = openf($1);
60 | $data = readb($handle, -1);
61 | closef($handle);
62 |
63 | checkDocument($1, $data);
64 | }
65 |
66 | invoke(&main, @ARGV)
67 |
--------------------------------------------------------------------------------
/data/rules/grammar/too:
--------------------------------------------------------------------------------
1 | too niche::filter=kill
2 | too .*/NN|VB .*/VB.*::word=too \1 \2:: # ruling out a false positive
3 | too .*/NN|VB::word=to \1::pivots=too,to
4 | too do::word=to \1::pivots=too,to
5 | too the::word=to \1::pivots=too,to
6 | to much|few of::filter=kill
7 | to much|few::word=too \1::pivots=to,too
8 | two many::words=to many,too many::pivots=two,to,too
9 | is to|two late|easy::word=\0 too \2::pivots=\1,too
10 | was to|two late|easy::word=\0 too \2::pivots=\1,too
11 | be to|two late|easy::word=\0 too \2::pivots=\1,too
12 | were to|two late|easy::word=\0 too \2::pivots=\1,too
13 | are to|two late|easy::word=\0 too \2::pivots=\1,too
14 | been to|two late|easy::word=\0 too \2::pivots=\1,too
15 | comes to|two soon::word=\0 too \2::pivots=\1,too
16 | came to|two soon::word=\0 too \2::pivots=\1,too
17 | much to|two soon|late|early|easy::word=\0 too \2::pivots=\1,too
18 | is to|two soon::word=\0 too \2::pivots=\1,too
19 | was to|two soon::word=\0 too \2::pivots=\1,too
20 | were to|two soon::word=\0 too \2::pivots=\1,too
21 | are to|two soon::word=\0 too \2::pivots=\1,too
22 | been to|two soon::word=\0 too \2::pivots=\1,too
23 | is to .*/JJ.* 0END.0::word=\0 too \2::filter=none
24 | was to .*/JJ.* 0END.0::word=\0 too \2::filter=none
25 | be to .*/JJ.* 0END.0::word=\0 too \2::filter=none
26 | were to .*/JJ.* 0END.0::word=\0 too \2::filter=none
27 | are to .*/JJ.* 0END.0::word=\0 too \2::filter=none
28 | been to .*/JJ.* 0END.0::word=\0 too \2::filter=none
29 | not to .*/JJ 0END.0::word=too \1::filter=none
30 | is to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
31 | was to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
32 | be to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
33 | were to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
34 | are to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
35 | been to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none
36 |
--------------------------------------------------------------------------------
/utils/common/homo.sl:
--------------------------------------------------------------------------------
1 | #
2 | # test out spelling with associated context information
3 | #
4 |
5 | sub suggestTest
6 | {
7 | local('$suspect $dict $previous $next @suggestions $f');
8 | ($suspect, $dict, $previous, $next) = @_;
9 |
10 | @suggestions = %edits[$suspect];
11 |
12 | if ($correct in @suggestions)
13 | {
14 | foreach $f (@functions)
15 | {
16 | [$f : $suspect, $correct, copy(@suggestions), $previous, $next];
17 | }
18 | # warn("Done for $previous $suspect $next -> $correct");
19 | }
20 |
21 | return @();
22 | }
23 |
24 | sub testCorrectionsContext
25 | {
26 | local('$score $entry $sentence $correct $wrongs @results @words $rule $wrong $previous $next $func');
27 |
28 | while $entry (sentences($1))
29 | {
30 | ($sentence, $correct, $wrongs) = $entry;
31 | ($previous, $next) = split(' \\* ', $sentence);
32 | $func = lambda(&suggestTest, \$score, \$correct, @functions => sublist(@_, 1));
33 |
34 | #
35 | # check for a false negative
36 | #
37 | foreach $wrong ($wrongs)
38 | {
39 | [$func: $wrong, $dictionary, $previous, $next]
40 | }
41 | }
42 | }
43 |
44 | sub loopHomophonesPOS
45 | {
46 | local('$entry $sentence $correct $wrongs $pre2 $pre1 $next $object $wrong $next2');
47 |
48 | while $entry (sentences($1))
49 | {
50 | ($sentence, $correct, $wrongs) = $entry;
51 | ($pre2, $pre1, $null, $next, $next2) = toTaggerForm(split(' ', $sentence));
52 |
53 | if ($pre2[1] eq "UNK") { $pre2[1] = ""; }
54 | if ($pre1[1] eq "UNK") { $pre1[1] = ""; }
55 |
56 | $correct = split('/', $correct)[0];
57 |
58 | push($wrongs, $correct);
59 |
60 | foreach $wrong ($wrongs)
61 | {
62 | [$2 process: $correct, $wrong, $wrongs, $pre2, $pre1, $next, $next2];
63 | }
64 |
65 | # [$2 process: $correct, $correct, $wrongs, $pre2, $pre1, $next];
66 | }
67 |
68 | [$2 finish];
69 | }
70 |
--------------------------------------------------------------------------------
/lib/quality.sl:
--------------------------------------------------------------------------------
1 | #
2 | # calculate quality score for a dataset
3 | #
4 |
5 | sub loadCommonWords
6 | {
7 | this('$common');
8 | if ($common is $null)
9 | {
10 | $common = %();
11 | local('$handle $bad $good $foo');
12 |
13 | # function to load file data and add it to our hash
14 | $foo = lambda(
15 | {
16 | local('$handle $bad');
17 | $handle = openf($1);
18 | while $bad (readln($handle))
19 | {
20 | if ($bad !in $dictionary)
21 | {
22 | $common[$bad] = 1;
23 | }
24 | }
25 | closef($handle);
26 | }, \$common);
27 |
28 | [$foo : 'data/tests/tests1.txt'];
29 | [$foo : 'data/tests/tests2.txt'];
30 | }
31 |
32 | return $common;
33 | }
34 |
35 | sub generateStatistics
36 | {
37 | local('$error $rule');
38 |
39 | foreach $error ($1)
40 | {
41 | $rule = $error[0];
42 | $2[$rule['rule']] += 1;
43 | }
44 | }
45 |
46 | sub processDocumentQuality
47 | {
48 | local('@paragraphs $paragraph $sentence @results @words $count $word %common $suggest %stats');
49 |
50 | %common = loadCommonWords();
51 | @paragraphs = splitByParagraph($1);
52 |
53 | $suggest = function('&suggest');
54 | setf('&suggest', { return @(); });
55 |
56 | foreach $count => $paragraph (@paragraphs)
57 | {
58 | foreach $sentence ($paragraph)
59 | {
60 | if ($sentence eq "")
61 | {
62 | continue;
63 | }
64 |
65 | @words = splitIntoWords($sentence);
66 | %stats['words'] += size(@words);
67 | %stats['sentences'] += 1;
68 |
69 | foreach $word (@words) { if ($word in %common) { %stats['miss'] += 1; } }
70 |
71 | processSentence(\$sentence, \@results);
72 | }
73 |
74 | generateStatistics(@results, %stats);
75 | @results = @();
76 | }
77 |
78 | setf('&suggest', $suggest);
79 | return %stats;
80 | }
81 |
82 |
--------------------------------------------------------------------------------
/utils/spelldata/maker.sl:
--------------------------------------------------------------------------------
1 | #
2 | # This is a script to generate an AtD test corpus from a rule file (assumes you used torules.sl or something similar to generate the file)
3 | #
4 | # java -jar utils/rules/maker.sl
5 | #
6 | # format:
7 | #
8 | # correct text|word=wrong text
9 | #
10 |
11 | include("lib/engine.sl");
12 | include("utils/rules/rules.sl");
13 |
14 | sub checkSentenceSpelling
15 | {
16 | }
17 |
18 | sub initAll
19 | {
20 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
21 | $model = get_language_model();
22 | $dictionary = dictionary();
23 | $dsize = size($dictionary);
24 | $hnetwork = get_network("hnetwork.bin");
25 | $verbs = loadVerbData();
26 | initTaggerModels();
27 | }
28 |
29 | sub main
30 | {
31 | local('$handle $sentence @results @past');
32 |
33 | initAll();
34 |
35 | if (function("& $+ $1") !is $null)
36 | {
37 | $rules = machine();
38 | invoke(function("& $+ $1"));
39 | }
40 | else
41 | {
42 | $rules = loadRules(machine(), $1, %());
43 | }
44 |
45 | $handle = openf($2);
46 | while $sentence (readln($handle))
47 | {
48 | @results = @();
49 | processSentence(\$sentence, \@results);
50 |
51 | @past = copy(@results);
52 |
53 | if (size(@past) == 1)
54 | {
55 | foreach $index => $r (@past)
56 | {
57 | local('$rule $text $path $context @suggestions');
58 | ($rule, $text, $path, $context, @suggestions) = $r;
59 |
60 | %count[$rule['word']] += 1;
61 |
62 | if (%count[$rule['word']] < 5)
63 | {
64 | println(strrep($sentence, " $text ", ' * ') . '|' . $rule['word'] . ', ' . iff($rule['options'] ne "", $rule['options'], $text) . '|' . $text);
65 | }
66 | }
67 | }
68 | }
69 | }
70 |
71 | invoke(&main, @ARGV);
72 |
--------------------------------------------------------------------------------
/data/rules/grammar/apostrophes:
--------------------------------------------------------------------------------
1 | #
2 | # missing apostrophes
3 | #
4 |
5 | # Verbs with not contracted:
6 |
7 | arent::word=aren't
8 | didnt::word=didn't
9 | dont::word=don't
10 | isnt::word=isn't
11 | #cant::word=can't
12 | werent::word=weren't
13 | wouldnt::word=wouldn't
14 | doesnt::word=doesn't
15 | hasnt::word=hasn't
16 | couldnt::word=couldn't
17 | hadnt::word=hadn't
18 |
19 | Arent::word=Aren't
20 | Didnt::word=Didn't
21 | Dont::word=Don't
22 | Isnt::word=Isn't
23 | #cant::word=Can't
24 | Werent::word=Weren't
25 | Wouldnt::word=Wouldn't
26 | Doesnt::word=Doesn't
27 | Hasnt::word=Hasn't
28 | Couldnt::word=Couldn't
29 | Hadnt::word=Hadn't
30 |
31 | # Pronouns with will
32 |
33 | Ill::word=I'll
34 |
35 | youll::word=you'll
36 | #hell::word=he'll
37 | #shell::word=she'll
38 | theyll::word=they'll
39 |
40 | Youll::word=You'll
41 | #hell::word=he'll
42 | #shell::word=she'll
43 | Theyll::word=Yhey'll
44 |
45 | # pronouns with the verb to be
46 |
47 | Im::word=I'm
48 |
49 | youre::word=you're
50 | whos::word=who's
51 | hes::word=he's
52 | shes::word=she's
53 | #its::word=it's
54 | #were::word=we're
55 | theyre::word=they're
56 | thats::word=that's::filter=none
57 |
58 | Youre::word=You're
59 | Whos::word=Who's
60 | Hes::word=He's
61 | Shes::word=She's
62 | #its::word=it's
63 | #were::word=we're
64 | Theyre::word=They're
65 | Thats::word=That's
66 |
67 | # to have
68 |
69 | Ive::word=I've
70 |
71 | youve::word=you've
72 | weve::word=we've
73 | theyve::word=they've
74 |
75 | Youve::word=You've
76 | Weve::word=We've
77 | Theyve::word=They've
78 |
79 | # would or had
80 |
81 | #Id::word=I'd
82 |
83 | hed::word=he'd
84 | #shed::word=she'd
85 | youd::word=you'd
86 | #wed::word=we'd
87 | theyd::word=they'd
88 |
89 | Hed::word=He'd
90 | #shed::word=she'd
91 | Youd::word=You'd
92 | #wed::word=we'd
93 | Theyd::word=They'd
94 |
95 | #
96 |
97 | Theres::word=There's
98 | theres::word=there's
99 |
100 | oclock::word=o'clock
101 |
102 | heres::word=here's
103 |
--------------------------------------------------------------------------------
/data/rules/grammar/their:
--------------------------------------------------------------------------------
1 | their is|are|a|an::word=there \1::pivots=their,there
2 | there to::filter=kill
3 | there .*/JJ.* .*/NN::word=their \1 \2::pivots=there,their
4 | there .*ing/NN::word=their \1, they're \1::pivots=there,their,they're
5 | there .*/NN::word=their \1::pivots=there,their
6 | Their is|are|a|an::word=There \1::pivots=their,there
7 | There .*/JJ.* .*/NN::word=Their \1 \2::pivots=there,their
8 | There .*ing/NN::word=Their \1, They're \1::pivots=there,their,they're
9 | There .*/NN::word=Their \1::pivots=there,their
10 | is there .*/NN::word=\0 \1 \2
11 | is there .*/JJ .*/NN::word=\0 \1 \2 \3
12 | isn't there .*/NN::word=\0 \1 \2
13 | isn't there .*/JJ .*/NN::word=\0 \1 \2 \3
14 | was there .*/NN::word=\0 \1 \2
15 | was there .*/JJ .*/NN::word=\0 \1 \2 \3
16 | are there .*/NN::word=\0 \1 \2
17 | are there .*/JJ .*/NN::word=\0 \1 \2 \3
18 | if their .*ing::word=\0 they're \2::pivots=\1,they're
19 | to .*/VB there .*/NN::word=\0 \1 their \3::pivots=\2,their
20 | in there|they're .*/NN|JJ .*/NN::word=\0 their \2 \3::pivots=\1,their
21 | in there|they're .*/NN::word=\0 their \2::pivots=\1,their
22 | they're are::word=there are, they are::pivots=they're,there,they
23 | They're are::word=There are, They are::pivots=They're,There,They
24 | .*/VB there .*/NNS::word=\0 their \2::pivots=\1,their
25 | .*/VB there .*/JJ .*/NNS::word=\0 their \2 \3::pivots=\1,their
26 | .*/IN there .*/NNS::word=\0 their \2::pivots=\1,their
27 | .*/IN there .*/JJ .*/NNS::word=\0 their \2 3::pivots=\1,their
28 |
29 | has|is they're::word=\0 their::pivots=they're,their::options=they're,their
30 | their so|as|gonna::word=they're \1::pivots=their,they're::options=their,They're
31 | Their so|as|gonna::word=They're \1::pivots=Their,They're::options=Their,They're
32 |
33 | #
34 | # some rules to map their|there -> they're
35 | #
36 | their doing so::filter=kill
37 | there being .*/IN|DT::filter=kill
38 | there|their .*/VBG .*/IN::word=they're \1 \2::pivots=\0,they're
39 | there|their .*/VBG .*/DT::word=they're \1 \2::pivots=\0,they're
40 | there|their .*/VBG 0END.0::word=they're \1 \2::pivots=\0,they're
41 |
--------------------------------------------------------------------------------
/service/code/src/org/dashnine/preditor/SortFromHash.java:
--------------------------------------------------------------------------------
1 | package org.dashnine.preditor;
2 |
3 | import sleep.runtime.*;
4 | import sleep.bridges.*;
5 | import sleep.interfaces.*;
6 |
7 | import java.util.*;
8 |
9 | /* Code to implement a sort function that sorts values by their corresponding Double values in a hashtable. This class exists to replace
10 | sort(lambda({ return %hash[$1] <=> %hash[$2]; }, \%hash). This snippet was identified by the profiler as consuming more time
11 | than any other function */
12 | public class SortFromHash implements Loadable
13 | {
14 | private static class CompareHashItems implements Comparator
15 | {
16 | protected ScalarHash hash;
17 |
18 | public CompareHashItems(ScalarHash _hash)
19 | {
20 | hash = _hash;
21 | }
22 |
23 | public int compare(Object a, Object b)
24 | {
25 | double aa, bb;
26 | aa = hash.getAt((Scalar)a).doubleValue();
27 | bb = hash.getAt((Scalar)b).doubleValue();
28 |
29 | if (aa > bb)
30 | {
31 | return -1;
32 | }
33 | else if (aa < bb)
34 | {
35 | return 1;
36 | }
37 | else
38 | {
39 | return 0;
40 | }
41 | }
42 | }
43 |
44 | private static class func_sortFromHash implements Function
45 | {
46 | public Scalar evaluate(String n, ScriptInstance i, Stack l)
47 | {
48 | ScalarArray array = BridgeUtilities.getWorkableArray(l);
49 | ScalarHash hash = BridgeUtilities.getHash(l);
50 |
51 | array.sort(new CompareHashItems(hash));
52 |
53 | return SleepUtils.getArrayScalar(array);
54 | }
55 | }
56 |
57 | public void scriptLoaded(ScriptInstance script)
58 | {
59 | script.getScriptEnvironment().getEnvironment().put("&sortHash", new func_sortFromHash());
60 | }
61 |
62 | public void scriptUnloaded(ScriptInstance script)
63 | {
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/bin/buildtaggersets.sh:
--------------------------------------------------------------------------------
1 | #
2 | # code to generate the data used to bootstrap the tagger
3 | #
4 |
5 | mkdir tmp
6 |
7 | java -Xmx1024M -jar lib/sleep.jar utils/tagger/makesentences.sl data/corpus_wikipedia tmp/wikipedia_sentences.txt
8 | java -Xmx1024M -jar lib/sleep.jar utils/tagger/makesentences.sl data/corpus_gutenberg tmp/gutenberg_sentences.txt
9 |
10 | #
11 | # You *must* download the Stanford POS Tagger (GPL) from: http://nlp.stanford.edu/software/tagger.shtml
12 | # and extract it into your AtD directory.
13 | #
14 | # This tagger will take 3 days to run / file
15 | # ------
16 |
17 | cd stanford-postagger-2008-09-28
18 | java -Xmx1024M -XX:+AggressiveHeap -XX:+UseParallelGC -jar ../lib/sleep.jar ../utils/tagger/makebootstrap.sl models/bidirectional-wsj-0-18.tagger ../data/gutenberg_sentences.txt >../tmp/gutenberg_sentences_tagged.txt &
19 | java -Xmx1024M -XX:+AggressiveHeap -XX:+UseParallelGC -jar ../lib/sleep.jar ../utils/tagger/makebootstrap.sl models/bidirectional-wsj-0-18.tagger ../data/wikipedia_sentences.txt >../tmp/wikipedia_sentences_tagged.txt &
20 |
21 | #
22 | # Or, optionally, you can use this Tagger which includes source but use is allowed for non-commercial research purposes only
23 | #
24 | # http://www-tsujii.is.s.u-tokyo.ac.jp/~tsuruoka/postagger/
25 | #
26 | # This tagger will execute in 5 minutes / file
27 | # ---------
28 |
29 | # Oh, irony of ironies-- this tagger and the Stanford tagger produce nearly identical data (AtD bootstraps from the Stanford data though)
30 |
31 | #
32 | #cd postagger-1.0
33 | #./tagger <../tmp/wikipedia_sentences.txt >../tmp/wikipedia_sentences_tagged.txt
34 | #./tagger <../tmp/gutenberg_sentences.txt >../tmp/gutenberg_sentences_tagged.txt
35 | #
36 | cd ..
37 |
38 | java -jar lib/sleep.jar utils/tagger/fixtags.sl tmp/wikipedia_sentences_tagged.txt >data/wikipedia_sentences_tagged_f.txt
39 | java -jar lib/sleep.jar utils/tagger/fixtags.sl tmp/gutenberg_sentences_tagged.txt >data/gutenberg_sentences_tagged_f.txt
40 |
41 | mv tmp/wikipedia_sentences.txt data/wikipedia_sentences.txt
42 | mv tmp/gutenberg_sentences.txt data/gutenberg_sentences.txt
43 |
44 | rm -rf tmp
45 |
--------------------------------------------------------------------------------
/data/rules/agreement/chunk_single.r:
--------------------------------------------------------------------------------
1 | .*/NNP [a-z]+/NN or [a-z]+/PRP.* [a-z]+/NN::\0 \1 and \3 \4
2 | .*/NNP [a-z]+/NNS or [a-z]+/PRP.* [a-z]+/NN::\0 \1 and \3 \4
3 | A [a-z]+/NN or [a-z]+/NN::\0 \1 and \3
4 | An [a-z]+/NN or [a-z]+/NN::\0 \1 and \3
5 | .*/NNP or [a-z]+/NNP::\0 and \2
6 | Every one of [a-z]+/DT [a-z]+/NNS::\3:upper \4
7 | One of [a-z]+/PRP.* [a-z]+/NNS::\2:upper \3
8 | Each one of [a-z]+/PRP.* [a-z]+/NNS::\3:upper \4
9 | The [a-z]+/NN [a-z]+/IN::\0 \1:plural \2
10 | The [a-z]+/NN::\0 \1:plural
11 | This [a-z]+/NN [a-z]+/IN::These \1:plural \2
12 | This [a-z]+/NN::These \1:plural
13 | One of [a-z]+/DT [a-z]+/NNS::\2:upper \3
14 | .*/NNP,POS [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
15 | .*/NNP,POS [a-z]+/NN::\0 \1:plural
16 | The [a-z]+/NN [a-z]+/IN [a-z]+/DT [a-z]+/NN::\0 \1:plural \2 \3 \4
17 | This [a-z]+/NN [a-z]+/IN [a-z]+/DT [a-z]+/NN::These \1:plural \2 \3 \4
18 | .*/RB one
19 | The [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
20 | This [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
21 | Their [a-z]+/NN::\0 \1:plural
22 | Their [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
23 | Their [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
24 | Your [a-z]+/NN::\0 \1:plural
25 | Your [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
26 | Your [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
27 | His [a-z]+/NN::\0 \1:plural
28 | His [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
29 | His [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
30 | Her [a-z]+/NN::\0 \1:plural
31 | Her [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
32 | Her [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
33 | My [a-z]+/NN::\0 \1:plural
34 | My [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural
35 | My [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural
36 | The [a-z]+/VBN [a-z]+/NN::\0 \1 \2:plural
37 | This [a-z]+/VBN [a-z]+/NN::\0 \1 \2:plural
38 | .*/CD dollars|pounds|points|feet|inches|meters
39 | The [a-z]+/NN [a-z]+/VB [a-z]+/NN::\0 \1 \2 \3:plural
40 | The [a-z]+/NN [a-z]+/VB [a-z]+/NN [a-z]+/VBP [a-z]+/JJ [a-z]+/NN
41 | The [a-z]+/JJ [a-z]+/NN [a-z]+/VBP [a-z]+/JJ [a-z]+/NN
42 | The [a-z]+/NN of [a-z]+/VB [a-z]+/NN::\0 \1 \2 \3 \4:plural
43 | The [a-z]+/NN [a-z]+/VB
44 | The [a-z]+/NN of [a-z]+/VB [a-z]+/NNS::\0 \1:plural of \3 \4
45 | Either [a-z]+/NN
46 | .*/NN::\0:plural
47 | Either [a-z]+/NNP [a-z]+/NNS or [a-z]+/PRP.* [a-z]+/NN::\1:upper \2 and \4 \5
48 |
--------------------------------------------------------------------------------
/utils/common/exp.sl:
--------------------------------------------------------------------------------
1 | sub exp::init
2 | {
3 | this('$score1 $score2 $score $criterf $network $criteria %dpoints $tscores $nscores $oscores $criterf2 $network2 $criteria2');
4 |
5 | $criterf = criteria($2);
6 | $network = get_network($1);
7 | $criteria = $2;
8 |
9 | $nscores = newObject("score", "network total");
10 | $tscores = newObject("score", "trigrams total");
11 | $oscores = newObject("score", "best score");
12 | }
13 |
14 | sub exp::process
15 | {
16 | local('$correct $wrong $wrongs $pre2 $pre1 $next @temp $nbase $tbase $solution $all %scores');
17 | ($correct, $wrong, $wrongs, $pre2, $pre1, $next) = @_;
18 |
19 | # do a trigram check?
20 | if ($wrong eq $correct)
21 | {
22 | $all = tagAll($pre2[1], $pre1[1], $pre1[0], $wrongs);
23 |
24 | if (isDifferent($all))
25 | {
26 | $solution = getBest($all)[0];
27 | if ($solution eq $correct)
28 | {
29 | [$tscores correct];
30 | }
31 | else
32 | {
33 | if ($bywords[$solution] == 1.0)
34 | {
35 | # warn("$solution is wrong, correct is $correct : " . $bywords[$correct]);
36 | }
37 | }
38 | [$tscores record];
39 | }
40 | }
41 |
42 | if ($wrong eq $correct)
43 | {
44 | (@temp, %scores) = checkAnyHomophone2($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]),
45 | $criteriaf => $criterf);
46 |
47 | if (size(@temp) == 0)
48 | {
49 | @temp[0] = $wrong;
50 | }
51 |
52 | if ($bywords[$solution] >= 1.0) #&& $solution eq $correct)
53 | {
54 | @temp[0] = $solution;
55 | }
56 |
57 | if (@temp[0] eq $correct)
58 | {
59 | [$nscores correct];
60 | }
61 | [$nscores record];
62 |
63 | if (@temp[0] eq $correct || $solution eq $correct)
64 | {
65 | [$oscores correct];
66 | }
67 | [$oscores record];
68 |
69 | if ($solution ne $correct && $bywords[$solution] == 1.0)
70 | {
71 | # warn("$solution - " . $bywords[$solution] . " vs. $correct " . $bywords[$correct]);
72 | }
73 | }
74 | }
75 |
76 | sub exp::finish
77 | {
78 | [$nscores print];
79 | [$tscores print];
80 | [$oscores print];
81 | }
82 |
--------------------------------------------------------------------------------
/data/rules/irregular_nouns.txt:
--------------------------------------------------------------------------------
1 | addendum addenda
2 | alga algae
3 | alumna alumnae
4 | alumnus alumni
5 | analysis analyses
6 | antenna antennas,antennae
7 | apparatus apparatuses
8 | appendix appendices,appendixes
9 | axis axes
10 | bacillus bacilli
11 | bacterium bacteria
12 | basis bases
13 | beau beaux
14 | bison bison
15 | buffalo buffalos,buffaloes
16 | bureau bureaus
17 | bus busses,buses
18 | cactus cactuses,cacti
19 | calf calves
20 | child children
21 | corps corps
22 | corpus corpora,corpuses
23 | crisis crises
24 | criterion criteria
25 | curriculum curricula
26 | datum data
27 | deer deer
28 | die dice
29 | dwarf dwarfs,dwarves
30 | diagnosis diagnoses
31 | echo echoes
32 | elf elves
33 | ellipsis ellipses
34 | embargo embargoes
35 | emphasis emphases
36 | erratum errata
37 | fireman firemen
38 | fish fish,fishes
39 | focus focuses
40 | foot feet
41 | formula formulas
42 | fungus fungi,funguses
43 | genus genera
44 | goose geese
45 | half halves
46 | hero heroes
47 | hippopotamus hippopotami,hippopotamuses
48 | hoof hoofs,hooves
49 | hypothesis hypotheses
50 | index indices,indexes
51 | knife knives
52 | leaf leaves
53 | life lives
54 | loaf loaves
55 | louse lice
56 | man men
57 | matrix matrices
58 | means means
59 | medium media
60 | memorandum memoranda
61 | millennium millenniums,milennia
62 | moose moose
63 | mosquito mosquitoes
64 | mouse mice
65 | nebula nebulae,nebulas
66 | neurosis neuroses
67 | nucleus nuclei
68 | oasis oases
69 | octopus octopi,octopuses
70 | ovum ova
71 | ox oxen
72 | paralysis paralyses
73 | parenthesis parentheses
74 | person people
75 | phenomenon phenomena
76 | potato potatoes
77 | radius radii,radiuses
78 | scarf scarfs,scarves
79 | self selves
80 | series series
81 | sheep sheep
82 | shelf shelves
83 | scissors scissors
84 | species species
85 | stimulus stimuli
86 | stratum strata
87 | syllabus syllabi,syllabuses
88 | symposium symposia,symposiums
89 | synthesis syntheses
90 | synopsis synopses
91 | tableau tableaux
92 | that those
93 | thesis theses
94 | thief thieves
95 | this these
96 | tomato tomatoes
97 | tooth teeth
98 | torpedo torpedoes
99 | vertebra vertebrae
100 | veto vetoes
101 | vita vitae
102 | watch watches
103 | wife wives
104 | wolf wolves
105 | woman women
106 | zero zeros,zeroes
107 |
--------------------------------------------------------------------------------
/data/rules/grammar/its2:
--------------------------------------------------------------------------------
1 | on it's own::name=it's rule::word=on its own::filter=none
2 | of it's own::name=it's rule::word=of its own::filter=none
3 | such as it's::name=it's rule::word=such as its::filter=none
4 | from all it's::name=it's rule::word=from all its::filter=none
5 | by all it's::name=it's rule::word=by all its::filter=none
6 | it's approach::name=it's rule::word=its approach::filter=none
7 | by it's::name=it's rule::word=by its::filter=none
8 | By it's::name=it's rule::word=By its::filter=none
9 | with it's::name=it's rule::word=\0 its::pivots=it's,its
10 | with/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
11 | With/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
12 | in/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
13 | In/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
14 | without/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
15 | Without/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
16 | from/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
17 | From/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
18 | Under/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
19 | under/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
20 | over/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
21 | Over/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
22 | above/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
23 | Above/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none
24 | for it's .*/JJ|NN|NNS::word=for its::pivots=\1,its
25 |
26 | it's class|color|current|end|first|former|fourth|goal|highest|history|inital|junction|lack|last|lead|lowest|maximum|minimum|money|name|northern|original|own|peak|previous|primary|second|third|timeslot|toll|way::word=its \1::pivots=it's,its::options=it's,its
27 |
28 | at|be|about|above|across|against|along|among|around|at|behind|by|for|from|had|in|near|of|on|over|through|to|towards|under|upon|with|without it's .*/JJ|NN|NNS::word=\0 its \2::pivots=it's,its
29 |
30 | it's you::filter=kill
31 | it's [a-z].*/NNP::word=its \1::pivots=\0,its
32 |
33 | to .*/VB it's .*/NN|NNS::word=\0 \1 its \3::pivots=it's,its
34 |
35 | it's .*/JJ .*/NNS|NN::word=its \1 \2::pivots=it's,its
36 |
--------------------------------------------------------------------------------
/data/rules/agreement/chunk_plural.r:
--------------------------------------------------------------------------------
1 | The [a-z]+/JJ two|three|four|five|six|seven|eight|nine|ten|hundred|thousand|million|billion|trillion
2 | My|Your|His|Her|Their pants
3 | .*/NNP [a-z]+/NN and [a-z]+/PRP.* [a-z]+/NN::\0 \1 or \3 \4
4 | .*/NNP [a-z]+/NNS and [a-z]+/PRP.* [a-z]+/NN::\0 \1 or \3 \4
5 | .*/NNP and [a-z]+/NNP::\0 or \2
6 | .*/NNP and [a-z]+/PRP.* [a-z]+/NNS::\0 or \2 \3:singular
7 | The [a-z]+/NN and [a-z]+/DT [a-z]+/NNS::\0 \1 or \3 \4:singular
8 | The [a-z]+/NN or [a-z]+/DT [a-z]+/NNS::\0 \1 \2 \3 \4:singular
9 | The [a-z]+/NN and [a-z]+/NN::The \1 or \3
10 | The [a-z]+/NNS::\0 \1:singular
11 | The [a-z]+/NNS::\0 \1:singular
12 | These [a-z]+/NN and [a-z]+/DT [a-z]+/NNS::The \1 or the \4:singular
13 | These [a-z]+/NN or [a-z]+/DT [a-z]+/NNS::word=The \1 \2 the \4:singular
14 | These [a-z]+/NNS::The \1:singular
15 | All||all of [a-z]+/DT [a-z]+/NNS::\2:upper \3:singular
16 | The [a-z]+/NNS of|for [a-z]+/NN::\0 \1:singular \2 \3
17 | These [a-z]+/NNS of|for [a-z]+/NN::Each \1:singular \2 \3
18 | The [a-z]+/NNS of|for [a-z]+/JJ [a-z]+/NN::\0 \1:singular \2 \3 \4
19 | These [a-z]+/NNS of|for [a-z]+/JJ [a-z]+/NN::Each \1:singular \2 \3 \4
20 | .*/NNP,POS [a-z]+/NNS::\0 \1:singular
21 | .*/NNP,POS [a-z]+/NNS in [a-z]+/DT [a-z]+/NN::\0 \1:singular \2 \3 \4
22 | The [a-z]+/JJS [a-z]+/JJ [a-z]+/NNS of|for|from [a-z]+/NN [a-z]+/NN::\0 \1 \2 \3:singular \4 \5 \6
23 | The [a-z]+/JJS [a-z]+/JJ [a-z]+/NNS::\0 \1 \2 \3:singular
24 | .*/NNS of|for|from [a-z]+/NNS::\0:singular \1 \2:singular
25 | .*/NNP,POS [a-z]+/NNS in [a-z]+/DT [a-z]+/NN::\0 \1:singular \2 \3 \4
26 | .*/CD [a-z]+/NNS
27 | The series of [a-z]+ [a-z]+/NNS::\0 \1 \2 \3 \4:singular
28 | The series of [a-z]+/NNS::\0 \1 \2 \3:singular
29 | The/DT [a-z]+/NN [a-z]+/IN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3 \4:singular
30 | The [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
31 | My [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
32 | Your [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
33 | His [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
34 | Her [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular
35 | My [a-z]+/NNS::\0 \1:singular
36 | Your [a-z]+/NNS::\0 \1:singular
37 | Their [a-z]+/NNS::\0 \1:singular
38 | His [a-z]+/NNS::\0 \1:singular
39 | Her [a-z]+/NNS::\0 \1:singular
40 | .*/JJ [a-z]+/NNS::\0 \1:singular
41 | The [a-z]+/NN [a-z]+/IN [a-z]+/VB [a-z]+/NNS
42 | My [a-z]+/NNS and I
43 | My [a-z]+/NN and I
44 |
--------------------------------------------------------------------------------
/utils/rules/transr.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this is a script to transform sentences in a corpus using rules from an AtD rule file
3 | #
4 | # java -jar utils/rules/testr.sl
5 | #
6 | # format:
7 | #
8 | # rule..|[key=value|...]
9 | #
10 | # note that key=value are parsed and dumped into a hash. This information is used by the system to
11 | # filter out false positives and stuff.
12 | #
13 |
14 | include("lib/engine.sl");
15 | include("utils/rules/rules.sl");
16 |
17 | sub checkSentenceSpelling
18 | {
19 | }
20 |
21 | setf('&score', let({
22 | local('$value');
23 | $value = invoke($oldf, @_);
24 | warn("Looking at: " . join("|", @_) . " = " . $value);
25 | return $value;
26 | }, $oldf => &score));
27 |
28 | sub initAll
29 | {
30 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
31 | $model = get_language_model();
32 | $dictionary = dictionary();
33 | $dsize = size($dictionary);
34 | $hnetwork = get_network("hnetwork4.bin");
35 | $verbs = loadVerbData();
36 | initTaggerModels();
37 | }
38 |
39 | sub main
40 | {
41 | local('$handle $sentence @results @past');
42 |
43 | initAll();
44 |
45 | if (function("& $+ $1") !is $null)
46 | {
47 | $rules = machine();
48 | invoke(function("& $+ $1"));
49 | }
50 | else
51 | {
52 | $rules = loadRules(machine(), $1, %());
53 | }
54 |
55 | $handle = openf($2);
56 | while $sentence (readln($handle))
57 | {
58 | @results = @();
59 | processSentence(\$sentence, \@results);
60 |
61 | @past = copy(@results);
62 |
63 | if (size(@past) > 0)
64 | {
65 | # println($sentence);
66 | # println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
67 | foreach $index => $r (@past)
68 | {
69 | local('$rule $text $path $context @suggestions');
70 | ($rule, $text, $path, $context, @suggestions) = $r;
71 |
72 | if ($r in @results)
73 | {
74 | $n = strrep($sentence, $text, @suggestions[0]);
75 | println($n);
76 |
77 | if ($n eq $sentence)
78 | {
79 | println("===> $context $text => " . @suggestions);
80 | }
81 |
82 | break;
83 | }
84 |
85 |
86 | }
87 |
88 | }
89 | }
90 | }
91 |
92 | invoke(&main, @ARGV);
93 |
--------------------------------------------------------------------------------
/utils/spelldata/torules.sl:
--------------------------------------------------------------------------------
1 | #
2 | # Generate a rule file from cut and paste Wikipedia rules data
3 | # http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Grammar_and_Misc
4 | #
5 | # use java -jar lib/sleep.jar torules.sl wrong to generate a reverse rules file suitable for error corpus generation
6 | #
7 | # paste the contents into a text editor, then paste into a text file and process with this program
8 | #
9 |
10 | $handle = openf("wp.txt");
11 |
12 | %sections = ohash();
13 | setMissPolicy(%sections, { return @(); });
14 |
15 | while $text (readln($handle))
16 | {
17 | if ($text ismatch '.*?[\*\#] (.*?) \((.*?)\).*')
18 | {
19 | ($wrong, $correct) = matched();
20 |
21 | if (',' !isin $correct)
22 | {
23 | @a = split(' ', $wrong);
24 | @b = split(' ', $correct);
25 |
26 | if (size(@a) == size(@b))
27 | {
28 | foreach $index => $word (@a)
29 | {
30 | if ($word !in @b) { $special = $word; $replace = @b[$index]; }
31 | }
32 |
33 | if (@ARGV[0] eq 'wrong')
34 | {
35 | push(%sections["Confused word: $special"], "$correct $+ ::word= $+ $wrong");
36 | }
37 | else
38 | {
39 | push(%sections["Confused word: $special"], "$wrong $+ ::word= $+ $correct $+ ::pivots= $+ $special $+ , $+ $replace $+ ::options= $+ $special $+ , $+ $replace");
40 | }
41 | }
42 | else
43 | {
44 | if (@ARGV[0] eq 'wrong')
45 | {
46 | push(%sections["Multiple Options"], "$correct $+ ::word= $+ $wrong");
47 | }
48 | else
49 | {
50 | push(%sections["Multiple Options"], "$wrong $+ ::word= $+ $correct");
51 | }
52 | }
53 | }
54 | else
55 | {
56 | if (@ARGV[0] ne 'wrong')
57 | {
58 | push(%sections["Misc"], "$wrong $+ ::word= $+ $correct");
59 | #push(%sections["Misc"], "$correct $+ ::word= $+ $wrong");
60 | }
61 | else
62 | {
63 | @temp = split(', ', $correct);
64 | map(lambda({ push(%sections["Misc"], "$1 $+ ::word= $+ $wrong $+ ::options= $+ $correct"); }, \$wrong, \$correct), @temp);
65 | }
66 | }
67 | }
68 | else
69 | {
70 | # push(%sections["__Rejects__"], $text);
71 | }
72 | }
73 |
74 | foreach $key => $value (%sections)
75 | {
76 | println("\n#\n# $key \n#\n");
77 | printAll($value);
78 | }
79 |
--------------------------------------------------------------------------------
/service/code/src/org/dashnine/preditor/LanguageModelSmall.java:
--------------------------------------------------------------------------------
1 | package org.dashnine.preditor;
2 |
3 | import java.io.*;
4 | import java.util.*;
5 | import java.util.zip.*;
6 |
7 | /** This class holds the (minified) AtD language model */
8 | public class LanguageModelSmall extends LanguageModel implements Serializable
9 | {
10 | protected ZipFile entries;
11 |
12 | private static long lowMemoryThreshold = 256 * 1024 * 1024;
13 |
14 | protected class CacheMap extends LinkedHashMap
15 | {
16 | protected boolean removeEldestEntry(Map.Entry eldest)
17 | {
18 | long memory = Runtime.getRuntime().freeMemory() + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory());
19 | return (size() > 16384 || memory < lowMemoryThreshold);
20 | }
21 | }
22 |
23 | /* read a string value from the specified map... adds the string if it doesn't exist */
24 | protected Value getStringValue(Map map, String word, boolean makeAsNecessary)
25 | {
26 | Object sid = getStringId(word, false);
27 |
28 | if (sid != null)
29 | {
30 | synchronized (this)
31 | {
32 | Value val = (Value)map.get(sid);
33 | if (val == null && map == model)
34 | {
35 | try
36 | {
37 | int sid_i = ((Integer)sid).intValue();
38 |
39 | ZipEntry entry = entries.getEntry((sid_i % 512) + "/" + sid_i);
40 | if (entry != null)
41 | {
42 | ObjectInputStream stream = new ObjectInputStream(entries.getInputStream(entry));
43 | val = (Value)stream.readObject();
44 | map.put(sid, val);
45 | }
46 | }
47 | catch (Exception ex)
48 | {
49 | System.err.println("Could not load: " + word + "(" + sid + ")");
50 | ex.printStackTrace();
51 | }
52 | }
53 | return val;
54 | }
55 | }
56 |
57 | return null;
58 | }
59 |
60 | public LanguageModelSmall(Map _string_pool, long _count, File entries_file)
61 | {
62 | string_pool = _string_pool;
63 | count = _count;
64 | model = new CacheMap();
65 | try
66 | {
67 | entries = new ZipFile(entries_file);
68 | }
69 | catch (Exception ex)
70 | {
71 | System.err.println("Could not load zipfile: " + entries_file);
72 | ex.printStackTrace();
73 | }
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/utils/bigrams/amigo.sl:
--------------------------------------------------------------------------------
1 | #
2 | # test spanish homophones against spanish corpora.
3 | #
4 |
5 | debug(7 | 24);
6 |
7 | include("lib/quality.sl");
8 | include("lib/engine.sl");
9 |
10 | #
11 | # load AtD models
12 | #
13 | global('$lang');
14 |
15 | $lang = systemProperties()["atd.lang"];
16 | if ($lang ne "" && -exists "lang/ $+ $lang $+ /load.sl") {
17 | include("lang/ $+ $lang $+ /load.sl");
18 | initAllModels();
19 | }
20 |
21 | #
22 | # load homophones
23 | #
24 | sub homophones {
25 | local('$handle $text %h @candidates');
26 | $handle = openf("lang/ $+ $lang $+ /homophonedb.txt");
27 | while $text (readln($handle)) {
28 | if ('-*' iswm $text) {
29 | %h[substr($text, 1)] = $null;
30 | }
31 | else {
32 | @candidates = split(',\s+', $text);
33 | map(lambda({ %h[$1] = @candidates; }, \%h, \@candidates), @candidates);
34 | }
35 | }
36 | return %h;
37 | }
38 |
39 | sub isHomophone {
40 | local('$sentence $pre2 $pre1 $current $next @results');
41 | ($sentence, $pre2, $pre1, $current, $next) = @_;
42 |
43 | @results = checkHomophone($hnetwork, $current, %homophones[$current], $pre1, $next, @(), $pre2, $bias1 => 30.0, $bias2 => 10.0);
44 |
45 | if (size(@results) > 0) {
46 | println("\t $+ $sentence");
47 | println("\t $+ $pre2 $pre1 | $current | $next or: " . @results . "\n");
48 | }
49 | }
50 |
51 | #
52 | # check a sentence for homophones
53 | #
54 | sub checkSentenceForHomophones {
55 | local('$pre2 $pre1 $current $next $word');
56 |
57 | $current = '0BEGIN.0';
58 |
59 | foreach $next (splitIntoWords($1)) {
60 | if ($current ne '0BEGIN.0' && $current in %homophones) {
61 | isHomophone($1, $pre2, $pre1, $current, $next);
62 | }
63 | $pre2 = $pre1;
64 | $pre1 = $current;
65 | $current = $next;
66 | }
67 |
68 | $next = '0END.0';
69 |
70 | if ($current in %homophones) {
71 | isHomophone($1, $pre2, $pre1, $current, $next);
72 | }
73 | }
74 |
75 | #
76 | # loop through the file, look for homophones... report them!
77 | #
78 | sub checkForHomophones {
79 | local('$handle $contents');
80 | $handle = openf($1);
81 | $contents = splitIntoSentences(join("\n", readAll($handle, -1)));
82 | map(&checkSentenceForHomophones, $contents);
83 | closef($handle);
84 | }
85 |
86 | sub main {
87 | global('%homophones');
88 | %homophones = homophones();
89 | [{
90 | if (-isDir $1) {
91 | map($this, ls($1));
92 | }
93 | else {
94 | if ('*.txt' iswm $1) {
95 | println($1);
96 | checkForHomophones($1);
97 | }
98 | }
99 | }: "lang/ $+ $lang $+ /corpus"];
100 | }
101 |
102 | invoke(&main, @ARGV);
103 |
--------------------------------------------------------------------------------
/utils/spelldata/bootstrapspell.sl:
--------------------------------------------------------------------------------
1 | #
2 | # Walk through a corpus and find spelling errors and their corrections
3 | #
4 | # java [all the memory junk here] -jar lib/sleep.jar utils/spelldata/bootstrapspell.sl data/corpus_wikipedia
5 | #
6 |
7 | debug(7 | 34);
8 |
9 | include("lib/engine.sl");
10 |
11 | global('$model $dictionary $trie $rules $network $hnetwork %edits $dsize $old_suggest %words');
12 |
13 | $model = get_language_model();
14 | $dictionary = dictionary();
15 | $rules = get_rules();
16 | $trie = trie($dictionary);
17 | $network = get_network("cnetwork.bin");
18 | $hnetwork = get_network("hnetwork2.bin");
19 | %edits = initEdits();
20 | setRemovalPolicy(%edits, { return 1; });
21 | $dsize = size($dictionary);
22 | initTaggerModels();
23 |
24 | $old_suggest = function('&getSuggestionPool');
25 |
26 | sub getSuggestionPool
27 | {
28 | local('$error $dict $pre $next @suggests %scores');
29 | ($error, $dict, $pre, $next) = @_;
30 |
31 | if ($error ismatch '[a-z]+\'{0,1}[a-z]+' && $pre ne "" && $next ne "" && ($pre ne '0BEGIN.0' || $next ne '0END.0') && $pre ismatch '[a-zA-Z0-9\\.,]+' && $next ismatch '[a-zA-Z0-9\\.,]+')
32 | # if ($error in %words && $pre ne "" && $next ne "" && ($pre ne '0BEGIN.0' || $next ne '0END.0') && $pre ismatch '[a-zA-Z0-9\\.,]+' && $next ismatch '[a-zA-Z0-9\\.,]+')
33 | {
34 | (@suggests, %scores) = invoke($old_suggest, @_);
35 |
36 | if (size(@suggests) > 0 && %seen[@_] is $null)
37 | {
38 | println("$pre * $next $+ |" . @suggests[0] . ", $error $+ |" . %scores[@suggests[0]]);
39 | %seen[@_] = 1;
40 | }
41 |
42 | return @(@suggests, %scores);
43 | }
44 |
45 | return @(@(), %());
46 | }
47 |
48 | sub checkIt
49 | {
50 | local('$handle $data');
51 | $handle = openf($1);
52 | $data = readb($handle, -1);
53 | closef($handle);
54 |
55 | $data = stripHTML($data);
56 |
57 | processDocument($data)
58 |
59 | local('@paragraphs $paragraph $sentence');
60 | @paragraphs = splitByParagraph($data);
61 |
62 | foreach $paragraph (@paragraphs)
63 | {
64 | foreach $sentence ($paragraph)
65 | {
66 | if ($sentence eq "")
67 | {
68 | continue;
69 | }
70 |
71 | checkSentenceSpelling(splitIntoWords($sentence), @results => @());
72 | }
73 | }
74 |
75 | [System gc];
76 | }
77 |
78 | sub main
79 | {
80 | # collect list of files.
81 | [{
82 | if (-isDir $1)
83 | {
84 | map($this, ls($1));
85 | }
86 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
87 | {
88 | checkIt($1);
89 | }
90 | }: $1];
91 | }
92 |
93 | invoke(&main, @ARGV);
94 |
--------------------------------------------------------------------------------
/utils/bigrams/corpus-lex-diff.sl:
--------------------------------------------------------------------------------
1 | #
2 | # Analyze a text file containing raw text data and show the top words not in the current wordlist data
3 | #
4 | #
5 |
6 | sub loadWordlists
7 | {
8 | if (-isDir $1)
9 | {
10 | map($this, ls($1));
11 | }
12 | else
13 | {
14 | loadWordlist($1, \%wordlist);
15 | }
16 | }
17 |
18 | sub loadWordlist
19 | {
20 | local('$handle $word');
21 | $handle = openf($1);
22 | map(lambda({ %wordlist[$1] = 1; }, \%wordlist), split("\n", readb($handle, -1)));
23 | closef($handle);
24 | }
25 |
26 | sub wordlists
27 | {
28 | this('$dictionary');
29 | if ($dictionary is $null)
30 | {
31 | $dictionary = %();
32 | [lambda(&loadWordlists, %wordlist => $dictionary) : "data/wordlists"];
33 |
34 | # add punctuation chars here
35 |
36 | # warn("Loaded: " . size($dictionary) . " words");
37 |
38 | $dictionary[","] = 1; # make sure commas are in the wordlist
39 | }
40 | return $dictionary;
41 | }
42 |
43 | #
44 | # tool to build a corpus. <3
45 | #
46 |
47 | debug(7 | 34);
48 |
49 | sub process
50 | {
51 | local('@words $head $next');
52 |
53 | @words = splitIntoWords($1);
54 |
55 | while (size(@words) > 1)
56 | {
57 | ($next) = @words;
58 |
59 | if ($next !in %wordlists && lc($next) !in %wordlists && !-isnumber $next)
60 | {
61 | %nots[$next] += 1;
62 | }
63 |
64 | @words = sublist(@words, 1);
65 | }
66 | }
67 |
68 | sub processFile
69 | {
70 | local('$handle $key $data $text @paragraphs');
71 |
72 | # read in our corpus.
73 | $handle = openf($1);
74 | $text = replace(readb($handle, -1), '<[^>]*?>', '');
75 | closef($handle);
76 |
77 | # start processing it?!?
78 | @paragraphs = splitByParagraph($text);
79 | map({ map({ map(&process, splitIntoClauses($1)); }, $1); }, @paragraphs);
80 | }
81 |
82 | sub main
83 | {
84 | global('%wordlists %dictionary @files %current %nots');
85 |
86 | include("lib/nlp.sl");
87 | include("lib/dictionary.sl");
88 |
89 | %wordlists = wordlists();
90 |
91 | processFile(@ARGV[0]);
92 |
93 | local('@words $word');
94 |
95 | # sort everything...
96 |
97 | @words = sort({ return %nots[$2] <=> %nots[$1]; }, filter(lambda({ return iff($min == 0 || %nots[$1] > $min, $1); }, $min => $2), keys(%nots)));
98 |
99 | foreach $word (@words)
100 | {
101 | if (($2 == 0 || %nots[$word] > $2))
102 | {
103 | if ($3 eq "")
104 | {
105 | println("$[50]word ... " . %nots[$word]);
106 | }
107 | else
108 | {
109 | println($word);
110 | }
111 | }
112 | }
113 | }
114 |
115 | invoke(&main, @ARGV);
116 |
--------------------------------------------------------------------------------
/utils/common/spellcontext.sl:
--------------------------------------------------------------------------------
1 | #
2 | # test out spelling with associated context information
3 | #
4 |
5 | sub suggestTest
6 | {
7 | local('$suspect $dict $previous $next @suggestions $f');
8 | ($suspect, $dict, $previous, $next) = @_;
9 |
10 | @suggestions = %edits[$suspect];
11 |
12 | if ($correct in @suggestions)
13 | {
14 | foreach $f (@functions)
15 | {
16 | [$f : $suspect, $correct, copy(@suggestions), $previous, $next];
17 | }
18 | # warn("Done for $previous $suspect $next -> $correct");
19 | }
20 |
21 | return @();
22 | }
23 |
24 | sub testCorrectionsContext
25 | {
26 | local('$score $entry $sentence $correct $wrongs @results @words $rule $wrong $previous $next $func');
27 |
28 | while $entry (sentences($1))
29 | {
30 | ($sentence, $correct, $wrongs) = $entry;
31 | ($previous, $next) = split(' \\* ', $sentence);
32 | $func = lambda(&suggestTest, \$score, \$correct, @functions => sublist(@_, 1));
33 |
34 | #
35 | # check for a false negative
36 | #
37 | foreach $wrong ($wrongs)
38 | {
39 | [$func: $wrong, $dictionary, $previous, $next]
40 | }
41 | }
42 | }
43 |
44 | sub checkAnyHomophone
45 | {
46 | return invoke(&checkAnyHomophone2, @_, parameters => %(\$criteriaf))[0];
47 | }
48 |
49 | sub checkAnyHomophone2
50 | {
51 | local('$current $options $pre $next %scores $criteriaf @results $option $hnetwork $tags $pre2 $next2');
52 | ($hnetwork, $current, $options, $pre, $next, $tags, $pre2, $next2) = @_;
53 |
54 | # setup the criteria function
55 | # $criteriaf = criteria(@("pref", "postf", "probability"));
56 |
57 | # $options = filter(lambda({ return iff(Pbigram1($pre, $1) > 0.0 || Pbigram2($1, $next) > 0.0, $1); }, \$pre, \$next), $options);
58 |
59 | # score the options
60 | foreach $option ($options)
61 | {
62 | # warn(@_ . " -> " . [$criteriaf: $current, $option, $options, $pre, $next, $tags]);
63 | %scores[$option] = [$hnetwork getresult: [$criteriaf: $current, $option, $options, $pre, $next, $tags, $pre2, $next2]]["result"];
64 | if ($option eq $current)
65 | {
66 | # warn(Pword($current));
67 | %scores[$option] *= 10.0; # * (1.0 - (Pword($current) * 2500));
68 | }
69 | }
70 |
71 | # filter out any unacceptable words
72 | @results = filter(lambda({ return iff(%scores[$1] >= %scores[$current] && $1 ne $current && %scores[$1] > 0.0, $1, $null); }, \%scores, \$current), $options);
73 |
74 | # sort the remaining results (probably only one left at this point)
75 | @results = sort(lambda({ return %scores[$2] <=> %scores[$1]; }, \%scores), @results);
76 |
77 | if (size(@results) > 0)
78 | {
79 | # warn("checkHomophone: " . @_ . " -> " . @results);
80 | # warn(" " . %scores);
81 | }
82 |
83 | # return the results
84 | return @(@results, %scores);
85 | }
86 |
--------------------------------------------------------------------------------
/utils/rules/testr.sl:
--------------------------------------------------------------------------------
1 | #
2 | # This is a script to test the rules out. It's fun stuff.
3 | #
4 | # java -jar utils/rules/testr.sl
5 | #
6 | # format:
7 | #
8 | # rule..|[key=value|...]
9 | #
10 | # note that key=value are parsed and dumped into a hash. This information is used by the system to
11 | # filter out false positives and stuff.
12 | #
13 |
14 | include("lib/engine.sl");
15 | include("utils/rules/rules.sl");
16 |
17 | sub checkSentenceSpelling
18 | {
19 | }
20 |
21 | setf('&score', let({
22 | local('$value');
23 | $value = invoke($oldf, @_);
24 | warn("Looking at: " . join("|", @_) . " = " . $value);
25 | return $value;
26 | }, $oldf => &score));
27 |
28 | sub initAll
29 | {
30 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
31 | $model = get_language_model();
32 | $dictionary = dictionary();
33 | $dsize = size($dictionary);
34 | $hnetwork = get_network("hnetwork4.bin");
35 | $verbs = loadVerbData();
36 | initTaggerModels();
37 | }
38 |
39 | sub main
40 | {
41 | local('$handle $sentence @results @past');
42 |
43 | initAll();
44 |
45 | if (function("& $+ $1") !is $null)
46 | {
47 | $rules = machine();
48 | invoke(function("& $+ $1"));
49 | }
50 | else
51 | {
52 | $rules = loadRules(machine(), $1, %());
53 | }
54 |
55 | # processSentence now expects $rules to be an array of rule packages
56 | $rules = @( $rules );
57 |
58 | $handle = openf($2);
59 | while $sentence (readln($handle))
60 | {
61 | @results = @();
62 | processSentence(\$sentence, \@results);
63 |
64 | @past = copy(@results);
65 |
66 | if (size(@past) > 0)
67 | {
68 | println($sentence);
69 | println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
70 | foreach $index => $r (@past)
71 | {
72 | local('$rule $text $path $context @suggestions');
73 | ($rule, $text, $path, $context, @suggestions) = $r;
74 |
75 | if ($r in @results)
76 | {
77 | println(" $index $+ ) [ACCEPT] $context $+ , $text -> " . @suggestions);
78 | }
79 | else
80 | {
81 | println(" $index $+ ) [REJECT] $context $+ , $text -> " . @suggestions);
82 | }
83 |
84 | foreach $key => $value ($rule)
85 | {
86 | println(" $[10]key => $value");
87 | }
88 | }
89 | }
90 | else
91 | {
92 | # println("NOT FOUND");
93 | # println($sentence);
94 | # println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence))));
95 | }
96 | }
97 | }
98 |
99 | invoke(&main, @ARGV);
100 |
--------------------------------------------------------------------------------
/data/rules/grammar/det_agreement:
--------------------------------------------------------------------------------
1 | These|Those is::word=This \1::filter=none
2 | These|Those was::word=This \1::filter=none
3 | These|Those is .*/NNS::word=\0 are \2::filter=none
4 | These|Those was .*/NNS::word=\0 were \2::filter=none
5 | These|Those is .*/JJ .*/NNS::word=\0 are \2 \3::filter=none
6 | These|Those was .*/JJ .*/NNS::word=\0 were \2 \3::filter=none
7 |
8 | This are .*/NNS::word=These \1 \2::filter=none
9 | This were .*/NNS::word=Those \1 \2::filter=none
10 | This are .*/JJ .*/NNS::word=These \1 \2 \3::filter=none
11 | This were .*/JJ .*/NNS::word=Those \1 \2 \3::filter=none
12 | This are::word=This is::filter=none
13 | This were::word=This was::filter=none
14 |
15 | # rules for there
16 |
17 | there|There is none::filter=kill
18 | there|There are none|but|today|plenty|way::filter=kill
19 |
20 | there|There is .*/NNS of .*/NN|VBG::filter=kill
21 | there|There are .*/NN of .*/NNS|VBG|JJ::filter=kill
22 | there|There are .*/NN of .*/NN .*/NNS|VBG::filter=kill
23 | there|There are .*/NN .*/NNS::filter=kill
24 | there|There are .*/NN .*/NN .*/NNS::filter=kill
25 | there|There are .*/NN too many::filter=kill
26 |
27 | #there/EX are/VBP plenty/NN of/IN advantages/NNS to/TO
28 |
29 | # according to http://ask.metafilter.com/84536/There-is-or-There-are
30 | # I should use the closest noun to determine is/are. So these rules are not
31 | # needed. Just the same I'm commenting them out for future reference.
32 | #there|there is .*/NN and .*/NN::word=\0 are \2 \3 \4::filter=none
33 | #there|There are .*/NN and .*/NN::filter=kill
34 |
35 | there|There are .*/NN::word=\0 is \2::pivots=\1,is
36 | there|There is .*/NNS::word=\0 are \2::pivots=\1,are
37 | there|There is .*/NN .*/NNS::word=\0 are \2 \3::pivots=\1,are
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 | there's|There's none::filter=kill
49 | there's|There's none|but|today|plenty::filter=kill
50 |
51 | there's|There's .*/NNS of .*/NN|VBG::filter=kill
52 | there're|There're .*/NN of .*/NNS|VBG|JJ::filter=kill
53 | there're|There're .*/NN of .*/NN .*/NNS|VBG::filter=kill
54 | there're|There're .*/NN .*/NNS::filter=kill
55 | there're|There're .*/NN .*/NN .*/NNS::filter=kill
56 | there're|There're .*/NN too many::filter=kill
57 |
58 | #there/EX are/VBP plenty/NN of/IN advantages/NNS to/TO
59 |
60 | # according to http://ask.metafilter.com/84536/There-is-or-There-are
61 | # I should use the closest noun to determine is/are. So these rules are not
62 | # needed. Just the same I'm commenting them out for future reference.
63 | #there|there is .*/NN and .*/NN::word=\0 are \2 \3 \4::filter=none
64 | #there|There are .*/NN and .*/NN::filter=kill
65 |
66 | There're .*/NN::word=There's \1::pivots=\0,There's
67 | there're .*/NN::word=there's \1::pivots=\0,there's
68 |
69 | There's .*/NNS::word=There are \1::pivots=\0,There are
70 | there's .*/NNS::word=there are \1::pivots=\0,there are
71 |
72 | There's .*/NN .*/NNS::word=There are \1 \2::pivots=\0,There are
73 | there's .*/NN .*/NNS::word=there are \1 \2::pivots=\0,there are
74 |
--------------------------------------------------------------------------------
/data/rules/grammar/repeats:
--------------------------------------------------------------------------------
1 | #
2 | # some repeated words, makes no sense.
3 | #
4 |
5 | you'll will::word=you will::filter=none
6 | You'll will::word=You will::filter=none
7 | I'll will::word=I will::filter=none
8 | we'll will::word=we will::filter=none
9 | We'll will::word=We will::filter=none
10 | they'll will::word=they will::filter=none
11 | They'll will::word=They will::filter=none
12 | She'll will::word=She will::filter=none
13 | she'll will::word=she will::filter=none
14 | He'll will::word=He will::filter=none
15 | he'll will::word=he will::filter=none
16 |
17 | aren't not::word=are not::filter=none
18 | didn't not::word=did not::filter=none
19 | don't not::word=do not::filter=none
20 | isn't not::word=is not::filter=none
21 | can't not::word=can not::filter=none
22 | weren't not::word=were not::filter=none
23 | wouldn't not::word=would not::filter=none
24 | doesn't not::word=does not::filter=none
25 | hasn't not::word=has not::filter=none
26 | couldn't not::word=could not::filter=none
27 |
28 | Aren't not::word=Are not::filter=none
29 | Didn't not::word=Did not::filter=none
30 | Don't not::word=Do not::filter=none
31 | Isn't not::word=Is not::filter=none
32 | Can't not::word=Can not::filter=none
33 | Weren't not::word=Were not::filter=none
34 | Wouldn't not::word=Would not::filter=none
35 | Doesn't not::word=Does not::filter=none
36 | Hasn't not::word=Has not::filter=none
37 | Couldn't not::word=Could not::filter=none
38 |
39 | it's is::word=it is::filter=none
40 | It's is::word=It is::filter=none
41 | That's is::word=That is::filter=none
42 | that's is::word=that is::filter=none
43 | there's is::word=there's is::filter=none
44 | There's is::word=There's is::filter=none
45 | he's is::word=he is::filter=none
46 | He's is::word=He is::filter=none
47 | she's is::word=she is::filter=none
48 | She's is::word=She is::filter=none
49 | who's is::word=who is::filter=none
50 | Who's is::word=Who is::filter=none
51 |
52 | we're are::word=We are::filter=none
53 | you're are::word=You are::filter=none
54 | they're are::word=They are::filter=none
55 | We're are::word=We are::filter=none
56 | You're are::word=You are::filter=none
57 | They're are::word=They are::filter=none
58 | Who're are::word=Who are::filter=none
59 | who're are::word=who are::filter=none
60 |
61 | I'm am::word=I am::filter=none
62 | I've have::word=I have::filter=none
63 |
64 | you've have::word=you have::filter=none
65 | we've have::word=we have::filter=none
66 | they've have::word=they have::filter=none
67 |
68 | You've have::word=You have::filter=none
69 | We've have::word=We have::filter=none
70 | They've have::word=They have::filter=none
71 |
72 | I'd would::word=I would::filter=none
73 |
74 | he'd would::word=he would::filter=none
75 | she'd would::word=she would::filter=none
76 | you'd would::word=you would::filter=none
77 | we'd would::word=we would::filter=none
78 | they'd would::word=they would::filter=none
79 |
80 | He'd would::word=He would::filter=none
81 | She'd would::word=She would::filter=none
82 | You'd would::word=You would::filter=none
83 | We'd would::word=We would::filter=none
84 | They'd would::word=They would::filter=none
85 |
--------------------------------------------------------------------------------
/utils/spelldata/gen2.sl:
--------------------------------------------------------------------------------
1 | #
2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
3 | #
4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
5 | #
6 | # wordfile must be in bad\ngood\n order
7 | #
8 |
9 | debug(7 | 34);
10 |
11 | sub getthree
12 | {
13 | local('@words');
14 | @words = copy($1);
15 | add(@words, '0BEGIN.0');
16 | push(@words, '0END.0');
17 |
18 | while (size(@words) >= 3)
19 | {
20 | yield sublist(@words, 0, 3);
21 | @words = sublist(@words, 1);
22 | }
23 | }
24 |
25 | sub process
26 | {
27 | local('@words $entry $previous $current $next');
28 |
29 | $1 = [$1 trim];
30 | if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]')
31 | {
32 | return;
33 | }
34 |
35 | @words = splitIntoWords($1);
36 |
37 | while $entry (getthree(@words))
38 | {
39 | ($previous, $current, $next) = $entry;
40 |
41 | if (%words[$current] !is $null && %dictionary[$previous] !is $null && %dictionary[$next] !is $null && %counts[$current] < 1)
42 | {
43 | println($output, "$previous * $next $+ |" . join(", ", @($current, rand(%dataset[$current]))) );
44 | %counts[$current] += 1;
45 | }
46 | }
47 | }
48 |
49 | sub processFile
50 | {
51 | local('$handle $key $data $text @paragraphs');
52 |
53 | # read in our corpus.
54 | $handle = openf($1);
55 | $text = replace(readb($handle, -1), '<[^>]*?>', '');
56 | closef($handle);
57 |
58 | # start processing it?!?
59 | @paragraphs = splitByParagraph($text);
60 | map({ map(&process, $1); }, @paragraphs);
61 |
62 | #warn("Processed $1 $+ !");
63 | }
64 |
65 | sub main
66 | {
67 | global('%dataset $goal %words %counts');
68 |
69 | # load the words we're interested in.
70 | local('$handle $text $good');
71 |
72 | $handle = openf($2);
73 | while $text (readln($handle))
74 | {
75 | $good = readln($handle);
76 |
77 | if (%dataset[$good] is $null) { %dataset[$good] = @(); }
78 | push(%dataset[$good], $text);
79 | %words[$good] += 1;
80 | }
81 | closef($handle);
82 |
83 | $goal = size(%dataset);
84 |
85 | # setup our file that we're going to dump the output to.
86 | global('$output');
87 | $output = openf("> $+ $3");
88 |
89 | # ok go through all the junk parsing through the files.
90 |
91 | include("lib/nlp.sl");
92 | include("lib/dictionary.sl");
93 | global('%dictionary');
94 | %dictionary = dictionary();
95 | %dictionary["0BEGIN.0"] = 1;
96 | %dictionary["0END.0"] = 1;
97 |
98 | # collect list of files.
99 | [{
100 | if (-isDir $1)
101 | {
102 | map($this, ls($1));
103 | }
104 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
105 | {
106 | processFile($1);
107 | }
108 | }: $1];
109 |
110 |
111 | closef($output);
112 | println("Done!");
113 | }
114 |
115 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile";
116 |
117 | invoke(&main, @ARGV);
118 |
--------------------------------------------------------------------------------
/utils/spelldata/gen3.sl:
--------------------------------------------------------------------------------
1 | #
2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
3 | #
4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
5 | #
6 | # wordfile must be in bad\ngood\n order
7 | #
8 |
9 | debug(7 | 34);
10 |
11 | sub getthree
12 | {
13 | local('@words');
14 | @words = copy($1);
15 | add(@words, '0BEGIN.0');
16 | push(@words, '0END.0');
17 |
18 | while (size(@words) >= 3)
19 | {
20 | yield sublist(@words, 0, 3);
21 | @words = sublist(@words, 1);
22 | }
23 | }
24 |
25 | sub process
26 | {
27 | local('@words $entry $previous $current $next');
28 |
29 | $1 = [$1 trim];
30 | if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]')
31 | {
32 | return;
33 | }
34 |
35 | @words = splitIntoWords($1);
36 |
37 | while $entry (getthree(@words))
38 | {
39 | ($previous, $current, $next) = $entry;
40 |
41 | if (%words[$current] !is $null && %dictionary[$previous] !is $null && %dictionary[$next] !is $null && %counts[$current] < 10)
42 | {
43 | println($output, "$previous * $next $+ |" . join(", ", concat($current, %dataset[$current])) );
44 | %counts[$current] += 1;
45 | }
46 | }
47 | }
48 |
49 | sub processFile
50 | {
51 | local('$handle $key $data $text @paragraphs');
52 |
53 | # read in our corpus.
54 | $handle = openf($1);
55 | $text = replace(readb($handle, -1), '<[^>]*?>', '');
56 | closef($handle);
57 |
58 | # start processing it?!?
59 | @paragraphs = splitByParagraph($text);
60 | map({ map(&process, $1); }, @paragraphs);
61 |
62 | #warn("Processed $1 $+ !");
63 | }
64 |
65 | sub main
66 | {
67 | global('%dataset $goal %words %counts');
68 |
69 | # load the words we're interested in.
70 | local('$handle $text $good');
71 |
72 | $handle = openf($2);
73 | while $text (readln($handle))
74 | {
75 | $good = readln($handle);
76 |
77 | if (%dataset[$good] is $null) { %dataset[$good] = @(); }
78 | push(%dataset[$good], $text);
79 | %words[$good] += 1;
80 | }
81 | closef($handle);
82 |
83 | $goal = size(%dataset);
84 |
85 | # setup our file that we're going to dump the output to.
86 | global('$output');
87 | $output = openf("> $+ $3");
88 |
89 | # ok go through all the junk parsing through the files.
90 |
91 | include("lib/nlp.sl");
92 | include("lib/dictionary.sl");
93 | global('%dictionary');
94 | %dictionary = dictionary();
95 | %dictionary["0BEGIN.0"] = 1;
96 | %dictionary["0END.0"] = 1;
97 |
98 | # collect list of files.
99 | [{
100 | if (-isDir $1)
101 | {
102 | map($this, ls($1));
103 | }
104 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
105 | {
106 | processFile($1);
107 | }
108 | }: $1];
109 |
110 |
111 | closef($output);
112 | println("Done!");
113 | }
114 |
115 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile";
116 |
117 | invoke(&main, @ARGV);
118 |
--------------------------------------------------------------------------------
/utils/tagger/postest.sl:
--------------------------------------------------------------------------------
1 | #
2 | # test the tagger
3 | #
4 |
5 | debug(debug() | 7 | 34);
6 |
7 | include("lib/tagger.sl");
8 | initTaggerModels();
9 |
10 | sub both
11 | {
12 | local('$a $b');
13 | ($a, $b) = @_;
14 | while (size($a) > 0 && size($b) > 0)
15 | {
16 | yield @($a[0], $b[0]);
17 | $a = sublist($a, 1);
18 | $b = sublist($b, 1);
19 | }
20 | }
21 |
22 | sub tests
23 | {
24 | local('$lexicon $handle $count $score $line $item $word $tag $f $compare $taggit $opt $count $word $tag');
25 |
26 | $handle = openf(@ARGV[0]);
27 | while $line (readln($handle))
28 | {
29 | $compare = map({ return split('/', $1)[0]; }, split(' ', $line));
30 |
31 | foreach $f (@_)
32 | {
33 | $taggit = taggerToString([$f tag: $compare]);
34 |
35 | while $opt (both(split(' ', $line), split(' ', $taggit)))
36 | {
37 | ($word, $tag) = split('/', $opt[0]);
38 |
39 | if ($word in $lexdb)
40 | {
41 | if ($opt[0] eq $opt[1])
42 | {
43 | [$f scoreK];
44 | }
45 | [$f countK];
46 | }
47 | else
48 | {
49 | if ($opt[0] eq $opt[1])
50 | {
51 | [$f scoreU];
52 | }
53 | [$f countU];
54 | }
55 | }
56 | }
57 |
58 | $count++;
59 | # if (($count % 2500) == 0 && $count > 0)
60 | # {
61 | # foreach $f (@_)
62 | # {
63 | # [$f print];
64 | # }
65 | # println("$[-20]count");
66 | # }
67 | }
68 |
69 | foreach $f (@_)
70 | {
71 | [$f print];
72 | }
73 | }
74 |
75 | sub test
76 | {
77 | return lambda(
78 | {
79 | if ($0 eq "tag")
80 | {
81 | return invoke($function, @_);
82 | }
83 | else if ($0 eq "scoreK")
84 | {
85 | $scoreK += 1;
86 | }
87 | else if ($0 eq "countK")
88 | {
89 | $countK += 1;
90 | }
91 | else if ($0 eq "scoreU")
92 | {
93 | $scoreU += 1;
94 | }
95 | else if ($0 eq "countU")
96 | {
97 | $countU += 1;
98 | }
99 | else if ($0 eq "print")
100 | {
101 | println("test: $description = known: " . ($scoreK / $countK) . " unknown: " . ($scoreU / $countU) . " composite: " . (($scoreK + $scoreU) / ($countK + $countU)));
102 | }
103 | }, $function => $2, $description => $1, $scoreK => 0.0, $countK => 0.0, $scoreU => 0.0, $countU => 0.0);
104 | }
105 |
106 | tests(
107 | # test("pytagger", &taggerPython),
108 | # test("brill-light", &taggerLikeBrill),
109 | test("trigrams", &taggerWithTrigrams),
110 | test("lexprob", &taggerWithLexProb),
111 | # test("trigrams w/ neural", &taggerWithNeuralTrigrams),
112 | # test("trigrams w/ fix", &taggerWithTrigramsFix),
113 | # test("trigrams - no fixes", &taggerWithTrigrams2),
114 | # test("random", &taggerRandom)
115 | # test("HMM", &taggerHMM)
116 | );
117 |
--------------------------------------------------------------------------------
/utils/spell/definitions.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this script creates a dictionary definitions file for AtD from the raw text of the public
3 | # domain OPTED dictionary (Online Plain Text English Dictionary)
4 | #
5 | # Available at: http://msowww.anu.edu.au/~ralph/OPTED/
6 | #
7 | # Depends on:
8 | # data/rules/homophonedb.txt (list of words we want to create def file for)
9 | #
10 | # Outputs to:
11 | # data/rules/definitions.txt (a worddefinition file)
12 |
13 | debug(7 | 34);
14 |
15 | sub loadWords
16 | {
17 | local('$handle $words $text $word $def');
18 | $handle = openf("data/rules/homophonedb.txt");
19 | $words = split(',\s+', join(", ",readAll($handle)));
20 | closef($handle);
21 |
22 | $handle = openf("data/rules/homo/definitions.txt");
23 | while $text (readln($handle))
24 | {
25 | ($word, $def) = split('\t+', $text);
26 | push($words, $word);
27 | %alts[$word] = $def;
28 | }
29 | closef($handle);
30 |
31 | map({ $dictionary[$1] = 1; }, sort({ return lc($1) cmp lc($2); }, $words));
32 | }
33 |
34 | sub suckUpDictFile
35 | {
36 | local('$handle $text $word $pos $definition $check');
37 | $handle = openf($1);
38 | while $text (readln($handle))
39 | {
40 | if ($text ismatch '(.*?) \((.*?)\) (.*?)
')
41 | {
42 | ($word, $pos, $definition) = matched();
43 | if ("See*" iswm $definition || "Alt. of*" iswm $definition || "pl. of" iswm $definition || "of *" iswm $definition)
44 | {
45 | continue;
46 | }
47 |
48 | if ($word in $dictionary && strlen($dictionary[$word]) == 1)
49 | {
50 | $dictionary[$word] = $definition;
51 | }
52 | if (lc($word) in $dictionary && strlen($dictionary[lc($word)]) == 1)
53 | {
54 | $dictionary[lc($word)] = $definition;
55 | }
56 |
57 | $check = lc($word) . "s";
58 | if ($check in $dictionary && strlen($dictionary[$check]) == 1)
59 | {
60 | $dictionary[$check] = "Plural of " . lc($word) . ". " . $definition;
61 | }
62 | }
63 | }
64 |
65 | closef($handle);
66 | }
67 |
68 |
69 | sub main
70 | {
71 | global('$dictionary %alts');
72 | $dictionary = ohash();
73 | loadWords();
74 |
75 | [{
76 | if (-isDir $1)
77 | {
78 | map($this, ls($1));
79 | }
80 | else
81 | {
82 | suckUpDictFile($1);
83 | }
84 | }: "data/OPTED"];
85 |
86 | local('$word $definition');
87 |
88 | foreach $word => $definition ($dictionary)
89 | {
90 | if ($definition eq "1" || "See*" iswm $definition || "Alt. of*" iswm $definition || "of *" iswm $definition)
91 | {
92 | [[System err] println: "Substituting: $word = " . %alts[$word]];
93 | $definition = uc(charAt(%alts[$word], 0)) . substr(%alts[$word], 1);
94 | }
95 | else
96 | {
97 | $definition = split(';', $definition)[0];
98 | }
99 |
100 | println("$word $+ \t $+ $definition");
101 | }
102 | }
103 |
104 | invoke(&main, @ARGV);
105 |
--------------------------------------------------------------------------------
/utils/bigrams/buildunigrams.sl:
--------------------------------------------------------------------------------
1 | #
2 | # code to load wordlists.
3 | # we use this here because this code actually builds the corpus.
4 | #
5 | # java -jar sleep.jar buildunigrams.sl corpus/ outputfile.bin
6 |
7 | import org.dashnine.preditor.* from: 'lib/spellutils.jar';
8 |
9 | #
10 | # tool to build a corpus. <3
11 | #
12 |
13 | debug(7 | 34);
14 |
15 | sub process
16 | {
17 | local('@words $head $next $previous');
18 |
19 | @words = splitIntoWords($1);
20 | add(@words, '0BEGIN.0', 0);
21 |
22 | [$model addUnigram: '0BEGIN.0'];
23 |
24 | while (size(@words) > 1)
25 | {
26 | ($head, $next) = @words;
27 | [$model addUnigram: $next];
28 | @words = sublist(@words, 1);
29 | }
30 |
31 | [$model addUnigram: '0END.0'];
32 | }
33 |
34 | sub processFile
35 | {
36 | local('$handle $key $data $text @paragraphs');
37 |
38 | # read in our corpus.
39 | $handle = openf($1);
40 | $text = stripHTML(join("\n", readAll($handle)));
41 | closef($handle);
42 |
43 | # start processing it?!?
44 | @paragraphs = splitByParagraph($text);
45 | map({ map(&process, $1); }, @paragraphs);
46 | warn("$1 complete");
47 | }
48 |
49 | sub agent
50 | {
51 | local('$next $key $data $size $ticks $lsize $lang');
52 |
53 | include("lib/nlp.sl");
54 |
55 | $lang = systemProperties()["atd.lang"];
56 | if ($lang ne "" && -exists "lang/ $+ $lang $+ /load.sl")
57 | {
58 | include("lang/ $+ $lang $+ /load.sl");
59 | }
60 |
61 | $next = @files[0];
62 | removeAt(@files, 0);
63 | $size = size(@files);
64 |
65 | println("ready!");
66 |
67 | while ($next !is $null)
68 | {
69 | processFile($next);
70 | $next = @files[0];
71 | @files = sublist(@files, 1);
72 | }
73 | }
74 |
75 | sub main
76 | {
77 | global('%dictionary @files %homophones $model $lock');
78 |
79 | local('$handle');
80 |
81 | if (-exists $2)
82 | {
83 | $handle = openf($2);
84 | $model = readObject($handle);
85 | closef($handle);
86 | }
87 | else
88 | {
89 | $model = [new LanguageModel];
90 | }
91 |
92 | # collect list of files.
93 | [{
94 | if (-isDir $1)
95 | {
96 | map($this, ls($1));
97 | }
98 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
99 | {
100 | push(@files, $1);
101 | }
102 | }: $1];
103 |
104 | local('@agents @store $index $value $threads');
105 |
106 | $threads = 8;
107 |
108 | @store = @(@(), @(), @(), @(), @(), @(), @(), @());
109 |
110 | foreach $index => $value (@files)
111 | {
112 | push(@store[$index % $threads], $value);
113 | }
114 |
115 | for ($index = 0; $index < $threads; $index++)
116 | {
117 | push(@agents, fork(&agent, @files => copy(@store[$index]), \$model, \%homophones, \%dictionary));
118 | }
119 |
120 | foreach $index => $value (@agents)
121 | {
122 | wait($value);
123 | warn("Agent $index complete");
124 | }
125 |
126 | # save model
127 | $handle = openf("> $+ $2");
128 | writeObject($handle, $model);
129 | closef($handle);
130 |
131 | println("Done!");
132 | }
133 |
134 | invoke(&main, @ARGV);
135 |
--------------------------------------------------------------------------------
/utils/rules/makeprepositions.sl:
--------------------------------------------------------------------------------
1 | $handle = openf(@ARGV[0]);
2 | while $text (readln($handle))
3 | {
4 | ($first, $second, $type) = matches($text, '(\w+), (\w+) : (\w+)\\(.*');
5 | if ($type eq 'Pbigram1' && $first ne "wont" && $first ne "continue" && '*ed' !iswm $first && $first ne "attempts")
6 | {
7 | if ($second eq "to")
8 | {
9 | if ($first eq "decided")
10 | {
11 | println(".*/DT $first stir::filter=kill");
12 | }
13 | else if ($first eq "attempt")
14 | {
15 | println(".*/DT $first be::filter=kill");
16 | }
17 | else if ($first eq "reference")
18 | {
19 | println(".*/DT $first have::filter=kill");
20 | }
21 | else if ($first eq "wanted" || $first eq "wants" || $first eq "want")
22 | {
23 | println(".*/PRP $first help::filter=kill");
24 | println(".*/NNP $first help::filter=kill");
25 | }
26 |
27 | if (-islower charAt($first, 0))
28 | {
29 | println(".*/PRP $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
30 | println(".*/NNP $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
31 | println(".*/DT $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
32 | }
33 | else
34 | {
35 | println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
36 | println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
37 | println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
38 | }
39 | }
40 | else if ($second eq "of")
41 | {
42 | if ($first eq "couple")
43 | {
44 | println(".*/DT $first .*/NN|NNS::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
45 | }
46 | else if ($first eq "beware")
47 | {
48 | println(".*/DT $first .*/DT .*/NN|NNS::word=\\0 \\1 $second \\2 \\3::pivots= $+ $first $+ , $+ $first $second");
49 | println(".*/DT $first .*/NN|NNS::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second");
50 | }
51 | }
52 | else if ($second eq "on" || $second eq "with" || $second eq "in")
53 | {
54 | # println("$first .*/DT .*/NN|NNS::word=\\0 $second \\1 \\2::pivots= $+ $first $+ , $+ $first $second");
55 | # println("$first .*/NN|NNS::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second");
56 | }
57 | else if ($second ne "of" && $second ne "to")
58 | {
59 | # println("$first $second $+ ::filter=none");
60 | }
61 | }
62 | else if ($type eq 'Pbigram2')
63 | {
64 | # println(".*/DT .*/NN $first $+ ::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $second $first");
65 | # println(".*/VB $first $+ ::word=\\0 $second \\1::pivots=\\1, $+ $second \\1");
66 | # println(".*/VBD $first $+ ::word=\\0 $second \\1::pivots=\\1, $+ $second \\1");
67 | # println(".*/VBD .*/PRP $first $+ ::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $second $first");
68 | }
69 | }
70 |
71 |
--------------------------------------------------------------------------------
/data/rules/biasdb.txt:
--------------------------------------------------------------------------------
1 | African people South Asian peoples
2 | Dwarves Dwarfs
3 | East Indian South Asian
4 | Siamese twins conjoined twins
5 | West Indian Caribbean
6 | afflicted with has a disability, has an illness
7 | amputee person with an amputation
8 | black market underground economy, deals on the side
9 | black sheep reprobate, backslider
10 | blackball ostracize, disapprove, reject
11 | blacklist condemn, ostracize, boycott
12 | blackmail extort, threaten, demand
13 | businessman business person
14 | chairman chair, co-ordinator, convenor
15 | chronic mental illness long-term mental illness, persistent mental illness, psychiatric disability
16 | cleaning woman cleaner
17 | clergyman clergy, deacon, minister, pastor, priest, rabbi
18 | colored people Black peoples, people of African descent,
19 | common man average person, members of the public
20 | confined to a wheelchair uses a wheelchair
21 | craftsman artisan, craftsperson
22 | crippled impaired, flawed, disabled
23 | deaf mute deaf
24 | disabled person person with a disability
25 | disseminate broadcast, inform, publicise
26 | dwarves dwarfs
27 | epileptics individuals with epilepsy
28 | fair sex women
29 | fireman firefighter
30 | forefathers ancestors
31 | founding fathers founders
32 | hearing impaired hard of hearing
33 | housewife homemaker
34 | ladies women
35 | lady woman
36 | layman layperson, average person
37 | low man|woman on the totem pole lowest rung of the ladder
38 | man hours working hours
39 | man in the street public person in the street, public, member of the public
40 | man the \w+s staff the, handle the
41 | man-made synthetic, artificial
42 | mankind civilization, humanity, people
43 | manpower personnel, staff, staffing requirements, workers, workforce
44 | master copy top copy, original
45 | master of ceremonies host, emcee
46 | masterful domineering, very skilful
47 | mentally ill child|adult|person|boy|girl person with mental illness, person with psychiatric disability
48 | middleman wholesaler, go-between, intermediary
49 | mistress of ceremonies host, emcee
50 | newsman journalist, reporter
51 | niggard miser
52 | niggardly miserly, stingy
53 | non-whites people of colour
54 | old masters classic art, artists
55 | one man show one person show
56 | Oriental Asian
57 | orientals Asian peoples, East Asian peoples, Southeast Asian peoples
58 | paraplegics individuals with paraplegia
59 | physically challenged physically disabled
60 | policeman officer, police officer
61 | postman postal worker, mail carrier
62 | primitive societies non-industrial societies
63 | retarded adult adult with mental retardation
64 | right-hand man assistant
65 | salesman clerk, sales rep
66 | schizophrenics people who have schizophrenia
67 | seminal classical, formative
68 | sexual preference sexual orientation, gender orientation
69 | spokesman spokesperson, representative, speaker, official
70 | stewardess flight attendant
71 | suffering from has a disability, has an illness
72 | the crippled people with a disability
73 | the disabled persons with disabiliites, people with disabilities
74 | the handicapped people with disabilities
75 | the man in the street people in general
76 | the rights of man peoples/citizens rights, the rights of the individual
77 | tribes ethnic groups
78 | wheelchair-bound uses a wheelchair
79 | wives and children families, family
80 | workman worker
81 |
--------------------------------------------------------------------------------
/data/rules/grammar/determiners:
--------------------------------------------------------------------------------
1 | # These rules look for missing determiners
2 |
3 | .*/VBP &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
4 | .*/VBP &determiner_wanted 0END.0::word=\0 \1:determiner \1, \0 \1:determiner2 \1::pivots=\1,\1:determiner \1,\1:determiner2 \1
5 | .*/VBP .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2 \3, \0 \1:determiner2 \1 \2 \3::pivots=\1,\1:determiner \1,\1:determiner2 \1
6 | .*/VBP .*/JJ &determiner_wanted 0END.0::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
7 |
8 | .*/VBZ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
9 | .*/VBZ &determiner_wanted 0END.0::word=\0 \1:determiner \1, \0 \1:determiner2 \1::pivots=\1,\1:determiner \1,\1:determiner2 \1
10 | .*/VBZ .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2 \3, \0 \1:determiner2 \1 \2 \3::pivots=\1,\1:determiner \1,\1:determiner2 \1
11 | .*/VBZ .*/JJ &determiner_wanted 0END.0::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1
12 |
13 | .*/MD .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
14 | .*/MD .*/VB &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2
15 | .*/MD .*/VB .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3 \4, \0 \1 \2:determiner2 \2 \3 \4::pivots=\2,\2:determiner \2,\2:determiner2 \2
16 | .*/MD .*/VB .*/JJ &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
17 |
18 | .*/PRP .*/VBD &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
19 | .*/PRP .*/VBD &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2
20 | .*/PRP .*/VBD .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3 \4, \0 \1 \2:determiner2 \2 \3 \4::pivots=\2,\2:determiner \2,\2:determiner2 \2
21 | .*/PRP .*/VBD .*/JJ &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
22 |
23 | .*/PRP be &determiner_wanted::filter=kill
24 | .*/PRP .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
25 | .*/PRP .*/VB &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2
26 | .*/PRP .*/VBP\,RB .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2 \3:determiner \3 \4, \0 \1 \2 \3:determiner2 \3 \4::pivots=\3,\3:determiner \3,\3:determiner2 \3
27 | .*/PRP .*/VBP\,RB .*/VB &determiner_wanted 0END.0::word=\0 \1 \2 \3:determiner \3, \0 \1 \2 \3:determiner2 \3::pivots=\3,\3:determiner \3,\3:determiner2 \3
28 | .*/PRP .*/VBP &determiner_wanted .*ing::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2
29 |
--------------------------------------------------------------------------------
/utils/spelldata/gen.sl:
--------------------------------------------------------------------------------
1 | #
2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
3 | #
4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
5 | #
6 | # wordfile must be in bad\ngood\n order
7 | #
8 |
9 | debug(7 | 34);
10 |
11 | sub process
12 | {
13 | local('@words $head $next $count $candidate $prev $indict');
14 |
15 | $1 = [$1 trim];
16 | if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]')
17 | {
18 | return;
19 | }
20 |
21 | if ("we're" isin $1 || "they're" isin $1 || "it's" isin $1)
22 | {
23 | warn("Could be? $1");
24 | }
25 |
26 | @words = splitIntoWords($1);
27 | $count = 0;
28 |
29 | # make sure there is only one misspelling in this sentence.
30 | foreach $word (@words)
31 | {
32 | if (%words[$word] !is $null)
33 | {
34 | $candidate = $word;
35 | $count++;
36 | }
37 |
38 | if (%dictionary[$word] is $null)
39 | {
40 | $indict++;
41 | }
42 | }
43 |
44 | if ($count == 1 && size(@words) >= 3 && %counts[$candidate] < 10 && $indict == 0)
45 | {
46 | $change = replace($1, "\\b $+ $candidate $+ \\b", '*');
47 |
48 | println($output, "$change $+ |" . join(", ", concat(@($candidate), %dataset[$candidate]) ));
49 | %counts[$candidate] += 1;
50 | }
51 | else if ("we're" isin $1 || "they're" isin $1 || "it's" isin $1)
52 | {
53 | warn("Could be? $1 - Nope: $count and " . %counts[$candidate] . " and $indict");
54 | }
55 | }
56 |
57 | sub processFile
58 | {
59 | local('$handle $key $data $text @paragraphs');
60 |
61 | # read in our corpus.
62 | $handle = openf($1);
63 | $text = replace(readb($handle, -1), '<[^>]*?>', '');
64 | closef($handle);
65 |
66 | # start processing it?!?
67 | @paragraphs = splitByParagraph($text);
68 | map({ map(&process, $1); }, @paragraphs);
69 |
70 | #warn("Processed $1 $+ !");
71 | }
72 |
73 | sub main
74 | {
75 | global('%dataset $goal %words %counts');
76 |
77 | # load the words we're interested in.
78 | local('$handle $text $good');
79 |
80 | $handle = openf($2);
81 | while $text (readln($handle))
82 | {
83 | $good = readln($handle);
84 |
85 | if (%dataset[$good] is $null) { %dataset[$good] = @(); }
86 | push(%dataset[$good], $text);
87 | %words[$good] += 1;
88 | }
89 | closef($handle);
90 |
91 | $goal = size(%dataset);
92 |
93 | # setup our file that we're going to dump the output to.
94 | global('$output');
95 | $output = openf("> $+ $3");
96 |
97 | # ok go through all the junk parsing through the files.
98 |
99 | include("nlp.sl");
100 | include("dictionary.sl");
101 | global('%dictionary');
102 | %dictionary = dictionary();
103 |
104 | # collect list of files.
105 | [{
106 | if (-isDir $1)
107 | {
108 | map($this, ls($1));
109 | }
110 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
111 | {
112 | processFile($1);
113 | }
114 | }: $1];
115 |
116 |
117 | closef($output);
118 | println("Done!");
119 | }
120 |
121 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile";
122 |
123 | invoke(&main, @ARGV);
124 |
--------------------------------------------------------------------------------
/data/rules/grammar/combine:
--------------------------------------------------------------------------------
1 | #
2 | # words that should be combined
3 | #
4 |
5 | # every day (daily) vs. everyday (common)
6 |
7 | an|in|for|the|to every day::word=\0 everyday::pivots=\1 \2,everyday
8 |
9 | # before hand -> beforehand
10 |
11 | before hand::word=beforehand
12 |
13 | # an other -> another
14 |
15 | an other::word=another
16 |
17 | # all ways -> always (unless refering to everything) like she is better than him in all ways. <- this is ok
18 | in all ways::filter=kill
19 | all ways::word=always
20 |
21 | # every where -> everywhere
22 |
23 | every where::word=everywhere
24 |
25 | #
26 | # more words to combine
27 | #
28 | eye sight::word=eyesight
29 | eye sore::word=eyesore
30 | figure head::word=figurehead
31 | flag ship::word=flagship
32 | head gear::word=headgear
33 | head quarters::word=headquarters
34 | head stone::word=headstone
35 | head wear::word=headwear
36 | how ever::word=however
37 | in stead of::word=instead of
38 | in tact::word=intact
39 | it self::word=itself
40 | key note::word=keynote
41 | laughing stock::word=laughingstock
42 | life time::word=lifetime
43 | mean while::word=meanwhile
44 | nation wide::word=nationwide
45 | near by::word=nearby
46 | new comer::word=newcomer
47 | no where to::word=nowhere to
48 | note worthy::word=noteworthy
49 | now a days::word=nowadays
50 | on going::word=ongoing
51 | out grow::word=outgrow
52 | out side::word=outside
53 | over looked::word=overlooked
54 | over looking::word=overlooking
55 | over rated::word=overrated
56 | over seas::word=overseas
57 | short coming::word=shortcoming
58 | short cut::word=shortcut
59 | side kick::word=sidekick
60 | sky diving::word=skydiving
61 | some how::word=somehow
62 | some what::word=somewhat
63 | stale mate::word=stalemate
64 | them selves::word=themselves
65 | back fire::word=backfire
66 | world wide::word=worldwide
67 | worth while::word=worthwhile
68 | where as::word=whereas
69 | where by::word=whereby
70 | where upon::word=whereupon
71 | #with in an|a|the second|minute|hour|year|decade|century|day::word=within \2 \3::filter=none
72 | with in::word=within
73 | with out::word=without
74 | way side::word=wayside
75 | along side::word=alongside
76 | be cause::word=because
77 | be ware::word=beware
78 | before hand::word=beforehand
79 | down side::word=downside
80 | eye brow::word=eyebrow
81 | eye lash::word=eyelash
82 | eye lid::word=eyelid
83 | through out::word=throughout
84 | on-going::word=ongoing
85 | light weight::word=lightweight
86 | heavy weight::word=heavyweight
87 | free lance::word=freelance
88 | free lancer::word=freelancer
89 | free lances::word=freelances
90 | free lancing::word=freelancing
91 |
92 | # awhile is an adverb, should be used after a verb
93 | .*/VB a while::word=\0 awhile::pivots=a while,awhile
94 |
95 | # join web site into website
96 | web site::word=website
97 | Web Site|site::word=Website
98 |
99 | head scarf::word=headscarf
100 | head scarves::word=headscarves
101 |
102 | key words::word=keywords
103 | crowd sourcing::word=crowdsourcing
104 | meta data::word=metadata
105 | mis .*::word=\0\1::filter=sane
106 |
107 | stand alone::word=standalone
108 | past time::word=pastime
109 | any where::word=anywhere
110 | some where::word=somewhere
111 | no where::word=nowhere
112 | .*/DT bail out::word=\0 bailout::pivots=bail out,bailout
113 |
114 | out come::word=outcome
115 |
116 |
--------------------------------------------------------------------------------
/utils/spelldata/gen4.sl:
--------------------------------------------------------------------------------
1 | #
2 | # process through corpus, our goal is to associate all misspelt words with a sentence.
3 | #
4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto
5 | #
6 | # wordfile must be in bad\ngood\n order
7 | #
8 |
9 | debug(7 | 34);
10 |
11 | sub getnext
12 | {
13 | local('@words');
14 | @words = copy($1);
15 | add(@words, @('0BEGIN.0', 'UNK'));
16 | push(@words, @('0END.0', 'UNK'));
17 |
18 | while (size(@words) >= 5)
19 | {
20 | yield sublist(@words, 0, 5);
21 | @words = sublist(@words, 1);
22 | }
23 | }
24 |
25 | sub process
26 | {
27 | local('@words $entry $previous $current $next $pre2 $pre1 $next1 $next2');
28 |
29 | $1 = [$1 trim];
30 | if ($1 !ismatch '[A-Z][A-Za-z\'\,\- ]*?[\.\?\!]{0,1}')
31 | {
32 | return;
33 | }
34 |
35 | @words = taggerWithTrigrams(splitIntoWords($1));
36 |
37 | while $entry (getnext(@words))
38 | {
39 | ($pre2, $pre1, $current, $next1, $next2) = map({ return $1[0]; }, $entry);
40 |
41 | if (%words[$current] !is $null && %dictionary[$pre2] !is $null && %dictionary[$pre1] !is $null && %dictionary[$next1] !is $null && %dictionary[$next2] !is $null && %counts[$current] < $max)
42 | {
43 | ($pre2, $pre1, $current, $next1, $next2) = map({ return join('/', $1); }, $entry);
44 |
45 | println($output, "$pre2 $pre1 * $next1 $next2 $+ |" . join("; ", concat($current, %dataset[$entry[2][0]])) );
46 | %counts[$entry[2][0]] += 1;
47 | }
48 | }
49 | }
50 |
51 | sub processFile
52 | {
53 | local('$handle $key $data $text @paragraphs');
54 |
55 | # read in our corpus.
56 | $handle = openf($1);
57 | $text = replace(readb($handle, -1), '<[^>]*?>', '');
58 | closef($handle);
59 |
60 | # start processing it?!?
61 | @paragraphs = splitByParagraph($text);
62 | map(lambda({ map(lambda(&process, \$max), $1); }, \$max), @paragraphs);
63 |
64 | #warn("Processed $1 $+ !");
65 | }
66 |
67 | sub main
68 | {
69 | global('%dataset $goal %words %counts');
70 |
71 | # load the words we're interested in.
72 | local('$handle $text $good');
73 |
74 | $handle = openf($1);
75 | while $text (readln($handle))
76 | {
77 | $good = readln($handle);
78 |
79 | if (%dataset[$good] is $null) { %dataset[$good] = @(); }
80 | push(%dataset[$good], $text);
81 | %words[$good] += 1;
82 | }
83 | closef($handle);
84 |
85 | $goal = size(%dataset);
86 |
87 | # setup our file that we're going to dump the output to.
88 | global('$output');
89 | $output = openf("> $+ $3");
90 |
91 | # ok go through all the junk parsing through the files.
92 |
93 | include("lib/nlp.sl");
94 | include("lib/dictionary.sl");
95 | include("lib/tagger.sl");
96 |
97 | global('%dictionary');
98 | %dictionary = dictionary();
99 | %dictionary["0BEGIN.0"] = 1;
100 | %dictionary["0END.0"] = 1;
101 |
102 | initTaggerModels();
103 |
104 | # collect list of files.
105 | [{
106 | if (-isDir $1)
107 | {
108 | map($this, ls($1));
109 | }
110 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1)
111 | {
112 | processFile($1, \$max);
113 | }
114 | }: $2, $max => $4];
115 |
116 |
117 | closef($output);
118 | println("Done!");
119 | }
120 |
121 | assert size(@ARGV) == 4 : "java -jar sleep.jar corpus_data wordlist outputfile max_entries_per_word";
122 |
123 | invoke(&main, @ARGV);
124 |
--------------------------------------------------------------------------------
/utils/rules/testgr.sl:
--------------------------------------------------------------------------------
1 | #
2 | # This is a script to test grammar rules. It's fun stuff.
3 | #
4 | # java -jar utils/rules/testgr.sl [missing|wrong]
5 | #
6 |
7 | debug(7 | 34);
8 |
9 | include("lib/engine.sl");
10 | include("utils/rules/rules.sl");
11 | include("utils/common/score.sl");
12 |
13 | sub checkSentenceSpelling
14 | {
15 | }
16 |
17 | sub initAll
18 | {
19 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs');
20 | $model = get_language_model();
21 | $dictionary = dictionary();
22 | $rules = get_rules();
23 | $dsize = size($dictionary);
24 | $hnetwork = get_network("hnetwork4.bin");
25 | $verbs = loadVerbData();
26 | initTaggerModels();
27 | }
28 |
29 | sub measure
30 | {
31 | local('@results $options $correct $score $s_score $good $index $r @suggs $debug');
32 | (@results, $options, $correct, $score, $s_score, $debug) = @_;
33 |
34 | if (size(@results) > 0)
35 | {
36 | foreach $index => $r (@results)
37 | {
38 | local('$rule $text $path $context @suggestions');
39 | ($rule, $text, $path, $context, @suggestions) = $r;
40 |
41 | if (!-isarray @suggestions) { @suggestions = split(', ', @suggestions); }
42 |
43 | if ($text eq $options[0])
44 | {
45 | @suggs = filter(lambda({ return iff($1 in $options, 1); }, $options => sublist($options, 1)), @suggestions);
46 |
47 | if (size(@suggs) > 0)
48 | {
49 | [$score correctSugg];
50 | [$s_score correctSugg];
51 |
52 | if ($correct in @suggestions)
53 | {
54 | [$score correct];
55 | [$s_score correct];
56 | }
57 | }
58 | else if ('wrong' isin $debug)
59 | {
60 | println("$wrong => $text");
61 | println(" - entry: " . $entry);
62 | println(" - expect: " . sublist($options, 1));
63 | println(" - options: " . @suggestions);
64 | println(" - " . $rule['category'] . ' = ' . $rule['rule'] );
65 | }
66 | $good = 1;
67 |
68 | [$s_score record];
69 | }
70 | }
71 | }
72 |
73 | if (!$good)
74 | {
75 | [$score falseNegative]; # move if $text eq options[1] never happens
76 |
77 | if ('missing' isin $debug)
78 | {
79 | println("$wrong => $text");
80 | println(" - entry: " . $entry);
81 | println(" - expect: " . sublist($options, 1));
82 | }
83 | }
84 |
85 | [$score record];
86 | }
87 |
88 | sub main
89 | {
90 | local('$handle $sentence $entry @results $options $correct $wrong $score1 $score2 $2');
91 |
92 | $score1 = newObject('score', "Suggestion score for $1");
93 | $score2 = newObject('score', "Grammar score for $1");
94 |
95 | initAll();
96 |
97 | $handle = openf($1);
98 | while $entry (readln($handle))
99 | {
100 | ($sentence, $options, $correct) = split('\|', $entry);
101 | $options = split(', ', $options);
102 |
103 | $wrong = strrep($sentence, ' * ', " " . $options[0] . " ");
104 |
105 | @results = @();
106 | processSentence($sentence => $wrong, \@results);
107 |
108 | measure(@results, $options, $correct, $score2, $score1, $2, \$entry, \$wrong);
109 | }
110 |
111 | [$score1 print];
112 | [$score2 print];
113 | }
114 |
115 | invoke(&main, @ARGV);
116 |
--------------------------------------------------------------------------------
/data/rules/diacritic/diaeresis:
--------------------------------------------------------------------------------
1 | #
2 | # http://en.wikipedia.org/wiki/Diaeresis
3 | #
4 |
5 | achroodextrin::word=achroödextrin::filter=none
6 | aedes::word=aëdes::filter=none
7 | Ajie::word=Ajië::filter=none
8 | Bootes::word=Boötes::filter=none
9 | chiliaedron::word=chiliaëdron::filter=none
10 | Chloe::word=Chloë::filter=none
11 | cooperate::word=coöperate::filter=none
12 | cooperation::word=coöperation::filter=none
13 | coopt::word=coöpt::filter=none
14 | coordinate::word=coördinate::filter=none
15 | coordinated::word=coördinated::filter=none
16 | coordinately::word=coördinately::filter=none
17 | coordinateness::word=coördinateness::filter=none
18 | coordinates::word=coördinates::filter=none
19 | coordination::word=coördination::filter=none
20 | coordinative::word=coördinative::filter=none
21 | coordinator::word=coördinator::filter=none
22 | diploe::word=diploë::filter=none
23 | eleemosynary::word=eleëmosynary::filter=none
24 | naive::word=naïve::filter=none
25 | naively::word=naïvely::filter=none
26 | noel::word=noël::filter=none
27 | Noel::word=Noël::filter=none
28 | oogone::word=oögone::filter=none
29 | ooidal::word=oöidal::filter=none
30 | oology::word=oölogy::filter=none
31 | preempt::word=preëmpt::filter=none
32 | preempted::word=preëmpted::filter=none
33 | preemptible::word=preëmptible::filter=none
34 | preemption::word=preëmption::filter=none
35 | preemptioner::word=preëmptioner::filter=none
36 | preemptive::word=preëmptive::filter=none
37 | preemptively::word=preëmptively::filter=none
38 | preemptor::word=preëmptor::filter=none
39 | preemptory::word=preëmptory::filter=none
40 | preexisting::word=preëxisting::filter=none
41 | reeducate::word=reëducate::filter=none
42 | reelect::word=reëlect::filter=none
43 | reenter::word=reënter::filter=none
44 | reentry::word=reëntry::filter=none
45 | reexamination::word=reëxamination::filter=none
46 | reexamine::word=reëxamine::filter=none
47 | reextend::word=reëxtend::filter=none
48 | uncoordinate::word=uncoördinate::filter=none
49 | uncoordinated::word=uncoördinated::filter=none
50 | vacuum::word=vacuüm::filter=none
51 | zoea::word=zoëa::filter=none
52 | zoochemistry::word=zoöchemistry::filter=none
53 | zoochemy::word=zoöchemy::filter=none
54 | zoochlorella::word=zoöchlorella::filter=none
55 | zoocyst::word=zoöcyst::filter=none
56 | zoocytium::word=zoöcytium::filter=none
57 | zooerythrine::word=zoöerythrine::filter=none
58 | zoogeography::word=zoögeography::filter=none
59 | zooglaea::word=zoöglœa::filter=none
60 | zoographer::word=zoögrapher::filter=none
61 | zoography::word=zoögraphy::filter=none
62 | zoolatry::word=zoölatry::filter=none
63 | zoology::word=zoölogy::filter=none
64 | zoomelanin::word=zoömelanin::filter=none
65 | zoomorphism::word=zoömorphism::filter=none
66 | zoon::word=zoön::filter=none
67 | zoonite::word=zoönite::filter=none
68 | zoonomy::word=zoönomy::filter=none
69 | zoonule::word=zoönule::filter=none
70 | zoopathology::word=zoöpathology::filter=none
71 | zoophaga::word=zoöphaga::filter=none
72 | zoophagan::word=zoöphagan::filter=none
73 | zoophagous::word=zoöphagous::filter=none
74 | zoophilist::word=zoöphilist::filter=none
75 | zoophily::word=zoöphily::filter=none
76 | zoophite::word=zoöphite::filter=none
77 | zoophorous::word=zoöphorous::filter=none
78 | Zoophyta::word=Zoöphyta::filter=none
79 | zoophyte::word=zoöphyte::filter=none
80 | zoophytic::word=zoöphytic::filter=none
81 | zoophytology::word=zoöphytology::filter=none
82 | zoopraxiscope::word=zoöpraxiscope::filter=none
83 | zoopsychology::word=zoöpsychology::filter=none
84 | zoosperm::word=zoösperm::filter=none
85 | zoosporangium::word=zoösporangium::filter=none
86 | zoospore::word=zoöspore::filter=none
87 | zoospores::word=zoöspores::filter=none
88 | zootic::word=zoötic::filter=none
89 | zootomist::word=zoötomist::filter=none
90 | zootomy::word=zoötomy::filter=none
91 | zootrophic::word=zoötrophic::filter=none
92 |
--------------------------------------------------------------------------------
/data/rules/grammar/aux_been_was:
--------------------------------------------------------------------------------
1 | been .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
2 | been .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
3 | been .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
4 | been .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
5 | was .*/VB 0END.0::word=\0 \1:participle::pivots=\1,\1:participle
6 | was .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
7 | was .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
8 | was .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
9 | was .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
10 | were .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
11 | were .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
12 | were .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
13 | were .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
14 | are .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
15 | are .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
16 | are .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
17 | are .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
18 | am .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle
19 | am .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
20 | am .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
21 | am .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
22 | is .*/VB|VBP for|by|as::word=\0 \1:participle \2::pivots=\1,\1:participle
23 | is .*/RB .*/VB|VBP for|by|as::word=\0 \1 \2:participle \3::pivots=\2,\2:participle
24 | is .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present
25 | is .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present
26 | do is .*/VB::filter=kill
27 | been .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
28 | been .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
29 | was .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
30 | was .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
31 | were .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
32 | were .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
33 | are .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
34 | are .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
35 | am .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
36 | am .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
37 | is .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present
38 | is .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present
39 | it is .*/VBP|VB|VBD::word=\0 \1 \2:participle::pivots=\2,\2:participle
40 | It is .*/VBP|VB|VBD::word=\0 \1 \2:participle::pivots=\2,\2:participle
41 | is .*/VBP|VBD|VB as|for|to::word=\0 \1:participle \2::pivots=\1,\1:participle
42 |
43 | # are [base verb mistagged as a noun] to -> are [past tense] to
44 | are .*(?
2 |
3 |
4 |
5 | Rich Editor Help
6 |
7 |
8 |
9 |
114 |
137 |
138 |
139 |
140 |
141 |
142 | After the Deadline
143 |
144 |
148 |
149 |
150 |
151 |
152 |
153 |
157 |
158 |
159 |
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/utils/common/spelltests.sl:
--------------------------------------------------------------------------------
1 | #
2 | # this is a script to run unit tests and calculute the effectiveness of the
3 | # preditor engine
4 | #
5 |
6 | sub testSpellingNoContext
7 | {
8 | local('$handle $score $bad $good');
9 | $handle = openf("tests/tests2.txt");
10 |
11 | $score = newObject("score", "Spellchecker w/ No Context");
12 |
13 | while $bad (readln($handle))
14 | {
15 | $good = readln($handle);
16 | if ($dictionary[$bad] !is $null)
17 | {
18 | local('$source $size');
19 | [$score falseNegative];
20 | }
21 | else
22 | {
23 | [$score correct];
24 | }
25 |
26 | if ($dictionary[$good] is $null)
27 | {
28 | [$score falsePositive];
29 | }
30 |
31 | [$score record];
32 | }
33 |
34 | [$score print];
35 | }
36 |
37 | sub testSoundEx
38 | {
39 | local('$score $entry $bad $good');
40 | $score = newObject("score", "Test of SoundEx");
41 | while $entry (words("tests2.txt"))
42 | {
43 | ($bad, $good) = $entry;
44 | if (soundex($bad) eq soundex($good))
45 | {
46 | [$score correct];
47 | }
48 | else
49 | {
50 | warn("$[25]bad " . soundex($bad) . " $[25]good " . soundex($good));
51 | }
52 |
53 | [$score record];
54 | }
55 |
56 | [$score print];
57 | }
58 |
59 | sub testSoundExEditDistance
60 | {
61 | local('%distance %totals $count $entry $bad $good $key $value $p $t');
62 |
63 | while $entry (words("tests2.txt"))
64 | {
65 | ($bad, $good) = $entry;
66 | if (soundex($bad) eq soundex($good))
67 | {
68 | %distance[editDistance($good, $bad)] += 1;
69 | }
70 |
71 | if (editDistance($good, $bad) == 0)
72 | {
73 | warn("$good -> $bad has an edit distance of 0?!?");
74 | }
75 |
76 | %totals[editDistance($good, $bad)] += 1;
77 | $count++;
78 | }
79 |
80 | foreach $key => $value (%distance)
81 | {
82 | $p = double($value) / $count;
83 | $t = double($value) / %totals[$key];
84 |
85 | println("$[5]key $[20]t $p");
86 | }
87 | }
88 |
89 | sub testCorrectionsNoContext
90 | {
91 | local('$good $bad $entry $score @suggestions $f $c');
92 |
93 | $score = newObject("score", "Test of Corrections w/o Context");
94 | $c = 0;
95 |
96 |
97 | while $entry (words(@_[0]))
98 | {
99 | ($bad, $good) = $entry;
100 |
101 | if ($dictionary[$bad] is $null && $dictionary[$good] !is $null)
102 | {
103 | @suggestions = %edits[$bad]; # filterByDictionary($bad, $dictionary);
104 |
105 | if ($good in @suggestions)
106 | {
107 | foreach $f (sublist(@_, 1))
108 | {
109 | [$f : $bad, $good, copy(@suggestions), $null, $null];
110 | }
111 | [$score correct];
112 | }
113 | else
114 | {
115 | # println("$bad -> $good : " . editDistance($bad, $good));
116 | }
117 |
118 | [$score record];
119 | }
120 | else
121 | {
122 | if ($dictionary[$bad] !is $null)
123 | {
124 | [$score falseNegative];
125 | $c++;
126 | }
127 |
128 | if ($dictionary[$good] is $null)
129 | {
130 | [$score falsePositive];
131 | }
132 | }
133 | }
134 |
135 | println("Present words: $c");
136 | [$score print];
137 | }
138 |
139 | sub RandomGuess
140 | {
141 | [$score record];
142 | if (rand($3) eq $2)
143 | {
144 | [$score correct];
145 | }
146 | }
147 |
148 | sub FrequencyCount
149 | {
150 | local('@suggs');
151 |
152 | [$score record];
153 | @suggs = sort({ return Pword($2) <=> Pword($1); }, $3);
154 | if (@suggs[0] eq $2)
155 | {
156 | [$score correct];
157 | }
158 | }
159 |
160 | sub scoreIt
161 | {
162 | return ( ( 0.75 / ( editDistance($word, $1) + 1 ) ) ) +
163 | ( 0.25 * Pword($1) ) ;
164 | }
165 | sub scoreIt2
166 | {
167 | return ( ( 0.75 / ( editDistance($word, $1) + 1 ) ) ) +
168 | ( 0.25 * Pword($1) ) ;
169 | }
170 |
171 | sub CombineFreqEdit
172 | {
173 | local('@suggs');
174 |
175 | let(&scoreIt, $word => $1);
176 | let(&scoreIt2, $word => $1);
177 |
178 | [$score record];
179 | @suggs = sort({ return scoreIt2($2) <=> scoreIt2($1); }, $3);
180 |
181 | if (@suggs[0] eq $2)
182 | {
183 | [$score correct];
184 | }
185 | }
186 |
187 | sub NeuralNetworkScore
188 | {
189 | local('@suggs $4 $5 $cs');
190 |
191 | [$score record];
192 | @suggs = sortHash($3, CompareSuggestions($network, $criteriaf, $1, $pool => $3, $pre => $4, $next => $5));
193 |
194 | if (@suggs[0] eq $2)
195 | {
196 | [$score correct];
197 | }
198 | }
199 |
--------------------------------------------------------------------------------
/service/src/view/wordpress.slp:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Rich Editor Help
6 |
7 |
8 |
9 |
118 |
141 |
142 |
143 |
144 |
145 |
146 | After the Deadline
147 |
148 |
152 |
153 |
154 |
155 |
156 |
157 |
161 |
162 |
163 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
--------------------------------------------------------------------------------