├── service ├── root │ └── index.html ├── src │ ├── data │ ├── lang │ ├── lib │ ├── models │ ├── view │ │ ├── rules │ │ │ ├── empty.slp │ │ │ ├── nom.slp │ │ │ ├── redundant.slp │ │ │ ├── passive.slp │ │ │ ├── avoid.slp │ │ │ ├── bias.slp │ │ │ ├── complex.slp │ │ │ ├── ruleview.slp │ │ │ ├── homophone.slp │ │ │ ├── homophone2.slp │ │ │ ├── category.slp │ │ │ └── nomit.slp │ │ ├── problem.slp │ │ ├── metric.slp │ │ ├── suggestions.slp │ │ ├── rule.slp │ │ ├── service.slp │ │ ├── wordpress_gen.slp │ │ ├── tinymce.slp │ │ ├── error.slp │ │ ├── quality.slp │ │ ├── wordpress26.slp │ │ └── wordpress.slp │ └── local.sl └── code │ ├── compile.txt │ ├── build.xml │ └── src │ └── org │ └── dashnine │ └── preditor │ ├── GuessLanguage.java │ ├── SortFromHash.java │ └── LanguageModelSmall.java ├── data └── rules │ ├── nohomophone.txt │ ├── grammar │ ├── indef_uncount │ ├── aux_modals │ ├── personal_pronoun_case │ ├── infinitives │ ├── det_agreement_plural │ ├── weare │ ├── were │ ├── an │ ├── whose │ ├── subject_verb_agreement │ ├── contractedformnot │ ├── dneg2 │ ├── possessive │ ├── count │ ├── lay │ ├── its │ ├── aux_noparticiple │ ├── separate │ ├── your │ ├── too │ ├── apostrophes │ ├── their │ ├── its2 │ ├── det_agreement │ ├── repeats │ ├── determiners │ ├── combine │ ├── aux_been_was │ ├── comprised │ └── aux_wrong_verb │ ├── abbr.txt │ ├── agreement │ ├── plural.r │ ├── single.r │ ├── chunk_single.r │ └── chunk_plural.r │ ├── pronouns.txt │ ├── complex │ ├── been │ └── misc │ ├── prepositions.txt │ ├── nomdb.txt │ ├── avoiddb.txt │ ├── hyphens.txt │ ├── foreigndb.txt │ ├── irregular_nouns.txt │ ├── biasdb.txt │ └── diacritic │ └── diaeresis ├── lib ├── sleep.jar ├── cngram.jar ├── moconti.jar ├── spellutils.jar ├── object.sl └── quality.sl ├── bin ├── quality.sh ├── dictgrep.sh ├── tagit.sh ├── testr.sh ├── corpuswp.sh ├── fixdata.sh ├── buildrules.sh ├── agreement.sh ├── transr.sh ├── amigo.sh ├── compilespelltools.sh ├── make3.sh ├── testgr.sh ├── corpus-lex-diff.sh ├── trainhomophones.sh ├── traintagger.sh ├── inspect.sh ├── smallmodel.sh ├── buildedits.sh ├── prepositions.sh ├── buildgrammarsets.sh ├── all.sh ├── trainspellnocontext.sh ├── buildmodel.sh ├── buildspelldata.sh ├── trainspellcontext.sh ├── buildhomodata.sh └── buildtaggersets.sh ├── atdconfig.sl ├── README.txt ├── utils ├── bigrams │ ├── printcorpus.sl │ ├── contextprob.sl │ ├── builddict.sl │ ├── corpuswp.sl │ ├── buildsmallmodel.sl │ ├── fixgutenberg.sl │ ├── inspect.sl │ ├── qscore.sl │ ├── amigo.sl │ ├── corpus-lex-diff.sl │ └── buildunigrams.sl ├── spelldata │ ├── makesrc.sl │ ├── process.sl │ ├── maker.sl │ ├── torules.sl │ ├── bootstrapspell.sl │ ├── gen2.sl │ ├── gen3.sl │ ├── gen.sl │ └── gen4.sl ├── tagger │ ├── tagit.sl │ ├── fixtags.sl │ ├── makebootstrap.sl │ ├── makesentences.sl │ └── postest.sl ├── common │ ├── score.sl │ ├── bywords.sl │ ├── utils.sl │ ├── hotest.sl │ ├── homo.sl │ ├── exp.sl │ ├── spellcontext.sl │ └── spelltests.sl ├── rules │ ├── agreement.sl │ ├── findprepositions.sl │ ├── makespecial.sl │ ├── transr.sl │ ├── testr.sl │ ├── makeprepositions.sl │ └── testgr.sl └── spell │ ├── seededits.sl │ └── definitions.sl ├── run-lowmem.bat ├── run.sh ├── run-lowmem.sh ├── CREDITS.rules.txt ├── CREDITS.txt └── models └── get_model_binaries.sh /service/root/index.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /service/src/data: -------------------------------------------------------------------------------- 1 | ../../data -------------------------------------------------------------------------------- /service/src/lang: -------------------------------------------------------------------------------- 1 | ../../lang -------------------------------------------------------------------------------- /service/src/lib: -------------------------------------------------------------------------------- 1 | ../../lib -------------------------------------------------------------------------------- /service/src/models: -------------------------------------------------------------------------------- 1 | ../../models -------------------------------------------------------------------------------- /service/src/view/rules/empty.slp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /service/src/view/rules/nom.slp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /service/src/view/rules/redundant.slp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/rules/nohomophone.txt: -------------------------------------------------------------------------------- 1 | me 2 | based 3 | we 4 | -------------------------------------------------------------------------------- /service/src/local.sl: -------------------------------------------------------------------------------- 1 | # put local modifications to service here 2 | -------------------------------------------------------------------------------- /lib/sleep.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/sleep.jar -------------------------------------------------------------------------------- /lib/cngram.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/cngram.jar -------------------------------------------------------------------------------- /lib/moconti.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/moconti.jar -------------------------------------------------------------------------------- /service/src/view/problem.slp: -------------------------------------------------------------------------------- 1 | 2 | <% $1 %> 3 | 4 | -------------------------------------------------------------------------------- /lib/spellutils.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Automattic/atd-server-next/HEAD/lib/spellutils.jar -------------------------------------------------------------------------------- /service/src/view/rules/passive.slp: -------------------------------------------------------------------------------- 1 |

2 | <% $1["rule"] %> - <% $2 %> 3 |

4 | -------------------------------------------------------------------------------- /bin/quality.sh: -------------------------------------------------------------------------------- 1 | java -Xmx3328M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/qscore.sl $1 2 | -------------------------------------------------------------------------------- /service/src/view/rules/avoid.slp: -------------------------------------------------------------------------------- 1 |

Translation:

2 | 5 | -------------------------------------------------------------------------------- /bin/dictgrep.sh: -------------------------------------------------------------------------------- 1 | java -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/dictgrep.sl $1 2 | -------------------------------------------------------------------------------- /bin/tagit.sh: -------------------------------------------------------------------------------- 1 | # 2 | java -Datd.lowmem=true -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/tagit.sl $1 3 | -------------------------------------------------------------------------------- /bin/testr.sh: -------------------------------------------------------------------------------- 1 | java -Datd.lowmem=true -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testr.sl $1 $2 2 | -------------------------------------------------------------------------------- /data/rules/grammar/indef_uncount: -------------------------------------------------------------------------------- 1 | a|an &uncountable .*/RP|VBZ|IN::word=\1 \2::filter=none 2 | a|an &uncountable 0END.0::word=\1::filter=none 3 | 4 | -------------------------------------------------------------------------------- /service/src/view/metric.slp: -------------------------------------------------------------------------------- 1 | 2 | <% $1 %> 3 | <% $2 %> 4 | <% $3 %> 5 | 6 | -------------------------------------------------------------------------------- /service/src/view/suggestions.slp: -------------------------------------------------------------------------------- 1 | 2 | $+ $1 $+ "); }, $1); 4 | ?> 5 | -------------------------------------------------------------------------------- /bin/corpuswp.sh: -------------------------------------------------------------------------------- 1 | # 2 | # convert a WordPress WXR file to raw data suitable for use in the AtD corpus 3 | # 4 | 5 | java -Xmx3584M -jar lib/sleep.jar utils/bigrams/corpuswp.sl $1 6 | -------------------------------------------------------------------------------- /bin/fixdata.sh: -------------------------------------------------------------------------------- 1 | # 2 | # do this once! 3 | # 4 | 5 | cd data 6 | tar zxf corpora.tgz 7 | cd .. 8 | java -Xmx1024M -jar lib/sleep.jar utils/bigrams/fixgutenberg.sl data/corpus_gutenberg 9 | -------------------------------------------------------------------------------- /data/rules/grammar/aux_modals: -------------------------------------------------------------------------------- 1 | may|might|could|would .*/VBN|VBG::word=\0 \1:base::pivots=\1,\1:base 2 | may|might|could|would .*/VBZ::word=\0 \1:singular::pivots=\1,\1:singular 3 | 4 | -------------------------------------------------------------------------------- /service/src/view/rules/bias.slp: -------------------------------------------------------------------------------- 1 |

Replace <% $2 %> with

2 | 3 | 6 | -------------------------------------------------------------------------------- /bin/buildrules.sh: -------------------------------------------------------------------------------- 1 | # 2 | # This script creates the AtD rules 3 | # 4 | 5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/rules.sl 6 | -------------------------------------------------------------------------------- /bin/agreement.sh: -------------------------------------------------------------------------------- 1 | cd data/rules/agreement 2 | java -jar ../../../lib/sleep.jar ../../../utils/rules/agreement.sl chunk_single.r single.r chunk_plural.r plural.r >../grammar/agreement 3 | cd ../../.. 4 | -------------------------------------------------------------------------------- /data/rules/grammar/personal_pronoun_case: -------------------------------------------------------------------------------- 1 | # 2 | # personal pronoun I is always uppercase. 3 | # 4 | 5 | i::word=I 6 | i'll::word=I'll 7 | i'm::word=I'm 8 | i've::word=I've 9 | i'd::word=I'd 10 | -------------------------------------------------------------------------------- /service/src/view/rules/complex.slp: -------------------------------------------------------------------------------- 1 |

Replace <% $2 %> with

2 | 3 | 6 | -------------------------------------------------------------------------------- /bin/transr.sh: -------------------------------------------------------------------------------- 1 | # 2 | # run through a corpus and transform matching sentences using the specified rules. 3 | # 4 | 5 | java -Xmx3384M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/transr.sl $1 $2 6 | -------------------------------------------------------------------------------- /service/code/compile.txt: -------------------------------------------------------------------------------- 1 | To compile this code: 2 | 3 | 1. Create a symbolic link to 4 | ln -s ../../lib lib 5 | 6 | 2. Use Apache Ant to build everything 7 | ant clean 8 | ant 9 | cp spellutils.jar to lib 10 | -------------------------------------------------------------------------------- /bin/amigo.sh: -------------------------------------------------------------------------------- 1 | # find homophones in corpus for a language 2 | # ./bin/amigo.sh [language] 3 | 4 | java -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lang=$1 -classpath lib/\* sleep.console.TextConsole utils/bigrams/amigo.sl 5 | -------------------------------------------------------------------------------- /bin/compilespelltools.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Compiles the Sleep methods ported to Java contained in service/code 3 | # 4 | 5 | cd service/code 6 | ln -s ../../lib/ lib 7 | ant clean 8 | ant 9 | mv spellutils.jar lib/spellutils.jar 10 | rm -f lib 11 | ant clean 12 | -------------------------------------------------------------------------------- /atdconfig.sl: -------------------------------------------------------------------------------- 1 | # 2 | # configuration file for the Moconti app server 3 | # 4 | 5 | [$server addSite: "service.afterthedeadline.com", 6 | "service/src/site.sl", 7 | "service/root", 8 | ".", 9 | "key"]; 10 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | After the Deadline - Open Source Language Checking Technology README 2 | ------------------ 3 | 4 | Documentation on this code is at http://open.afterthedeadline.com 5 | 6 | See LICENSE.txt for license information. Enjoy the software. 7 | 8 | -- Raphael Mudge (rsmudge@gmail.com) 9 | -------------------------------------------------------------------------------- /bin/make3.sh: -------------------------------------------------------------------------------- 1 | java -Xmx1024M -jar sleep.jar gen3.sl corpus2 homophones.txt ho_test_gutenberg_context.txt 2 | java -Xmx1024M -jar sleep.jar gen2.sl corpus2 homophones.txt ho_train_gutenberg_context.txt 3 | java -Xmx1024M -jar sleep.jar gen3.sl /home/raffi/spell/corpus homophones.txt ho_test_wp_context.txt 4 | -------------------------------------------------------------------------------- /bin/testgr.sh: -------------------------------------------------------------------------------- 1 | java -Datd.lowmem=true -Xmx4048M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testgr.sl data/tests/grammar_wikipedia.txt 2 | java -Datd.lowmem=true -Xmx4048M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/testgr.sl data/tests/grammar_gutenberg.txt 3 | -------------------------------------------------------------------------------- /service/src/view/rule.slp: -------------------------------------------------------------------------------- 1 |

<% invoke($1["recommendation"], @_) %>

2 | 3 |

<% $1["description"] %> <% iff($1["source"] ne "", '(' . $1["source"] . ')') %>

4 | 5 | 6 | -------------------------------------------------------------------------------- /data/rules/grammar/infinitives: -------------------------------------------------------------------------------- 1 | # infinitive phrases 2 | # http://www.chompchomp.com/terms/infinitivephrase.htm 3 | 4 | to is::filter=kill 5 | to .*/VBZ .*/DT|NN::word=\0 \1:base \2::pivots=\1,\1:base 6 | To .*/VBZ .*/DT|NN::word=\0 \1:base \2::pivots=\1,\1:base 7 | 8 | need|going|have|ought to .*/VBG::word=\0 \1 \2:base::pivots=\2,\2:base 9 | -------------------------------------------------------------------------------- /bin/corpus-lex-diff.sh: -------------------------------------------------------------------------------- 1 | # 2 | # compare a corpus text file to the current wordlists and see what needs to be added 3 | # 4 | 5 | # to generate a wordlist suitable for the AtD wordlists directory: 6 | # 7 | # ./bin/corpus-lex-diff.sh filename.txt 50 wordlist 8 | 9 | java -Xmx3072M -jar lib/sleep.jar utils/bigrams/corpus-lex-diff.sl $1 $2 $3 10 | -------------------------------------------------------------------------------- /utils/bigrams/printcorpus.sl: -------------------------------------------------------------------------------- 1 | include("lib/nlp.sl"); 2 | 3 | $handle = openf(@ARGV[0]); 4 | $data = readb($handle, -1); 5 | closef($handle); 6 | 7 | foreach $paragraph (splitByParagraph($data)) 8 | { 9 | println("PARAGRAPH BEGIN!"); 10 | 11 | foreach $sentence ($paragraph) 12 | { 13 | println(" $sentence"); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /bin/trainhomophones.sh: -------------------------------------------------------------------------------- 1 | # 2 | # train and test the homophone misuse detection models 3 | 4 | java -Datd.lowmem=true -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainHomophoneModels 5 | java -Datd.lowmem=true -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runHomophoneTests 6 | -------------------------------------------------------------------------------- /bin/traintagger.sh: -------------------------------------------------------------------------------- 1 | # 2 | # code to generate and evaluate the tagger models. 3 | # 4 | 5 | java -Xmx3072M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/postrain.sl wikipedia_sentences_tagged_f.txt 6 | java -Xmx3072M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/tagger/postest.sl data/gutenberg_sentences_tagged_f.txt 7 | -------------------------------------------------------------------------------- /run-lowmem.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | REM 3 | REM startup script for AtD web service 4 | REM 5 | java -Dfile.encoding=UTF-8 -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lowmem=true -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath .\lib\sleep.jar;.\lib\moconti.jar;.\lib\spellutils.jar httpd.Moconti atdconfig.sl 6 | -------------------------------------------------------------------------------- /data/rules/abbr.txt: -------------------------------------------------------------------------------- 1 | Mr 2 | Mrs 3 | No 4 | pp 5 | St 6 | no 7 | Dr 8 | Prof 9 | Sr 10 | Bros 11 | etc 12 | vs 13 | esp 14 | Fig 15 | fig 16 | Jan 17 | Feb 18 | Mar 19 | Apr 20 | Jun 21 | Jul 22 | Aug 23 | Sep 24 | Sept 25 | Oct 26 | Nov 27 | Dec 28 | Ph.D 29 | PhD 30 | Lt 31 | LT 32 | 2Lt 33 | 1Lt 34 | Capt 35 | Maj 36 | Col 37 | Gen 38 | Brig 39 | Sgt 40 | Esq 41 | i.e 42 | e.g 43 | -------------------------------------------------------------------------------- /data/rules/agreement/plural.r: -------------------------------------------------------------------------------- 1 | *prefix* is a|the term|field::filter=kill::avoid=live, rest 2 | *prefix* is::word=*text* are, *transform*::filter=sane::avoid=live, rest 3 | *prefix* was::word=*text* were, *transform*::filter=sane::avoid=live, rest 4 | *prefix* doesn't::word=*text* don't, *transform*::filter=sane::avoid=live, rest 5 | *prefix* [a-z]+/VBZ::word=*text* \X:base, *transform*::filter=sane::avoid=live, rest 6 | -------------------------------------------------------------------------------- /bin/inspect.sh: -------------------------------------------------------------------------------- 1 | # 2 | # startup script for AtD web service 3 | # 4 | 5 | #!/bin/sh 6 | 7 | export PRODUCTION=/home/atd 8 | export ATD_HOME=/home/atd/atd 9 | export LOG_DIR=$ATD_HOME/logs 10 | 11 | export LC_CTYPE=en_US.UTF-8 12 | export LANG=en_US.UTF-8 13 | 14 | java -Datd.lowmem=true -Dfile.encoding=UTF-8 -Xmx3512M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/inspect.sl 15 | 16 | -------------------------------------------------------------------------------- /data/rules/grammar/det_agreement_plural: -------------------------------------------------------------------------------- 1 | # 2 | # determiner agreement rules for determiners expecting a plural noun 3 | # 4 | 5 | Both|Many|Several|Many|Few|Fewer|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten [a-z].*/NN [a-z].*ed/VBD::word=\0 \1:plural \2::pivots=\1,\1:plural 6 | both|these|those|us|many|several|few|fewer|two|three|four|five|six|seven|eight|nine|ten [a-z].*/NN [a-z].*ed/VBD::word=\0 \1:plural::pivots=\1,\1:plural 7 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # 2 | # startup script for AtD web service 3 | # 4 | 5 | #!/bin/sh 6 | 7 | export ATD_HOME=. 8 | export LOG_DIR=$ATD_HOME/logs 9 | 10 | java -server -Datd.lowmem=true -Dsleep.pattern_cache_size=8192 -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath "$ATD_HOME/lib/*" httpd.Moconti atdconfig.sl 11 | -------------------------------------------------------------------------------- /data/rules/grammar/weare: -------------------------------------------------------------------------------- 1 | if were|where .*/VBN|RB|VBG::word=\0 we're \2::pivots=\1,we're 2 | what were|where .*/VBN::word=\0 we're \2::filter=none 3 | what were|where .*/RB|VBG::word=\0 we're \2::pivots=\1,we're 4 | since were|where .*/RB|VBN|VBG::word=\0 we're \2::pivots=\1,we're 5 | that were|where .*/VBG::word=\0 we're \2::pivots=\1,we're 6 | where were::word=where we're::pivots=were,we're 7 | we're are::word=we are, where are::pivots=we're,we,where 8 | -------------------------------------------------------------------------------- /run-lowmem.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # startup script for AtD web service 4 | # 5 | 6 | export LC_CTYPE=en_US.UTF-8 7 | export LANG=en_US.UTF-8 8 | 9 | java -Dfile.encoding=UTF-8 -XX:+AggressiveHeap -XX:+UseParallelGC -Datd.lowmem=true -Dbind.interface=127.0.0.1 -Dserver.port=1049 -Dsleep.classpath=$ATD_HOME/lib:$ATD_HOME/service/code -Dsleep.debug=24 -classpath ./lib/sleep.jar:./lib/moconti.jar:./lib/spellutils.jar:./lib/* httpd.Moconti atdconfig.sl 10 | -------------------------------------------------------------------------------- /service/src/view/service.slp: -------------------------------------------------------------------------------- 1 | 2 | 13 | -------------------------------------------------------------------------------- /data/rules/pronouns.txt: -------------------------------------------------------------------------------- 1 | # Personal Pronouns: 2 | # subjective, objective, reflective, possessive pronoun, possessive determiner 3 | # 4 | # http://wapedia.mobi/en/English_personal_pronouns 5 | 6 | I, me, myself, mine, mine, my 7 | we, us, ourselves, ours, our 8 | you, you, yourselves, yours, your 9 | he, him, himself, his, his 10 | she, her, herself, hers, her 11 | it, it, itself, its, its 12 | they, them, themselves, theirs, their 13 | who, whom, whose, whose 14 | -------------------------------------------------------------------------------- /data/rules/grammar/were: -------------------------------------------------------------------------------- 1 | were are|is|did|will::word=where \1::pivots=were,where 2 | is also were::word=is also where::pivots=were,where 3 | were .*/EX .*/VBZ::word=where \1 \2::pivots=were,where 4 | is were::word=is where::pivots=were,where 5 | where .*/VBN::word=were \1::pivot=where,were 6 | were .*/VB|VBP::word=where \1::pivot=were,where 7 | we|they|I|he|she where .*/NNP|VBN::word=\0 were \2::pivots=where,were 8 | who where::word=who were::pivots=where,were::options=where,were 9 | -------------------------------------------------------------------------------- /bin/smallmodel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Create a language model for low-memory AtD 4 | # 5 | rm -f models/model.zip 6 | rm -rf tmp 7 | mkdir tmp 8 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildsmallmodel.sl 9 | cd tmp 10 | 11 | # we're using this instead of zip because zip on some systems creates corrupt 12 | # zip files when dealing with as many files as we have... get the JDK out. 13 | jar -cf ../models/model.zip . 1>/dev/null 14 | cd .. 15 | -------------------------------------------------------------------------------- /data/rules/complex/been: -------------------------------------------------------------------------------- 1 | has been .*ing and .*ing::word=\0 \2:participle and \4:participle 2 | have been .*ing::word=\0 \2:participle 3 | has been .*ing::word=\0 \2:participle 4 | had been .*ing::word=\0 \2:participle 5 | They've|they've been .*ing::word=\0 \2:participle 6 | You've|you've been .*ing::word=\0 \2:participle 7 | I've been .*ing::word=\0 \2:participle 8 | We've|we've been .*ing::word=\0 \2:participle 9 | should've been .*ing::word=\0 \2:participle 10 | could've been .*ing::word=\0 \2:participle 11 | would've been .*ing::word=\0 \2:participle 12 | -------------------------------------------------------------------------------- /data/rules/grammar/an: -------------------------------------------------------------------------------- 1 | # 2 | # these rules pick up when a/an are misused 3 | # 4 | 5 | # killing errors related to an indef article with a number 6 | # have to solve the problem with hundreds, teens, etc. 7 | An|A|a|an [\d+]\w+::filter=kill 8 | a|an|A|An RPG|RSS|XSS|SEC::filter=kill 9 | 10 | a/.* [aeiouyhAEIOUYH18]\w+/.*::filter=indefarticle::word=an \1 11 | an/.* [^aeiAEIMNRSX8]\w+/.*::filter=indefarticle::word=a \1 12 | 0BEGIN.0 A/.* [aeiouyhAEIOUYH18]\w+/.*::filter=indefarticle::word=An \1 13 | 0BEGIN.0 An/.* [^aeiAEIMNRSX8]\w+/.*::filter=indefarticle::word=A \1 14 | -------------------------------------------------------------------------------- /data/rules/grammar/whose: -------------------------------------------------------------------------------- 1 | Who's|who's .*ing::filter=kill 2 | 3 | who's .*/NN::word=whose \1::pivots=who's,whose 4 | whose .*/DT::word=who's \1::pivots=whose,who's 5 | Who's .*/NN::word=Whose \1::pivots=Who's,Whose 6 | Whose .*/DT::word=Who's \1::pivots=Whose,Who's 7 | 8 | about who's::word=about whose::pivots=who's,whose::options=who's,whose 9 | who's actual::word=whose actual::pivots=who's,whose::options=who's,whose 10 | who's name::word=whose name::pivots=who's,whose::options=who's,whose 11 | who's previous::word=whose previous::pivots=who's,whose::options=who's,whose 12 | -------------------------------------------------------------------------------- /lib/object.sl: -------------------------------------------------------------------------------- 1 | # everything you need for Sleep OO 2 | sub object 3 | { 4 | local('$function'); 5 | $function = function("& $+ $type $+ :: $+ $0"); 6 | if ($function !is $null) 7 | { 8 | return invoke($function, @_, $0, $this => $this); 9 | } 10 | throw "$type $+ :: $+ $0 - no such method"; 11 | } 12 | 13 | sub newObject 14 | { 15 | local('$object'); 16 | $object = lambda(&object, $type => $1); 17 | # invoke the constructor 18 | invoke($object, sublist(@_, 1), "init", $this => $object); 19 | return $object; 20 | } 21 | -------------------------------------------------------------------------------- /bin/buildedits.sh: -------------------------------------------------------------------------------- 1 | # 2 | # seed the edits model 3 | # This model is nothing more than a cache of potential edits for common word mispellings. The purpose is to speed up processing. AtD uses an LRU cache 4 | # when running to track and grow this information. The seeding is done because the edits operation is so expensive that have this information available 5 | # makes training, testing, and warm up time significantly faster. 6 | # 7 | java -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/seededits.sl sp_test_aspell_nocontext.txt sp_test_wpcm_nocontext.txt 8 | -------------------------------------------------------------------------------- /utils/bigrams/contextprob.sl: -------------------------------------------------------------------------------- 1 | # 2 | # a tool to inspect the language model 3 | # 4 | 5 | import org.dashnine.preditor.* from: lib/spellutils.jar; 6 | use(^SpellingUtils); 7 | 8 | # misc junk 9 | include("lib/dictionary.sl"); 10 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 11 | $model = get_language_model(); 12 | $dictionary = dictionary(); 13 | $dsize = size($dictionary); 14 | 15 | $total = 0L; 16 | foreach $word ($dictionary) { 17 | $total += count($word); 18 | } 19 | 20 | println($total); 21 | -------------------------------------------------------------------------------- /data/rules/grammar/subject_verb_agreement: -------------------------------------------------------------------------------- 1 | # I rules with correctons 2 | 3 | 0BEGIN.0 I is|be|are::word=I am::pivots=\1,am 4 | 5 | # You rules with corrections 6 | 7 | We|They|You is|am::word=\0 are::pivots=\1,are 8 | We|They|You was::word=\0 were::pivots=\1,were 9 | 0BEGIN.0 I has::word=\0 have::pivots=\1,have 10 | We|They|You has::word=\0 have::pivots=\1,have 11 | 12 | # He/She rules with corrections 13 | 14 | 0BEGIN.0 I were::word=\0 was::pivots=\1,was 15 | He|She were::word=\0 was::pivots=\1,was 16 | He|She have::word=\0 has::pivots=\1,has 17 | He|She be|am|are::word=\0 is::pivots=\1,is 18 | 19 | -------------------------------------------------------------------------------- /service/src/view/rules/ruleview.slp: -------------------------------------------------------------------------------- 1 |
"> 2 |
3 | ')" onclick="this.blur();">Display Rule 4 | ')" onclick="this.blur();">Add Rule 5 |
6 |

<% $1["rule"] %>: <% $1["text"] %>

7 | 8 |

<% $1["description"] %> ("><% $1["source"] %>)

9 |
10 | -------------------------------------------------------------------------------- /data/rules/grammar/contractedformnot: -------------------------------------------------------------------------------- 1 | ain't not::word=\0::filter=none 2 | aren't not::word=\0::filter=none 3 | can't not::word=\0::filter=none 4 | couldn't not::word=\0::filter=none 5 | didn't not::word=\0::filter=none 6 | doesn't not::word=\0::filter=none 7 | don't not::word=\0::filter=none 8 | hasn't not::word=\0::filter=none 9 | isn't not::word=\0::filter=none 10 | mightn't not::word=\0::filter=none 11 | mustn't not::word=\0::filter=none 12 | shan't not::word=\0::filter=none 13 | shouldn't not::word=\0::filter=none 14 | weren't not::word=\0::filter=none 15 | won't not::word=\0::filter=none 16 | wouldn't not::word=\0::filter=none 17 | -------------------------------------------------------------------------------- /service/src/view/rules/homophone.slp: -------------------------------------------------------------------------------- 1 | 14 | 15 |

Review definitions:

16 | 17 | 20 | 21 | -------------------------------------------------------------------------------- /bin/prepositions.sh: -------------------------------------------------------------------------------- 1 | # 2 | # code to generate rules for prepositions 3 | # 4 | 5 | echo '#' >data/rules/grammar/prepositions 6 | echo '# This file is automatically generated by ./bin/prepositions.sh - do not edit' >> data/rules/grammar/prepositions 7 | echo '#' >> data/rules/grammar/prepositions 8 | 9 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/findprepositions.sl >preps.tmp 10 | java -Dfile.encoding=UTF-8 -Xmx3840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/rules/makeprepositions.sl preps.tmp >>data/rules/grammar/prepositions 11 | rm -f preps.tmp 12 | -------------------------------------------------------------------------------- /service/src/view/rules/homophone2.slp: -------------------------------------------------------------------------------- 1 | 14 | 15 |

Review definitions:

16 | 17 | 20 | 21 | -------------------------------------------------------------------------------- /bin/buildgrammarsets.sh: -------------------------------------------------------------------------------- 1 | # 2 | # build grammar corpora 3 | # 4 | 5 | if [ -f wp.txt ] 6 | then 7 | 8 | java -jar lib/sleep.jar utils/spelldata/torules.sl wrong >rules.out 9 | 10 | # make the grammar rules files 11 | 12 | java -jar lib/sleep.jar utils/spelldata/maker.sl rules.out data/wikipedia_sentences.txt >data/tests/grammar_wikipedia.txt 13 | java -jar lib/sleep.jar utils/spelldata/maker.sl rules.out data/gutenberg_sentences.txt >data/tests/grammar_gutenberg.txt 14 | 15 | rm -f rules.out 16 | 17 | else 18 | echo "No wp.txt file is present, cut and paste Wikipedia Common Errors List to wp.txt and try again" 19 | 20 | fi 21 | -------------------------------------------------------------------------------- /data/rules/grammar/dneg2: -------------------------------------------------------------------------------- 1 | # 2 | # Style Double Negatives 3 | # 4 | 5 | not a|an unifable|unified|uniformed|unifying|united|undulated|undulating|universalized|universalised|unrest|(.*?der)|university|understood|understanding::filter=kill 6 | not unifable|unified|uniformed|unifying|united|undulated|undulating|universalized|universalised|unrest|(.*?der)|university|understood|understanding::filter=kill 7 | 8 | not a|an un[aeiouy].*::word=an \2:positive 9 | not a|an un[^aeiouy].*::word=a \2:positive 10 | not un.*::word=\1:positive 11 | 12 | # another double negative rule. Changes the meaning of the sentence but is easier to understand 13 | dont have|need no::word=\0 \1 any::pivots=no,any 14 | -------------------------------------------------------------------------------- /bin/all.sh: -------------------------------------------------------------------------------- 1 | ./bin/compilespelltools.sh # don't do this as the build box doesn't have ant on it (yet) 2 | 3 | # 4 | # set some vars that may help the cause. 5 | # 6 | export LC_CTYPE=en_US.UTF-8 7 | export LANG=en_US.UTF-8 8 | 9 | # 10 | # build the foundational NLP models 11 | # 12 | ./bin/buildmodel.sh 13 | #./bin/buildtaggersets.sh # do not uncomment this 14 | 15 | # 16 | # intermediate stuff 17 | # 18 | ./bin/buildrules.sh 19 | ./bin/testgr.sh 20 | ./bin/buildedits.sh 21 | 22 | # 23 | # train various models 24 | # 25 | #./bin/traintagger.sh # no good reason to do this unless tagger data changes 26 | ./bin/trainspellcontext.sh 27 | ./bin/trainspellnocontext.sh 28 | ./bin/trainhomophones.sh 29 | -------------------------------------------------------------------------------- /service/src/view/rules/category.slp: -------------------------------------------------------------------------------- 1 |
box" name="<% $1 %>"> 2 |

<% $2["rule"] %>

3 | 4 |

<% $2["description"] %> <% iff($2["source"] ne "", '('.$2["source"].')') %>

5 | 6 | 18 |
19 | -------------------------------------------------------------------------------- /data/rules/grammar/possessive: -------------------------------------------------------------------------------- 1 | # 2 | # errors related to possession vs. plural 3 | # 4 | 5 | Your|your|My|my|Their|their|Her|her|His|his|That|The|the|that [a-z].*/NNS .*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly 6 | with|a|an|With|A|an [a-z].*/NNS .*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly 7 | 8 | before|after|in|before|during|at [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly 9 | Before|After|In|Before|During|At [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly 10 | about|in|for|on|with [a-z].*s/NNS [a-z].*/NN::word=\0 \1:possessive \2::pivots=\1,\1:possessive::filter=nextonly 11 | 12 | -------------------------------------------------------------------------------- /utils/bigrams/builddict.sl: -------------------------------------------------------------------------------- 1 | # 2 | # This is a script to generate a spellchecker dictionary using the specified threshold. It's fun stuff. 3 | # 4 | # java -jar sleep.jar builddict.sl threshold models/model.bin models/dictionary.txt 5 | # 6 | 7 | debug(7 | 34); 8 | 9 | import org.dashnine.preditor.* from: lib/spellutils.jar; 10 | use(^SpellingUtils); 11 | 12 | include("lib/dictionary.sl"); 13 | 14 | sub main 15 | { 16 | global('$model $threshold $handle $index $1 $2'); 17 | $model = get_language_model($2); 18 | 19 | $handle = openf(iff($2 is $null, ">models/dictionary.txt", "> $+ $3")); 20 | 21 | printAll($handle, [SleepUtils getArrayWrapper: [$model harvest: int($1)]]); 22 | 23 | closef($handle); 24 | } 25 | 26 | invoke(&main, @ARGV); 27 | -------------------------------------------------------------------------------- /CREDITS.rules.txt: -------------------------------------------------------------------------------- 1 | The AtD rule set was inspired from many resources and projects around the web. 2 | The following resources were particularly helpful: 3 | 4 | LanguageTool Open Source Language Checker 5 | http://www.languagetool.org 6 | 7 | PlainLanguage.gov 8 | http://www.plainlanguage.gov 9 | 10 | GNU Style and Diction 11 | http://www.gnu.org/software/diction/diction.html 12 | 13 | Wikipedia 14 | http://en.wikipedia.org/wiki/Category:Wikipedia_style_guidelines 15 | http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings 16 | (and many other lists...) 17 | 18 | Graviax Grammar Checker 19 | http://graviax.sourceforge.net/ 20 | 21 | Cliches: Avoid Them Like the Plague 22 | http://suspense.net/whitefish/cliche.htm 23 | 24 | WordNet - Lexical Database for English 25 | http://wordnet.princeton.edu/ 26 | -------------------------------------------------------------------------------- /utils/spelldata/makesrc.sl: -------------------------------------------------------------------------------- 1 | # 2 | # transform the homophonesdb file into something our other scripts can handle 3 | # using the bad\ngood format. 4 | 5 | ($inh, $outh) = @ARGV; 6 | 7 | $handle = openf("models/dictionary.txt"); 8 | putAll(%dictionary, readAll($handle), { return 1; }); 9 | closef($handle); 10 | 11 | $handle = openf($inh); 12 | @data = readAll($handle); 13 | closef($handle); 14 | 15 | $handle = openf("> $+ $outh"); 16 | foreach $d (@data) 17 | { 18 | @words = split(',\s*', $d); 19 | foreach $w1 (@words) 20 | { 21 | foreach $w2 (@words) 22 | { 23 | if ($w1 ne $w2 && $w1 in %dictionary && $w2 in %dictionary) 24 | { 25 | println($handle, "$w2"); 26 | println($handle, "$w1"); 27 | } 28 | } 29 | } 30 | } 31 | closef($handle); 32 | -------------------------------------------------------------------------------- /utils/bigrams/corpuswp.sl: -------------------------------------------------------------------------------- 1 | # 2 | # Export posts (only!) from a WordPress WXR file and make the content as plain text as possible. 3 | # use this to preprocess a file for adding to data/corpus_extra 4 | # 5 | 6 | $handle = openf(@ARGV[0]); 7 | $data = readb($handle, -1); 8 | closef($handle); 9 | 10 | $data = join(' ', split("\n|\r", $data)); 11 | @data = matches($data, '\\<\!\[CDATA\[(.*?)\]\]\>\'); 12 | 13 | foreach $index => $data (@data) 14 | { 15 | if (strlen($data) > 0) 16 | { 17 | $data = strrep($data, '&', '&', ' ', ' ', '
', "\n", '

', "\n", '"e;', '"', '“', "'", '”', "'", '’', "'", '«', '"', '»', '"', '’', "'"); 18 | $data = replace($data, '(<[^>]*?>)', ''); 19 | println($data); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /bin/trainspellnocontext.sh: -------------------------------------------------------------------------------- 1 | # 2 | # train and test the spellchecker models 3 | # 4 | 5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainNoContext 6 | 7 | echo "=== NON-CONTEXTUAL DATA =======================================================================" 8 | 9 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingTest sp_test_aspell_nocontext.txt 10 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingTest sp_test_wpcm_nocontext.txt 11 | 12 | # normal spelling test 13 | #java -Xmx1024M -jar lib/sleep.jar utils/spell/test.sl runSpellingTest tests1.txt 14 | #java -Xmx1024M -jar lib/sleep.jar utils/spell/test.sl runSpellingTest tests2.txt 15 | -------------------------------------------------------------------------------- /utils/tagger/tagit.sl: -------------------------------------------------------------------------------- 1 | # this script simply tags sentences in a file. it assumes each setence is on a line by itself. 2 | 3 | include("lib/engine.sl"); 4 | include("utils/rules/rules.sl"); 5 | 6 | sub initAll 7 | { 8 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 9 | $model = get_language_model(); 10 | $dsize = size($dictionary); 11 | $hnetwork = get_network("hnetwork.bin"); 12 | $verbs = loadVerbData(); 13 | initTaggerModels(); 14 | } 15 | 16 | sub main 17 | { 18 | local('$handle $sentence @results @past'); 19 | 20 | initAll(); 21 | 22 | $handle = openf($1); 23 | while $sentence (readln($handle)) 24 | { 25 | println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence)))); 26 | } 27 | } 28 | 29 | invoke(&main, @ARGV); 30 | -------------------------------------------------------------------------------- /utils/tagger/fixtags.sl: -------------------------------------------------------------------------------- 1 | sub main 2 | { 3 | local('$handle $sentence $entry $word $tag $previous @s'); 4 | $handle = openf($1); 5 | while $sentence (readln($handle)) 6 | { 7 | @s = @(); 8 | 9 | foreach $entry (split(' ', $sentence)) 10 | { 11 | ($word, $tag) = split('/', $entry); 12 | if ("'" isin $word && size(@s) > 0) 13 | { 14 | if ($tag eq "''") 15 | { 16 | @s[-1] = @(@s[-1][0] . $word, @s[-1][1]); 17 | } 18 | else 19 | { 20 | @s[-1] = @(@s[-1][0] . $word, @s[-1][1] . ',' . $tag); 21 | } 22 | } 23 | else 24 | { 25 | push(@s, @(lc($word), $tag)); 26 | } 27 | 28 | } 29 | println( join(" ", map({ return join('/', $1); }, @s)) ); 30 | } 31 | } 32 | 33 | invoke(&main, @ARGV); 34 | -------------------------------------------------------------------------------- /data/rules/prepositions.txt: -------------------------------------------------------------------------------- 1 | about 2 | above 3 | according to 4 | across 5 | after 6 | against 7 | along 8 | along with 9 | among 10 | apart from 11 | around 12 | as 13 | as for 14 | at 15 | because of 16 | before 17 | behind 18 | below 19 | beneath 20 | beside 21 | between 22 | beyond 23 | but* 24 | by 25 | by means of 26 | concerning 27 | despite 28 | down 29 | during 30 | except 31 | except for 32 | excepting 33 | for 34 | from 35 | in 36 | in addition to 37 | in back of 38 | in case of 39 | in front of 40 | in place of 41 | inside 42 | in spite of 43 | instead of 44 | into 45 | like 46 | near 47 | next 48 | of 49 | off 50 | on 51 | onto 52 | on top of 53 | out 54 | out of 55 | outside 56 | over 57 | past 58 | regarding 59 | round 60 | since 61 | through 62 | throughout 63 | till 64 | to 65 | toward 66 | under 67 | underneath 68 | unlike 69 | until 70 | up 71 | upon 72 | up to 73 | with 74 | within 75 | without 76 | -------------------------------------------------------------------------------- /utils/spelldata/process.sl: -------------------------------------------------------------------------------- 1 | $handle = openf("spelling.txt"); 2 | 3 | global('%dataset'); 4 | 5 | while $bad (readln($handle)) 6 | { 7 | $good = readln($handle); 8 | %dataset[$bad] = $good; 9 | } 10 | 11 | closef($handle); 12 | 13 | 14 | $handle = openf("batch0.tab"); 15 | while $text (readln($handle)) 16 | { 17 | ($bad, $good) = split('\s+', $text); 18 | %dataset[$bad] = $good; 19 | } 20 | 21 | closef($handle); 22 | 23 | $handle = openf("batch0.tab.1"); 24 | while $text (readln($handle)) 25 | { 26 | ($bad, $good) = split('\s+', $text); 27 | %dataset[$bad] = $good; 28 | } 29 | 30 | closef($handle); 31 | 32 | $handle = openf(">output.txt"); 33 | $handle2 = openf(">output2.txt"); 34 | 35 | @bads = sorta(keys(%dataset)); 36 | foreach $bword (@bads) 37 | { 38 | println($handle, $bword); 39 | println($handle2, $bword); 40 | println($handle, %dataset[$bword]); 41 | } 42 | 43 | closef($handle); 44 | closef($handle2); 45 | -------------------------------------------------------------------------------- /bin/buildmodel.sh: -------------------------------------------------------------------------------- 1 | # 2 | # This script creates the AtD bigram model (corpus.zip) 3 | # 4 | 5 | java -version 6 | 7 | rm -f models/model.bin 8 | 9 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_gutenberg 10 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_wikipedia 11 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/bigrams/buildcorpus.sl data/corpus_extra 12 | 13 | # build dictionary (make sure it's done *after* zipping) 14 | 15 | java -Dfile.encoding=UTF-8 -Xmx6840M -XX:NewSize=512M -jar lib/sleep.jar utils/bigrams/builddict.sl 2 16 | 17 | # create the not misspelled dictionary... 18 | 19 | cp data/wordlists/accented.txt models/not_misspelled.txt 20 | 21 | # create LM for low-memory AtD 22 | ./bin/smallmodel.sh 23 | -------------------------------------------------------------------------------- /service/src/view/wordpress_gen.slp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | <% $1["rule"] %> 5 | 35 | 36 | 37 | 38 |

<% $1["rule"] %>

39 | 40 |
41 | 42 |
43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /bin/buildspelldata.sh: -------------------------------------------------------------------------------- 1 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/sp_test_wpcm_nocontext.txt data/tests/sp_test_gutenberg_context1.txt 2 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/sp_test_aspell_nocontext.txt data/tests/sp_test_gutenberg_context2.txt 3 | 4 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_wikipedia data/tests/sp_test_wpcm_nocontext.txt data/tests/sp_test_wp_context1.txt 5 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_wikipedia data/tests/sp_test_aspell_nocontext.txt data/tests/sp_test_wp_context2.txt 6 | 7 | #java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg data/tests/train.txt data/tests/sp_train_gutenberg_context.txt 8 | #echo "are * blind|you, oyu" >>data/tests/sp_train_gutenberg_context.txt 9 | 10 | -------------------------------------------------------------------------------- /data/rules/grammar/count: -------------------------------------------------------------------------------- 1 | # lowercase 2 | fewer &uncountable::word=less \1::filter=indefarticle 3 | &uncountable or fewer::word=\0 or less::filter=none 4 | few &uncountable::word=little \1::filter=indefarticle 5 | 6 | the less::filter=die 7 | less &uncountable::filter=kill 8 | less .*/NNS::word=fewer \1::filter=indefarticle 9 | 10 | little people::filter=kill 11 | little &uncountable::word=few \1::filter=indefarticle 12 | 13 | # uppercase 14 | Fewer &uncountable::word=Less \1::filter=indefarticle 15 | Few &uncountable::word=Little \1::filter=indefarticle 16 | 17 | The less::filter=die 18 | Less &uncountable::filter=kill 19 | Less .*/NNS::word=Fewer \1::filter=indefarticle 20 | 21 | Little people::filter=kill 22 | Little &uncountable::word=Few \1::filter=indefarticle 23 | 24 | # hide situations where the uncountable noun is used as an adjective 25 | # (e.g., water snails) 26 | few|fewer|Few|Fewer &uncountable .*/NNS::filter=kill 27 | little|Little &uncountable .*/NNS::filter=kill 28 | 29 | -------------------------------------------------------------------------------- /bin/trainspellcontext.sh: -------------------------------------------------------------------------------- 1 | # 2 | # train and test the spellchecker models 3 | # 4 | 5 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/trainspell.sl trainWithContext 6 | 7 | echo "=== CONTEXTUAL DATA ===========================================================================" 8 | 9 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_wp_context1.txt 10 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_wp_context2.txt 11 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_gutenberg_context1.txt 12 | java -Datd.lowmem=true -Xmx3536M -XX:+AggressiveHeap -XX:+UseParallelGC -jar lib/sleep.jar utils/spell/test.sl runSpellingContextTest sp_test_gutenberg_context2.txt 13 | 14 | -------------------------------------------------------------------------------- /CREDITS.txt: -------------------------------------------------------------------------------- 1 | After the Deadline uses the following libraries: 2 | 3 | lib/cngramj.jar 4 | http://ngramj.sourceforge.net/ 5 | 6 | ngramj is a language guessing library for Java. It's licensed under the LGPL. 7 | I modified it by packaging a language profile for Indonesian: 8 | http://blog.afterthedeadline.com/2010/02/08/n-gram-language-guessing-with-ngramj/ 9 | 10 | lang/lib/languagetool 11 | http://www.languagetool.org 12 | 13 | Language Tool is a rule-based language checking program. It's licensed under the LGPL. 14 | No modifications to Language Tool were made. 15 | 16 | lang/*/wordlists/*.utf8.txt 17 | 18 | Several dictionaries were extracted from the Open Office dictionaries page and converted 19 | to their normal form using unmunch and then converted to UTF8 by me. 20 | 21 | The licenses for the original source files range from GPL, LGPL, MPL (Mozilla Public 22 | License) and Creative Commons ShareAlike licenses. 23 | 24 | http://wiki.services.openoffice.org/wiki/Dictionaries 25 | -------------------------------------------------------------------------------- /utils/tagger/makebootstrap.sl: -------------------------------------------------------------------------------- 1 | debug(7 | 34); 2 | 3 | import java.util.List; 4 | import java.io.BufferedReader; 5 | import java.io.FileReader; 6 | 7 | import edu.stanford.nlp.ling.Sentence from: stanford-postagger-2008-09-28.jar; 8 | import edu.stanford.nlp.ling.TaggedWord from: stanford-postagger-2008-09-28.jar; 9 | import edu.stanford.nlp.ling.HasWord from: stanford-postagger-2008-09-28.jar; 10 | import edu.stanford.nlp.tagger.maxent.MaxentTagger from: stanford-postagger-2008-09-28.jar; 11 | 12 | global('$x $semaphore $handle $file @array'); 13 | 14 | $semaphore = semaphore(); 15 | $handle = openf(@ARGV[1]); 16 | $file = @ARGV[0]; 17 | 18 | sub doit 19 | { 20 | local('$taggedLine $tagger $text $sentence'); 21 | 22 | $tagger = [new MaxentTagger: $file]; 23 | 24 | while $text (readln($handle)) 25 | { 26 | $sentence = [Sentence toSentence: cast(split('\s+', strrep($text, "'", " '")), ^String)]; 27 | $taggedLine = [$tagger tagSentence: $sentence]; 28 | println([$taggedLine toString: 0]); 29 | } 30 | } 31 | 32 | doit(); 33 | -------------------------------------------------------------------------------- /service/src/view/tinymce.slp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | <% $1["rule"] %> 5 | 6 | 7 | 8 | 9 | 10 |
11 | 14 |
15 |
16 |
17 | 18 |
19 |
20 | 21 |
22 |
23 | 24 |
25 |
26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /data/rules/grammar/lay: -------------------------------------------------------------------------------- 1 | # http://grammar.quickanddirtytips.com/lay-versus-lie.aspx 2 | 3 | # 4 | # Confused word: laid 5 | # 6 | 7 | laid ahead::word=lay ahead::pivots=laid,lay::options=laid,lay 8 | 9 | # 10 | # Confused word: lay 11 | # 12 | 13 | lay around::word=lie around::pivots=lay,lie::options=lay,lie 14 | lay low::word=lie low::pivots=lay,lie::options=lay,lie 15 | 16 | # 17 | # Confused word: laying 18 | # 19 | 20 | laying around::word=lying around::pivots=laying,lying::options=laying,lying 21 | laying low::word=lying low::pivots=laying,lying::options=laying,lying 22 | 23 | # 24 | # Confused word: lays 25 | # 26 | 27 | lays atop::word=lies atop::pivots=lays,lies::options=lays,lies 28 | lays beside::word=lies beside::pivots=lays,lies::options=lays,lies 29 | lays low::word=lies low::pivots=lays,lies::options=lays,lies 30 | lays near::word=lies near::pivots=lays,lies::options=lays,lies 31 | lays on::word=lies on::pivots=lays,lies::options=lays,lies 32 | 33 | # 34 | # Confused word: lain 35 | # 36 | 37 | was lain::word=was laid::pivots=lain,laid::options=lain,laid 38 | were lain::word=were laid::pivots=lain,laid::options=lain,laid 39 | -------------------------------------------------------------------------------- /data/rules/grammar/its: -------------------------------------------------------------------------------- 1 | # yes, I know some parts of this rule are redundant with others--why mess with a working formula 2 | Its .*/JJ|JJS .*/NN .*/TO|PRP|NNP::word=It's \1 \2 \3::filter=none 3 | Its .*/JJ|JJS .*/TO|PRP|NNP::word=It's \1 \2::filter=none 4 | Its .*/JJ|JJS .*/NN a|an|that|because|as::word=It's \1 \2 \3::filter=none 5 | Its .*/JJ|JJS a|an|that|because|as::word=It's \1 \2::filter=none 6 | Its .*/JJ for::word=It's \1 \2::filter=none 7 | 8 | Its .*/RB|DT::word=It's \1::filter=none 9 | 10 | its .*/DT|IN|MD|POS|PP|WDT|WP|WRB::name=its rule::word=it's \1::filter=none 11 | its .*/CC|RB::name=its rule::word=it's \1::pivots=its,it's 12 | Its .*/DT|IN|MD|POS|PP|WDT|WP|WRB::name=its rule::word=It's \1::filter=none 13 | Its .*/CC|RB::name=its rule::word=It's \1::pivots=Its,It's 14 | its .*ed/VBN|VBD .*/NN|NNS::word=it's \1 \2::filter=kill 15 | its .*ed/VBN|VBD::word=it's \1::pivots=its,it's 16 | Its .*ed/VBN|VBD .*/NN|NNS::word=it's \1 \2::filter=kill 17 | Its .*ed/VBN|VBD::word=it's \1::pivots=its,it's 18 | 19 | its not::word=it's not::pivots=its,it's 20 | 21 | 22 | its .*/VBG .*/NN|NNS::filter=kill 23 | its .*/VBG::word=it's \1::pivots=\0,it's 24 | 25 | 26 | -------------------------------------------------------------------------------- /service/src/view/rules/nomit.slp: -------------------------------------------------------------------------------- 1 | $+ $2 $+ with "); 9 | 10 | $o = map({ return " $+ $1 $+ "; }, split(', ', $option)); 11 | $o = filter(lambda({ if ($1 !in %nodupes) { %nodupes[$1] = 1; return $1; } }, %nodupes => %()), $o); 12 | 13 | if (size($o) == 1) 14 | { 15 | print($o[0]); 16 | } 17 | else 18 | { 19 | print([(join(",", sublist($o, 0, -1)) . " or " . $o[-1]) trim]); 20 | } 21 | 22 | println("."); 23 | } 24 | else 25 | { 26 | println("You should revise $+ $2 $+ to bring out the verb."); 27 | } 28 | ?> 29 | 30 |
31 |
Revision Examples 32 |
33 |
Before: Bonuses are based on the performance of the company. 34 |
After: Bonuses are based on how the company performs. 35 |
36 |
Before: An Explanation of Hidden Verbs. 37 |
After: Hidden Verbs Explained. 38 | -------------------------------------------------------------------------------- /utils/bigrams/buildsmallmodel.sl: -------------------------------------------------------------------------------- 1 | # 2 | # convert the large language model to pieces that we can load as needed 3 | # 4 | debug(7 | 34); 5 | 6 | import org.dashnine.preditor.* from: lib/spellutils.jar; 7 | use(^SpellingUtils); 8 | 9 | # misc junk 10 | include("lib/dictionary.sl"); 11 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 12 | $model = get_language_model(); 13 | 14 | sub main { 15 | local('$handle $x $entry $wid $file'); 16 | $handle = openf(">models/stringpool.bin"); 17 | writeObject($handle, [$model getStringPool]); 18 | writeObject($handle, [$model count]); 19 | closef($handle); 20 | 21 | # make the necessary directories 22 | mkdir("tmp"); 23 | for ($x = 0; $x < 512; $x++) { 24 | mkdir("tmp/ $+ $x"); 25 | } 26 | 27 | # create each individual entry 28 | foreach $entry ([[[$model getStringPool] entrySet] iterator]) { 29 | $wid = [$entry getValue]; 30 | $file = getFileProper("tmp", $wid % 512, $wid); 31 | $handle = openf("> $+ $file"); 32 | writeAsObject($handle, [[$model getLanguageModel] get: $wid]); 33 | closef($handle); 34 | } 35 | } 36 | 37 | invoke(&main, @ARGV); 38 | -------------------------------------------------------------------------------- /utils/common/score.sl: -------------------------------------------------------------------------------- 1 | # 2 | # code for the score object 3 | # 4 | 5 | sub sortScores 6 | { 7 | return [$1 value] <=> [$2 value]; 8 | } 9 | 10 | sub score::init 11 | { 12 | this('$desc $count $fneg $fpos $correct $sugg'); 13 | ($desc) = @_; 14 | } 15 | 16 | sub score::record 17 | { 18 | $count++; 19 | } 20 | 21 | sub score::falseNegative 22 | { 23 | $fneg++; 24 | } 25 | 26 | sub score::falsePositive 27 | { 28 | $fpos++; 29 | } 30 | 31 | sub score::correct 32 | { 33 | $correct++; 34 | } 35 | 36 | sub score::correctSugg 37 | { 38 | $sugg++; 39 | } 40 | 41 | sub score::value 42 | { 43 | return (double($correct) / $count); 44 | } 45 | 46 | sub score::print 47 | { 48 | println("Report for $desc"); 49 | println("Correct: " . ((double($correct) / $count) * 100.0)); 50 | 51 | if ($sugg != 0) 52 | { 53 | println("Suggestion Acc: " . ((double($sugg) / $count) * 100.0)); 54 | println("-" x 20); 55 | } 56 | if ($fneg != 0) 57 | { 58 | println("False Negative: " . ((double($fneg) / $count) * 100.0)); 59 | } 60 | if ($fpos != 0) 61 | { 62 | println("False Positive: " . ((double($fpos) / $count) * 100.0)); 63 | } 64 | } 65 | 66 | 67 | -------------------------------------------------------------------------------- /bin/buildhomodata.sh: -------------------------------------------------------------------------------- 1 | # generate the source data 2 | rm -rf tmp 3 | mkdir tmp 4 | java -jar lib/sleep.jar utils/spelldata/makesrc.sl data/rules/homophonedb.txt tmp/homophones.txt 5 | 6 | # 7 | # build with parts-of-speech 8 | # 9 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/wikipedia_sentences.txt data/tests/ho_test_wp_pos_context.txt 15 10 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/gutenberg_sentences.txt data/tests/ho_test_gutenberg_pos_context.txt 15 11 | 12 | # was 8 13 | java -jar lib/sleep.jar utils/spelldata/gen4.sl tmp/homophones.txt data/gutenberg_sentences.txt data/tests/ho_train_gutenberg_pos_context.txt 6 14 | 15 | # 16 | # build without parts-of-speech 17 | # 18 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen3.sl data/corpus_gutenberg tmp/homophones.txt data/tests/ho_test_gutenberg_context.txt 19 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen2.sl data/corpus_gutenberg tmp/homophones.txt data/tests/ho_train_gutenberg_context.txt 20 | java -Xmx2536M -XX:NewSize=512M -jar lib/sleep.jar utils/spelldata/gen3.sl data/corpus_wikipedia tmp/homophones.txt data/tests/ho_test_wp_context.txt 21 | rm -rf tmp 22 | -------------------------------------------------------------------------------- /data/rules/agreement/single.r: -------------------------------------------------------------------------------- 1 | *prefix* are::word=*text* is, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane 2 | *prefix* were::word=*text* was, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane 3 | *prefix* don't::word=*text* doesn't, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane 4 | *prefix* [a-z]+/VBP::word=*text* \X:plural, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let::filter=sane 5 | *prefix* be::filter=kill 6 | *prefix* by::filter=kill 7 | *prefix* [a-z]+/VB is::filter=kill 8 | *prefix* [a-z]+/VB of|for::filter=kill 9 | *prefix* [a-z]+/VB [a-z]+/VBD|VBZ::filter=kill 10 | *prefix* [a-z]+/VB::word=*text* \X:plural, *transform*::avoid=police, sheep, will, cannot, i, read, majority, half, might, let, let's::filter=sane 11 | *prefix* are [a-z]+/VBN::filter=kill 12 | *prefix* [a-z]+/MD::filter=kill 13 | One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|Eleven|Twelve|Thirteen|Fourteen|Fifteen|Sixteen|Seventeen|Eighteen|Nineteen|Twenty|Thirty|Fourty|Fifty|Sixty|Seventy|Eighty|Ninenty dollars|pounds|points|feet|inches|meters::filter=kill 14 | -------------------------------------------------------------------------------- /utils/bigrams/fixgutenberg.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this program fixes the gutenberg corpus by looping through each file and collapsing paragraphs onto a single line. 3 | # this will lead to a more accurate language model which is a really good thing. 4 | # 5 | # do not do this twice or bad things will happen!!!! 6 | # 7 | 8 | sub fixFile 9 | { 10 | local('$handle $buffer $text $data'); 11 | 12 | # read the file and populate our buffer please 13 | 14 | $buffer = allocate(lof($1)); 15 | $handle = openf($1); 16 | while $text (readln($handle)) 17 | { 18 | if ($text eq "") 19 | { 20 | print($buffer, "\n"); 21 | } 22 | else 23 | { 24 | print($buffer, "$text "); 25 | } 26 | } 27 | closef($handle); 28 | closef($buffer); 29 | 30 | # read the contents of the buffer in 31 | 32 | $data = readb($buffer, -1); 33 | closef($buffer); 34 | 35 | # transfer the contents of the buffer to 36 | 37 | $handle = openf("> $+ $1"); 38 | writeb($handle, $data); 39 | closef($handle); 40 | } 41 | 42 | 43 | map({ 44 | if (-isDir $1) 45 | { 46 | map($this, ls($1)); 47 | } 48 | else 49 | { 50 | fixFile($1); 51 | } 52 | }, @ARGV); 53 | 54 | println("Corpus Prepared"); 55 | -------------------------------------------------------------------------------- /utils/rules/agreement.sl: -------------------------------------------------------------------------------- 1 | # 2 | # make a super rule file based on the chunker 3 | # 4 | 5 | sub fix { 6 | local('$s $c $t'); 7 | $s = split('\s+', $1); 8 | foreach $c => $t ($s) { 9 | $t = "\\ $+ $c"; 10 | } 11 | return join(" ", $s); 12 | } 13 | 14 | sub count { 15 | local('$s $c $t'); 16 | $s = split('\s+', $1); 17 | return "\\" . (size($s) + $2); 18 | } 19 | 20 | sub noempties { 21 | return iff(strlen([$1 trim]) > 0, $1); 22 | } 23 | 24 | sub makeData { 25 | local('$a $b'); 26 | ($a, $b) = split('::', $1); 27 | if (strlen($b) > 0) { $b = ", $b " . count($a); } 28 | return @($a, fix($a), count($a, 0), $b, count($a, 1)); 29 | } 30 | 31 | sub main { 32 | local('$handle @prefixes @rules $rule'); 33 | $handle = openf($1); 34 | @prefixes = map(&makeData, filter(&noempties, readAll($handle))); 35 | closef($handle); 36 | 37 | $handle = openf($2); 38 | @rules = readAll($handle); 39 | closef($handle); 40 | 41 | foreach $rule (@rules) { 42 | printAll(map(lambda({ return '0BEGIN.0 ' . strrep($rule, '*prefix*', $1[0], '*text*', $1[1], '\\X', $1[2], '\\Y', $1[4], ', *transform*', $1[3]); }, \$rule), @prefixes)); 43 | } 44 | 45 | printAll(map({ return '0BEGIN.0 ' . $1[0] . "::filter=kill"; }, @prefixes)); 46 | } 47 | 48 | invoke(&main, sublist(@ARGV, 2)); 49 | invoke(&main, @ARGV); 50 | 51 | -------------------------------------------------------------------------------- /utils/common/bywords.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this class looks at how often the trigram tagger guesses a word's correctness by the confused word 3 | # used to generate the homobias class to 4 | # 5 | 6 | sub byword::init 7 | { 8 | this('%data'); 9 | 10 | %data = ohash(); 11 | setMissPolicy(%data, 12 | { 13 | return newObject("score", "$2"); 14 | }); 15 | } 16 | 17 | sub byword::process 18 | { 19 | local('$correct $wrong $wrongs $pre2 $pre1 $next @temp $nbase $tbase $solution $all %scores'); 20 | ($correct, $wrong, $wrongs, $pre2, $pre1, $next) = @_; 21 | 22 | $all = tagAll($pre2[1], $pre1[1], $pre1[0], $wrongs); 23 | 24 | if (isDifferent($all)) 25 | { 26 | $solution = getBest($all)[0]; 27 | if ($solution eq $correct) 28 | { 29 | [%data[$solution] correct]; 30 | } 31 | [%data[$solution] record]; 32 | } 33 | } 34 | 35 | sub byword::finish 36 | { 37 | map({ [$1 print]; }, sort(&sortScores, values(%data))); 38 | } 39 | 40 | sub byword::save 41 | { 42 | local('$key $value $handle'); 43 | foreach $key => $value (%data) 44 | { 45 | $value = [$value value]; 46 | # warn("$key -> $value"); 47 | } 48 | 49 | $handle = openf(">models/bywords.bin"); 50 | writeObject($handle, %data); 51 | closef($handle); 52 | println("Model saved"); 53 | } 54 | -------------------------------------------------------------------------------- /service/code/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /service/src/view/error.slp: -------------------------------------------------------------------------------- 1 | 2 | <% $1 %> 3 | <% $2["rule"] %> 4 | <% iff($3 ne "0BEGIN.0" && $3 !isin ',()-[];:/--', $3) %> 5 | 0) 7 | { 8 | display("service/src/view/suggestions.slp", $4); 9 | } 10 | ?> 11 | 25 | http://service.afterthedeadline.com/info.slp?text='.[java.net.URLEncoder encode: $1].''); 38 | println('' . $INFOURL . '/info.slp?text='.[java.net.URLEncoder encode: $1].'&tags='.[java.net.URLEncoder encode: join('/', map({ return $1[1]; }, @tags))].'&engine='.$6.''); 39 | } 40 | ?> 41 | 42 | 43 | -------------------------------------------------------------------------------- /utils/rules/findprepositions.sl: -------------------------------------------------------------------------------- 1 | # 2 | # a tool to inspect the language model 3 | # 4 | 5 | import org.dashnine.preditor.* from: lib/spellutils.jar; 6 | use(^SpellingUtils); 7 | 8 | # misc junk 9 | include("lib/dictionary.sl"); 10 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 11 | $model = get_language_model(); 12 | $dictionary = dictionary(); 13 | $dsize = size($dictionary); 14 | 15 | global('@prepositions'); 16 | @prepositions = filter({ return iff(indexOf($1, ' ') is $null, $1); }, map({ return [$1 trim]; }, `cat data/rules/prepositions.txt`)); 17 | 18 | foreach $word (sort({ return count($2) <=> count($1); }, keys($dictionary))) 19 | { 20 | if (count($word) < 100) 21 | { 22 | continue; 23 | } 24 | 25 | foreach $preposition (@prepositions) 26 | { 27 | # Pnext(preposition|word) 28 | if (Pbigram1($word, $preposition) > 0.50) 29 | { 30 | println("$word $+ , $preposition : Pbigram1( $+ $word $+ , $preposition $+ ) = " . Pbigram1($word, $preposition)); 31 | } 32 | # Pprev(preposition|word) 33 | else if (Pbigram2($preposition, $word) > 0.50) 34 | { 35 | println("$word $+ , $preposition : Pbigram2( $+ $preposition $+ , $word $+ ) = " . Pbigram2($preposition, $word)); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /data/rules/nomdb.txt: -------------------------------------------------------------------------------- 1 | an NOM of|with|from 2 | a NOM of|with|from 3 | in the NOM with|of 4 | in NOM with|of 5 | the NOM with|of|from 6 | come to a|an|the NOM 7 | came to a|an|the NOM 8 | make a|an|the NOM 9 | makes a|an|the NOM 10 | making a|an|the NOM 11 | made a|an|the NOM 12 | do a|an|the NOM 13 | did a|an|the NOM 14 | does a|an|the NOM 15 | doesnt a|an|the NOM 16 | give a|an|the NOM 17 | given a|an|the NOM 18 | have a|an|the NOM 19 | has a|an|the NOM 20 | had a|an|the NOM 21 | having a|an|the NOM 22 | have a|an|the NOM 23 | achieve a|an|the NOM 24 | achieved a|an|the NOM 25 | be NOM 26 | provided a|an|the NOM 27 | perform a|an|the NOM 28 | performed a|an|the NOM 29 | conduct a|an|the NOM 30 | conducted a|an|the NOM 31 | accomplish a|an|the NOM 32 | accomplished a|an|the NOM 33 | achieved a|an|the NOM 34 | attained a|an|the NOM 35 | carry out a|an|the NOM 36 | carried out a|an|the NOM 37 | conduct a|an|the NOM 38 | conducted a|an|the NOM 39 | effected a|an|the NOM 40 | experienced a|an|the NOM 41 | experience a|an|the NOM 42 | facilitated a|an|the NOM 43 | given a|an|the NOM 44 | implemented a|an|the NOM 45 | indicate a|an|the NOM 46 | indicated a|an|the NOM 47 | involve a|an|the NOM 48 | involved a|an|the NOM 49 | made a|an|the NOM 50 | obtained a|an|the NOM 51 | occurred a|an|the NOM 52 | performed a|an|the NOM 53 | proceeded a|an|the NOM 54 | produced a|an|the NOM 55 | required a|an|the NOM 56 | require a|an|the NOM 57 | -------------------------------------------------------------------------------- /utils/spell/seededits.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this is a script to run unit tests and calculute the effectiveness of the 3 | # preditor engine 4 | # 5 | 6 | debug(debug() | 7 | 34); 7 | 8 | map({ iff('*.sl' iswm $1, include($1)); }, ls("utils/common")); 9 | 10 | include("lib/engine.sl"); 11 | include("lib/object.sl"); 12 | 13 | global('$dictionary $model $dsize $trie'); 14 | $model = get_language_model(); 15 | $dictionary = dictionary(); 16 | $trie = trie($dictionary); 17 | $dsize = size($dictionary); 18 | 19 | sub seedFile 20 | { 21 | local('$score $good $bad $word'); 22 | 23 | $score = newObject("score", "Word pool accuracy: $1"); 24 | 25 | while $word (words($1)) 26 | { 27 | ($bad, $good) = $word; 28 | 29 | if ($bad !in %edits) 30 | { 31 | %edits[$bad] = editst($dictionary, $trie, $bad); # filterByDictionary($bad, $dictionary); 32 | } 33 | 34 | if ($good in %edits[$bad]) 35 | { 36 | [$score correct]; 37 | } 38 | else 39 | { 40 | # println("$bad -> $good ".editDistance($bad, $good)." is not in " . %edits[$bad]); 41 | } 42 | [$score record]; 43 | } 44 | 45 | [$score print]; 46 | } 47 | 48 | global('%edits $handle'); 49 | %edits = ohasha(); 50 | 51 | map(&seedFile, @ARGV); 52 | 53 | $handle = openf(">models/edits.bin"); 54 | writeObject($handle, %edits); 55 | closef($handle); 56 | 57 | println("Edits flushed!"); 58 | -------------------------------------------------------------------------------- /service/code/src/org/dashnine/preditor/GuessLanguage.java: -------------------------------------------------------------------------------- 1 | package org.dashnine.preditor; 2 | 3 | import sleep.bridges.*; 4 | import sleep.runtime.*; 5 | import sleep.interfaces.*; 6 | 7 | import java.util.*; 8 | 9 | import de.spieleck.app.cngram.NGramProfiles; 10 | 11 | /** Utilities for the Sleep Spellchecker used in AtD */ 12 | public class GuessLanguage implements Loadable, Function 13 | { 14 | private static NGramProfiles profiles = null; 15 | static 16 | { 17 | try 18 | { 19 | profiles = new NGramProfiles(); 20 | } 21 | catch (Exception ex) { ex.printStackTrace(); } 22 | } 23 | 24 | public String guessLanguage(String text) 25 | { 26 | if (text.length() > 1024) 27 | text = text.substring(0, 1024); 28 | 29 | NGramProfiles.Ranker ranker = profiles.getRanker(); 30 | ranker.account(text); 31 | NGramProfiles.RankResult result = ranker.getRankResult(); 32 | return result.getName(0); 33 | } 34 | 35 | public Scalar evaluate(String name, ScriptInstance script, Stack args) 36 | { 37 | return SleepUtils.getScalar(guessLanguage(BridgeUtilities.getString(args, ""))); 38 | } 39 | 40 | public void scriptLoaded(ScriptInstance script) 41 | { 42 | script.getScriptEnvironment().getEnvironment().put("&guessLanguage", this); 43 | } 44 | 45 | public void scriptUnloaded(ScriptInstance script) 46 | { 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /data/rules/avoiddb.txt: -------------------------------------------------------------------------------- 1 | All reasonable men think I believe 2 | As is well known I think 3 | As mentioned earlier This is superfluous 4 | As you know You probably do not know 5 | Critics claim I claim 6 | Experience shows that My experience shows 7 | For obvious reasons I have no evidence 8 | I don't know if you You are ignorant 9 | I don't want to bore you This statement is boring 10 | I heard that I don't have a reliable source 11 | I wouldn't hesitate to recommend I recommend 12 | If you will Please, pretty please, I'm begging you 13 | It has been decided that I decided that 14 | It has been mentioned that I say 15 | It is evident that I think 16 | It is generally agreed that Some people think 17 | It is known that I think 18 | It is likely that I have not good enough evidence 19 | It is not necessary to stress the fact I should not need to tell you 20 | It is perhaps true to say I do not know what to think 21 | People say I say 22 | Popular wisdom has it that I think 23 | So far as we know We could be wrong 24 | Tentative conclusions Possibilities 25 | The most typical example The example that best suits my purpose 26 | There is evidence that I don't have good evidence 27 | There is no doubt that I am convinced 28 | To be honest with you Up to this point, I have not told the truth 29 | To tell you the truth Up to this point, I have not told the truth 30 | Would you object to Here is my suggestion 31 | You probably never heard of You are ignorant 32 | if you will Please, pretty please, I'm begging you 33 | -------------------------------------------------------------------------------- /utils/bigrams/inspect.sl: -------------------------------------------------------------------------------- 1 | # 2 | # a tool to inspect the language model 3 | # 4 | 5 | debug(7 | 34); 6 | 7 | import org.dashnine.preditor.* from: lib/spellutils.jar; 8 | use(^SpellingUtils); 9 | 10 | # misc junk 11 | include("lib/dictionary.sl"); 12 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 13 | $model = get_language_model(); 14 | $dictionary = dictionary(); 15 | $dsize = size($dictionary); 16 | 17 | print("> "); 18 | 19 | while $command (readln()) 20 | { 21 | @temp = split('\s+', $command); 22 | if (size(@temp) == 5) 23 | { 24 | println("Trigram 1: " . sublist(@temp, 0, 3) . " = " . Ptrigram(@temp[0], @temp[1], @temp[2])); 25 | println("Trigram 2: " . sublist(@temp, 2, 5) . " = " . Ptrigram2(@temp[2], @temp[3], @temp[4])); 26 | } 27 | else if (size(@temp) == 3) 28 | { 29 | println("Trigram 1: " . @temp . " = " . Ptrigram(@temp[0], @temp[1], @temp[2])); 30 | println("Trigram 2: " . @temp . " = " . Ptrigram2(@temp[0], @temp[1], @temp[2])); 31 | } 32 | else if (size(@temp) == 2) 33 | { 34 | println("Bigram b, a->b " . @temp . " = " . Pbigram1(@temp[0], @temp[1]) ); 35 | println("Bigram b, b<-a " . @temp . " = " . Pbigram2(@temp[0], @temp[1]) ); 36 | } 37 | else if (size(@temp) == 1) 38 | { 39 | println("Unigram " . @temp . " = " . Pword(@temp[0])); 40 | println("Count " . @temp . " = " . count(@temp[0])); 41 | } 42 | 43 | print("> "); 44 | } 45 | -------------------------------------------------------------------------------- /service/src/view/quality.slp: -------------------------------------------------------------------------------- 1 | 2 | $data (%metrics) 29 | { 30 | if ($data > 0.0) 31 | { 32 | ($type, $name) = split('\.', $metric); 33 | display("service/src/view/metric.slp", $type, $name, $data); 34 | } 35 | } 36 | ?> 37 | -------------------------------------------------------------------------------- /data/rules/hyphens.txt: -------------------------------------------------------------------------------- 1 | # seeded from http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Grammar_and_Misc 2 | 3 | day to day::word=day-to-day 4 | out and out::word=out-and-out 5 | out of door::word=out-of-door 6 | out of doors::word=out-of-doors 7 | out of the way::word=out-of-the-way 8 | out of band::word=out-of-band 9 | out of bounds::word=out-of-bounds 10 | out of town::word=out-of-town 11 | out of state::word=out-of-state 12 | out of wedlock::word=out-of-wedlock 13 | out of pocket::word=out-of-pocket 14 | out of order::word=out-of-order 15 | out of place::word=out-of-place 16 | part time::word=part-time 17 | full time::word=full-time 18 | 1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59|60|61|62|63|64|65|66|67|68|69|70|71|72|73|74|75|76|77|78|79|80|81|82|83|84|85|86|87|88|89|90|91|92|93|94|95|96|97|98|99 year old::word=\0-year-old 19 | 100|200|250|500|100 year old::word=\0-year-old 20 | right|left handed::word=\0-handed 21 | case sensitive::word=case-sensitive 22 | case insensitive::word=case-insensitive 23 | award winning::word=award-winning 24 | out of body::word=out-of-body 25 | runner up::word=runner-up 26 | commander in chief::word=commander-in-chief 27 | win win::word=win-win 28 | win lose::word=win-lose 29 | lose lose::word=lose-lose 30 | built in::word=built-in 31 | ebook::word=e-book 32 | ereader::word=e-reader 33 | click throughs::word=click-throughs 34 | click through::word=click-through 35 | high five::word=high-five 36 | high fived::word=high-fived 37 | flu like::word=flu-like 38 | -------------------------------------------------------------------------------- /utils/common/utils.sl: -------------------------------------------------------------------------------- 1 | sub toTaggerForm 2 | { 3 | return map({ return split('/', $1); }, $1); 4 | } 5 | 6 | sub sentences 7 | { 8 | local('$handle $sentence $candidates $line'); 9 | 10 | $handle = openf("data/tests/ $+ $1"); 11 | 12 | while $line (readln($handle)) 13 | { 14 | ($sentence, $candidates) = split('\\|', $line); 15 | $candidates = split('[,;] ', $candidates); 16 | yield @($sentence, $candidates[0], sublist($candidates, 1)); 17 | } 18 | 19 | closef($handle); 20 | } 21 | 22 | sub words 23 | { 24 | local('$handle $bad $good'); 25 | $handle = openf("data/tests/ $+ $1"); 26 | while $bad (readln($handle)) 27 | { 28 | $good = readln($handle); 29 | yield @($bad, $good); 30 | } 31 | closef($handle); 32 | } 33 | 34 | sub loopHomophones 35 | { 36 | local('$entry $sentence $correct $wrongs $previous $next $wrong'); 37 | 38 | while $entry (sentences($1)) 39 | { 40 | ($sentence, $correct, $wrongs) = $entry; 41 | ($previous, $next) = split('\\*', $sentence); 42 | $previous = split('\\s+', [$previous trim])[-1]; 43 | $previous = iff($previous eq "", '0BEGIN.0', $previous); 44 | $next = split('\\s+', [$next trim])[0]; 45 | $next = iff($next eq "" || $next ismatch '[\\.!?]', '0END.0', $next); 46 | $next = iff(charAt($next, -1) ismatch '[\\.!?]', substr($next, 0, -1), $next); 47 | 48 | push($wrongs, $correct); 49 | 50 | foreach $wrong ($wrongs) 51 | { 52 | [$2 process: $correct, $wrong, $wrongs, $previous, $next]; 53 | } 54 | } 55 | 56 | [$2 finish]; 57 | } 58 | -------------------------------------------------------------------------------- /models/get_model_binaries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | svn export https://openatd.svn.wordpress.org/atd-server/models/cnetwork.bin ./models/cnetwork.bin 3 | svn export https://openatd.svn.wordpress.org/atd-server/models/cnetwork2.bin ./models/cnetwork2.bin 4 | svn export https://openatd.svn.wordpress.org/atd-server/models/dictionary.txt ./models/dictionary.txt 5 | svn export https://openatd.svn.wordpress.org/atd-server/models/edits.bin ./models/edits.bin 6 | svn export https://openatd.svn.wordpress.org/atd-server/models/endings.bin ./models/endings.bin 7 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork.bin ./models/hnetwork.bin 8 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork2.bin ./models/hnetwork2.bin 9 | svn export https://openatd.svn.wordpress.org/atd-server/models/hnetwork4.bin ./models/hnetwork4.bin 10 | svn export https://openatd.svn.wordpress.org/atd-server/models/lexicon.bin ./models/lexicon.bin 11 | svn export https://openatd.svn.wordpress.org/atd-server/models/model.bin ./models/model.bin 12 | svn export https://openatd.svn.wordpress.org/atd-server/models/model.zip ./models/model.zip 13 | svn export https://openatd.svn.wordpress.org/atd-server/models/network3f.bin ./models/network3f.bin 14 | svn export https://openatd.svn.wordpress.org/atd-server/models/network3p.bin ./models/network3p.bin 15 | svn export https://openatd.svn.wordpress.org/atd-server/models/not_misspelled.txt ./models/not_misspelled.txt 16 | svn export https://openatd.svn.wordpress.org/atd-server/models/stringpool.bin ./models/stringpool.bin 17 | svn export https://openatd.svn.wordpress.org/atd-server/models/trigrams.bin ./models/trigrams.bin 18 | ./bin/buildrules.sh 19 | -------------------------------------------------------------------------------- /data/rules/grammar/aux_noparticiple: -------------------------------------------------------------------------------- 1 | has &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 2 | hasn't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 3 | has not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle 4 | 5 | have &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 6 | haven't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 7 | have not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle 8 | 9 | had &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 10 | hadn't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 11 | had not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle 12 | 13 | were &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 14 | weren't &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 15 | were not &irregular_verb::word=\0 \1 \2:participle::pivots=\2,\2:participle 16 | 17 | could've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 18 | would've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 19 | should've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 20 | you've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 21 | You've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 22 | I've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 23 | we've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 24 | We've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 25 | they've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 26 | They've &irregular_verb::word=\0 \1:participle::pivots=\1,\1:participle 27 | -------------------------------------------------------------------------------- /data/rules/foreigndb.txt: -------------------------------------------------------------------------------- 1 | a fortiori with even stronger reason 2 | a posteriori from effects to causes; reasoning based on past experience 3 | a priori from causes to effects; conclusions drawn from assumptions; from what comes before; deductive reasoning 4 | ab initio from the beginning 5 | ad hoc improvised 6 | ad infinitum never ending 7 | ad lib at will, off the top of the head 8 | bona fide in good faith 9 | caveat caution, warning 10 | curricula vitae the courses of one's life, resumes 11 | curriculum vitae the course of one's life, resume 12 | de facto from the fact 13 | de jure from the law 14 | ex officio out of one's duty, out of one's office 15 | ex post facto after the fact, retrospectively 16 | hors d'oeuvre appetizer 17 | hors d'oeuvres appetizers 18 | hors de combat out of the battle, out of service 19 | in situ in its original place 20 | in toto in its entirety 21 | infra below 22 | inter alia among other things 23 | ipso facto by the fact itself 24 | locus classicus standard or most authoritative source 25 | non sequitur it does not follow 26 | passim here and there, throughout, in several places 27 | per capita per head 28 | prima facie at first sight, on the face of it 29 | pro bono for the public good, at no cost 30 | pro rata in proportion 31 | quid pro quo something in return 32 | raison d'etre reason for, purpose 33 | scilicet that is to say, namely 34 | scire licet that is to say, namely 35 | sic thus used, thus spelt 36 | sine die without a day, with no time fixed 37 | sine qua non without which not, essential precondition 38 | status quo things as they are 39 | stet as it was originally 40 | supra above 41 | vide see 42 | vide supre see above 43 | viva oral examination 44 | voce oral examination 45 | -------------------------------------------------------------------------------- /utils/rules/makespecial.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this script extracts relevant irregular verbs from the internal data to allow us to create rules 3 | # 4 | 5 | 6 | include("lib/engine.sl"); 7 | include("utils/rules/rules.sl"); 8 | 9 | sub checkSentenceSpelling 10 | { 11 | } 12 | 13 | sub initAll 14 | { 15 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 16 | $model = get_language_model(); 17 | $dictionary = dictionary(); 18 | $dsize = size($dictionary); 19 | $hnetwork = get_network("hnetwork.bin"); 20 | $verbs = loadVerbData(); 21 | initTaggerModels(); 22 | } 23 | 24 | sub main 25 | { 26 | initAll(); 27 | 28 | local('$key $value $base $past $participle @results @past @base'); 29 | 30 | foreach $key => $value ($verbs['base']) 31 | { 32 | ($base, $past, $participle) = values($value, @("base", "past", "participle")); 33 | if ($past ne $participle) 34 | { 35 | push(@past, $past); 36 | push(@results, $past); 37 | } 38 | 39 | if ($base ne $participle && $base ne $past) 40 | { 41 | push(@base, $base); 42 | push(@results, $base); 43 | } 44 | } 45 | 46 | @results = filter({ return iff(count($1) > 2, $1, println("Killed $[20]1 " . count($1)) ); }, @results); 47 | @past = filter({ return iff(count($1) > 2, $1); }, @past); 48 | @base = filter({ return iff(count($1) > 2, $1); }, @base); 49 | 50 | println("Total words: " . size(@results)); 51 | println("==== RESULTS ===="); 52 | println(join("|", sorta(@results))); 53 | println("==== PAST ===="); 54 | println(join("|", sorta(@past))); 55 | println("==== BASE ===="); 56 | println(join("|", sorta(@base))); 57 | } 58 | 59 | invoke(&main, @ARGV); 60 | -------------------------------------------------------------------------------- /data/rules/grammar/separate: -------------------------------------------------------------------------------- 1 | # 2 | # words that should be separated (and in what context) 3 | # 4 | 5 | # everyone of -> every one of 6 | 7 | everyone of::word=every one of::pivots=\1,one of::rule=Separate everyone 8 | 9 | # flashpoint -> flash point 10 | 11 | flashpoint::word=flash point 12 | 13 | # a while vs. awhile (split) 14 | 15 | after|for|in awhile::word=\0 a while::pivots=awhile,a while 16 | 17 | can backup::word=can back up 18 | can blackout::word=can black out 19 | can setup::word=can set up 20 | can workout::word=can work out 21 | for along time::word=for a long time 22 | for awhile::word=for a while 23 | for quite awhile::word=for quite a while 24 | got setup::word=got set up 25 | got shutdown::word=got shut down 26 | got shutout::word=got shut out 27 | had comeback::word=had come back 28 | had setup::word=had set up 29 | has setup::word=has set up 30 | have setup::word=have set up 31 | help setup::word=help set up 32 | in along time::word=in a long time 33 | in anyway::word=in any way 34 | in awhile::word=in a while 35 | in quite awhile::word=in quite a while 36 | incase of::word=in case of 37 | is setup::word=is set up 38 | Portland Trailblazers::word=Portland Trail Blazers 39 | take awhile::word=take a while 40 | to backout::word=to back out 41 | to backup::word=to back up 42 | to blackout::word=to black out 43 | to comeback::word=to come back 44 | to setup::word=to set up 45 | to shutdown::word=to shut down 46 | after along time::word=after a long time 47 | after awhile::word=after a while 48 | after quite awhile::word=after quite a while 49 | allot of::word=a lot of 50 | along time::word=a long time 51 | downpayment::word=down payment 52 | smartphone::word=smart phone 53 | ala mode::word=à la mode::filter=none 54 | afterall::word=after all 55 | to bailout::word=\0 bail out::pivots=bailout,bail out 56 | 57 | -------------------------------------------------------------------------------- /utils/tagger/makesentences.sl: -------------------------------------------------------------------------------- 1 | debug(7 | 34); 2 | 3 | sub process 4 | { 5 | local('@words $entry $previous $current $next'); 6 | 7 | $1 = [$1 trim]; 8 | if ($1 !ismatch '[A-Z][A-Za-z\'\,0-9 ]*?[\.\?\!]') 9 | { 10 | return; 11 | } 12 | 13 | @words = splitIntoWords($1); 14 | 15 | if (size(@words) < 3) 16 | { 17 | return; 18 | } 19 | 20 | # foreach $entry (@words) 21 | # { 22 | # if (%dictionary[$entry] is $null) 23 | # { 24 | # return; 25 | # } 26 | # } 27 | 28 | # println($output, lc(join(" ", @words)) ); 29 | println($output, join(" ", @words) ); 30 | } 31 | 32 | sub processFile 33 | { 34 | local('$handle $key $data $text @paragraphs'); 35 | 36 | # read in our corpus. 37 | $handle = openf($1); 38 | $text = replace(readb($handle, -1), '<[^>]*?>', ''); 39 | closef($handle); 40 | 41 | # start processing it?!? 42 | @paragraphs = splitByParagraph($text); 43 | map({ map(&process, $1); }, @paragraphs); 44 | } 45 | 46 | sub main 47 | { 48 | # setup our file that we're going to dump the output to. 49 | global('$output'); 50 | $output = openf("> $+ $2"); 51 | 52 | # ok go through all the junk parsing through the files. 53 | 54 | include("lib/nlp.sl"); 55 | include("lib/dictionary.sl"); 56 | 57 | global('%dictionary'); 58 | %dictionary = dictionary(); 59 | %dictionary["0BEGIN.0"] = 1; 60 | %dictionary["0END.0"] = 1; 61 | 62 | # collect list of files. 63 | [{ 64 | if (-isDir $1) 65 | { 66 | map($this, ls($1)); 67 | } 68 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 69 | { 70 | processFile($1); 71 | } 72 | }: $1]; 73 | 74 | 75 | closef($output); 76 | println("Done!"); 77 | } 78 | 79 | invoke(&main, @ARGV); 80 | -------------------------------------------------------------------------------- /data/rules/grammar/your: -------------------------------------------------------------------------------- 1 | your .*ing/VBG::word=you're \1::pivots=your,you're 2 | if your .*/DT::word=if you're \2::pivots=your,you're 3 | your the|a|an::word=you're \1::filter=none 4 | Your .*ing/VBG::word=You're \1::pivots=Your,You're 5 | If your .*/DT::word=If you're \2::pivots=your,you're 6 | Your the|a|an::word=You're \1::filter=none 7 | 8 | about|around|at|by|for|from|in|near|of|on|over|through|to|towards|under|with|without you're::word=\0 your::pivots=you're,your 9 | 10 | you're [a-z].*/NN|NNS are|is::word=your \1 \2::pivots=you're,your 11 | to .*/VB you're .*/NN::word=\0 \1 your \3::pivots=you're,your 12 | Your right::word=You're right::pivots=Your,You're::options=your,you're 13 | 14 | you're .* could|would|should|did|may|will|has|have|can|couldn't|wouldn't|shouldn't|didn't|won't|hasn't|haven't|can't::word=your \1 \2::pivots=you're,your 15 | 16 | to you're::word=to your::pivots=you're, your 17 | 18 | your welcome::word=you're welcome::pivots=your,you're 19 | Your welcome::word=You're welcome::pivots=Your,You're::options=your,you're 20 | Your welcome 0END.0::word=You're welcome::filter=none 21 | 22 | you're are::word=you are::filter=none 23 | your are::word=you're::filter=none 24 | your are .*ing::word=you are \2::filter=none 25 | 26 | Your not::word=You're not::pivots=Your,You're 27 | your not::word=you're not::pivots=your,you're 28 | your in|at::word=you're \1::filter=none 29 | Your in|at::word=You're \1::filter=none 30 | 31 | has|is you're::word=\0 your::pivots=you're,your::options=you're,your 32 | your so|as|gonna::word=you're \1::pivots=your,you're::options=your,you're 33 | Your so|as|gonna::word=You're \1::pivots=Your,You're::options=Your,You're 34 | 35 | as you're .*/NN::word=\0 your \2::pivots=you're,your::options=you're,your 36 | As you're .*/NN::word=\0 your \2::pivots=you're,your::options=you're,your 37 | -------------------------------------------------------------------------------- /utils/common/hotest.sl: -------------------------------------------------------------------------------- 1 | sub hotest::init 2 | { 3 | this('$score1 $score2 $score $criterf $network $criteria'); 4 | 5 | $criterf = criteria($2); 6 | $network = get_network($1); 7 | $criteria = $2; 8 | 9 | $score1 = newObject("score", "Correct $4"); 10 | $score2 = newObject("score", "Wrong $4"); 11 | $score = newObject("score", "Composite $4"); 12 | } 13 | 14 | sub hotest::process 15 | { 16 | local('$correct $wrong $wrongs $pre2 $pre1 $next $next2 @temp'); 17 | ($correct, $wrong, $wrongs, $pre2, $pre1, $next, $next2) = @_; 18 | 19 | if (size($criteria) == 0) 20 | { 21 | @temp[0] = rand($wrongs); 22 | } 23 | else 24 | { 25 | @temp = checkAnyHomophone($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]), $pre2[0], $next2[0], $criteriaf => $criterf); 26 | # println(join(', ', @($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]))) . ' = ' . @temp); 27 | } 28 | 29 | if (size(@temp) == 0) 30 | { 31 | @temp[0] = $wrong; 32 | } 33 | 34 | if (@temp[0] eq $correct) 35 | { 36 | [iff($wrong eq $correct, $score1, $score2) correct]; 37 | [$score correct]; 38 | # warn("Correct!"); 39 | } 40 | else 41 | { 42 | if ($wrong eq $correct) 43 | { 44 | [$score1 falsePositive]; 45 | [$score falsePositive]; 46 | # warn("FP!"); 47 | } 48 | else 49 | { 50 | [$score2 falseNegative]; 51 | [$score falseNegative]; 52 | # warn("FN!"); 53 | } 54 | } 55 | 56 | [$score record]; 57 | [iff($wrong eq $correct, $score1, $score2) record]; 58 | } 59 | 60 | sub hotest::finish 61 | { 62 | [$score1 print]; 63 | [$score2 print]; 64 | [$score print]; 65 | println("-" x 30); 66 | } 67 | 68 | -------------------------------------------------------------------------------- /utils/bigrams/qscore.sl: -------------------------------------------------------------------------------- 1 | # 2 | # generate statistics about a datset to evaluate writing quality 3 | # 4 | debug(7 | 34); 5 | 6 | include("lib/quality.sl"); 7 | include("lib/engine.sl"); 8 | 9 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs $locks $trie %common'); 10 | 11 | $model = get_language_model(); 12 | $dictionary = dictionary(); 13 | $rules = get_rules(); 14 | $network = get_network("cnetwork.bin"); 15 | $hnetwork = get_network("hnetwork.bin"); 16 | %edits = initEdits(); 17 | $dsize = size($dictionary); 18 | $verbs = loadVerbData(); 19 | %common = loadCommonWords(); 20 | initTaggerModels(); 21 | 22 | sub report 23 | { 24 | local('@keys $metric $words $sentences $a $b $key'); 25 | 26 | @keys = sort({ return lc($1) cmp lc($2); }, keys($2)); 27 | 28 | $words = double($2['words']); 29 | $sentences = double($2['sentences']); 30 | 31 | foreach $key (@keys) 32 | { 33 | $metric = double($2[$key]); 34 | $a = ($metric / $words) * 100.0; 35 | $b = ($metric / $sentences) * 100.0; 36 | println("$[20]1 : $[30]key : $[10]metric $[25]a $[25]b"); 37 | } 38 | } 39 | 40 | sub checkDocument 41 | { 42 | local('$data %stats $start'); 43 | 44 | $start = ticks(); 45 | 46 | # strip HTML please 47 | $data = strrep($2, ' ', ' ', '
', "\n", '

', "\n", '', "\n", '"e;', '"', '&', '&'); 48 | $data = replace($data, '(<[^>]*?>)', ''); 49 | 50 | %stats = processDocumentQuality($data); 51 | report(getFileName($1), %stats); 52 | 53 | println("Time: " . (ticks() - $start) . "ms"); 54 | } 55 | 56 | sub main 57 | { 58 | local('$handle $data'); 59 | $handle = openf($1); 60 | $data = readb($handle, -1); 61 | closef($handle); 62 | 63 | checkDocument($1, $data); 64 | } 65 | 66 | invoke(&main, @ARGV) 67 | -------------------------------------------------------------------------------- /data/rules/grammar/too: -------------------------------------------------------------------------------- 1 | too niche::filter=kill 2 | too .*/NN|VB .*/VB.*::word=too \1 \2:: # ruling out a false positive 3 | too .*/NN|VB::word=to \1::pivots=too,to 4 | too do::word=to \1::pivots=too,to 5 | too the::word=to \1::pivots=too,to 6 | to much|few of::filter=kill 7 | to much|few::word=too \1::pivots=to,too 8 | two many::words=to many,too many::pivots=two,to,too 9 | is to|two late|easy::word=\0 too \2::pivots=\1,too 10 | was to|two late|easy::word=\0 too \2::pivots=\1,too 11 | be to|two late|easy::word=\0 too \2::pivots=\1,too 12 | were to|two late|easy::word=\0 too \2::pivots=\1,too 13 | are to|two late|easy::word=\0 too \2::pivots=\1,too 14 | been to|two late|easy::word=\0 too \2::pivots=\1,too 15 | comes to|two soon::word=\0 too \2::pivots=\1,too 16 | came to|two soon::word=\0 too \2::pivots=\1,too 17 | much to|two soon|late|early|easy::word=\0 too \2::pivots=\1,too 18 | is to|two soon::word=\0 too \2::pivots=\1,too 19 | was to|two soon::word=\0 too \2::pivots=\1,too 20 | were to|two soon::word=\0 too \2::pivots=\1,too 21 | are to|two soon::word=\0 too \2::pivots=\1,too 22 | been to|two soon::word=\0 too \2::pivots=\1,too 23 | is to .*/JJ.* 0END.0::word=\0 too \2::filter=none 24 | was to .*/JJ.* 0END.0::word=\0 too \2::filter=none 25 | be to .*/JJ.* 0END.0::word=\0 too \2::filter=none 26 | were to .*/JJ.* 0END.0::word=\0 too \2::filter=none 27 | are to .*/JJ.* 0END.0::word=\0 too \2::filter=none 28 | been to .*/JJ.* 0END.0::word=\0 too \2::filter=none 29 | not to .*/JJ 0END.0::word=too \1::filter=none 30 | is to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none 31 | was to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none 32 | be to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none 33 | were to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none 34 | are to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none 35 | been to .*/JJ.* to .*/VB::word=\0 too \2 \3 \4::filter=none 36 | -------------------------------------------------------------------------------- /utils/common/homo.sl: -------------------------------------------------------------------------------- 1 | # 2 | # test out spelling with associated context information 3 | # 4 | 5 | sub suggestTest 6 | { 7 | local('$suspect $dict $previous $next @suggestions $f'); 8 | ($suspect, $dict, $previous, $next) = @_; 9 | 10 | @suggestions = %edits[$suspect]; 11 | 12 | if ($correct in @suggestions) 13 | { 14 | foreach $f (@functions) 15 | { 16 | [$f : $suspect, $correct, copy(@suggestions), $previous, $next]; 17 | } 18 | # warn("Done for $previous $suspect $next -> $correct"); 19 | } 20 | 21 | return @(); 22 | } 23 | 24 | sub testCorrectionsContext 25 | { 26 | local('$score $entry $sentence $correct $wrongs @results @words $rule $wrong $previous $next $func'); 27 | 28 | while $entry (sentences($1)) 29 | { 30 | ($sentence, $correct, $wrongs) = $entry; 31 | ($previous, $next) = split(' \\* ', $sentence); 32 | $func = lambda(&suggestTest, \$score, \$correct, @functions => sublist(@_, 1)); 33 | 34 | # 35 | # check for a false negative 36 | # 37 | foreach $wrong ($wrongs) 38 | { 39 | [$func: $wrong, $dictionary, $previous, $next] 40 | } 41 | } 42 | } 43 | 44 | sub loopHomophonesPOS 45 | { 46 | local('$entry $sentence $correct $wrongs $pre2 $pre1 $next $object $wrong $next2'); 47 | 48 | while $entry (sentences($1)) 49 | { 50 | ($sentence, $correct, $wrongs) = $entry; 51 | ($pre2, $pre1, $null, $next, $next2) = toTaggerForm(split(' ', $sentence)); 52 | 53 | if ($pre2[1] eq "UNK") { $pre2[1] = ""; } 54 | if ($pre1[1] eq "UNK") { $pre1[1] = ""; } 55 | 56 | $correct = split('/', $correct)[0]; 57 | 58 | push($wrongs, $correct); 59 | 60 | foreach $wrong ($wrongs) 61 | { 62 | [$2 process: $correct, $wrong, $wrongs, $pre2, $pre1, $next, $next2]; 63 | } 64 | 65 | # [$2 process: $correct, $correct, $wrongs, $pre2, $pre1, $next]; 66 | } 67 | 68 | [$2 finish]; 69 | } 70 | -------------------------------------------------------------------------------- /lib/quality.sl: -------------------------------------------------------------------------------- 1 | # 2 | # calculate quality score for a dataset 3 | # 4 | 5 | sub loadCommonWords 6 | { 7 | this('$common'); 8 | if ($common is $null) 9 | { 10 | $common = %(); 11 | local('$handle $bad $good $foo'); 12 | 13 | # function to load file data and add it to our hash 14 | $foo = lambda( 15 | { 16 | local('$handle $bad'); 17 | $handle = openf($1); 18 | while $bad (readln($handle)) 19 | { 20 | if ($bad !in $dictionary) 21 | { 22 | $common[$bad] = 1; 23 | } 24 | } 25 | closef($handle); 26 | }, \$common); 27 | 28 | [$foo : 'data/tests/tests1.txt']; 29 | [$foo : 'data/tests/tests2.txt']; 30 | } 31 | 32 | return $common; 33 | } 34 | 35 | sub generateStatistics 36 | { 37 | local('$error $rule'); 38 | 39 | foreach $error ($1) 40 | { 41 | $rule = $error[0]; 42 | $2[$rule['rule']] += 1; 43 | } 44 | } 45 | 46 | sub processDocumentQuality 47 | { 48 | local('@paragraphs $paragraph $sentence @results @words $count $word %common $suggest %stats'); 49 | 50 | %common = loadCommonWords(); 51 | @paragraphs = splitByParagraph($1); 52 | 53 | $suggest = function('&suggest'); 54 | setf('&suggest', { return @(); }); 55 | 56 | foreach $count => $paragraph (@paragraphs) 57 | { 58 | foreach $sentence ($paragraph) 59 | { 60 | if ($sentence eq "") 61 | { 62 | continue; 63 | } 64 | 65 | @words = splitIntoWords($sentence); 66 | %stats['words'] += size(@words); 67 | %stats['sentences'] += 1; 68 | 69 | foreach $word (@words) { if ($word in %common) { %stats['miss'] += 1; } } 70 | 71 | processSentence(\$sentence, \@results); 72 | } 73 | 74 | generateStatistics(@results, %stats); 75 | @results = @(); 76 | } 77 | 78 | setf('&suggest', $suggest); 79 | return %stats; 80 | } 81 | 82 | -------------------------------------------------------------------------------- /utils/spelldata/maker.sl: -------------------------------------------------------------------------------- 1 | # 2 | # This is a script to generate an AtD test corpus from a rule file (assumes you used torules.sl or something similar to generate the file) 3 | # 4 | # java -jar utils/rules/maker.sl 5 | # 6 | # format: 7 | # 8 | # correct text|word=wrong text 9 | # 10 | 11 | include("lib/engine.sl"); 12 | include("utils/rules/rules.sl"); 13 | 14 | sub checkSentenceSpelling 15 | { 16 | } 17 | 18 | sub initAll 19 | { 20 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 21 | $model = get_language_model(); 22 | $dictionary = dictionary(); 23 | $dsize = size($dictionary); 24 | $hnetwork = get_network("hnetwork.bin"); 25 | $verbs = loadVerbData(); 26 | initTaggerModels(); 27 | } 28 | 29 | sub main 30 | { 31 | local('$handle $sentence @results @past'); 32 | 33 | initAll(); 34 | 35 | if (function("& $+ $1") !is $null) 36 | { 37 | $rules = machine(); 38 | invoke(function("& $+ $1")); 39 | } 40 | else 41 | { 42 | $rules = loadRules(machine(), $1, %()); 43 | } 44 | 45 | $handle = openf($2); 46 | while $sentence (readln($handle)) 47 | { 48 | @results = @(); 49 | processSentence(\$sentence, \@results); 50 | 51 | @past = copy(@results); 52 | 53 | if (size(@past) == 1) 54 | { 55 | foreach $index => $r (@past) 56 | { 57 | local('$rule $text $path $context @suggestions'); 58 | ($rule, $text, $path, $context, @suggestions) = $r; 59 | 60 | %count[$rule['word']] += 1; 61 | 62 | if (%count[$rule['word']] < 5) 63 | { 64 | println(strrep($sentence, " $text ", ' * ') . '|' . $rule['word'] . ', ' . iff($rule['options'] ne "", $rule['options'], $text) . '|' . $text); 65 | } 66 | } 67 | } 68 | } 69 | } 70 | 71 | invoke(&main, @ARGV); 72 | -------------------------------------------------------------------------------- /data/rules/grammar/apostrophes: -------------------------------------------------------------------------------- 1 | # 2 | # missing apostrophes 3 | # 4 | 5 | # Verbs with not contracted: 6 | 7 | arent::word=aren't 8 | didnt::word=didn't 9 | dont::word=don't 10 | isnt::word=isn't 11 | #cant::word=can't 12 | werent::word=weren't 13 | wouldnt::word=wouldn't 14 | doesnt::word=doesn't 15 | hasnt::word=hasn't 16 | couldnt::word=couldn't 17 | hadnt::word=hadn't 18 | 19 | Arent::word=Aren't 20 | Didnt::word=Didn't 21 | Dont::word=Don't 22 | Isnt::word=Isn't 23 | #cant::word=Can't 24 | Werent::word=Weren't 25 | Wouldnt::word=Wouldn't 26 | Doesnt::word=Doesn't 27 | Hasnt::word=Hasn't 28 | Couldnt::word=Couldn't 29 | Hadnt::word=Hadn't 30 | 31 | # Pronouns with will 32 | 33 | Ill::word=I'll 34 | 35 | youll::word=you'll 36 | #hell::word=he'll 37 | #shell::word=she'll 38 | theyll::word=they'll 39 | 40 | Youll::word=You'll 41 | #hell::word=he'll 42 | #shell::word=she'll 43 | Theyll::word=Yhey'll 44 | 45 | # pronouns with the verb to be 46 | 47 | Im::word=I'm 48 | 49 | youre::word=you're 50 | whos::word=who's 51 | hes::word=he's 52 | shes::word=she's 53 | #its::word=it's 54 | #were::word=we're 55 | theyre::word=they're 56 | thats::word=that's::filter=none 57 | 58 | Youre::word=You're 59 | Whos::word=Who's 60 | Hes::word=He's 61 | Shes::word=She's 62 | #its::word=it's 63 | #were::word=we're 64 | Theyre::word=They're 65 | Thats::word=That's 66 | 67 | # to have 68 | 69 | Ive::word=I've 70 | 71 | youve::word=you've 72 | weve::word=we've 73 | theyve::word=they've 74 | 75 | Youve::word=You've 76 | Weve::word=We've 77 | Theyve::word=They've 78 | 79 | # would or had 80 | 81 | #Id::word=I'd 82 | 83 | hed::word=he'd 84 | #shed::word=she'd 85 | youd::word=you'd 86 | #wed::word=we'd 87 | theyd::word=they'd 88 | 89 | Hed::word=He'd 90 | #shed::word=she'd 91 | Youd::word=You'd 92 | #wed::word=we'd 93 | Theyd::word=They'd 94 | 95 | # 96 | 97 | Theres::word=There's 98 | theres::word=there's 99 | 100 | oclock::word=o'clock 101 | 102 | heres::word=here's 103 | -------------------------------------------------------------------------------- /data/rules/grammar/their: -------------------------------------------------------------------------------- 1 | their is|are|a|an::word=there \1::pivots=their,there 2 | there to::filter=kill 3 | there .*/JJ.* .*/NN::word=their \1 \2::pivots=there,their 4 | there .*ing/NN::word=their \1, they're \1::pivots=there,their,they're 5 | there .*/NN::word=their \1::pivots=there,their 6 | Their is|are|a|an::word=There \1::pivots=their,there 7 | There .*/JJ.* .*/NN::word=Their \1 \2::pivots=there,their 8 | There .*ing/NN::word=Their \1, They're \1::pivots=there,their,they're 9 | There .*/NN::word=Their \1::pivots=there,their 10 | is there .*/NN::word=\0 \1 \2 11 | is there .*/JJ .*/NN::word=\0 \1 \2 \3 12 | isn't there .*/NN::word=\0 \1 \2 13 | isn't there .*/JJ .*/NN::word=\0 \1 \2 \3 14 | was there .*/NN::word=\0 \1 \2 15 | was there .*/JJ .*/NN::word=\0 \1 \2 \3 16 | are there .*/NN::word=\0 \1 \2 17 | are there .*/JJ .*/NN::word=\0 \1 \2 \3 18 | if their .*ing::word=\0 they're \2::pivots=\1,they're 19 | to .*/VB there .*/NN::word=\0 \1 their \3::pivots=\2,their 20 | in there|they're .*/NN|JJ .*/NN::word=\0 their \2 \3::pivots=\1,their 21 | in there|they're .*/NN::word=\0 their \2::pivots=\1,their 22 | they're are::word=there are, they are::pivots=they're,there,they 23 | They're are::word=There are, They are::pivots=They're,There,They 24 | .*/VB there .*/NNS::word=\0 their \2::pivots=\1,their 25 | .*/VB there .*/JJ .*/NNS::word=\0 their \2 \3::pivots=\1,their 26 | .*/IN there .*/NNS::word=\0 their \2::pivots=\1,their 27 | .*/IN there .*/JJ .*/NNS::word=\0 their \2 3::pivots=\1,their 28 | 29 | has|is they're::word=\0 their::pivots=they're,their::options=they're,their 30 | their so|as|gonna::word=they're \1::pivots=their,they're::options=their,They're 31 | Their so|as|gonna::word=They're \1::pivots=Their,They're::options=Their,They're 32 | 33 | # 34 | # some rules to map their|there -> they're 35 | # 36 | their doing so::filter=kill 37 | there being .*/IN|DT::filter=kill 38 | there|their .*/VBG .*/IN::word=they're \1 \2::pivots=\0,they're 39 | there|their .*/VBG .*/DT::word=they're \1 \2::pivots=\0,they're 40 | there|their .*/VBG 0END.0::word=they're \1 \2::pivots=\0,they're 41 | -------------------------------------------------------------------------------- /service/code/src/org/dashnine/preditor/SortFromHash.java: -------------------------------------------------------------------------------- 1 | package org.dashnine.preditor; 2 | 3 | import sleep.runtime.*; 4 | import sleep.bridges.*; 5 | import sleep.interfaces.*; 6 | 7 | import java.util.*; 8 | 9 | /* Code to implement a sort function that sorts values by their corresponding Double values in a hashtable. This class exists to replace 10 | sort(lambda({ return %hash[$1] <=> %hash[$2]; }, \%hash). This snippet was identified by the profiler as consuming more time 11 | than any other function */ 12 | public class SortFromHash implements Loadable 13 | { 14 | private static class CompareHashItems implements Comparator 15 | { 16 | protected ScalarHash hash; 17 | 18 | public CompareHashItems(ScalarHash _hash) 19 | { 20 | hash = _hash; 21 | } 22 | 23 | public int compare(Object a, Object b) 24 | { 25 | double aa, bb; 26 | aa = hash.getAt((Scalar)a).doubleValue(); 27 | bb = hash.getAt((Scalar)b).doubleValue(); 28 | 29 | if (aa > bb) 30 | { 31 | return -1; 32 | } 33 | else if (aa < bb) 34 | { 35 | return 1; 36 | } 37 | else 38 | { 39 | return 0; 40 | } 41 | } 42 | } 43 | 44 | private static class func_sortFromHash implements Function 45 | { 46 | public Scalar evaluate(String n, ScriptInstance i, Stack l) 47 | { 48 | ScalarArray array = BridgeUtilities.getWorkableArray(l); 49 | ScalarHash hash = BridgeUtilities.getHash(l); 50 | 51 | array.sort(new CompareHashItems(hash)); 52 | 53 | return SleepUtils.getArrayScalar(array); 54 | } 55 | } 56 | 57 | public void scriptLoaded(ScriptInstance script) 58 | { 59 | script.getScriptEnvironment().getEnvironment().put("&sortHash", new func_sortFromHash()); 60 | } 61 | 62 | public void scriptUnloaded(ScriptInstance script) 63 | { 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /bin/buildtaggersets.sh: -------------------------------------------------------------------------------- 1 | # 2 | # code to generate the data used to bootstrap the tagger 3 | # 4 | 5 | mkdir tmp 6 | 7 | java -Xmx1024M -jar lib/sleep.jar utils/tagger/makesentences.sl data/corpus_wikipedia tmp/wikipedia_sentences.txt 8 | java -Xmx1024M -jar lib/sleep.jar utils/tagger/makesentences.sl data/corpus_gutenberg tmp/gutenberg_sentences.txt 9 | 10 | # 11 | # You *must* download the Stanford POS Tagger (GPL) from: http://nlp.stanford.edu/software/tagger.shtml 12 | # and extract it into your AtD directory. 13 | # 14 | # This tagger will take 3 days to run / file 15 | # ------ 16 | 17 | cd stanford-postagger-2008-09-28 18 | java -Xmx1024M -XX:+AggressiveHeap -XX:+UseParallelGC -jar ../lib/sleep.jar ../utils/tagger/makebootstrap.sl models/bidirectional-wsj-0-18.tagger ../data/gutenberg_sentences.txt >../tmp/gutenberg_sentences_tagged.txt & 19 | java -Xmx1024M -XX:+AggressiveHeap -XX:+UseParallelGC -jar ../lib/sleep.jar ../utils/tagger/makebootstrap.sl models/bidirectional-wsj-0-18.tagger ../data/wikipedia_sentences.txt >../tmp/wikipedia_sentences_tagged.txt & 20 | 21 | # 22 | # Or, optionally, you can use this Tagger which includes source but use is allowed for non-commercial research purposes only 23 | # 24 | # http://www-tsujii.is.s.u-tokyo.ac.jp/~tsuruoka/postagger/ 25 | # 26 | # This tagger will execute in 5 minutes / file 27 | # --------- 28 | 29 | # Oh, irony of ironies-- this tagger and the Stanford tagger produce nearly identical data (AtD bootstraps from the Stanford data though) 30 | 31 | # 32 | #cd postagger-1.0 33 | #./tagger <../tmp/wikipedia_sentences.txt >../tmp/wikipedia_sentences_tagged.txt 34 | #./tagger <../tmp/gutenberg_sentences.txt >../tmp/gutenberg_sentences_tagged.txt 35 | # 36 | cd .. 37 | 38 | java -jar lib/sleep.jar utils/tagger/fixtags.sl tmp/wikipedia_sentences_tagged.txt >data/wikipedia_sentences_tagged_f.txt 39 | java -jar lib/sleep.jar utils/tagger/fixtags.sl tmp/gutenberg_sentences_tagged.txt >data/gutenberg_sentences_tagged_f.txt 40 | 41 | mv tmp/wikipedia_sentences.txt data/wikipedia_sentences.txt 42 | mv tmp/gutenberg_sentences.txt data/gutenberg_sentences.txt 43 | 44 | rm -rf tmp 45 | -------------------------------------------------------------------------------- /data/rules/agreement/chunk_single.r: -------------------------------------------------------------------------------- 1 | .*/NNP [a-z]+/NN or [a-z]+/PRP.* [a-z]+/NN::\0 \1 and \3 \4 2 | .*/NNP [a-z]+/NNS or [a-z]+/PRP.* [a-z]+/NN::\0 \1 and \3 \4 3 | A [a-z]+/NN or [a-z]+/NN::\0 \1 and \3 4 | An [a-z]+/NN or [a-z]+/NN::\0 \1 and \3 5 | .*/NNP or [a-z]+/NNP::\0 and \2 6 | Every one of [a-z]+/DT [a-z]+/NNS::\3:upper \4 7 | One of [a-z]+/PRP.* [a-z]+/NNS::\2:upper \3 8 | Each one of [a-z]+/PRP.* [a-z]+/NNS::\3:upper \4 9 | The [a-z]+/NN [a-z]+/IN::\0 \1:plural \2 10 | The [a-z]+/NN::\0 \1:plural 11 | This [a-z]+/NN [a-z]+/IN::These \1:plural \2 12 | This [a-z]+/NN::These \1:plural 13 | One of [a-z]+/DT [a-z]+/NNS::\2:upper \3 14 | .*/NNP,POS [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 15 | .*/NNP,POS [a-z]+/NN::\0 \1:plural 16 | The [a-z]+/NN [a-z]+/IN [a-z]+/DT [a-z]+/NN::\0 \1:plural \2 \3 \4 17 | This [a-z]+/NN [a-z]+/IN [a-z]+/DT [a-z]+/NN::These \1:plural \2 \3 \4 18 | .*/RB one 19 | The [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 20 | This [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 21 | Their [a-z]+/NN::\0 \1:plural 22 | Their [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 23 | Their [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural 24 | Your [a-z]+/NN::\0 \1:plural 25 | Your [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 26 | Your [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural 27 | His [a-z]+/NN::\0 \1:plural 28 | His [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 29 | His [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural 30 | Her [a-z]+/NN::\0 \1:plural 31 | Her [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 32 | Her [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural 33 | My [a-z]+/NN::\0 \1:plural 34 | My [a-z]+/JJ [a-z]+/NN::\0 \1 \2:plural 35 | My [a-z]+/JJ [a-z]+/NN [a-z]+/VB::\0 \1 \2 \3:plural 36 | The [a-z]+/VBN [a-z]+/NN::\0 \1 \2:plural 37 | This [a-z]+/VBN [a-z]+/NN::\0 \1 \2:plural 38 | .*/CD dollars|pounds|points|feet|inches|meters 39 | The [a-z]+/NN [a-z]+/VB [a-z]+/NN::\0 \1 \2 \3:plural 40 | The [a-z]+/NN [a-z]+/VB [a-z]+/NN [a-z]+/VBP [a-z]+/JJ [a-z]+/NN 41 | The [a-z]+/JJ [a-z]+/NN [a-z]+/VBP [a-z]+/JJ [a-z]+/NN 42 | The [a-z]+/NN of [a-z]+/VB [a-z]+/NN::\0 \1 \2 \3 \4:plural 43 | The [a-z]+/NN [a-z]+/VB 44 | The [a-z]+/NN of [a-z]+/VB [a-z]+/NNS::\0 \1:plural of \3 \4 45 | Either [a-z]+/NN 46 | .*/NN::\0:plural 47 | Either [a-z]+/NNP [a-z]+/NNS or [a-z]+/PRP.* [a-z]+/NN::\1:upper \2 and \4 \5 48 | -------------------------------------------------------------------------------- /utils/common/exp.sl: -------------------------------------------------------------------------------- 1 | sub exp::init 2 | { 3 | this('$score1 $score2 $score $criterf $network $criteria %dpoints $tscores $nscores $oscores $criterf2 $network2 $criteria2'); 4 | 5 | $criterf = criteria($2); 6 | $network = get_network($1); 7 | $criteria = $2; 8 | 9 | $nscores = newObject("score", "network total"); 10 | $tscores = newObject("score", "trigrams total"); 11 | $oscores = newObject("score", "best score"); 12 | } 13 | 14 | sub exp::process 15 | { 16 | local('$correct $wrong $wrongs $pre2 $pre1 $next @temp $nbase $tbase $solution $all %scores'); 17 | ($correct, $wrong, $wrongs, $pre2, $pre1, $next) = @_; 18 | 19 | # do a trigram check? 20 | if ($wrong eq $correct) 21 | { 22 | $all = tagAll($pre2[1], $pre1[1], $pre1[0], $wrongs); 23 | 24 | if (isDifferent($all)) 25 | { 26 | $solution = getBest($all)[0]; 27 | if ($solution eq $correct) 28 | { 29 | [$tscores correct]; 30 | } 31 | else 32 | { 33 | if ($bywords[$solution] == 1.0) 34 | { 35 | # warn("$solution is wrong, correct is $correct : " . $bywords[$correct]); 36 | } 37 | } 38 | [$tscores record]; 39 | } 40 | } 41 | 42 | if ($wrong eq $correct) 43 | { 44 | (@temp, %scores) = checkAnyHomophone2($network, $wrong, copy($wrongs), $pre1[0], $next[0], @($pre2[1], $pre1[1]), 45 | $criteriaf => $criterf); 46 | 47 | if (size(@temp) == 0) 48 | { 49 | @temp[0] = $wrong; 50 | } 51 | 52 | if ($bywords[$solution] >= 1.0) #&& $solution eq $correct) 53 | { 54 | @temp[0] = $solution; 55 | } 56 | 57 | if (@temp[0] eq $correct) 58 | { 59 | [$nscores correct]; 60 | } 61 | [$nscores record]; 62 | 63 | if (@temp[0] eq $correct || $solution eq $correct) 64 | { 65 | [$oscores correct]; 66 | } 67 | [$oscores record]; 68 | 69 | if ($solution ne $correct && $bywords[$solution] == 1.0) 70 | { 71 | # warn("$solution - " . $bywords[$solution] . " vs. $correct " . $bywords[$correct]); 72 | } 73 | } 74 | } 75 | 76 | sub exp::finish 77 | { 78 | [$nscores print]; 79 | [$tscores print]; 80 | [$oscores print]; 81 | } 82 | -------------------------------------------------------------------------------- /data/rules/irregular_nouns.txt: -------------------------------------------------------------------------------- 1 | addendum addenda 2 | alga algae 3 | alumna alumnae 4 | alumnus alumni 5 | analysis analyses 6 | antenna antennas,antennae 7 | apparatus apparatuses 8 | appendix appendices,appendixes 9 | axis axes 10 | bacillus bacilli 11 | bacterium bacteria 12 | basis bases 13 | beau beaux 14 | bison bison 15 | buffalo buffalos,buffaloes 16 | bureau bureaus 17 | bus busses,buses 18 | cactus cactuses,cacti 19 | calf calves 20 | child children 21 | corps corps 22 | corpus corpora,corpuses 23 | crisis crises 24 | criterion criteria 25 | curriculum curricula 26 | datum data 27 | deer deer 28 | die dice 29 | dwarf dwarfs,dwarves 30 | diagnosis diagnoses 31 | echo echoes 32 | elf elves 33 | ellipsis ellipses 34 | embargo embargoes 35 | emphasis emphases 36 | erratum errata 37 | fireman firemen 38 | fish fish,fishes 39 | focus focuses 40 | foot feet 41 | formula formulas 42 | fungus fungi,funguses 43 | genus genera 44 | goose geese 45 | half halves 46 | hero heroes 47 | hippopotamus hippopotami,hippopotamuses 48 | hoof hoofs,hooves 49 | hypothesis hypotheses 50 | index indices,indexes 51 | knife knives 52 | leaf leaves 53 | life lives 54 | loaf loaves 55 | louse lice 56 | man men 57 | matrix matrices 58 | means means 59 | medium media 60 | memorandum memoranda 61 | millennium millenniums,milennia 62 | moose moose 63 | mosquito mosquitoes 64 | mouse mice 65 | nebula nebulae,nebulas 66 | neurosis neuroses 67 | nucleus nuclei 68 | oasis oases 69 | octopus octopi,octopuses 70 | ovum ova 71 | ox oxen 72 | paralysis paralyses 73 | parenthesis parentheses 74 | person people 75 | phenomenon phenomena 76 | potato potatoes 77 | radius radii,radiuses 78 | scarf scarfs,scarves 79 | self selves 80 | series series 81 | sheep sheep 82 | shelf shelves 83 | scissors scissors 84 | species species 85 | stimulus stimuli 86 | stratum strata 87 | syllabus syllabi,syllabuses 88 | symposium symposia,symposiums 89 | synthesis syntheses 90 | synopsis synopses 91 | tableau tableaux 92 | that those 93 | thesis theses 94 | thief thieves 95 | this these 96 | tomato tomatoes 97 | tooth teeth 98 | torpedo torpedoes 99 | vertebra vertebrae 100 | veto vetoes 101 | vita vitae 102 | watch watches 103 | wife wives 104 | wolf wolves 105 | woman women 106 | zero zeros,zeroes 107 | -------------------------------------------------------------------------------- /data/rules/grammar/its2: -------------------------------------------------------------------------------- 1 | on it's own::name=it's rule::word=on its own::filter=none 2 | of it's own::name=it's rule::word=of its own::filter=none 3 | such as it's::name=it's rule::word=such as its::filter=none 4 | from all it's::name=it's rule::word=from all its::filter=none 5 | by all it's::name=it's rule::word=by all its::filter=none 6 | it's approach::name=it's rule::word=its approach::filter=none 7 | by it's::name=it's rule::word=by its::filter=none 8 | By it's::name=it's rule::word=By its::filter=none 9 | with it's::name=it's rule::word=\0 its::pivots=it's,its 10 | with/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 11 | With/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 12 | in/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 13 | In/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 14 | without/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 15 | Without/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 16 | from/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 17 | From/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 18 | Under/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 19 | under/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 20 | over/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 21 | Over/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 22 | above/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 23 | Above/.* it's/.* .*/JJ.*|NN.*::name=it's rule::word=\0 its \2::filter=none 24 | for it's .*/JJ|NN|NNS::word=for its::pivots=\1,its 25 | 26 | it's class|color|current|end|first|former|fourth|goal|highest|history|inital|junction|lack|last|lead|lowest|maximum|minimum|money|name|northern|original|own|peak|previous|primary|second|third|timeslot|toll|way::word=its \1::pivots=it's,its::options=it's,its 27 | 28 | at|be|about|above|across|against|along|among|around|at|behind|by|for|from|had|in|near|of|on|over|through|to|towards|under|upon|with|without it's .*/JJ|NN|NNS::word=\0 its \2::pivots=it's,its 29 | 30 | it's you::filter=kill 31 | it's [a-z].*/NNP::word=its \1::pivots=\0,its 32 | 33 | to .*/VB it's .*/NN|NNS::word=\0 \1 its \3::pivots=it's,its 34 | 35 | it's .*/JJ .*/NNS|NN::word=its \1 \2::pivots=it's,its 36 | -------------------------------------------------------------------------------- /data/rules/agreement/chunk_plural.r: -------------------------------------------------------------------------------- 1 | The [a-z]+/JJ two|three|four|five|six|seven|eight|nine|ten|hundred|thousand|million|billion|trillion 2 | My|Your|His|Her|Their pants 3 | .*/NNP [a-z]+/NN and [a-z]+/PRP.* [a-z]+/NN::\0 \1 or \3 \4 4 | .*/NNP [a-z]+/NNS and [a-z]+/PRP.* [a-z]+/NN::\0 \1 or \3 \4 5 | .*/NNP and [a-z]+/NNP::\0 or \2 6 | .*/NNP and [a-z]+/PRP.* [a-z]+/NNS::\0 or \2 \3:singular 7 | The [a-z]+/NN and [a-z]+/DT [a-z]+/NNS::\0 \1 or \3 \4:singular 8 | The [a-z]+/NN or [a-z]+/DT [a-z]+/NNS::\0 \1 \2 \3 \4:singular 9 | The [a-z]+/NN and [a-z]+/NN::The \1 or \3 10 | The [a-z]+/NNS::\0 \1:singular 11 | The [a-z]+/NNS::\0 \1:singular 12 | These [a-z]+/NN and [a-z]+/DT [a-z]+/NNS::The \1 or the \4:singular 13 | These [a-z]+/NN or [a-z]+/DT [a-z]+/NNS::word=The \1 \2 the \4:singular 14 | These [a-z]+/NNS::The \1:singular 15 | All||all of [a-z]+/DT [a-z]+/NNS::\2:upper \3:singular 16 | The [a-z]+/NNS of|for [a-z]+/NN::\0 \1:singular \2 \3 17 | These [a-z]+/NNS of|for [a-z]+/NN::Each \1:singular \2 \3 18 | The [a-z]+/NNS of|for [a-z]+/JJ [a-z]+/NN::\0 \1:singular \2 \3 \4 19 | These [a-z]+/NNS of|for [a-z]+/JJ [a-z]+/NN::Each \1:singular \2 \3 \4 20 | .*/NNP,POS [a-z]+/NNS::\0 \1:singular 21 | .*/NNP,POS [a-z]+/NNS in [a-z]+/DT [a-z]+/NN::\0 \1:singular \2 \3 \4 22 | The [a-z]+/JJS [a-z]+/JJ [a-z]+/NNS of|for|from [a-z]+/NN [a-z]+/NN::\0 \1 \2 \3:singular \4 \5 \6 23 | The [a-z]+/JJS [a-z]+/JJ [a-z]+/NNS::\0 \1 \2 \3:singular 24 | .*/NNS of|for|from [a-z]+/NNS::\0:singular \1 \2:singular 25 | .*/NNP,POS [a-z]+/NNS in [a-z]+/DT [a-z]+/NN::\0 \1:singular \2 \3 \4 26 | .*/CD [a-z]+/NNS 27 | The series of [a-z]+ [a-z]+/NNS::\0 \1 \2 \3 \4:singular 28 | The series of [a-z]+/NNS::\0 \1 \2 \3:singular 29 | The/DT [a-z]+/NN [a-z]+/IN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3 \4:singular 30 | The [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular 31 | My [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular 32 | Your [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular 33 | His [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular 34 | Her [a-z]+/NN [a-z]+/VB [a-z]+/NNS::\0 \1 \2 \3:singular 35 | My [a-z]+/NNS::\0 \1:singular 36 | Your [a-z]+/NNS::\0 \1:singular 37 | Their [a-z]+/NNS::\0 \1:singular 38 | His [a-z]+/NNS::\0 \1:singular 39 | Her [a-z]+/NNS::\0 \1:singular 40 | .*/JJ [a-z]+/NNS::\0 \1:singular 41 | The [a-z]+/NN [a-z]+/IN [a-z]+/VB [a-z]+/NNS 42 | My [a-z]+/NNS and I 43 | My [a-z]+/NN and I 44 | -------------------------------------------------------------------------------- /utils/rules/transr.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this is a script to transform sentences in a corpus using rules from an AtD rule file 3 | # 4 | # java -jar utils/rules/testr.sl 5 | # 6 | # format: 7 | # 8 | # rule..|[key=value|...] 9 | # 10 | # note that key=value are parsed and dumped into a hash. This information is used by the system to 11 | # filter out false positives and stuff. 12 | # 13 | 14 | include("lib/engine.sl"); 15 | include("utils/rules/rules.sl"); 16 | 17 | sub checkSentenceSpelling 18 | { 19 | } 20 | 21 | setf('&score', let({ 22 | local('$value'); 23 | $value = invoke($oldf, @_); 24 | warn("Looking at: " . join("|", @_) . " = " . $value); 25 | return $value; 26 | }, $oldf => &score)); 27 | 28 | sub initAll 29 | { 30 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 31 | $model = get_language_model(); 32 | $dictionary = dictionary(); 33 | $dsize = size($dictionary); 34 | $hnetwork = get_network("hnetwork4.bin"); 35 | $verbs = loadVerbData(); 36 | initTaggerModels(); 37 | } 38 | 39 | sub main 40 | { 41 | local('$handle $sentence @results @past'); 42 | 43 | initAll(); 44 | 45 | if (function("& $+ $1") !is $null) 46 | { 47 | $rules = machine(); 48 | invoke(function("& $+ $1")); 49 | } 50 | else 51 | { 52 | $rules = loadRules(machine(), $1, %()); 53 | } 54 | 55 | $handle = openf($2); 56 | while $sentence (readln($handle)) 57 | { 58 | @results = @(); 59 | processSentence(\$sentence, \@results); 60 | 61 | @past = copy(@results); 62 | 63 | if (size(@past) > 0) 64 | { 65 | # println($sentence); 66 | # println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence)))); 67 | foreach $index => $r (@past) 68 | { 69 | local('$rule $text $path $context @suggestions'); 70 | ($rule, $text, $path, $context, @suggestions) = $r; 71 | 72 | if ($r in @results) 73 | { 74 | $n = strrep($sentence, $text, @suggestions[0]); 75 | println($n); 76 | 77 | if ($n eq $sentence) 78 | { 79 | println("===> $context $text => " . @suggestions); 80 | } 81 | 82 | break; 83 | } 84 | 85 | 86 | } 87 | 88 | } 89 | } 90 | } 91 | 92 | invoke(&main, @ARGV); 93 | -------------------------------------------------------------------------------- /utils/spelldata/torules.sl: -------------------------------------------------------------------------------- 1 | # 2 | # Generate a rule file from cut and paste Wikipedia rules data 3 | # http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/Grammar_and_Misc 4 | # 5 | # use java -jar lib/sleep.jar torules.sl wrong to generate a reverse rules file suitable for error corpus generation 6 | # 7 | # paste the contents into a text editor, then paste into a text file and process with this program 8 | # 9 | 10 | $handle = openf("wp.txt"); 11 | 12 | %sections = ohash(); 13 | setMissPolicy(%sections, { return @(); }); 14 | 15 | while $text (readln($handle)) 16 | { 17 | if ($text ismatch '.*?[\*\#] (.*?) \((.*?)\).*') 18 | { 19 | ($wrong, $correct) = matched(); 20 | 21 | if (',' !isin $correct) 22 | { 23 | @a = split(' ', $wrong); 24 | @b = split(' ', $correct); 25 | 26 | if (size(@a) == size(@b)) 27 | { 28 | foreach $index => $word (@a) 29 | { 30 | if ($word !in @b) { $special = $word; $replace = @b[$index]; } 31 | } 32 | 33 | if (@ARGV[0] eq 'wrong') 34 | { 35 | push(%sections["Confused word: $special"], "$correct $+ ::word= $+ $wrong"); 36 | } 37 | else 38 | { 39 | push(%sections["Confused word: $special"], "$wrong $+ ::word= $+ $correct $+ ::pivots= $+ $special $+ , $+ $replace $+ ::options= $+ $special $+ , $+ $replace"); 40 | } 41 | } 42 | else 43 | { 44 | if (@ARGV[0] eq 'wrong') 45 | { 46 | push(%sections["Multiple Options"], "$correct $+ ::word= $+ $wrong"); 47 | } 48 | else 49 | { 50 | push(%sections["Multiple Options"], "$wrong $+ ::word= $+ $correct"); 51 | } 52 | } 53 | } 54 | else 55 | { 56 | if (@ARGV[0] ne 'wrong') 57 | { 58 | push(%sections["Misc"], "$wrong $+ ::word= $+ $correct"); 59 | #push(%sections["Misc"], "$correct $+ ::word= $+ $wrong"); 60 | } 61 | else 62 | { 63 | @temp = split(', ', $correct); 64 | map(lambda({ push(%sections["Misc"], "$1 $+ ::word= $+ $wrong $+ ::options= $+ $correct"); }, \$wrong, \$correct), @temp); 65 | } 66 | } 67 | } 68 | else 69 | { 70 | # push(%sections["__Rejects__"], $text); 71 | } 72 | } 73 | 74 | foreach $key => $value (%sections) 75 | { 76 | println("\n#\n# $key \n#\n"); 77 | printAll($value); 78 | } 79 | -------------------------------------------------------------------------------- /service/code/src/org/dashnine/preditor/LanguageModelSmall.java: -------------------------------------------------------------------------------- 1 | package org.dashnine.preditor; 2 | 3 | import java.io.*; 4 | import java.util.*; 5 | import java.util.zip.*; 6 | 7 | /** This class holds the (minified) AtD language model */ 8 | public class LanguageModelSmall extends LanguageModel implements Serializable 9 | { 10 | protected ZipFile entries; 11 | 12 | private static long lowMemoryThreshold = 256 * 1024 * 1024; 13 | 14 | protected class CacheMap extends LinkedHashMap 15 | { 16 | protected boolean removeEldestEntry(Map.Entry eldest) 17 | { 18 | long memory = Runtime.getRuntime().freeMemory() + (Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory()); 19 | return (size() > 16384 || memory < lowMemoryThreshold); 20 | } 21 | } 22 | 23 | /* read a string value from the specified map... adds the string if it doesn't exist */ 24 | protected Value getStringValue(Map map, String word, boolean makeAsNecessary) 25 | { 26 | Object sid = getStringId(word, false); 27 | 28 | if (sid != null) 29 | { 30 | synchronized (this) 31 | { 32 | Value val = (Value)map.get(sid); 33 | if (val == null && map == model) 34 | { 35 | try 36 | { 37 | int sid_i = ((Integer)sid).intValue(); 38 | 39 | ZipEntry entry = entries.getEntry((sid_i % 512) + "/" + sid_i); 40 | if (entry != null) 41 | { 42 | ObjectInputStream stream = new ObjectInputStream(entries.getInputStream(entry)); 43 | val = (Value)stream.readObject(); 44 | map.put(sid, val); 45 | } 46 | } 47 | catch (Exception ex) 48 | { 49 | System.err.println("Could not load: " + word + "(" + sid + ")"); 50 | ex.printStackTrace(); 51 | } 52 | } 53 | return val; 54 | } 55 | } 56 | 57 | return null; 58 | } 59 | 60 | public LanguageModelSmall(Map _string_pool, long _count, File entries_file) 61 | { 62 | string_pool = _string_pool; 63 | count = _count; 64 | model = new CacheMap(); 65 | try 66 | { 67 | entries = new ZipFile(entries_file); 68 | } 69 | catch (Exception ex) 70 | { 71 | System.err.println("Could not load zipfile: " + entries_file); 72 | ex.printStackTrace(); 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /utils/bigrams/amigo.sl: -------------------------------------------------------------------------------- 1 | # 2 | # test spanish homophones against spanish corpora. 3 | # 4 | 5 | debug(7 | 24); 6 | 7 | include("lib/quality.sl"); 8 | include("lib/engine.sl"); 9 | 10 | # 11 | # load AtD models 12 | # 13 | global('$lang'); 14 | 15 | $lang = systemProperties()["atd.lang"]; 16 | if ($lang ne "" && -exists "lang/ $+ $lang $+ /load.sl") { 17 | include("lang/ $+ $lang $+ /load.sl"); 18 | initAllModels(); 19 | } 20 | 21 | # 22 | # load homophones 23 | # 24 | sub homophones { 25 | local('$handle $text %h @candidates'); 26 | $handle = openf("lang/ $+ $lang $+ /homophonedb.txt"); 27 | while $text (readln($handle)) { 28 | if ('-*' iswm $text) { 29 | %h[substr($text, 1)] = $null; 30 | } 31 | else { 32 | @candidates = split(',\s+', $text); 33 | map(lambda({ %h[$1] = @candidates; }, \%h, \@candidates), @candidates); 34 | } 35 | } 36 | return %h; 37 | } 38 | 39 | sub isHomophone { 40 | local('$sentence $pre2 $pre1 $current $next @results'); 41 | ($sentence, $pre2, $pre1, $current, $next) = @_; 42 | 43 | @results = checkHomophone($hnetwork, $current, %homophones[$current], $pre1, $next, @(), $pre2, $bias1 => 30.0, $bias2 => 10.0); 44 | 45 | if (size(@results) > 0) { 46 | println("\t $+ $sentence"); 47 | println("\t $+ $pre2 $pre1 | $current | $next or: " . @results . "\n"); 48 | } 49 | } 50 | 51 | # 52 | # check a sentence for homophones 53 | # 54 | sub checkSentenceForHomophones { 55 | local('$pre2 $pre1 $current $next $word'); 56 | 57 | $current = '0BEGIN.0'; 58 | 59 | foreach $next (splitIntoWords($1)) { 60 | if ($current ne '0BEGIN.0' && $current in %homophones) { 61 | isHomophone($1, $pre2, $pre1, $current, $next); 62 | } 63 | $pre2 = $pre1; 64 | $pre1 = $current; 65 | $current = $next; 66 | } 67 | 68 | $next = '0END.0'; 69 | 70 | if ($current in %homophones) { 71 | isHomophone($1, $pre2, $pre1, $current, $next); 72 | } 73 | } 74 | 75 | # 76 | # loop through the file, look for homophones... report them! 77 | # 78 | sub checkForHomophones { 79 | local('$handle $contents'); 80 | $handle = openf($1); 81 | $contents = splitIntoSentences(join("\n", readAll($handle, -1))); 82 | map(&checkSentenceForHomophones, $contents); 83 | closef($handle); 84 | } 85 | 86 | sub main { 87 | global('%homophones'); 88 | %homophones = homophones(); 89 | [{ 90 | if (-isDir $1) { 91 | map($this, ls($1)); 92 | } 93 | else { 94 | if ('*.txt' iswm $1) { 95 | println($1); 96 | checkForHomophones($1); 97 | } 98 | } 99 | }: "lang/ $+ $lang $+ /corpus"]; 100 | } 101 | 102 | invoke(&main, @ARGV); 103 | -------------------------------------------------------------------------------- /utils/spelldata/bootstrapspell.sl: -------------------------------------------------------------------------------- 1 | # 2 | # Walk through a corpus and find spelling errors and their corrections 3 | # 4 | # java [all the memory junk here] -jar lib/sleep.jar utils/spelldata/bootstrapspell.sl data/corpus_wikipedia 5 | # 6 | 7 | debug(7 | 34); 8 | 9 | include("lib/engine.sl"); 10 | 11 | global('$model $dictionary $trie $rules $network $hnetwork %edits $dsize $old_suggest %words'); 12 | 13 | $model = get_language_model(); 14 | $dictionary = dictionary(); 15 | $rules = get_rules(); 16 | $trie = trie($dictionary); 17 | $network = get_network("cnetwork.bin"); 18 | $hnetwork = get_network("hnetwork2.bin"); 19 | %edits = initEdits(); 20 | setRemovalPolicy(%edits, { return 1; }); 21 | $dsize = size($dictionary); 22 | initTaggerModels(); 23 | 24 | $old_suggest = function('&getSuggestionPool'); 25 | 26 | sub getSuggestionPool 27 | { 28 | local('$error $dict $pre $next @suggests %scores'); 29 | ($error, $dict, $pre, $next) = @_; 30 | 31 | if ($error ismatch '[a-z]+\'{0,1}[a-z]+' && $pre ne "" && $next ne "" && ($pre ne '0BEGIN.0' || $next ne '0END.0') && $pre ismatch '[a-zA-Z0-9\\.,]+' && $next ismatch '[a-zA-Z0-9\\.,]+') 32 | # if ($error in %words && $pre ne "" && $next ne "" && ($pre ne '0BEGIN.0' || $next ne '0END.0') && $pre ismatch '[a-zA-Z0-9\\.,]+' && $next ismatch '[a-zA-Z0-9\\.,]+') 33 | { 34 | (@suggests, %scores) = invoke($old_suggest, @_); 35 | 36 | if (size(@suggests) > 0 && %seen[@_] is $null) 37 | { 38 | println("$pre * $next $+ |" . @suggests[0] . ", $error $+ |" . %scores[@suggests[0]]); 39 | %seen[@_] = 1; 40 | } 41 | 42 | return @(@suggests, %scores); 43 | } 44 | 45 | return @(@(), %()); 46 | } 47 | 48 | sub checkIt 49 | { 50 | local('$handle $data'); 51 | $handle = openf($1); 52 | $data = readb($handle, -1); 53 | closef($handle); 54 | 55 | $data = stripHTML($data); 56 | 57 | processDocument($data) 58 | 59 | local('@paragraphs $paragraph $sentence'); 60 | @paragraphs = splitByParagraph($data); 61 | 62 | foreach $paragraph (@paragraphs) 63 | { 64 | foreach $sentence ($paragraph) 65 | { 66 | if ($sentence eq "") 67 | { 68 | continue; 69 | } 70 | 71 | checkSentenceSpelling(splitIntoWords($sentence), @results => @()); 72 | } 73 | } 74 | 75 | [System gc]; 76 | } 77 | 78 | sub main 79 | { 80 | # collect list of files. 81 | [{ 82 | if (-isDir $1) 83 | { 84 | map($this, ls($1)); 85 | } 86 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 87 | { 88 | checkIt($1); 89 | } 90 | }: $1]; 91 | } 92 | 93 | invoke(&main, @ARGV); 94 | -------------------------------------------------------------------------------- /utils/bigrams/corpus-lex-diff.sl: -------------------------------------------------------------------------------- 1 | # 2 | # Analyze a text file containing raw text data and show the top words not in the current wordlist data 3 | # 4 | # 5 | 6 | sub loadWordlists 7 | { 8 | if (-isDir $1) 9 | { 10 | map($this, ls($1)); 11 | } 12 | else 13 | { 14 | loadWordlist($1, \%wordlist); 15 | } 16 | } 17 | 18 | sub loadWordlist 19 | { 20 | local('$handle $word'); 21 | $handle = openf($1); 22 | map(lambda({ %wordlist[$1] = 1; }, \%wordlist), split("\n", readb($handle, -1))); 23 | closef($handle); 24 | } 25 | 26 | sub wordlists 27 | { 28 | this('$dictionary'); 29 | if ($dictionary is $null) 30 | { 31 | $dictionary = %(); 32 | [lambda(&loadWordlists, %wordlist => $dictionary) : "data/wordlists"]; 33 | 34 | # add punctuation chars here 35 | 36 | # warn("Loaded: " . size($dictionary) . " words"); 37 | 38 | $dictionary[","] = 1; # make sure commas are in the wordlist 39 | } 40 | return $dictionary; 41 | } 42 | 43 | # 44 | # tool to build a corpus. <3 45 | # 46 | 47 | debug(7 | 34); 48 | 49 | sub process 50 | { 51 | local('@words $head $next'); 52 | 53 | @words = splitIntoWords($1); 54 | 55 | while (size(@words) > 1) 56 | { 57 | ($next) = @words; 58 | 59 | if ($next !in %wordlists && lc($next) !in %wordlists && !-isnumber $next) 60 | { 61 | %nots[$next] += 1; 62 | } 63 | 64 | @words = sublist(@words, 1); 65 | } 66 | } 67 | 68 | sub processFile 69 | { 70 | local('$handle $key $data $text @paragraphs'); 71 | 72 | # read in our corpus. 73 | $handle = openf($1); 74 | $text = replace(readb($handle, -1), '<[^>]*?>', ''); 75 | closef($handle); 76 | 77 | # start processing it?!? 78 | @paragraphs = splitByParagraph($text); 79 | map({ map({ map(&process, splitIntoClauses($1)); }, $1); }, @paragraphs); 80 | } 81 | 82 | sub main 83 | { 84 | global('%wordlists %dictionary @files %current %nots'); 85 | 86 | include("lib/nlp.sl"); 87 | include("lib/dictionary.sl"); 88 | 89 | %wordlists = wordlists(); 90 | 91 | processFile(@ARGV[0]); 92 | 93 | local('@words $word'); 94 | 95 | # sort everything... 96 | 97 | @words = sort({ return %nots[$2] <=> %nots[$1]; }, filter(lambda({ return iff($min == 0 || %nots[$1] > $min, $1); }, $min => $2), keys(%nots))); 98 | 99 | foreach $word (@words) 100 | { 101 | if (($2 == 0 || %nots[$word] > $2)) 102 | { 103 | if ($3 eq "") 104 | { 105 | println("$[50]word ... " . %nots[$word]); 106 | } 107 | else 108 | { 109 | println($word); 110 | } 111 | } 112 | } 113 | } 114 | 115 | invoke(&main, @ARGV); 116 | -------------------------------------------------------------------------------- /utils/common/spellcontext.sl: -------------------------------------------------------------------------------- 1 | # 2 | # test out spelling with associated context information 3 | # 4 | 5 | sub suggestTest 6 | { 7 | local('$suspect $dict $previous $next @suggestions $f'); 8 | ($suspect, $dict, $previous, $next) = @_; 9 | 10 | @suggestions = %edits[$suspect]; 11 | 12 | if ($correct in @suggestions) 13 | { 14 | foreach $f (@functions) 15 | { 16 | [$f : $suspect, $correct, copy(@suggestions), $previous, $next]; 17 | } 18 | # warn("Done for $previous $suspect $next -> $correct"); 19 | } 20 | 21 | return @(); 22 | } 23 | 24 | sub testCorrectionsContext 25 | { 26 | local('$score $entry $sentence $correct $wrongs @results @words $rule $wrong $previous $next $func'); 27 | 28 | while $entry (sentences($1)) 29 | { 30 | ($sentence, $correct, $wrongs) = $entry; 31 | ($previous, $next) = split(' \\* ', $sentence); 32 | $func = lambda(&suggestTest, \$score, \$correct, @functions => sublist(@_, 1)); 33 | 34 | # 35 | # check for a false negative 36 | # 37 | foreach $wrong ($wrongs) 38 | { 39 | [$func: $wrong, $dictionary, $previous, $next] 40 | } 41 | } 42 | } 43 | 44 | sub checkAnyHomophone 45 | { 46 | return invoke(&checkAnyHomophone2, @_, parameters => %(\$criteriaf))[0]; 47 | } 48 | 49 | sub checkAnyHomophone2 50 | { 51 | local('$current $options $pre $next %scores $criteriaf @results $option $hnetwork $tags $pre2 $next2'); 52 | ($hnetwork, $current, $options, $pre, $next, $tags, $pre2, $next2) = @_; 53 | 54 | # setup the criteria function 55 | # $criteriaf = criteria(@("pref", "postf", "probability")); 56 | 57 | # $options = filter(lambda({ return iff(Pbigram1($pre, $1) > 0.0 || Pbigram2($1, $next) > 0.0, $1); }, \$pre, \$next), $options); 58 | 59 | # score the options 60 | foreach $option ($options) 61 | { 62 | # warn(@_ . " -> " . [$criteriaf: $current, $option, $options, $pre, $next, $tags]); 63 | %scores[$option] = [$hnetwork getresult: [$criteriaf: $current, $option, $options, $pre, $next, $tags, $pre2, $next2]]["result"]; 64 | if ($option eq $current) 65 | { 66 | # warn(Pword($current)); 67 | %scores[$option] *= 10.0; # * (1.0 - (Pword($current) * 2500)); 68 | } 69 | } 70 | 71 | # filter out any unacceptable words 72 | @results = filter(lambda({ return iff(%scores[$1] >= %scores[$current] && $1 ne $current && %scores[$1] > 0.0, $1, $null); }, \%scores, \$current), $options); 73 | 74 | # sort the remaining results (probably only one left at this point) 75 | @results = sort(lambda({ return %scores[$2] <=> %scores[$1]; }, \%scores), @results); 76 | 77 | if (size(@results) > 0) 78 | { 79 | # warn("checkHomophone: " . @_ . " -> " . @results); 80 | # warn(" " . %scores); 81 | } 82 | 83 | # return the results 84 | return @(@results, %scores); 85 | } 86 | -------------------------------------------------------------------------------- /utils/rules/testr.sl: -------------------------------------------------------------------------------- 1 | # 2 | # This is a script to test the rules out. It's fun stuff. 3 | # 4 | # java -jar utils/rules/testr.sl 5 | # 6 | # format: 7 | # 8 | # rule..|[key=value|...] 9 | # 10 | # note that key=value are parsed and dumped into a hash. This information is used by the system to 11 | # filter out false positives and stuff. 12 | # 13 | 14 | include("lib/engine.sl"); 15 | include("utils/rules/rules.sl"); 16 | 17 | sub checkSentenceSpelling 18 | { 19 | } 20 | 21 | setf('&score', let({ 22 | local('$value'); 23 | $value = invoke($oldf, @_); 24 | warn("Looking at: " . join("|", @_) . " = " . $value); 25 | return $value; 26 | }, $oldf => &score)); 27 | 28 | sub initAll 29 | { 30 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 31 | $model = get_language_model(); 32 | $dictionary = dictionary(); 33 | $dsize = size($dictionary); 34 | $hnetwork = get_network("hnetwork4.bin"); 35 | $verbs = loadVerbData(); 36 | initTaggerModels(); 37 | } 38 | 39 | sub main 40 | { 41 | local('$handle $sentence @results @past'); 42 | 43 | initAll(); 44 | 45 | if (function("& $+ $1") !is $null) 46 | { 47 | $rules = machine(); 48 | invoke(function("& $+ $1")); 49 | } 50 | else 51 | { 52 | $rules = loadRules(machine(), $1, %()); 53 | } 54 | 55 | # processSentence now expects $rules to be an array of rule packages 56 | $rules = @( $rules ); 57 | 58 | $handle = openf($2); 59 | while $sentence (readln($handle)) 60 | { 61 | @results = @(); 62 | processSentence(\$sentence, \@results); 63 | 64 | @past = copy(@results); 65 | 66 | if (size(@past) > 0) 67 | { 68 | println($sentence); 69 | println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence)))); 70 | foreach $index => $r (@past) 71 | { 72 | local('$rule $text $path $context @suggestions'); 73 | ($rule, $text, $path, $context, @suggestions) = $r; 74 | 75 | if ($r in @results) 76 | { 77 | println(" $index $+ ) [ACCEPT] $context $+ , $text -> " . @suggestions); 78 | } 79 | else 80 | { 81 | println(" $index $+ ) [REJECT] $context $+ , $text -> " . @suggestions); 82 | } 83 | 84 | foreach $key => $value ($rule) 85 | { 86 | println(" $[10]key => $value"); 87 | } 88 | } 89 | } 90 | else 91 | { 92 | # println("NOT FOUND"); 93 | # println($sentence); 94 | # println(taggerToString(taggerWithTrigrams(splitIntoWords($sentence)))); 95 | } 96 | } 97 | } 98 | 99 | invoke(&main, @ARGV); 100 | -------------------------------------------------------------------------------- /data/rules/grammar/det_agreement: -------------------------------------------------------------------------------- 1 | These|Those is::word=This \1::filter=none 2 | These|Those was::word=This \1::filter=none 3 | These|Those is .*/NNS::word=\0 are \2::filter=none 4 | These|Those was .*/NNS::word=\0 were \2::filter=none 5 | These|Those is .*/JJ .*/NNS::word=\0 are \2 \3::filter=none 6 | These|Those was .*/JJ .*/NNS::word=\0 were \2 \3::filter=none 7 | 8 | This are .*/NNS::word=These \1 \2::filter=none 9 | This were .*/NNS::word=Those \1 \2::filter=none 10 | This are .*/JJ .*/NNS::word=These \1 \2 \3::filter=none 11 | This were .*/JJ .*/NNS::word=Those \1 \2 \3::filter=none 12 | This are::word=This is::filter=none 13 | This were::word=This was::filter=none 14 | 15 | # rules for there 16 | 17 | there|There is none::filter=kill 18 | there|There are none|but|today|plenty|way::filter=kill 19 | 20 | there|There is .*/NNS of .*/NN|VBG::filter=kill 21 | there|There are .*/NN of .*/NNS|VBG|JJ::filter=kill 22 | there|There are .*/NN of .*/NN .*/NNS|VBG::filter=kill 23 | there|There are .*/NN .*/NNS::filter=kill 24 | there|There are .*/NN .*/NN .*/NNS::filter=kill 25 | there|There are .*/NN too many::filter=kill 26 | 27 | #there/EX are/VBP plenty/NN of/IN advantages/NNS to/TO 28 | 29 | # according to http://ask.metafilter.com/84536/There-is-or-There-are 30 | # I should use the closest noun to determine is/are. So these rules are not 31 | # needed. Just the same I'm commenting them out for future reference. 32 | #there|there is .*/NN and .*/NN::word=\0 are \2 \3 \4::filter=none 33 | #there|There are .*/NN and .*/NN::filter=kill 34 | 35 | there|There are .*/NN::word=\0 is \2::pivots=\1,is 36 | there|There is .*/NNS::word=\0 are \2::pivots=\1,are 37 | there|There is .*/NN .*/NNS::word=\0 are \2 \3::pivots=\1,are 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | there's|There's none::filter=kill 49 | there's|There's none|but|today|plenty::filter=kill 50 | 51 | there's|There's .*/NNS of .*/NN|VBG::filter=kill 52 | there're|There're .*/NN of .*/NNS|VBG|JJ::filter=kill 53 | there're|There're .*/NN of .*/NN .*/NNS|VBG::filter=kill 54 | there're|There're .*/NN .*/NNS::filter=kill 55 | there're|There're .*/NN .*/NN .*/NNS::filter=kill 56 | there're|There're .*/NN too many::filter=kill 57 | 58 | #there/EX are/VBP plenty/NN of/IN advantages/NNS to/TO 59 | 60 | # according to http://ask.metafilter.com/84536/There-is-or-There-are 61 | # I should use the closest noun to determine is/are. So these rules are not 62 | # needed. Just the same I'm commenting them out for future reference. 63 | #there|there is .*/NN and .*/NN::word=\0 are \2 \3 \4::filter=none 64 | #there|There are .*/NN and .*/NN::filter=kill 65 | 66 | There're .*/NN::word=There's \1::pivots=\0,There's 67 | there're .*/NN::word=there's \1::pivots=\0,there's 68 | 69 | There's .*/NNS::word=There are \1::pivots=\0,There are 70 | there's .*/NNS::word=there are \1::pivots=\0,there are 71 | 72 | There's .*/NN .*/NNS::word=There are \1 \2::pivots=\0,There are 73 | there's .*/NN .*/NNS::word=there are \1 \2::pivots=\0,there are 74 | -------------------------------------------------------------------------------- /data/rules/grammar/repeats: -------------------------------------------------------------------------------- 1 | # 2 | # some repeated words, makes no sense. 3 | # 4 | 5 | you'll will::word=you will::filter=none 6 | You'll will::word=You will::filter=none 7 | I'll will::word=I will::filter=none 8 | we'll will::word=we will::filter=none 9 | We'll will::word=We will::filter=none 10 | they'll will::word=they will::filter=none 11 | They'll will::word=They will::filter=none 12 | She'll will::word=She will::filter=none 13 | she'll will::word=she will::filter=none 14 | He'll will::word=He will::filter=none 15 | he'll will::word=he will::filter=none 16 | 17 | aren't not::word=are not::filter=none 18 | didn't not::word=did not::filter=none 19 | don't not::word=do not::filter=none 20 | isn't not::word=is not::filter=none 21 | can't not::word=can not::filter=none 22 | weren't not::word=were not::filter=none 23 | wouldn't not::word=would not::filter=none 24 | doesn't not::word=does not::filter=none 25 | hasn't not::word=has not::filter=none 26 | couldn't not::word=could not::filter=none 27 | 28 | Aren't not::word=Are not::filter=none 29 | Didn't not::word=Did not::filter=none 30 | Don't not::word=Do not::filter=none 31 | Isn't not::word=Is not::filter=none 32 | Can't not::word=Can not::filter=none 33 | Weren't not::word=Were not::filter=none 34 | Wouldn't not::word=Would not::filter=none 35 | Doesn't not::word=Does not::filter=none 36 | Hasn't not::word=Has not::filter=none 37 | Couldn't not::word=Could not::filter=none 38 | 39 | it's is::word=it is::filter=none 40 | It's is::word=It is::filter=none 41 | That's is::word=That is::filter=none 42 | that's is::word=that is::filter=none 43 | there's is::word=there's is::filter=none 44 | There's is::word=There's is::filter=none 45 | he's is::word=he is::filter=none 46 | He's is::word=He is::filter=none 47 | she's is::word=she is::filter=none 48 | She's is::word=She is::filter=none 49 | who's is::word=who is::filter=none 50 | Who's is::word=Who is::filter=none 51 | 52 | we're are::word=We are::filter=none 53 | you're are::word=You are::filter=none 54 | they're are::word=They are::filter=none 55 | We're are::word=We are::filter=none 56 | You're are::word=You are::filter=none 57 | They're are::word=They are::filter=none 58 | Who're are::word=Who are::filter=none 59 | who're are::word=who are::filter=none 60 | 61 | I'm am::word=I am::filter=none 62 | I've have::word=I have::filter=none 63 | 64 | you've have::word=you have::filter=none 65 | we've have::word=we have::filter=none 66 | they've have::word=they have::filter=none 67 | 68 | You've have::word=You have::filter=none 69 | We've have::word=We have::filter=none 70 | They've have::word=They have::filter=none 71 | 72 | I'd would::word=I would::filter=none 73 | 74 | he'd would::word=he would::filter=none 75 | she'd would::word=she would::filter=none 76 | you'd would::word=you would::filter=none 77 | we'd would::word=we would::filter=none 78 | they'd would::word=they would::filter=none 79 | 80 | He'd would::word=He would::filter=none 81 | She'd would::word=She would::filter=none 82 | You'd would::word=You would::filter=none 83 | We'd would::word=We would::filter=none 84 | They'd would::word=They would::filter=none 85 | -------------------------------------------------------------------------------- /utils/spelldata/gen2.sl: -------------------------------------------------------------------------------- 1 | # 2 | # process through corpus, our goal is to associate all misspelt words with a sentence. 3 | # 4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto 5 | # 6 | # wordfile must be in bad\ngood\n order 7 | # 8 | 9 | debug(7 | 34); 10 | 11 | sub getthree 12 | { 13 | local('@words'); 14 | @words = copy($1); 15 | add(@words, '0BEGIN.0'); 16 | push(@words, '0END.0'); 17 | 18 | while (size(@words) >= 3) 19 | { 20 | yield sublist(@words, 0, 3); 21 | @words = sublist(@words, 1); 22 | } 23 | } 24 | 25 | sub process 26 | { 27 | local('@words $entry $previous $current $next'); 28 | 29 | $1 = [$1 trim]; 30 | if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]') 31 | { 32 | return; 33 | } 34 | 35 | @words = splitIntoWords($1); 36 | 37 | while $entry (getthree(@words)) 38 | { 39 | ($previous, $current, $next) = $entry; 40 | 41 | if (%words[$current] !is $null && %dictionary[$previous] !is $null && %dictionary[$next] !is $null && %counts[$current] < 1) 42 | { 43 | println($output, "$previous * $next $+ |" . join(", ", @($current, rand(%dataset[$current]))) ); 44 | %counts[$current] += 1; 45 | } 46 | } 47 | } 48 | 49 | sub processFile 50 | { 51 | local('$handle $key $data $text @paragraphs'); 52 | 53 | # read in our corpus. 54 | $handle = openf($1); 55 | $text = replace(readb($handle, -1), '<[^>]*?>', ''); 56 | closef($handle); 57 | 58 | # start processing it?!? 59 | @paragraphs = splitByParagraph($text); 60 | map({ map(&process, $1); }, @paragraphs); 61 | 62 | #warn("Processed $1 $+ !"); 63 | } 64 | 65 | sub main 66 | { 67 | global('%dataset $goal %words %counts'); 68 | 69 | # load the words we're interested in. 70 | local('$handle $text $good'); 71 | 72 | $handle = openf($2); 73 | while $text (readln($handle)) 74 | { 75 | $good = readln($handle); 76 | 77 | if (%dataset[$good] is $null) { %dataset[$good] = @(); } 78 | push(%dataset[$good], $text); 79 | %words[$good] += 1; 80 | } 81 | closef($handle); 82 | 83 | $goal = size(%dataset); 84 | 85 | # setup our file that we're going to dump the output to. 86 | global('$output'); 87 | $output = openf("> $+ $3"); 88 | 89 | # ok go through all the junk parsing through the files. 90 | 91 | include("lib/nlp.sl"); 92 | include("lib/dictionary.sl"); 93 | global('%dictionary'); 94 | %dictionary = dictionary(); 95 | %dictionary["0BEGIN.0"] = 1; 96 | %dictionary["0END.0"] = 1; 97 | 98 | # collect list of files. 99 | [{ 100 | if (-isDir $1) 101 | { 102 | map($this, ls($1)); 103 | } 104 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 105 | { 106 | processFile($1); 107 | } 108 | }: $1]; 109 | 110 | 111 | closef($output); 112 | println("Done!"); 113 | } 114 | 115 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile"; 116 | 117 | invoke(&main, @ARGV); 118 | -------------------------------------------------------------------------------- /utils/spelldata/gen3.sl: -------------------------------------------------------------------------------- 1 | # 2 | # process through corpus, our goal is to associate all misspelt words with a sentence. 3 | # 4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto 5 | # 6 | # wordfile must be in bad\ngood\n order 7 | # 8 | 9 | debug(7 | 34); 10 | 11 | sub getthree 12 | { 13 | local('@words'); 14 | @words = copy($1); 15 | add(@words, '0BEGIN.0'); 16 | push(@words, '0END.0'); 17 | 18 | while (size(@words) >= 3) 19 | { 20 | yield sublist(@words, 0, 3); 21 | @words = sublist(@words, 1); 22 | } 23 | } 24 | 25 | sub process 26 | { 27 | local('@words $entry $previous $current $next'); 28 | 29 | $1 = [$1 trim]; 30 | if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]') 31 | { 32 | return; 33 | } 34 | 35 | @words = splitIntoWords($1); 36 | 37 | while $entry (getthree(@words)) 38 | { 39 | ($previous, $current, $next) = $entry; 40 | 41 | if (%words[$current] !is $null && %dictionary[$previous] !is $null && %dictionary[$next] !is $null && %counts[$current] < 10) 42 | { 43 | println($output, "$previous * $next $+ |" . join(", ", concat($current, %dataset[$current])) ); 44 | %counts[$current] += 1; 45 | } 46 | } 47 | } 48 | 49 | sub processFile 50 | { 51 | local('$handle $key $data $text @paragraphs'); 52 | 53 | # read in our corpus. 54 | $handle = openf($1); 55 | $text = replace(readb($handle, -1), '<[^>]*?>', ''); 56 | closef($handle); 57 | 58 | # start processing it?!? 59 | @paragraphs = splitByParagraph($text); 60 | map({ map(&process, $1); }, @paragraphs); 61 | 62 | #warn("Processed $1 $+ !"); 63 | } 64 | 65 | sub main 66 | { 67 | global('%dataset $goal %words %counts'); 68 | 69 | # load the words we're interested in. 70 | local('$handle $text $good'); 71 | 72 | $handle = openf($2); 73 | while $text (readln($handle)) 74 | { 75 | $good = readln($handle); 76 | 77 | if (%dataset[$good] is $null) { %dataset[$good] = @(); } 78 | push(%dataset[$good], $text); 79 | %words[$good] += 1; 80 | } 81 | closef($handle); 82 | 83 | $goal = size(%dataset); 84 | 85 | # setup our file that we're going to dump the output to. 86 | global('$output'); 87 | $output = openf("> $+ $3"); 88 | 89 | # ok go through all the junk parsing through the files. 90 | 91 | include("lib/nlp.sl"); 92 | include("lib/dictionary.sl"); 93 | global('%dictionary'); 94 | %dictionary = dictionary(); 95 | %dictionary["0BEGIN.0"] = 1; 96 | %dictionary["0END.0"] = 1; 97 | 98 | # collect list of files. 99 | [{ 100 | if (-isDir $1) 101 | { 102 | map($this, ls($1)); 103 | } 104 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 105 | { 106 | processFile($1); 107 | } 108 | }: $1]; 109 | 110 | 111 | closef($output); 112 | println("Done!"); 113 | } 114 | 115 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile"; 116 | 117 | invoke(&main, @ARGV); 118 | -------------------------------------------------------------------------------- /utils/tagger/postest.sl: -------------------------------------------------------------------------------- 1 | # 2 | # test the tagger 3 | # 4 | 5 | debug(debug() | 7 | 34); 6 | 7 | include("lib/tagger.sl"); 8 | initTaggerModels(); 9 | 10 | sub both 11 | { 12 | local('$a $b'); 13 | ($a, $b) = @_; 14 | while (size($a) > 0 && size($b) > 0) 15 | { 16 | yield @($a[0], $b[0]); 17 | $a = sublist($a, 1); 18 | $b = sublist($b, 1); 19 | } 20 | } 21 | 22 | sub tests 23 | { 24 | local('$lexicon $handle $count $score $line $item $word $tag $f $compare $taggit $opt $count $word $tag'); 25 | 26 | $handle = openf(@ARGV[0]); 27 | while $line (readln($handle)) 28 | { 29 | $compare = map({ return split('/', $1)[0]; }, split(' ', $line)); 30 | 31 | foreach $f (@_) 32 | { 33 | $taggit = taggerToString([$f tag: $compare]); 34 | 35 | while $opt (both(split(' ', $line), split(' ', $taggit))) 36 | { 37 | ($word, $tag) = split('/', $opt[0]); 38 | 39 | if ($word in $lexdb) 40 | { 41 | if ($opt[0] eq $opt[1]) 42 | { 43 | [$f scoreK]; 44 | } 45 | [$f countK]; 46 | } 47 | else 48 | { 49 | if ($opt[0] eq $opt[1]) 50 | { 51 | [$f scoreU]; 52 | } 53 | [$f countU]; 54 | } 55 | } 56 | } 57 | 58 | $count++; 59 | # if (($count % 2500) == 0 && $count > 0) 60 | # { 61 | # foreach $f (@_) 62 | # { 63 | # [$f print]; 64 | # } 65 | # println("$[-20]count"); 66 | # } 67 | } 68 | 69 | foreach $f (@_) 70 | { 71 | [$f print]; 72 | } 73 | } 74 | 75 | sub test 76 | { 77 | return lambda( 78 | { 79 | if ($0 eq "tag") 80 | { 81 | return invoke($function, @_); 82 | } 83 | else if ($0 eq "scoreK") 84 | { 85 | $scoreK += 1; 86 | } 87 | else if ($0 eq "countK") 88 | { 89 | $countK += 1; 90 | } 91 | else if ($0 eq "scoreU") 92 | { 93 | $scoreU += 1; 94 | } 95 | else if ($0 eq "countU") 96 | { 97 | $countU += 1; 98 | } 99 | else if ($0 eq "print") 100 | { 101 | println("test: $description = known: " . ($scoreK / $countK) . " unknown: " . ($scoreU / $countU) . " composite: " . (($scoreK + $scoreU) / ($countK + $countU))); 102 | } 103 | }, $function => $2, $description => $1, $scoreK => 0.0, $countK => 0.0, $scoreU => 0.0, $countU => 0.0); 104 | } 105 | 106 | tests( 107 | # test("pytagger", &taggerPython), 108 | # test("brill-light", &taggerLikeBrill), 109 | test("trigrams", &taggerWithTrigrams), 110 | test("lexprob", &taggerWithLexProb), 111 | # test("trigrams w/ neural", &taggerWithNeuralTrigrams), 112 | # test("trigrams w/ fix", &taggerWithTrigramsFix), 113 | # test("trigrams - no fixes", &taggerWithTrigrams2), 114 | # test("random", &taggerRandom) 115 | # test("HMM", &taggerHMM) 116 | ); 117 | -------------------------------------------------------------------------------- /utils/spell/definitions.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this script creates a dictionary definitions file for AtD from the raw text of the public 3 | # domain OPTED dictionary (Online Plain Text English Dictionary) 4 | # 5 | # Available at: http://msowww.anu.edu.au/~ralph/OPTED/ 6 | # 7 | # Depends on: 8 | # data/rules/homophonedb.txt (list of words we want to create def file for) 9 | # 10 | # Outputs to: 11 | # data/rules/definitions.txt (a worddefinition file) 12 | 13 | debug(7 | 34); 14 | 15 | sub loadWords 16 | { 17 | local('$handle $words $text $word $def'); 18 | $handle = openf("data/rules/homophonedb.txt"); 19 | $words = split(',\s+', join(", ",readAll($handle))); 20 | closef($handle); 21 | 22 | $handle = openf("data/rules/homo/definitions.txt"); 23 | while $text (readln($handle)) 24 | { 25 | ($word, $def) = split('\t+', $text); 26 | push($words, $word); 27 | %alts[$word] = $def; 28 | } 29 | closef($handle); 30 | 31 | map({ $dictionary[$1] = 1; }, sort({ return lc($1) cmp lc($2); }, $words)); 32 | } 33 | 34 | sub suckUpDictFile 35 | { 36 | local('$handle $text $word $pos $definition $check'); 37 | $handle = openf($1); 38 | while $text (readln($handle)) 39 | { 40 | if ($text ismatch '

(.*?) \((.*?)\) (.*?)

') 41 | { 42 | ($word, $pos, $definition) = matched(); 43 | if ("See*" iswm $definition || "Alt. of*" iswm $definition || "pl. of" iswm $definition || "of *" iswm $definition) 44 | { 45 | continue; 46 | } 47 | 48 | if ($word in $dictionary && strlen($dictionary[$word]) == 1) 49 | { 50 | $dictionary[$word] = $definition; 51 | } 52 | if (lc($word) in $dictionary && strlen($dictionary[lc($word)]) == 1) 53 | { 54 | $dictionary[lc($word)] = $definition; 55 | } 56 | 57 | $check = lc($word) . "s"; 58 | if ($check in $dictionary && strlen($dictionary[$check]) == 1) 59 | { 60 | $dictionary[$check] = "Plural of " . lc($word) . ". " . $definition; 61 | } 62 | } 63 | } 64 | 65 | closef($handle); 66 | } 67 | 68 | 69 | sub main 70 | { 71 | global('$dictionary %alts'); 72 | $dictionary = ohash(); 73 | loadWords(); 74 | 75 | [{ 76 | if (-isDir $1) 77 | { 78 | map($this, ls($1)); 79 | } 80 | else 81 | { 82 | suckUpDictFile($1); 83 | } 84 | }: "data/OPTED"]; 85 | 86 | local('$word $definition'); 87 | 88 | foreach $word => $definition ($dictionary) 89 | { 90 | if ($definition eq "1" || "See*" iswm $definition || "Alt. of*" iswm $definition || "of *" iswm $definition) 91 | { 92 | [[System err] println: "Substituting: $word = " . %alts[$word]]; 93 | $definition = uc(charAt(%alts[$word], 0)) . substr(%alts[$word], 1); 94 | } 95 | else 96 | { 97 | $definition = split(';', $definition)[0]; 98 | } 99 | 100 | println("$word $+ \t $+ $definition"); 101 | } 102 | } 103 | 104 | invoke(&main, @ARGV); 105 | -------------------------------------------------------------------------------- /utils/bigrams/buildunigrams.sl: -------------------------------------------------------------------------------- 1 | # 2 | # code to load wordlists. 3 | # we use this here because this code actually builds the corpus. 4 | # 5 | # java -jar sleep.jar buildunigrams.sl corpus/ outputfile.bin 6 | 7 | import org.dashnine.preditor.* from: 'lib/spellutils.jar'; 8 | 9 | # 10 | # tool to build a corpus. <3 11 | # 12 | 13 | debug(7 | 34); 14 | 15 | sub process 16 | { 17 | local('@words $head $next $previous'); 18 | 19 | @words = splitIntoWords($1); 20 | add(@words, '0BEGIN.0', 0); 21 | 22 | [$model addUnigram: '0BEGIN.0']; 23 | 24 | while (size(@words) > 1) 25 | { 26 | ($head, $next) = @words; 27 | [$model addUnigram: $next]; 28 | @words = sublist(@words, 1); 29 | } 30 | 31 | [$model addUnigram: '0END.0']; 32 | } 33 | 34 | sub processFile 35 | { 36 | local('$handle $key $data $text @paragraphs'); 37 | 38 | # read in our corpus. 39 | $handle = openf($1); 40 | $text = stripHTML(join("\n", readAll($handle))); 41 | closef($handle); 42 | 43 | # start processing it?!? 44 | @paragraphs = splitByParagraph($text); 45 | map({ map(&process, $1); }, @paragraphs); 46 | warn("$1 complete"); 47 | } 48 | 49 | sub agent 50 | { 51 | local('$next $key $data $size $ticks $lsize $lang'); 52 | 53 | include("lib/nlp.sl"); 54 | 55 | $lang = systemProperties()["atd.lang"]; 56 | if ($lang ne "" && -exists "lang/ $+ $lang $+ /load.sl") 57 | { 58 | include("lang/ $+ $lang $+ /load.sl"); 59 | } 60 | 61 | $next = @files[0]; 62 | removeAt(@files, 0); 63 | $size = size(@files); 64 | 65 | println("ready!"); 66 | 67 | while ($next !is $null) 68 | { 69 | processFile($next); 70 | $next = @files[0]; 71 | @files = sublist(@files, 1); 72 | } 73 | } 74 | 75 | sub main 76 | { 77 | global('%dictionary @files %homophones $model $lock'); 78 | 79 | local('$handle'); 80 | 81 | if (-exists $2) 82 | { 83 | $handle = openf($2); 84 | $model = readObject($handle); 85 | closef($handle); 86 | } 87 | else 88 | { 89 | $model = [new LanguageModel]; 90 | } 91 | 92 | # collect list of files. 93 | [{ 94 | if (-isDir $1) 95 | { 96 | map($this, ls($1)); 97 | } 98 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 99 | { 100 | push(@files, $1); 101 | } 102 | }: $1]; 103 | 104 | local('@agents @store $index $value $threads'); 105 | 106 | $threads = 8; 107 | 108 | @store = @(@(), @(), @(), @(), @(), @(), @(), @()); 109 | 110 | foreach $index => $value (@files) 111 | { 112 | push(@store[$index % $threads], $value); 113 | } 114 | 115 | for ($index = 0; $index < $threads; $index++) 116 | { 117 | push(@agents, fork(&agent, @files => copy(@store[$index]), \$model, \%homophones, \%dictionary)); 118 | } 119 | 120 | foreach $index => $value (@agents) 121 | { 122 | wait($value); 123 | warn("Agent $index complete"); 124 | } 125 | 126 | # save model 127 | $handle = openf("> $+ $2"); 128 | writeObject($handle, $model); 129 | closef($handle); 130 | 131 | println("Done!"); 132 | } 133 | 134 | invoke(&main, @ARGV); 135 | -------------------------------------------------------------------------------- /utils/rules/makeprepositions.sl: -------------------------------------------------------------------------------- 1 | $handle = openf(@ARGV[0]); 2 | while $text (readln($handle)) 3 | { 4 | ($first, $second, $type) = matches($text, '(\w+), (\w+) : (\w+)\\(.*'); 5 | if ($type eq 'Pbigram1' && $first ne "wont" && $first ne "continue" && '*ed' !iswm $first && $first ne "attempts") 6 | { 7 | if ($second eq "to") 8 | { 9 | if ($first eq "decided") 10 | { 11 | println(".*/DT $first stir::filter=kill"); 12 | } 13 | else if ($first eq "attempt") 14 | { 15 | println(".*/DT $first be::filter=kill"); 16 | } 17 | else if ($first eq "reference") 18 | { 19 | println(".*/DT $first have::filter=kill"); 20 | } 21 | else if ($first eq "wanted" || $first eq "wants" || $first eq "want") 22 | { 23 | println(".*/PRP $first help::filter=kill"); 24 | println(".*/NNP $first help::filter=kill"); 25 | } 26 | 27 | if (-islower charAt($first, 0)) 28 | { 29 | println(".*/PRP $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second"); 30 | println(".*/NNP $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second"); 31 | println(".*/DT $first .*/VB::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second"); 32 | } 33 | else 34 | { 35 | println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second"); 36 | println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second"); 37 | println("0BEGIN.0 $first .*/VB::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second"); 38 | } 39 | } 40 | else if ($second eq "of") 41 | { 42 | if ($first eq "couple") 43 | { 44 | println(".*/DT $first .*/NN|NNS::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second"); 45 | } 46 | else if ($first eq "beware") 47 | { 48 | println(".*/DT $first .*/DT .*/NN|NNS::word=\\0 \\1 $second \\2 \\3::pivots= $+ $first $+ , $+ $first $second"); 49 | println(".*/DT $first .*/NN|NNS::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $first $second"); 50 | } 51 | } 52 | else if ($second eq "on" || $second eq "with" || $second eq "in") 53 | { 54 | # println("$first .*/DT .*/NN|NNS::word=\\0 $second \\1 \\2::pivots= $+ $first $+ , $+ $first $second"); 55 | # println("$first .*/NN|NNS::word=\\0 $second \\1::pivots= $+ $first $+ , $+ $first $second"); 56 | } 57 | else if ($second ne "of" && $second ne "to") 58 | { 59 | # println("$first $second $+ ::filter=none"); 60 | } 61 | } 62 | else if ($type eq 'Pbigram2') 63 | { 64 | # println(".*/DT .*/NN $first $+ ::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $second $first"); 65 | # println(".*/VB $first $+ ::word=\\0 $second \\1::pivots=\\1, $+ $second \\1"); 66 | # println(".*/VBD $first $+ ::word=\\0 $second \\1::pivots=\\1, $+ $second \\1"); 67 | # println(".*/VBD .*/PRP $first $+ ::word=\\0 \\1 $second \\2::pivots= $+ $first $+ , $+ $second $first"); 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /data/rules/biasdb.txt: -------------------------------------------------------------------------------- 1 | African people South Asian peoples 2 | Dwarves Dwarfs 3 | East Indian South Asian 4 | Siamese twins conjoined twins 5 | West Indian Caribbean 6 | afflicted with has a disability, has an illness 7 | amputee person with an amputation 8 | black market underground economy, deals on the side 9 | black sheep reprobate, backslider 10 | blackball ostracize, disapprove, reject 11 | blacklist condemn, ostracize, boycott 12 | blackmail extort, threaten, demand 13 | businessman business person 14 | chairman chair, co-ordinator, convenor 15 | chronic mental illness long-term mental illness, persistent mental illness, psychiatric disability 16 | cleaning woman cleaner 17 | clergyman clergy, deacon, minister, pastor, priest, rabbi 18 | colored people Black peoples, people of African descent, 19 | common man average person, members of the public 20 | confined to a wheelchair uses a wheelchair 21 | craftsman artisan, craftsperson 22 | crippled impaired, flawed, disabled 23 | deaf mute deaf 24 | disabled person person with a disability 25 | disseminate broadcast, inform, publicise 26 | dwarves dwarfs 27 | epileptics individuals with epilepsy 28 | fair sex women 29 | fireman firefighter 30 | forefathers ancestors 31 | founding fathers founders 32 | hearing impaired hard of hearing 33 | housewife homemaker 34 | ladies women 35 | lady woman 36 | layman layperson, average person 37 | low man|woman on the totem pole lowest rung of the ladder 38 | man hours working hours 39 | man in the street public person in the street, public, member of the public 40 | man the \w+s staff the, handle the 41 | man-made synthetic, artificial 42 | mankind civilization, humanity, people 43 | manpower personnel, staff, staffing requirements, workers, workforce 44 | master copy top copy, original 45 | master of ceremonies host, emcee 46 | masterful domineering, very skilful 47 | mentally ill child|adult|person|boy|girl person with mental illness, person with psychiatric disability 48 | middleman wholesaler, go-between, intermediary 49 | mistress of ceremonies host, emcee 50 | newsman journalist, reporter 51 | niggard miser 52 | niggardly miserly, stingy 53 | non-whites people of colour 54 | old masters classic art, artists 55 | one man show one person show 56 | Oriental Asian 57 | orientals Asian peoples, East Asian peoples, Southeast Asian peoples 58 | paraplegics individuals with paraplegia 59 | physically challenged physically disabled 60 | policeman officer, police officer 61 | postman postal worker, mail carrier 62 | primitive societies non-industrial societies 63 | retarded adult adult with mental retardation 64 | right-hand man assistant 65 | salesman clerk, sales rep 66 | schizophrenics people who have schizophrenia 67 | seminal classical, formative 68 | sexual preference sexual orientation, gender orientation 69 | spokesman spokesperson, representative, speaker, official 70 | stewardess flight attendant 71 | suffering from has a disability, has an illness 72 | the crippled people with a disability 73 | the disabled persons with disabiliites, people with disabilities 74 | the handicapped people with disabilities 75 | the man in the street people in general 76 | the rights of man peoples/citizens rights, the rights of the individual 77 | tribes ethnic groups 78 | wheelchair-bound uses a wheelchair 79 | wives and children families, family 80 | workman worker 81 | -------------------------------------------------------------------------------- /data/rules/grammar/determiners: -------------------------------------------------------------------------------- 1 | # These rules look for missing determiners 2 | 3 | .*/VBP &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1 4 | .*/VBP &determiner_wanted 0END.0::word=\0 \1:determiner \1, \0 \1:determiner2 \1::pivots=\1,\1:determiner \1,\1:determiner2 \1 5 | .*/VBP .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2 \3, \0 \1:determiner2 \1 \2 \3::pivots=\1,\1:determiner \1,\1:determiner2 \1 6 | .*/VBP .*/JJ &determiner_wanted 0END.0::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1 7 | 8 | .*/VBZ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1 9 | .*/VBZ &determiner_wanted 0END.0::word=\0 \1:determiner \1, \0 \1:determiner2 \1::pivots=\1,\1:determiner \1,\1:determiner2 \1 10 | .*/VBZ .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1:determiner \1 \2 \3, \0 \1:determiner2 \1 \2 \3::pivots=\1,\1:determiner \1,\1:determiner2 \1 11 | .*/VBZ .*/JJ &determiner_wanted 0END.0::word=\0 \1:determiner \1 \2, \0 \1:determiner2 \1 \2::pivots=\1,\1:determiner \1,\1:determiner2 \1 12 | 13 | .*/MD .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2 14 | .*/MD .*/VB &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2 15 | .*/MD .*/VB .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3 \4, \0 \1 \2:determiner2 \2 \3 \4::pivots=\2,\2:determiner \2,\2:determiner2 \2 16 | .*/MD .*/VB .*/JJ &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2 17 | 18 | .*/PRP .*/VBD &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2 19 | .*/PRP .*/VBD &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2 20 | .*/PRP .*/VBD .*/JJ &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3 \4, \0 \1 \2:determiner2 \2 \3 \4::pivots=\2,\2:determiner \2,\2:determiner2 \2 21 | .*/PRP .*/VBD .*/JJ &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2 22 | 23 | .*/PRP be &determiner_wanted::filter=kill 24 | .*/PRP .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2 25 | .*/PRP .*/VB &determiner_wanted 0END.0::word=\0 \1 \2:determiner \2, \0 \1 \2:determiner2 \2::pivots=\2,\2:determiner \2,\2:determiner2 \2 26 | .*/PRP .*/VBP\,RB .*/VB &determiner_wanted for|of|in|to|so|from|with::word=\0 \1 \2 \3:determiner \3 \4, \0 \1 \2 \3:determiner2 \3 \4::pivots=\3,\3:determiner \3,\3:determiner2 \3 27 | .*/PRP .*/VBP\,RB .*/VB &determiner_wanted 0END.0::word=\0 \1 \2 \3:determiner \3, \0 \1 \2 \3:determiner2 \3::pivots=\3,\3:determiner \3,\3:determiner2 \3 28 | .*/PRP .*/VBP &determiner_wanted .*ing::word=\0 \1 \2:determiner \2 \3, \0 \1 \2:determiner2 \2 \3::pivots=\2,\2:determiner \2,\2:determiner2 \2 29 | -------------------------------------------------------------------------------- /utils/spelldata/gen.sl: -------------------------------------------------------------------------------- 1 | # 2 | # process through corpus, our goal is to associate all misspelt words with a sentence. 3 | # 4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto 5 | # 6 | # wordfile must be in bad\ngood\n order 7 | # 8 | 9 | debug(7 | 34); 10 | 11 | sub process 12 | { 13 | local('@words $head $next $count $candidate $prev $indict'); 14 | 15 | $1 = [$1 trim]; 16 | if ($1 !ismatch '[A-Z][A-Za-z\' ]*?[\.\?\!]') 17 | { 18 | return; 19 | } 20 | 21 | if ("we're" isin $1 || "they're" isin $1 || "it's" isin $1) 22 | { 23 | warn("Could be? $1"); 24 | } 25 | 26 | @words = splitIntoWords($1); 27 | $count = 0; 28 | 29 | # make sure there is only one misspelling in this sentence. 30 | foreach $word (@words) 31 | { 32 | if (%words[$word] !is $null) 33 | { 34 | $candidate = $word; 35 | $count++; 36 | } 37 | 38 | if (%dictionary[$word] is $null) 39 | { 40 | $indict++; 41 | } 42 | } 43 | 44 | if ($count == 1 && size(@words) >= 3 && %counts[$candidate] < 10 && $indict == 0) 45 | { 46 | $change = replace($1, "\\b $+ $candidate $+ \\b", '*'); 47 | 48 | println($output, "$change $+ |" . join(", ", concat(@($candidate), %dataset[$candidate]) )); 49 | %counts[$candidate] += 1; 50 | } 51 | else if ("we're" isin $1 || "they're" isin $1 || "it's" isin $1) 52 | { 53 | warn("Could be? $1 - Nope: $count and " . %counts[$candidate] . " and $indict"); 54 | } 55 | } 56 | 57 | sub processFile 58 | { 59 | local('$handle $key $data $text @paragraphs'); 60 | 61 | # read in our corpus. 62 | $handle = openf($1); 63 | $text = replace(readb($handle, -1), '<[^>]*?>', ''); 64 | closef($handle); 65 | 66 | # start processing it?!? 67 | @paragraphs = splitByParagraph($text); 68 | map({ map(&process, $1); }, @paragraphs); 69 | 70 | #warn("Processed $1 $+ !"); 71 | } 72 | 73 | sub main 74 | { 75 | global('%dataset $goal %words %counts'); 76 | 77 | # load the words we're interested in. 78 | local('$handle $text $good'); 79 | 80 | $handle = openf($2); 81 | while $text (readln($handle)) 82 | { 83 | $good = readln($handle); 84 | 85 | if (%dataset[$good] is $null) { %dataset[$good] = @(); } 86 | push(%dataset[$good], $text); 87 | %words[$good] += 1; 88 | } 89 | closef($handle); 90 | 91 | $goal = size(%dataset); 92 | 93 | # setup our file that we're going to dump the output to. 94 | global('$output'); 95 | $output = openf("> $+ $3"); 96 | 97 | # ok go through all the junk parsing through the files. 98 | 99 | include("nlp.sl"); 100 | include("dictionary.sl"); 101 | global('%dictionary'); 102 | %dictionary = dictionary(); 103 | 104 | # collect list of files. 105 | [{ 106 | if (-isDir $1) 107 | { 108 | map($this, ls($1)); 109 | } 110 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 111 | { 112 | processFile($1); 113 | } 114 | }: $1]; 115 | 116 | 117 | closef($output); 118 | println("Done!"); 119 | } 120 | 121 | assert size(@ARGV) == 3 : "java -jar sleep.jar corpus_data wordlist outputfile"; 122 | 123 | invoke(&main, @ARGV); 124 | -------------------------------------------------------------------------------- /data/rules/grammar/combine: -------------------------------------------------------------------------------- 1 | # 2 | # words that should be combined 3 | # 4 | 5 | # every day (daily) vs. everyday (common) 6 | 7 | an|in|for|the|to every day::word=\0 everyday::pivots=\1 \2,everyday 8 | 9 | # before hand -> beforehand 10 | 11 | before hand::word=beforehand 12 | 13 | # an other -> another 14 | 15 | an other::word=another 16 | 17 | # all ways -> always (unless refering to everything) like she is better than him in all ways. <- this is ok 18 | in all ways::filter=kill 19 | all ways::word=always 20 | 21 | # every where -> everywhere 22 | 23 | every where::word=everywhere 24 | 25 | # 26 | # more words to combine 27 | # 28 | eye sight::word=eyesight 29 | eye sore::word=eyesore 30 | figure head::word=figurehead 31 | flag ship::word=flagship 32 | head gear::word=headgear 33 | head quarters::word=headquarters 34 | head stone::word=headstone 35 | head wear::word=headwear 36 | how ever::word=however 37 | in stead of::word=instead of 38 | in tact::word=intact 39 | it self::word=itself 40 | key note::word=keynote 41 | laughing stock::word=laughingstock 42 | life time::word=lifetime 43 | mean while::word=meanwhile 44 | nation wide::word=nationwide 45 | near by::word=nearby 46 | new comer::word=newcomer 47 | no where to::word=nowhere to 48 | note worthy::word=noteworthy 49 | now a days::word=nowadays 50 | on going::word=ongoing 51 | out grow::word=outgrow 52 | out side::word=outside 53 | over looked::word=overlooked 54 | over looking::word=overlooking 55 | over rated::word=overrated 56 | over seas::word=overseas 57 | short coming::word=shortcoming 58 | short cut::word=shortcut 59 | side kick::word=sidekick 60 | sky diving::word=skydiving 61 | some how::word=somehow 62 | some what::word=somewhat 63 | stale mate::word=stalemate 64 | them selves::word=themselves 65 | back fire::word=backfire 66 | world wide::word=worldwide 67 | worth while::word=worthwhile 68 | where as::word=whereas 69 | where by::word=whereby 70 | where upon::word=whereupon 71 | #with in an|a|the second|minute|hour|year|decade|century|day::word=within \2 \3::filter=none 72 | with in::word=within 73 | with out::word=without 74 | way side::word=wayside 75 | along side::word=alongside 76 | be cause::word=because 77 | be ware::word=beware 78 | before hand::word=beforehand 79 | down side::word=downside 80 | eye brow::word=eyebrow 81 | eye lash::word=eyelash 82 | eye lid::word=eyelid 83 | through out::word=throughout 84 | on-going::word=ongoing 85 | light weight::word=lightweight 86 | heavy weight::word=heavyweight 87 | free lance::word=freelance 88 | free lancer::word=freelancer 89 | free lances::word=freelances 90 | free lancing::word=freelancing 91 | 92 | # awhile is an adverb, should be used after a verb 93 | .*/VB a while::word=\0 awhile::pivots=a while,awhile 94 | 95 | # join web site into website 96 | web site::word=website 97 | Web Site|site::word=Website 98 | 99 | head scarf::word=headscarf 100 | head scarves::word=headscarves 101 | 102 | key words::word=keywords 103 | crowd sourcing::word=crowdsourcing 104 | meta data::word=metadata 105 | mis .*::word=\0\1::filter=sane 106 | 107 | stand alone::word=standalone 108 | past time::word=pastime 109 | any where::word=anywhere 110 | some where::word=somewhere 111 | no where::word=nowhere 112 | .*/DT bail out::word=\0 bailout::pivots=bail out,bailout 113 | 114 | out come::word=outcome 115 | 116 | -------------------------------------------------------------------------------- /utils/spelldata/gen4.sl: -------------------------------------------------------------------------------- 1 | # 2 | # process through corpus, our goal is to associate all misspelt words with a sentence. 3 | # 4 | # java -jar sleep.jar gen.sl corpus_file wordfile filetowriteto 5 | # 6 | # wordfile must be in bad\ngood\n order 7 | # 8 | 9 | debug(7 | 34); 10 | 11 | sub getnext 12 | { 13 | local('@words'); 14 | @words = copy($1); 15 | add(@words, @('0BEGIN.0', 'UNK')); 16 | push(@words, @('0END.0', 'UNK')); 17 | 18 | while (size(@words) >= 5) 19 | { 20 | yield sublist(@words, 0, 5); 21 | @words = sublist(@words, 1); 22 | } 23 | } 24 | 25 | sub process 26 | { 27 | local('@words $entry $previous $current $next $pre2 $pre1 $next1 $next2'); 28 | 29 | $1 = [$1 trim]; 30 | if ($1 !ismatch '[A-Z][A-Za-z\'\,\- ]*?[\.\?\!]{0,1}') 31 | { 32 | return; 33 | } 34 | 35 | @words = taggerWithTrigrams(splitIntoWords($1)); 36 | 37 | while $entry (getnext(@words)) 38 | { 39 | ($pre2, $pre1, $current, $next1, $next2) = map({ return $1[0]; }, $entry); 40 | 41 | if (%words[$current] !is $null && %dictionary[$pre2] !is $null && %dictionary[$pre1] !is $null && %dictionary[$next1] !is $null && %dictionary[$next2] !is $null && %counts[$current] < $max) 42 | { 43 | ($pre2, $pre1, $current, $next1, $next2) = map({ return join('/', $1); }, $entry); 44 | 45 | println($output, "$pre2 $pre1 * $next1 $next2 $+ |" . join("; ", concat($current, %dataset[$entry[2][0]])) ); 46 | %counts[$entry[2][0]] += 1; 47 | } 48 | } 49 | } 50 | 51 | sub processFile 52 | { 53 | local('$handle $key $data $text @paragraphs'); 54 | 55 | # read in our corpus. 56 | $handle = openf($1); 57 | $text = replace(readb($handle, -1), '<[^>]*?>', ''); 58 | closef($handle); 59 | 60 | # start processing it?!? 61 | @paragraphs = splitByParagraph($text); 62 | map(lambda({ map(lambda(&process, \$max), $1); }, \$max), @paragraphs); 63 | 64 | #warn("Processed $1 $+ !"); 65 | } 66 | 67 | sub main 68 | { 69 | global('%dataset $goal %words %counts'); 70 | 71 | # load the words we're interested in. 72 | local('$handle $text $good'); 73 | 74 | $handle = openf($1); 75 | while $text (readln($handle)) 76 | { 77 | $good = readln($handle); 78 | 79 | if (%dataset[$good] is $null) { %dataset[$good] = @(); } 80 | push(%dataset[$good], $text); 81 | %words[$good] += 1; 82 | } 83 | closef($handle); 84 | 85 | $goal = size(%dataset); 86 | 87 | # setup our file that we're going to dump the output to. 88 | global('$output'); 89 | $output = openf("> $+ $3"); 90 | 91 | # ok go through all the junk parsing through the files. 92 | 93 | include("lib/nlp.sl"); 94 | include("lib/dictionary.sl"); 95 | include("lib/tagger.sl"); 96 | 97 | global('%dictionary'); 98 | %dictionary = dictionary(); 99 | %dictionary["0BEGIN.0"] = 1; 100 | %dictionary["0END.0"] = 1; 101 | 102 | initTaggerModels(); 103 | 104 | # collect list of files. 105 | [{ 106 | if (-isDir $1) 107 | { 108 | map($this, ls($1)); 109 | } 110 | else if ("*Image*.html" !iswm $1 && "*User*.html" !iswm $1) 111 | { 112 | processFile($1, \$max); 113 | } 114 | }: $2, $max => $4]; 115 | 116 | 117 | closef($output); 118 | println("Done!"); 119 | } 120 | 121 | assert size(@ARGV) == 4 : "java -jar sleep.jar corpus_data wordlist outputfile max_entries_per_word"; 122 | 123 | invoke(&main, @ARGV); 124 | -------------------------------------------------------------------------------- /utils/rules/testgr.sl: -------------------------------------------------------------------------------- 1 | # 2 | # This is a script to test grammar rules. It's fun stuff. 3 | # 4 | # java -jar utils/rules/testgr.sl [missing|wrong] 5 | # 6 | 7 | debug(7 | 34); 8 | 9 | include("lib/engine.sl"); 10 | include("utils/rules/rules.sl"); 11 | include("utils/common/score.sl"); 12 | 13 | sub checkSentenceSpelling 14 | { 15 | } 16 | 17 | sub initAll 18 | { 19 | global('$__SCRIPT__ $model $rules $dictionary $network $dsize %edits $hnetwork $account $usage $endings $lexdb $trigrams $verbs'); 20 | $model = get_language_model(); 21 | $dictionary = dictionary(); 22 | $rules = get_rules(); 23 | $dsize = size($dictionary); 24 | $hnetwork = get_network("hnetwork4.bin"); 25 | $verbs = loadVerbData(); 26 | initTaggerModels(); 27 | } 28 | 29 | sub measure 30 | { 31 | local('@results $options $correct $score $s_score $good $index $r @suggs $debug'); 32 | (@results, $options, $correct, $score, $s_score, $debug) = @_; 33 | 34 | if (size(@results) > 0) 35 | { 36 | foreach $index => $r (@results) 37 | { 38 | local('$rule $text $path $context @suggestions'); 39 | ($rule, $text, $path, $context, @suggestions) = $r; 40 | 41 | if (!-isarray @suggestions) { @suggestions = split(', ', @suggestions); } 42 | 43 | if ($text eq $options[0]) 44 | { 45 | @suggs = filter(lambda({ return iff($1 in $options, 1); }, $options => sublist($options, 1)), @suggestions); 46 | 47 | if (size(@suggs) > 0) 48 | { 49 | [$score correctSugg]; 50 | [$s_score correctSugg]; 51 | 52 | if ($correct in @suggestions) 53 | { 54 | [$score correct]; 55 | [$s_score correct]; 56 | } 57 | } 58 | else if ('wrong' isin $debug) 59 | { 60 | println("$wrong => $text"); 61 | println(" - entry: " . $entry); 62 | println(" - expect: " . sublist($options, 1)); 63 | println(" - options: " . @suggestions); 64 | println(" - " . $rule['category'] . ' = ' . $rule['rule'] ); 65 | } 66 | $good = 1; 67 | 68 | [$s_score record]; 69 | } 70 | } 71 | } 72 | 73 | if (!$good) 74 | { 75 | [$score falseNegative]; # move if $text eq options[1] never happens 76 | 77 | if ('missing' isin $debug) 78 | { 79 | println("$wrong => $text"); 80 | println(" - entry: " . $entry); 81 | println(" - expect: " . sublist($options, 1)); 82 | } 83 | } 84 | 85 | [$score record]; 86 | } 87 | 88 | sub main 89 | { 90 | local('$handle $sentence $entry @results $options $correct $wrong $score1 $score2 $2'); 91 | 92 | $score1 = newObject('score', "Suggestion score for $1"); 93 | $score2 = newObject('score', "Grammar score for $1"); 94 | 95 | initAll(); 96 | 97 | $handle = openf($1); 98 | while $entry (readln($handle)) 99 | { 100 | ($sentence, $options, $correct) = split('\|', $entry); 101 | $options = split(', ', $options); 102 | 103 | $wrong = strrep($sentence, ' * ', " " . $options[0] . " "); 104 | 105 | @results = @(); 106 | processSentence($sentence => $wrong, \@results); 107 | 108 | measure(@results, $options, $correct, $score2, $score1, $2, \$entry, \$wrong); 109 | } 110 | 111 | [$score1 print]; 112 | [$score2 print]; 113 | } 114 | 115 | invoke(&main, @ARGV); 116 | -------------------------------------------------------------------------------- /data/rules/diacritic/diaeresis: -------------------------------------------------------------------------------- 1 | # 2 | # http://en.wikipedia.org/wiki/Diaeresis 3 | # 4 | 5 | achroodextrin::word=achroödextrin::filter=none 6 | aedes::word=aëdes::filter=none 7 | Ajie::word=Ajië::filter=none 8 | Bootes::word=Boötes::filter=none 9 | chiliaedron::word=chiliaëdron::filter=none 10 | Chloe::word=Chloë::filter=none 11 | cooperate::word=coöperate::filter=none 12 | cooperation::word=coöperation::filter=none 13 | coopt::word=coöpt::filter=none 14 | coordinate::word=coördinate::filter=none 15 | coordinated::word=coördinated::filter=none 16 | coordinately::word=coördinately::filter=none 17 | coordinateness::word=coördinateness::filter=none 18 | coordinates::word=coördinates::filter=none 19 | coordination::word=coördination::filter=none 20 | coordinative::word=coördinative::filter=none 21 | coordinator::word=coördinator::filter=none 22 | diploe::word=diploë::filter=none 23 | eleemosynary::word=eleëmosynary::filter=none 24 | naive::word=naïve::filter=none 25 | naively::word=naïvely::filter=none 26 | noel::word=noël::filter=none 27 | Noel::word=Noël::filter=none 28 | oogone::word=oögone::filter=none 29 | ooidal::word=oöidal::filter=none 30 | oology::word=oölogy::filter=none 31 | preempt::word=preëmpt::filter=none 32 | preempted::word=preëmpted::filter=none 33 | preemptible::word=preëmptible::filter=none 34 | preemption::word=preëmption::filter=none 35 | preemptioner::word=preëmptioner::filter=none 36 | preemptive::word=preëmptive::filter=none 37 | preemptively::word=preëmptively::filter=none 38 | preemptor::word=preëmptor::filter=none 39 | preemptory::word=preëmptory::filter=none 40 | preexisting::word=preëxisting::filter=none 41 | reeducate::word=reëducate::filter=none 42 | reelect::word=reëlect::filter=none 43 | reenter::word=reënter::filter=none 44 | reentry::word=reëntry::filter=none 45 | reexamination::word=reëxamination::filter=none 46 | reexamine::word=reëxamine::filter=none 47 | reextend::word=reëxtend::filter=none 48 | uncoordinate::word=uncoördinate::filter=none 49 | uncoordinated::word=uncoördinated::filter=none 50 | vacuum::word=vacuüm::filter=none 51 | zoea::word=zoëa::filter=none 52 | zoochemistry::word=zoöchemistry::filter=none 53 | zoochemy::word=zoöchemy::filter=none 54 | zoochlorella::word=zoöchlorella::filter=none 55 | zoocyst::word=zoöcyst::filter=none 56 | zoocytium::word=zoöcytium::filter=none 57 | zooerythrine::word=zoöerythrine::filter=none 58 | zoogeography::word=zoögeography::filter=none 59 | zooglaea::word=zoöglœa::filter=none 60 | zoographer::word=zoögrapher::filter=none 61 | zoography::word=zoögraphy::filter=none 62 | zoolatry::word=zoölatry::filter=none 63 | zoology::word=zoölogy::filter=none 64 | zoomelanin::word=zoömelanin::filter=none 65 | zoomorphism::word=zoömorphism::filter=none 66 | zoon::word=zoön::filter=none 67 | zoonite::word=zoönite::filter=none 68 | zoonomy::word=zoönomy::filter=none 69 | zoonule::word=zoönule::filter=none 70 | zoopathology::word=zoöpathology::filter=none 71 | zoophaga::word=zoöphaga::filter=none 72 | zoophagan::word=zoöphagan::filter=none 73 | zoophagous::word=zoöphagous::filter=none 74 | zoophilist::word=zoöphilist::filter=none 75 | zoophily::word=zoöphily::filter=none 76 | zoophite::word=zoöphite::filter=none 77 | zoophorous::word=zoöphorous::filter=none 78 | Zoophyta::word=Zoöphyta::filter=none 79 | zoophyte::word=zoöphyte::filter=none 80 | zoophytic::word=zoöphytic::filter=none 81 | zoophytology::word=zoöphytology::filter=none 82 | zoopraxiscope::word=zoöpraxiscope::filter=none 83 | zoopsychology::word=zoöpsychology::filter=none 84 | zoosperm::word=zoösperm::filter=none 85 | zoosporangium::word=zoösporangium::filter=none 86 | zoospore::word=zoöspore::filter=none 87 | zoospores::word=zoöspores::filter=none 88 | zootic::word=zoötic::filter=none 89 | zootomist::word=zoötomist::filter=none 90 | zootomy::word=zoötomy::filter=none 91 | zootrophic::word=zoötrophic::filter=none 92 | -------------------------------------------------------------------------------- /data/rules/grammar/aux_been_was: -------------------------------------------------------------------------------- 1 | been .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle 2 | been .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle 3 | been .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present 4 | been .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present 5 | was .*/VB 0END.0::word=\0 \1:participle::pivots=\1,\1:participle 6 | was .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle 7 | was .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle 8 | was .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present 9 | was .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present 10 | were .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle 11 | were .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle 12 | were .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present 13 | were .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present 14 | are .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle 15 | are .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle 16 | are .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present 17 | are .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present 18 | am .*/VB|VBP .*/IN::word=\0 \1:participle \2::pivots=\1,\1:participle 19 | am .*/RB .*/VB|VBP .*/IN::word=\0 \1 \2:participle \3::pivots=\2,\2:participle 20 | am .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present 21 | am .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present 22 | is .*/VB|VBP for|by|as::word=\0 \1:participle \2::pivots=\1,\1:participle 23 | is .*/RB .*/VB|VBP for|by|as::word=\0 \1 \2:participle \3::pivots=\2,\2:participle 24 | is .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1:present \2::pivots=\1,\1:present 25 | is .*/RB .*/VB|VBP|VBD .*/VBN|VB::word=\0 \1 \2:present \3::pivots=\2,\2:present 26 | do is .*/VB::filter=kill 27 | been .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present 28 | been .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present 29 | was .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present 30 | was .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present 31 | were .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present 32 | were .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present 33 | are .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present 34 | are .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present 35 | am .*/VB|VBP|VBD .*/TO|DT::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present 36 | am .*/RB .*/VB|VBP|VBD .*/TO|DT::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present 37 | is .*/VB|VBP|VBD .*/TO::word=\0 \1:participle \2, \0 \1:present \2::pivots=\1,\1:participle,\1:present 38 | is .*/RB .*/VB|VBP|VBD .*/TO::word=\0 \1 \2:participle \3, \0 \1 \2:present \3::pivots=\2,\2:participle,\2:present 39 | it is .*/VBP|VB|VBD::word=\0 \1 \2:participle::pivots=\2,\2:participle 40 | It is .*/VBP|VB|VBD::word=\0 \1 \2:participle::pivots=\2,\2:participle 41 | is .*/VBP|VBD|VB as|for|to::word=\0 \1:participle \2::pivots=\1,\1:participle 42 | 43 | # are [base verb mistagged as a noun] to -> are [past tense] to 44 | are .*(? 2 | 3 | 4 | 5 | Rich Editor Help 6 | 7 | 8 | 9 | 114 | 137 | 138 | 139 | 140 |
141 | 142 | 143 | 144 | 148 | 149 |
150 |
151 | 152 |
153 | 157 |
158 | 159 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /utils/common/spelltests.sl: -------------------------------------------------------------------------------- 1 | # 2 | # this is a script to run unit tests and calculute the effectiveness of the 3 | # preditor engine 4 | # 5 | 6 | sub testSpellingNoContext 7 | { 8 | local('$handle $score $bad $good'); 9 | $handle = openf("tests/tests2.txt"); 10 | 11 | $score = newObject("score", "Spellchecker w/ No Context"); 12 | 13 | while $bad (readln($handle)) 14 | { 15 | $good = readln($handle); 16 | if ($dictionary[$bad] !is $null) 17 | { 18 | local('$source $size'); 19 | [$score falseNegative]; 20 | } 21 | else 22 | { 23 | [$score correct]; 24 | } 25 | 26 | if ($dictionary[$good] is $null) 27 | { 28 | [$score falsePositive]; 29 | } 30 | 31 | [$score record]; 32 | } 33 | 34 | [$score print]; 35 | } 36 | 37 | sub testSoundEx 38 | { 39 | local('$score $entry $bad $good'); 40 | $score = newObject("score", "Test of SoundEx"); 41 | while $entry (words("tests2.txt")) 42 | { 43 | ($bad, $good) = $entry; 44 | if (soundex($bad) eq soundex($good)) 45 | { 46 | [$score correct]; 47 | } 48 | else 49 | { 50 | warn("$[25]bad " . soundex($bad) . " $[25]good " . soundex($good)); 51 | } 52 | 53 | [$score record]; 54 | } 55 | 56 | [$score print]; 57 | } 58 | 59 | sub testSoundExEditDistance 60 | { 61 | local('%distance %totals $count $entry $bad $good $key $value $p $t'); 62 | 63 | while $entry (words("tests2.txt")) 64 | { 65 | ($bad, $good) = $entry; 66 | if (soundex($bad) eq soundex($good)) 67 | { 68 | %distance[editDistance($good, $bad)] += 1; 69 | } 70 | 71 | if (editDistance($good, $bad) == 0) 72 | { 73 | warn("$good -> $bad has an edit distance of 0?!?"); 74 | } 75 | 76 | %totals[editDistance($good, $bad)] += 1; 77 | $count++; 78 | } 79 | 80 | foreach $key => $value (%distance) 81 | { 82 | $p = double($value) / $count; 83 | $t = double($value) / %totals[$key]; 84 | 85 | println("$[5]key $[20]t $p"); 86 | } 87 | } 88 | 89 | sub testCorrectionsNoContext 90 | { 91 | local('$good $bad $entry $score @suggestions $f $c'); 92 | 93 | $score = newObject("score", "Test of Corrections w/o Context"); 94 | $c = 0; 95 | 96 | 97 | while $entry (words(@_[0])) 98 | { 99 | ($bad, $good) = $entry; 100 | 101 | if ($dictionary[$bad] is $null && $dictionary[$good] !is $null) 102 | { 103 | @suggestions = %edits[$bad]; # filterByDictionary($bad, $dictionary); 104 | 105 | if ($good in @suggestions) 106 | { 107 | foreach $f (sublist(@_, 1)) 108 | { 109 | [$f : $bad, $good, copy(@suggestions), $null, $null]; 110 | } 111 | [$score correct]; 112 | } 113 | else 114 | { 115 | # println("$bad -> $good : " . editDistance($bad, $good)); 116 | } 117 | 118 | [$score record]; 119 | } 120 | else 121 | { 122 | if ($dictionary[$bad] !is $null) 123 | { 124 | [$score falseNegative]; 125 | $c++; 126 | } 127 | 128 | if ($dictionary[$good] is $null) 129 | { 130 | [$score falsePositive]; 131 | } 132 | } 133 | } 134 | 135 | println("Present words: $c"); 136 | [$score print]; 137 | } 138 | 139 | sub RandomGuess 140 | { 141 | [$score record]; 142 | if (rand($3) eq $2) 143 | { 144 | [$score correct]; 145 | } 146 | } 147 | 148 | sub FrequencyCount 149 | { 150 | local('@suggs'); 151 | 152 | [$score record]; 153 | @suggs = sort({ return Pword($2) <=> Pword($1); }, $3); 154 | if (@suggs[0] eq $2) 155 | { 156 | [$score correct]; 157 | } 158 | } 159 | 160 | sub scoreIt 161 | { 162 | return ( ( 0.75 / ( editDistance($word, $1) + 1 ) ) ) + 163 | ( 0.25 * Pword($1) ) ; 164 | } 165 | sub scoreIt2 166 | { 167 | return ( ( 0.75 / ( editDistance($word, $1) + 1 ) ) ) + 168 | ( 0.25 * Pword($1) ) ; 169 | } 170 | 171 | sub CombineFreqEdit 172 | { 173 | local('@suggs'); 174 | 175 | let(&scoreIt, $word => $1); 176 | let(&scoreIt2, $word => $1); 177 | 178 | [$score record]; 179 | @suggs = sort({ return scoreIt2($2) <=> scoreIt2($1); }, $3); 180 | 181 | if (@suggs[0] eq $2) 182 | { 183 | [$score correct]; 184 | } 185 | } 186 | 187 | sub NeuralNetworkScore 188 | { 189 | local('@suggs $4 $5 $cs'); 190 | 191 | [$score record]; 192 | @suggs = sortHash($3, CompareSuggestions($network, $criteriaf, $1, $pool => $3, $pre => $4, $next => $5)); 193 | 194 | if (@suggs[0] eq $2) 195 | { 196 | [$score correct]; 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /service/src/view/wordpress.slp: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Rich Editor Help 6 | 7 | 8 | 9 | 118 | 141 | 142 | 143 | 144 |
145 | 146 | 147 | 148 | 152 | 153 |
154 |
155 | 156 |
157 | 161 |
162 | 163 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | --------------------------------------------------------------------------------