├── .filetree ├── .github └── FUNDING.yml ├── .gitignore ├── .project ├── KBSnlp.package ├── .filetree ├── ManifestKBSnlp.class │ ├── README.md │ ├── class │ │ ├── ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st │ │ └── ruleRBToDoCollectRuleV1FalsePositive.st │ └── properties.json ├── NLPcategories.class │ ├── README.md │ ├── class │ │ ├── classify..st │ │ └── initializeCategoryHash.st │ └── properties.json ├── NLPentities.class │ ├── README.md │ ├── class │ │ ├── entities..st │ │ ├── entityHelper.text..st │ │ ├── fileToDictionary..st │ │ ├── humanNameHelper..st │ │ └── initializeEntities.st │ └── properties.json ├── NLPsentences.class │ ├── README.md │ ├── class │ │ ├── fileToSet..st │ │ ├── loadData.st │ │ ├── sentences..st │ │ └── tokenizeLeavePeriods..st │ └── properties.json ├── NLPsummarizer.class │ ├── README.md │ ├── class │ │ └── summarize..st │ └── properties.json ├── NLPtagger.class │ ├── README.md │ ├── class │ │ ├── initializeLexicon.st │ │ ├── pptag..st │ │ ├── sentences..st │ │ ├── tag..st │ │ └── tokenize..st │ └── properties.json ├── monticello.meta │ ├── categories.st │ ├── initializers.st │ └── package └── properties.json ├── LICENSE.txt ├── README.md ├── company_names.txt ├── firstnames.txt ├── honorifics.txt ├── lastnames.txt ├── lexicon.txt ├── placenames.txt ├── prefixnames.txt ├── product_names.txt ├── src ├── .properties ├── KBSnlp.st └── KBSnlp │ ├── ManifestKBSnlp.class.st │ ├── NLPcategories.class.st │ ├── NLPentities.class.st │ ├── NLPsentences.class.st │ ├── NLPsummarizer.class.st │ ├── NLPtagger.class.st │ └── package.st ├── tags.json ├── tags_2gram.json └── tokensWithPeriods.txt /.filetree: -------------------------------------------------------------------------------- 1 | {"packageExtension" : ".package", 2 | "propertyFileExtension" : ".json", 3 | "Metadata" : "false" } -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: mark-watson # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | { 2 | 'srcDirectory' : 'src' 3 | } -------------------------------------------------------------------------------- /KBSnlp.package/.filetree: -------------------------------------------------------------------------------- 1 | { 2 | "noMethodMetaData" : true, 3 | "separateMethodMetaAndSource" : false, 4 | "useCypressPropertiesFile" : true } 5 | -------------------------------------------------------------------------------- /KBSnlp.package/ManifestKBSnlp.class/README.md: -------------------------------------------------------------------------------- 1 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 2 | 3 | See: https://github.com/mark-watson/nlp_smalltalk 4 | -------------------------------------------------------------------------------- /KBSnlp.package/ManifestKBSnlp.class/class/ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st: -------------------------------------------------------------------------------- 1 | code-critics 2 | ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive 3 | ^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') ) -------------------------------------------------------------------------------- /KBSnlp.package/ManifestKBSnlp.class/class/ruleRBToDoCollectRuleV1FalsePositive.st: -------------------------------------------------------------------------------- 1 | code-critics 2 | ruleRBToDoCollectRuleV1FalsePositive 3 | ^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') ) -------------------------------------------------------------------------------- /KBSnlp.package/ManifestKBSnlp.class/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "category" : "KBSnlp", 3 | "classinstvars" : [ 4 | ], 5 | "classvars" : [ 6 | ], 7 | "commentStamp" : "MarkWatson 5/19/2017 06:24", 8 | "instvars" : [ 9 | ], 10 | "name" : "ManifestKBSnlp", 11 | "pools" : [ 12 | ], 13 | "super" : "PackageManifest", 14 | "type" : "normal" } 15 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPcategories.class/README.md: -------------------------------------------------------------------------------- 1 | A NLPcategories is class to categorize text. 2 | 3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 4 | 5 | See: https://github.com/mark-watson/nlp_smalltalk 6 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPcategories.class/class/classify..st: -------------------------------------------------------------------------------- 1 | classify 2 | classify: text 3 | "classify text in a string" 4 | 5 | | tokens categories scores num hash numTokens results cutoff | 6 | tokens := NLPtagger tokenize: (text , 'XXXXXX'). 7 | categories := (Smalltalk at: #NlpCategoryHash) keys. 8 | num := categories size. 9 | numTokens := tokens size - 1. 10 | scores := Array new: num. 11 | 1 to: num do: [ :i | 12 | scores at: i put: 0. 13 | hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i). 14 | 1 to: numTokens do: [ :j | 15 | (hash includesKey: (tokens at: j)) 16 | ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ]. 17 | hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i). 18 | 1 to: numTokens do: [ :j | 19 | (hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1))) 20 | ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]]. 21 | results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 22 | 1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ]. 23 | cutoff := ((results at: 1) at: 1) / 2. 24 | results := results select: [ :x | (x at: 1) > cutoff ]. 25 | ^results. 26 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPcategories.class/class/initializeCategoryHash.st: -------------------------------------------------------------------------------- 1 | classify 2 | initializeCategoryHash 3 | "requires NeoJSON" 4 | 5 | Smalltalk at: #NlpCategoryHash 6 | put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags.json') contentsOfEntireFile). 7 | Smalltalk at: #NlpCategory2gramHash 8 | put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags_2gram.json') contentsOfEntireFile) -------------------------------------------------------------------------------- /KBSnlp.package/NLPcategories.class/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "category" : "KBSnlp", 3 | "classinstvars" : [ 4 | ], 5 | "classvars" : [ 6 | ], 7 | "commentStamp" : "MarkWatson 5/19/2017 06:25", 8 | "instvars" : [ 9 | ], 10 | "name" : "NLPcategories", 11 | "pools" : [ 12 | ], 13 | "super" : "Object", 14 | "type" : "normal" } 15 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/README.md: -------------------------------------------------------------------------------- 1 | A NLPentities is a class to find people's names, company names, place names, etc. in text. 2 | 3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 4 | 5 | See: https://github.com/mark-watson/nlp_smalltalk 6 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/class/entities..st: -------------------------------------------------------------------------------- 1 | entityDetection 2 | entities: aString 3 | "return a Dictionary of entities (keys type, values Sets" 4 | 5 | | temp result | 6 | result := Dictionary new. 7 | temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString. 8 | temp size > 0 9 | ifTrue: [ result at: 'companies' put: temp ]. 10 | temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString. 11 | temp size > 0 12 | ifTrue: [ result at: 'products' put: temp ]. 13 | temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString. 14 | temp size > 0 15 | ifTrue: [ result at: 'places' put: temp ]. 16 | temp := NLPentities humanNameHelper: aString. 17 | temp size > 0 18 | ifTrue: [ result at: 'places' put: temp ]. 19 | ^ result -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/class/entityHelper.text..st: -------------------------------------------------------------------------------- 1 | entityDetection 2 | entityHelper: entitySet text: aString 3 | "this is a helper method for everything **but** person names" 4 | 5 | | tokens num ngram2 ngram3 results | 6 | results := Set new. 7 | tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. 8 | num := tokens size - 3. " account for the 3 fake tokens at the end " 9 | 1 to: num do: [ :i | 10 | ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1). 11 | ngram3 := ngram2 , ' ' , (tokens at: i + 2). "Transcript show: ngram2; cr." 12 | (entitySet includes: ngram3) 13 | ifTrue: [ results add: ngram3 ] 14 | ifFalse: [ 15 | (entitySet includes: ngram2) 16 | ifTrue: [ results add: ngram2 ] 17 | ifFalse: [ 18 | (entitySet includes: (tokens at: i)) 19 | ifTrue: [ results add: (tokens at: i) ] ] ] ]. 20 | ^ results -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/class/fileToDictionary..st: -------------------------------------------------------------------------------- 1 | entityDetection 2 | fileToDictionary: filePath 3 | 4 | "Read data/lexicon.txt and build in memory lexicon" 5 | 6 | | read count aLine strm set | 7 | 8 | Transcript show: 'Processing file ' , filePath; cr. 9 | 10 | set := Set new. 11 | read := (MultiByteFileStream fileNamed: filePath) readOnly. 12 | count := 0. 13 | [read atEnd] 14 | whileFalse: [count := count + 1. 15 | aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" 16 | "look for a space character: " 17 | ((aLine indexOf: $:) > 0) 18 | ifTrue: [ 19 | strm := ReadStream on: aLine. 20 | aLine := strm upTo: $:]. 21 | set add: aLine]. 22 | read close. 23 | ^set 24 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/class/humanNameHelper..st: -------------------------------------------------------------------------------- 1 | entityDetection 2 | humanNameHelper: aString 3 | "this is a helper method for everything **but** person names" 4 | 5 | | tokens num results | 6 | results := Set new. 7 | tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. 8 | num := tokens size - 3. " account for the 3 fake tokens at the end " 9 | 1 to: num do: [ :i | 10 | ((Smalltalk at: #NLPfirstNames) includes: (tokens at: i)) 11 | ifTrue: [ 12 | (((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1)) 13 | and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2))) 14 | ifTrue: [ 15 | results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2). 16 | i := i + 2 ] 17 | ifFalse: [ 18 | ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1)) 19 | ifTrue: [ 20 | results add: (tokens at: i) , ' ' , (tokens at: i + 1). 21 | i := i + 1 ] ] ] ]. 22 | ^ results -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/class/initializeEntities.st: -------------------------------------------------------------------------------- 1 | entityDetection 2 | initializeEntities 3 | "load entity name data" 4 | 5 | " Note: place name lines of the form: Cairo:country_capital Fixed in fileToDictionary " 6 | 7 | Smalltalk 8 | at: #NLPcompanyNames 9 | put: (NLPentities fileToDictionary: './nlp_smalltalk/company_names.txt'). 10 | Smalltalk 11 | at: #NLPfirstNames 12 | put: (NLPentities fileToDictionary: './nlp_smalltalk/firstnames.txt'). 13 | Smalltalk 14 | at: #NLPlastNames 15 | put: (NLPentities fileToDictionary: './nlp_smalltalk/lastnames.txt'). 16 | Smalltalk 17 | at: #NLPhonorifics 18 | put: (NLPentities fileToDictionary: './nlp_smalltalk/honorifics.txt'). 19 | Smalltalk 20 | at: #NLPprefixNames 21 | put: (NLPentities fileToDictionary: './nlp_smalltalk/prefixnames.txt'). 22 | Smalltalk 23 | at: #NLPplaceNames 24 | put: (NLPentities fileToDictionary: './nlp_smalltalk/placenames.txt'). 25 | Smalltalk 26 | at: #NLPproductNames 27 | put: (NLPentities fileToDictionary: './nlp_smalltalk/product_names.txt'). 28 | 29 | " also read in data we will need for sentence segmentation: " 30 | Smalltalk 31 | at: #NLPtokensWithPeriods 32 | put: (NLPentities fileToDictionary: './nlp_smalltalk/tokens_with_periods.txt'). -------------------------------------------------------------------------------- /KBSnlp.package/NLPentities.class/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "category" : "KBSnlp", 3 | "classinstvars" : [ 4 | ], 5 | "classvars" : [ 6 | ], 7 | "commentStamp" : "MarkWatson 5/19/2017 06:25", 8 | "instvars" : [ 9 | ], 10 | "name" : "NLPentities", 11 | "pools" : [ 12 | ], 13 | "super" : "Object", 14 | "type" : "normal" } 15 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPsentences.class/README.md: -------------------------------------------------------------------------------- 1 | A class to segment text into sentences. 2 | 3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 4 | 5 | See: https://github.com/mark-watson/nlp_smalltalk 6 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPsentences.class/class/fileToSet..st: -------------------------------------------------------------------------------- 1 | utiities 2 | fileToSet: filePath 3 | "Read file, create Set with elements being each line in file" 4 | 5 | | read aLine set | 6 | Transcript 7 | show: 'Processing file ' , filePath; 8 | cr. 9 | set := Set new. 10 | read := (MultiByteFileStream fileNamed: filePath) readOnly. 11 | [ read atEnd ] 12 | whileFalse: [ aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" 13 | set add: aLine ]. 14 | read close. 15 | ^ set -------------------------------------------------------------------------------- /KBSnlp.package/NLPsentences.class/class/loadData.st: -------------------------------------------------------------------------------- 1 | initialize 2 | loadData 3 | "Load tokens that normally contain periods" 4 | 5 | | aSet count reverseDictionary forwardDictionary | 6 | count := 0. 7 | reverseDictionary := Dictionary new. 8 | forwardDictionary := Dictionary new. 9 | aSet := NLPsentences fileToSet: './nlp_smalltalk/tokensWithPeriods.txt'. 10 | Smalltalk at: #NLPtokensWithPeriods put: aSet. 11 | ^ 'tokens with periods data loaded' -------------------------------------------------------------------------------- /KBSnlp.package/NLPsentences.class/class/sentences..st: -------------------------------------------------------------------------------- 1 | segment 2 | sentences: someText 3 | "tokenize a string into individual sentences" 4 | 5 | | tokens aSet lastToken currentSentence allSentences | 6 | aSet := Smalltalk at: #NLPtokensWithPeriods. 7 | tokens := OrderedCollection new. 8 | (NLPsentences tokenizeLeavePeriods: someText) 9 | do: [ :token | 10 | (token includesSubstring: '.') not 11 | ifTrue: [ tokens add: token ] 12 | ifFalse: [ (aSet includes: token) 13 | ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: ''). 14 | tokens add: '.' ] 15 | ifTrue: [ tokens add: token ] ] ]. 16 | currentSentence := OrderedCollection new. 17 | allSentences := OrderedCollection new. 18 | lastToken := ''. 19 | Transcript 20 | show: tokens; 21 | cr. 22 | tokens 23 | do: [ :token | 24 | Transcript 25 | show: token; 26 | cr. 27 | currentSentence add: token. 28 | ((token = '.' and: lastToken isAllDigits not) or: token = '?') 29 | ifTrue: [ allSentences addLast: currentSentence. 30 | currentSentence := OrderedCollection new ]. 31 | lastToken := token ]. 32 | currentSentence isNotEmpty 33 | ifTrue: [ allSentences addLast: currentSentence ]. 34 | ^ allSentences -------------------------------------------------------------------------------- /KBSnlp.package/NLPsentences.class/class/tokenizeLeavePeriods..st: -------------------------------------------------------------------------------- 1 | utiities 2 | tokenizeLeavePeriods: wordsInAString 3 | "tokenizes a string" 4 | 5 | ^ wordsInAString 6 | findTokens: 7 | ' ;:,<>[]{}! 8 | @#$%^&*()?' 9 | keep: ';:.,<>[]{}!$?' " keep CR in this string!! " -------------------------------------------------------------------------------- /KBSnlp.package/NLPsentences.class/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "category" : "KBSnlp", 3 | "classinstvars" : [ 4 | ], 5 | "classvars" : [ 6 | ], 7 | "commentStamp" : "MarkWatson 5/19/2017 06:26", 8 | "instvars" : [ 9 | ], 10 | "name" : "NLPsentences", 11 | "pools" : [ 12 | ], 13 | "super" : "Object", 14 | "type" : "normal" } 15 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPsummarizer.class/README.md: -------------------------------------------------------------------------------- 1 | A class to classify English text into categories. 2 | 3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 4 | 5 | See: https://github.com/mark-watson/nlp_smalltalk 6 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPsummarizer.class/class/summarize..st: -------------------------------------------------------------------------------- 1 | summary 2 | summarize: text 3 | "extractive summarizer" 4 | 5 | | sentences sentenceScores tokens scoredCategories hash x bestIndices | 6 | scoredCategories := NLPcategories classify: text. 7 | sentences := NLPtagger sentences: text. 8 | sentenceScores := Array new: sentences size. 9 | 1 to: sentences size do: [ :i | 10 | sentenceScores at: i put: 0. 11 | tokens := sentences at: i. 12 | Transcript 13 | show: (sentences at: i); 14 | cr. 15 | scoredCategories 16 | do: [ :sc | 17 | hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2). 18 | tokens 19 | do: [ :token | 20 | (hash includesKey: token) 21 | ifTrue: [ x := hash at: token. 22 | sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ]. 23 | bestIndices := sentenceScores 24 | collectWithIndex: [ :score :i | 25 | {score. 26 | i} ]. 27 | Transcript 28 | show: 'sentence scoring: '; 29 | show: bestIndices; 30 | cr. 31 | bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ]. 32 | ^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ] -------------------------------------------------------------------------------- /KBSnlp.package/NLPsummarizer.class/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "category" : "KBSnlp", 3 | "classinstvars" : [ 4 | ], 5 | "classvars" : [ 6 | ], 7 | "commentStamp" : "MarkWatson 5/19/2017 06:26", 8 | "instvars" : [ 9 | ], 10 | "name" : "NLPsummarizer", 11 | "pools" : [ 12 | ], 13 | "super" : "Object", 14 | "type" : "normal" } 15 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/README.md: -------------------------------------------------------------------------------- 1 | NLP tagger converted to Squeak. 2 | A class that implements an NLP tagger. 3 | 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 5 | 6 | See: https://github.com/mark-watson/nlp_smalltalk 7 | -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/class/initializeLexicon.st: -------------------------------------------------------------------------------- 1 | tagging 2 | initializeLexicon 3 | "Read data/lexicon.txt and build in memory lexicon" 4 | 5 | | read count strm aLine word taglist token lex | 6 | lex := Dictionary new. 7 | read := (FileStream fileNamed: './nlp_smalltalk/lexicon.txt') readOnly. 8 | count := 0. 9 | [ read atEnd ] 10 | whileFalse: [ count := count + 1. 11 | aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" 12 | strm := ReadStream on: aLine. 13 | word := strm upTo: Character space. 14 | taglist := OrderedCollection new. 15 | [ strm atEnd ] 16 | whileFalse: [ token := strm upTo: Character space. 17 | taglist add: token ]. 18 | "Transcript show: word; cr." 19 | "Transcript show: taglist printString; cr." 20 | lex at: word put: taglist ]. 21 | read close. 22 | Smalltalk at: #NLPlexicon put: lex -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/class/pptag..st: -------------------------------------------------------------------------------- 1 | tagging 2 | pptag: wordString 3 | "returns a string of word/tag ..." 4 | 5 | | words tags write size count | 6 | words := NLPtagger tokenize: wordString. 7 | tags := NLPtagger tag: words. 8 | write := TextStream on: String new. 9 | size := words size. 10 | count := 1. 11 | [count <= size] 12 | whileTrue: [ 13 | write nextPutAll: (words at: count). 14 | write nextPutAll: '/'. 15 | write nextPutAll: (tags at: count). 16 | write nextPutAll: ' '. 17 | count := count + 1]. 18 | ^write contents string -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/class/sentences..st: -------------------------------------------------------------------------------- 1 | segmentation 2 | sentences: data 3 | "Handle either a string or array of tokens. 4 | Limitations: 5 | 1. This code does not currently handle special characters like — 6 | 2. Periods in numbers: only check previous character, not the 7 | next so a sentence ending with e.g., 2. will not be handled correctly. 8 | " 9 | 10 | | tokens lastToken currentSentence allSentences token | 11 | tokens := (data isMemberOf: ByteString) 12 | ifTrue: (NLPtagger tokenize: data) 13 | ifFalse: data. 14 | currentSentence := OrderedCollection new. 15 | allSentences := OrderedCollection new. 16 | lastToken := ''. 17 | tokens 18 | do: [ :token1 | 19 | ((Smalltalk at: #NLPtokensWithPeriods) 20 | includes: token1) 21 | ifTrue: [ token := (Smalltalk 22 | at: #NLPtokensWithPeriods) 23 | get: token1 ] 24 | ifFalse: [ token := token1 ]. 25 | Transcript 26 | show: token; 27 | cr. 28 | currentSentence add: token. 29 | ((token = '.' 30 | and: lastToken isAllDigits not) 31 | or: token = '?') 32 | ifTrue: [ allSentences addLast: currentSentence. 33 | currentSentence := OrderedCollection 34 | new ]. 35 | lastToken := token ]. 36 | currentSentence isNotEmpty 37 | ifTrue: [ allSentences addLast: currentSentence ]. 38 | ^ allSentences -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/class/tag..st: -------------------------------------------------------------------------------- 1 | tagging 2 | tag: words 3 | "tag an ordered collection of words, returning an ordered collection of corresponding tags" 4 | 5 | | lex tags tag count i word lastWord lastTag | 6 | tags := OrderedCollection new. 7 | lex := Smalltalk at: #NLPlexicon. 8 | words do: 9 | [:aWord | 10 | tag := lex at: aWord ifAbsent: [nil]. 11 | tag isNil ifFalse: [tag := tag at: 1] ifTrue: [tag := 'NN']. " the default tag " 12 | tags add: tag]. 13 | " Apply transformation rules: " 14 | lastWord := ''. 15 | lastTag := ''. 16 | i := 0. 17 | count := words size. 18 | [i < count] whileTrue: 19 | [i := i + 1. 20 | word := words at: i. 21 | tag := tags at: i. " reuse tag variable " 22 | " First, handle all rules for i > 1 " 23 | i > 1 24 | ifTrue: 25 | [" rule 1: DT, {VBD | VBP} --> DT, NN " 26 | 27 | lastTag = 'DT' & (tag = 'VBD' | (tag = 'VBP') | (tag = 'VB')) 28 | ifTrue: [tags at: i put: 'NN']. 29 | tag size > 1 30 | ifTrue: 31 | [" rule 6: convert a noun to a verb if the preceeding work is 'would' " 32 | (tag at: 1) = $N & ((tag at: 2) = $N) & (lastWord asLowercase = 'would') 33 | ifTrue: [tags at: i put: 'VB']]]. 34 | " Now, handle the remaining rules that are valid for i = 1: " 35 | " rule 2: convert a noun to a number (CD) if '.' appears in the word" 36 | (word findString: '.') > 0 37 | ifTrue: [(tag at: 1) = $N ifTrue: [tags at: i put: 'CD']]. " not working - tokenizer tosses '.' characters " 38 | " rule 3: convert a noun to a past participle if words[i] ends with 'ed' " 39 | (tag at: 1) = $N & (word endsWith: 'ed') ifTrue: [tags at: i put: 'VBN']. 40 | " rule 4: convert any type to adverb if it ends in 'ly' " 41 | (word endsWith: 'ly') ifTrue: [tags at: i put: 'RB']. 42 | " rule 5: convert a common noun (NN or NNS) to a adjective if it ends with 'al' " 43 | (tag at: 1) = $N & (word endsWith: 'al') ifTrue: [tags at: i put: 'JJ']. 44 | " rule 7: if a word has been categorized as a common noun and it ends with 's;, " 45 | " then set its type to plural common noun (NNS) " 46 | tag = 'NN' & (word endsWith: 's') ifTrue: [tags at: i put: 'NNS']. 47 | " rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) " 48 | (tag at: 1) = $N & (word endsWith: 'ing') ifTrue: [tags at: i put: 'VBG']. 49 | lastWord := word. 50 | lastTag := tag]. 51 | ^tags -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/class/tokenize..st: -------------------------------------------------------------------------------- 1 | tokenization 2 | tokenize: wordsInAString 3 | "tokenizes a string" 4 | 5 | ^ wordsInAString 6 | findTokens: 7 | ' ;:.,<>[]{}! 8 | @#$%^&*()?' 9 | keep: ';:.,<>[]{}!$?' " keep CR in this string!! " -------------------------------------------------------------------------------- /KBSnlp.package/NLPtagger.class/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "category" : "KBSnlp", 3 | "classinstvars" : [ 4 | ], 5 | "classvars" : [ 6 | "NLPlexicon" ], 7 | "commentStamp" : "MarkWatson 5/19/2017 06:27", 8 | "instvars" : [ 9 | ], 10 | "name" : "NLPtagger", 11 | "pools" : [ 12 | ], 13 | "super" : "Object", 14 | "type" : "normal" } 15 | -------------------------------------------------------------------------------- /KBSnlp.package/monticello.meta/categories.st: -------------------------------------------------------------------------------- 1 | SystemOrganization addCategory: #KBSnlp! 2 | -------------------------------------------------------------------------------- /KBSnlp.package/monticello.meta/initializers.st: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mark-watson/nlp_smalltalk/3a6c09aed17bed08f0ee3074ac0f1e578881ab87/KBSnlp.package/monticello.meta/initializers.st -------------------------------------------------------------------------------- /KBSnlp.package/monticello.meta/package: -------------------------------------------------------------------------------- 1 | (name 'KBSnlp') -------------------------------------------------------------------------------- /KBSnlp.package/properties.json: -------------------------------------------------------------------------------- 1 | { 2 | } 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2004-2017 Mark Watson. All Rights Reserved. 2 | 3 | This software may be used under the conditions of attribution of authorship and the MIT license. 4 | 5 | MIT License 6 | Copyright (c) 2004-2017 Mark Watson, corporate, and institutional contributors who have collectively contributed elements to this software ("The Pharo and Squeak Communities"). All rights reserved. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing Library for Pharo Smalltalk 2 | 3 | Copyright 2005 to 2021 by Mark Watson 4 | 5 | License: MIT 6 | 7 | Note: the most frequent updates to this Pharo Smalltalk package will appear on the [github repo for this project](https://github.com/mark-watson/nlp_smalltalk). 8 | 9 | Note 2: on 4/25/2021 I converted this project to use the IceBerg github support for Pharo Smalltalk. All source code and data have been moved to the subdirectory **src**. 10 | 11 | IceBerg/github documentation: [https://books.pharo.org/booklet-ManageCode/pdf/2019-03-24-ManageCode.pdf](https://books.pharo.org/booklet-ManageCode/pdf/2019-03-24-ManageCode.pdf) 12 | 13 | Add this repository using the IcewBerg Browser. 14 | 15 | ## Setup to be done one time after loading the code via IceBerg 16 | 17 | 18 | ### Part Of Speech Tagging 19 | 20 | Open a File Browser and fileIn the KBSnlp.st source file. Open a Class Browser 21 | and and look at the code in the KBnlp class. 22 | 23 | Open a Workspace and one time only evaluate: 24 | 25 | NLPtagger initializeLexicon 26 | 27 | Try tagging a sentence to make sure the data was read from disk correctly: 28 | 29 | NLPtagger pptag: 'The dog ran down the street' 30 | 31 | If this does not work then probably the directory nlp_smalltalk is not in the default directory. The code containing the file path is: 32 | 33 | read := (FileStream fileNamed: './nlp_smalltalk/lexicon.txt') readOnly. 34 | 35 | ### Categorization 36 | 37 | I am using NeoJSON to parse the category word count data so make sure NeoJSON is installed. NeoJSON can be installed using: 38 | 39 | Gofer it 40 | smalltalkhubUser: 'SvenVanCaekenberghe' project: 'Neo'; 41 | configurationOf: 'NeoJSON'; 42 | loadStable. 43 | 44 | One time initialization: 45 | 46 | NLPcategories initializeCategoryHash 47 | 48 | Try it: 49 | 50 | NLPcategories classify: 'The economy is bad and taxes are too high.' 51 | 52 | ### Entity Recognition 53 | 54 | Implemented for products, companies, places, and people's names. 55 | 56 | One time initialization: 57 | 58 | NLPentities initializeEntities 59 | 60 | Example: 61 | 62 | NLPentities entities: 'The Coca Cola factory is in London' 63 | 64 | --> a Dictionary('companies'->a Set('Coca Cola') 'places'->a Set('London') 'products'->a Set('Coca Cola') ) 65 | 66 | NLPentities humanNameHelper: 'John Alex Smith and Andy Jones went to the store.' 67 | 68 | --> a Set('John Alex Smith' 'Andy Jones') 69 | 70 | ### Sentence Segmentation 71 | 72 | One time initialization: 73 | 74 | NLPsentences loadData 75 | 76 | NLPsentences sentences: 'Today Mr. Jones went to town. He bought gas.' 77 | 78 | --> an OrderedCollection(an OrderedCollection('Today' 'Mr.' 'Jones' 'went' 'to' 'town' '.') an OrderedCollection('He' 'bought' 'gas' '.')) 79 | 80 | ### Summarization 81 | 82 | No additional data needs to be loaded for summarization, but all other data should be loaded as-per the above directions. Here is a short example: 83 | 84 | NLPsummarizer summarize: 'The administration and House Republicans have asked a federal appeals court for a 90-day extension in a case that involves federal payments to reduce deductibles and copayments for people with modest incomes who buy their own policies. The fate of $7 billion in "cost-sharing subsidies" remains under a cloud as insurers finalize their premium requests for next year. Experts say premiums could jump about 20 percent without the funding. In requesting the extension, lawyers for the Trump administration and the House said the parties are continuing to work on measures, including potential legislative action, to resolve the issue. Requests for extensions are usually granted routinely.' 85 | 86 | --> #('The administration and House Republicans have asked a federal appeals court for a 90-day extension in a case that involves federal payments to reduce deductibles and copayments for people with modest incomes who buy their own policies .' 'The fate of $ 7 billion in "cost-sharing subsidies" remains under a cloud as insurers finalize their premium requests for next year .' 'In requesting the extension , lawyers for the Trump administration and the House said the parties are continuing to work on measures , including potential legislative action , to resolve the issue .') 87 | 88 | ## Limitations 89 | 90 | - Does not currently handle special characters like: — 91 | - Categorization and summarization should also use "bag of ngrams" in addition to "bag of words" (BOW) 92 | -------------------------------------------------------------------------------- /company_names.txt: -------------------------------------------------------------------------------- 1 | IBM 2 | Twitter 3 | Facebook 4 | Motorola 5 | Ford 6 | Panasonic 7 | General Motors 8 | Casio 9 | GM 10 | Google 11 | Microsoft 12 | 3M 13 | Adobe 14 | AES 15 | Aetna 16 | AFLAC 17 | Agilent 18 | Akamai 19 | Alcoa 20 | Allegheny 21 | Allstate 22 | Altera 23 | Amazon 24 | American Express 25 | Analog Devices 26 | Apple 27 | AT&T 28 | Autodesk 29 | Avon 30 | Bank of America 31 | Best Buy 32 | Boeing 33 | Boston Scientific 34 | Bristol-Myers Squibb 35 | Broadcom 36 | Campbell Soup 37 | Chevron 38 | CIGNA 39 | Cisco 40 | Citigroup 41 | Citrix 42 | Clorox 43 | Coca Cola 44 | Colgate-Palmolive 45 | Comcast 46 | ConocoPhillips 47 | Corning 48 | Costco 49 | Dell 50 | DeVry 51 | DIRECTV 52 | Dow Chemical 53 | Du Pont 54 | eBay 55 | Exxon Mobil 56 | FedEx 57 | Ford Motor 58 | GameStop 59 | Gannett 60 | General Electric 61 | General Mills 62 | Goldman Sachs 63 | Goodyear 64 | Halliburton 65 | Hasbro 66 | Heinz 67 | Hewlett-Packard 68 | Honeywell 69 | Hormel 70 | Humana 71 | Intel 72 | Intuit 73 | Kellogg 74 | Kimco 75 | Kohl 76 | Kraft 77 | Marriott 78 | Mastercard 79 | Mattel 80 | McAfee 81 | McGraw-Hill 82 | Merck 83 | MetLife 84 | Microsoft 85 | Monsanto 86 | Morgan Stanley 87 | NIKE 88 | Nike 89 | Novell 90 | Nvidia 91 | Office Depot 92 | Oracle 93 | Philip Morris 94 | Procter & Gamble 95 | Prudential 96 | QLogic 97 | QUALCOMM 98 | Qualcomm 99 | Quest 100 | Raytheon 101 | Red Hat 102 | Rockwell 103 | Safeway 104 | Salesforce 105 | SanDisk 106 | Schlumberger 107 | Sears 108 | Sempra Energy 109 | Southwest Airlines 110 | Starbucks 111 | Sun Microsystems 112 | Time Warner 113 | Verisign 114 | Volkswagen 115 | Wal-Mart 116 | Walgreen 117 | WellPoint 118 | Wells Fargo 119 | Winnebago 120 | Xerox 121 | Yahoo 122 | New York Times 123 | Oxford University Press 124 | Cambridge University Press 125 | Washington Post 126 | Harvard University Press 127 | BBC Radio 128 | American Broadcasting Company 129 | Walt Disney 130 | Princeton University Press 131 | Columbia University Press 132 | Associated Press 133 | Yale University Press 134 | Los Angeles Times 135 | MIT Press 136 | Warner Bros. 137 | General Motors 138 | Daily Telegraph 139 | Clarendon Press 140 | General Electric 141 | Walt Disney Company 142 | Paramount Pictures 143 | Ford Motor Company 144 | Wall Street Journal 145 | USA Today 146 | British East India Company 147 | Apple Inc. 148 | Greenwood Press 149 | Sun Microsystems 150 | Entertainment Weekly 151 | Apple Computer 152 | Cornell University Press 153 | Johns Hopkins University Press 154 | Chicago Tribune 155 | Dutch East India Company 156 | Indiana University Press 157 | Stanford University Press 158 | Blackwell Publishing 159 | Boston Globe 160 | San Francisco Chronicle 161 | Fox Broadcasting Company 162 | Development Bank 163 | St. Martin's Press 164 | Time Warner 165 | Texas Instruments 166 | International Herald Tribune 167 | John Wiley & Sons 168 | National Public Radio 169 | Chicago Sun-Times 170 | Hudson's Bay Company 171 | Norfolk Southern 172 | Southern California 173 | CRC Press 174 | Best Music 175 | Houghton Mifflin Company 176 | Tuttle Publishing 177 | Digital Equipment Corporation 178 | American Airlines 179 | International Business Machines 180 | East India Company 181 | New York Stock Exchange 182 | United Airlines 183 | Sunday Times 184 | Los Alamos National Laboratory 185 | Sydney Morning Herald 186 | Da Capo Press 187 | University Press 188 | McDonnell Douglas 189 | Le Monde 190 | Warner Bros 191 | New York Review 192 | Union Pacific 193 | Bell Labs 194 | American general 195 | International Bank 196 | Bank of England 197 | Westview Press 198 | Lockheed Martin 199 | British Airways 200 | Greenwood Publishing Group 201 | Febrero co 202 | Electronic Arts 203 | London Stock Exchange 204 | Digital Equipment 205 | ISBN Princeton University Press 206 | Duke University Press 207 | Orion Publishing Group 208 | Canadian Broadcasting Corporation 209 | Australian Broadcasting Corporation 210 | General Dynamics 211 | Manchester University Press 212 | SUNY Press 213 | John Wiley and Sons 214 | Southern Africa 215 | Rutgers University Press 216 | Nike Inc. 217 | Jerusalem Post 218 | Delta Air Lines 219 | Washington Times 220 | Academic Press 221 | CBS News 222 | Scarecrow Press 223 | Random House Inc. 224 | Viking Press 225 | Edinburgh University Press 226 | Siemens AG 227 | Barnes & Noble 228 | W. Norton & Company 229 | LA Times 230 | Daily News 231 | New York Post 232 | Southern United States 233 | Silicon Graphics 234 | New York Daily News 235 | County Commissioners 236 | Popular Music 237 | New York University Press 238 | Canadian Pacific Railway 239 | Super Mario Bros. 240 | Japan Times 241 | Stars and Stripes 242 | Free Press 243 | Seattle Times 244 | Omnibus Press 245 | North Fork 246 | Procter & Gamble 247 | Pan American World Airways 248 | BBC Television 249 | Toronto Star 250 | Christian Science Monitor 251 | Houston Chronicle 252 | Bank of America 253 | Adobe Systems 254 | Oracle Corporation 255 | Osprey Publishing 256 | BAE Systems 257 | NBC Universal 258 | Dutch West India Company 259 | Lawrence Livermore National Laboratory 260 | John Wiley & Sons Inc. 261 | New England Journal 262 | Kessinger Publishing 263 | Royal Shakespeare Company 264 | Evening Standard 265 | Globe and Mail 266 | Microsoft Corporation 267 | CBC Television 268 | Red Crescent International Labour International Monetary Fund 269 | Burger King 270 | Bell Laboratories 271 | Northwestern University Press 272 | Data General 273 | Cisco Systems 274 | New York Times Book Review 275 | General Electric Company 276 | Naval Institute Press 277 | Heavy Metal 278 | TSR Inc. 279 | Temple University Press 280 | Robert Appleton Company 281 | Eastman Kodak 282 | Dover Publications Inc. 283 | Jet Propulsion Laboratory 284 | Guardian Unlimited 285 | Super Smash Bros 286 | International Finance Corporation 287 | Comic Book Resources 288 | Irish Times 289 | Pergamon Press 290 | W.W. Norton & Company 291 | News Corporation 292 | Natural Resources 293 | World Scientific 294 | National Trust 295 | Judaica Press 296 | Intellectual Property World Meteorological World Tourism World Trade 297 | Paulist Press 298 | Square Co. 299 | Continental Airlines 300 | Air Canada 301 | CBC Radio 302 | Northrop Grumman 303 | Warner Music Group 304 | America Online 305 | State Street 306 | Belknap Press 307 | Hewlett Packard 308 | Harper's Weekly 309 | Oak Ridge National Laboratory 310 | Continuum International Publishing Group 311 | Northwest Airlines 312 | Coca-Cola Company 313 | Southwest Airlines 314 | Boydell Press 315 | New York Herald 316 | Control Data Corporation 317 | Palm Beach 318 | Garland Publishing 319 | Wells Fargo 320 | Second Bank of the United States 321 | Prentice-Hall Inc. 322 | Advanced Micro Devices 323 | Western Union 324 | Wesleyan University Press 325 | Miami Herald 326 | Nuclear Suppliers Group 327 | RAND Corporation 328 | Daimler AG 329 | London Gazette 330 | American Express 331 | LA Weekly 332 | Trans World Airlines 333 | Country Music 334 | Philadelphia Inquirer 335 | British Broadcasting Corporation 336 | Monsanto Company 337 | US Airways 338 | World Bank Group 339 | Chosen Freeholders 340 | Bell Telephone Laboratories 341 | American Standard 342 | InterVarsity Press 343 | Pluto Press 344 | Macmillan Company 345 | North West Company 346 | New York Tribune 347 | Granada Television 348 | British Telecom 349 | National Semiconductor 350 | Baltimore Sun 351 | Time Inc. 352 | National Physical Laboratory 353 | Publishers Weekly 354 | Worshipful Company 355 | BBC Wales 356 | Thames Television 357 | Fairchild Semiconductor 358 | British Museum Press 359 | General Mills 360 | Black Hills 361 | PC World 362 | Sony Music 363 | United Parcel Service 364 | Abingdon Press 365 | Electronic Music 366 | Robert Bosch GmbH 367 | Coca Cola 368 | Paris Review 369 | CBS Radio 370 | Burroughs Corporation 371 | Nintendo of America 372 | London Review 373 | Roxy Music 374 | New Zealand Herald 375 | Human Rights 376 | Chrysler Corporation 377 | Johnson & Johnson 378 | Argonne National Laboratory 379 | Polity Press 380 | Syracuse University Press 381 | Arcadia Publishing 382 | Bank of Sweden Prize 383 | Wayne State University Press 384 | South End Press 385 | NCR Corporation 386 | World Wide Fund 387 | Penguin Group 388 | Smithsonian Institution Press 389 | Gale Group 390 | IARC Group 391 | Review Award 392 | Goldman Sachs 393 | Singapore Airlines 394 | Times of India 395 | Remington Rand 396 | Dish Network 397 | Atlantic Slave Trade 398 | Intel Corporation 399 | Inter-American Development Bank 400 | Hitachi Ltd. 401 | Deere & Company 402 | Angiosperm Phylogeny Group 403 | Kraft Foods 404 | Westminster John Knox Press 405 | Deutsche Bank 406 | Warner Brothers 407 | St Martin's Press 408 | St. Petersburg Times 409 | Lehman Brothers 410 | Shell Oil Company 411 | NYU Press 412 | Hackett Publishing 413 | Caves Books Ltd. 414 | Universum Film AG 415 | Electric Company 416 | Eastern Air Lines 417 | Penguin Press 418 | Morgan Stanley 419 | American Music 420 | Canadian National Railway 421 | American General 422 | Corriere della Sera 423 | DC Comics 424 | IBM Personal Computer 425 | El Paso 426 | XM Satellite Radio 427 | Aurum Press 428 | Douglas Aircraft 429 | London Corporation 430 | Southern Europe 431 | News of the World 432 | Volkswagen Group 433 | El Mundo 434 | Readers Digest 435 | Pennsylvania State University Press 436 | Louisiana State University Press 437 | Lawrence Berkeley National Laboratory 438 | Vega Science Trust 439 | Sirius Satellite Radio 440 | New York Journal 441 | Atari Inc. 442 | Merrill Lynch 443 | Universal Time 444 | U.S. Steel 445 | Apple Computer Inc. 446 | Japan Airlines 447 | Detroit Free Press 448 | Nazi Germany 449 | New York Herald Tribune 450 | Netscape Communications Corporation 451 | Turner Network Television 452 | Harry N. Abrams Inc. 453 | Eli Lilly and Company 454 | Financial Times 455 | Eli Lilly 456 | Safeway Inc. 457 | BBC America 458 | Boston Herald 459 | Home Depot 460 | MCI Inc. 461 | Ernst & Young 462 | San Jose Mercury News 463 | Russell Group 464 | Bank of the United States 465 | Rio Group 466 | Caribbean Development Bank 467 | Frankfurter Allgemeine 468 | Melbourne University Press 469 | Liturgical Press 470 | NBC Radio 471 | Sinauer Associates 472 | Brown and Co. 473 | BT Group 474 | Honourable East India Company 475 | Citadel Press 476 | Modern Music 477 | Rockwell International 478 | Daily Herald 479 | Cable & Wireless 480 | Plenum Press 481 | Electronic Data Systems 482 | Justice League Unlimited 483 | Westinghouse Electric Corporation 484 | Denver Post 485 | Brookhaven National Laboratory 486 | Rio Tinto 487 | J.P. Morgan 488 | SAP AG 489 | Beacon Press 490 | Bank of Sweden 491 | Daily Star 492 | Folk Music 493 | Best Buy 494 | Turner Broadcasting 495 | World Music 496 | CBS Corporation 497 | New Music 498 | Humanities Press 499 | Liverpool University Press 500 | Crescent International Hydrographic International Labour International Monetary Fund 501 | Tr�bner & Co. 502 | Continental Europe 503 | London Weekend Television 504 | North Point Press 505 | National Post 506 | British Aircraft Corporation 507 | Pearson Education Inc. 508 | Early Music 509 | Boeing Integrated Defense Systems 510 | Curzon Press 511 | Metro Goldwyn Mayer 512 | Union Carbide 513 | Kyle Cathie Limited 514 | Southern Italy 515 | Broadview Press 516 | NATO's Partnership 517 | World Bank 518 | Hal Leonard Corporation 519 | Virgin Media 520 | NSU Motorenwerke AG 521 | Sutton Publishing 522 | Standard Oil Company 523 | Belfast Telegraph 524 | Weinstein Company 525 | Bilderberg Group 526 | French East India Company 527 | General Motors Corporation 528 | JHU Press 529 | Houghton Mifflin Co. 530 | Ten Speed Press 531 | Air France 532 | Alaska Airlines 533 | Virgin Atlantic Airways 534 | Warner Communications 535 | Knights of Columbus 536 | International Trade 537 | Martin Marietta 538 | Super Mario Bros 539 | Thales Group 540 | Merck & Co. 541 | Sandia National Laboratories 542 | Dexter's Laboratory 543 | Yorkshire Television 544 | Taipei Times 545 | Victor Talking Machine Company 546 | University Press of America 547 | Palm Inc. 548 | Southern Ontario 549 | H. J. Heinz Company 550 | Analog Devices 551 | Fox Network 552 | ABC Radio 553 | Bank of Scotland 554 | British South Africa Company 555 | National Academy Press 556 | Rio Tinto Group 557 | Pan American Airways 558 | Marconi Company 559 | National Bank 560 | Southern Rhodesia 561 | English East India Company 562 | Bristol Aeroplane Company 563 | Sperry Corporation 564 | Barclays Bank 565 | Raven Software 566 | AMS Press 567 | Harcourt Brace Jovanovich 568 | Mercer University Press 569 | Grove Press 570 | Carl Zeiss AG 571 | Ralph Lauren 572 | Encyclop�dia Britannica Inc. 573 | Abbeville Press 574 | Lucent Technologies 575 | AOL Time Warner 576 | Die Welt 577 | Father and Son 578 | Chicago University Press 579 | Monsters Inc. 580 | American Broadcasting 581 | Texas A&M University Press 582 | Exxon Mobil 583 | IEEE Press 584 | Mario Bros 585 | UBS AG 586 | Lawrence Erlbaum Associates 587 | National Broadcasting Company 588 | Slave Trade 589 | Rolls-Royce plc 590 | David Steel 591 | D. Van Nostrand Company 592 | 20th Century Fox 593 | South Vietnam 594 | Earth Metrics Inc. 595 | Edwin Mellen Press 596 | Atlanta Journal 597 | McGraw-Hill Book Company 598 | Catalan Company 599 | Clear Channel 600 | MIT Lincoln Laboratory 601 | Walt Disney Parks and Resorts 602 | Rotten Tomatoes 603 | Human Resources 604 | Carolina Academic Press 605 | News Corp. 606 | Kent State University Press 607 | Open University Press 608 | Hearst Corporation 609 | NZ Herald 610 | Computer Sciences 611 | Hogarth Press 612 | Cathay Pacific 613 | B. Eerdmans Publishing 614 | Naval Research Laboratory 615 | Canadian Press 616 | Coachella Valley Music 617 | W.W. Norton & Co. 618 | Standard & Poor's 619 | Boeing Company 620 | McFarland & Company 621 | Bank of China 622 | Iberia Airlines 623 | Sky Television plc 624 | ACM Press 625 | France Telecom 626 | La Repubblica 627 | Thievery Corporation 628 | Central Bank 629 | Royal Asiatic Translation Fund 630 | Cray Research 631 | Johns Hopkins Press 632 | Gulf Oil 633 | Miramax Films 634 | Sveriges Riksbank 635 | United States Steel Corporation 636 | John Hopkins University Press 637 | Macquarie Library Pty Ltd 638 | Eckert-Mauchly Computer Corporation 639 | Irish Independent 640 | Banque de France 641 | Nissan Motors 642 | Southern France 643 | Coors Brewing Company 644 | J.C. Penney 645 | Gladstone Publishing 646 | Super Mario USA 647 | Altria Group 648 | Norfolk Wildlife Trust 649 | Freedom Press 650 | J. P. Morgan 651 | du Pont 652 | Haworth Press 653 | General Dynamics Corporation 654 | Kia Motors 655 | Princeton Architectural Press 656 | Iron Mountain 657 | ING Group 658 | Guilford Press 659 | Asiana Airlines 660 | Sky News 661 | Business Wire 662 | United Press International 663 | Northeastern University Press 664 | Sheffield Academic Press 665 | Canon Inc. 666 | McGill-Queen's University Press 667 | Pacific Northwest National Laboratory 668 | Arno Press 669 | Minute Maid 670 | Film Music 671 | China Airlines 672 | Total S.A. 673 | Commonwealth Bank 674 | Dow Jones 675 | Western Digital 676 | MTV Europe 677 | Portia Group 678 | Three Rivers Press 679 | Bibliographic Resources 680 | Milton Bradley Company 681 | Washington Mutual 682 | Avon Products 683 | Norsk Hydro 684 | Dow Jones & Company 685 | Ty Inc. 686 | American Fur Company 687 | Arcade Publishing 688 | Levi Strauss 689 | Continental AG 690 | South African Airways 691 | ADV Films 692 | Bloomsbury Group 693 | Tribune Company 694 | Hilton Hotels 695 | Imperial Airways 696 | Gramophone Company 697 | Hudson Ltd. 698 | Honourable Artillery Company 699 | Berkshire Hathaway 700 | Imperial Oil 701 | Deutsche Telekom 702 | Atari Corp. 703 | Apogee Software 704 | Phaidon Press 705 | Historical Capital 706 | Addison-Wesley Publishing 707 | Steinway & Sons 708 | All Nippon Airways 709 | Free Software 710 | W. Norton & Co. 711 | Fox Film Corporation 712 | Milwaukee Journal Sentinel 713 | Generations Network Inc. 714 | Thames Bank 715 | Arizona Republic 716 | Philip Morris USA 717 | World Trade 718 | Eastern Daily Press 719 | Left Bank 720 | Open Source Software 721 | Sea Venture 722 | Development Corporation 723 | Green and Co. 724 | Liberty Media 725 | Fox network 726 | Ashgate Publishing Ltd. 727 | Cavendish Laboratory 728 | Fannie Mae 729 | William B. Eerdmans Publishing Company 730 | South Sea Company 731 | MCI Communications 732 | British Sky Broadcasting 733 | Red Bank 734 | Hartford Courant 735 | Vivendi Universal 736 | Macmillan Publishing 737 | Sony Pictures Television 738 | Bausch & Lomb 739 | Thomson SA 740 | BHP Billiton 741 | Abercrombie & Fitch 742 | Glenn L. Martin Company 743 | Super Smash Bros. 744 | Canberra Times 745 | Philip Morris 746 | Sveriges Television 747 | MOS Technology Inc. 748 | Notre Dame Press 749 | Orange SA 750 | St. Martin�s Press 751 | United U.S. 752 | Xinhua News Agency 753 | W.W. Norton & Company Inc. 754 | Three's Company 755 | Staples Inc. 756 | Wal-Mart stores 757 | Bank of Montreal 758 | Cairns Group 759 | Freddie Mac 760 | Dallas Morning News 761 | American Motors Corporation 762 | Dell Inc. 763 | St. James Press 764 | Puma AG 765 | Yahoo! Music 766 | SIAM Journal 767 | Information Systems 768 | National High Magnetic Field Laboratory 769 | Los Alamos Scientific Laboratory 770 | Hershey Company 771 | National Academies Press 772 | Fortress Press 773 | Working Group 774 | Second City Television 775 | McGill-Queen's Press 776 | Wang Laboratories 777 | Arab Monetary Fund 778 | Southern Asia 779 | Thorndike Press 780 | Christian Dior 781 | Review Awards 782 | Penn State Press 783 | Artificial Intelligence Laboratory 784 | First Bank of the United States 785 | Water Resources 786 | Boston Review 787 | Scientific Data Systems 788 | British European Airways 789 | Object Management Group 790 | Chicago Daily Tribune 791 | Hot Dance Music 792 | UPS Airlines 793 | Pearson PLC 794 | American Motors 795 | Southern India 796 | Hudson Bay Company 797 | Subterranean Press 798 | British Heavy Metal 799 | Who Do You Trust 800 | New Super Mario Bros 801 | Oxford Clarendon Press 802 | Phoenix Press 803 | ION Television 804 | Montreal Gazette 805 | Bank of Canada 806 | Birmingham Small Arms Company 807 | Investment Bank 808 | Canterbury University Press 809 | Abbott Laboratories 810 | Leamington Spa 811 | BT Group plc 812 | Parker Brothers 813 | Trafford Publishing 814 | Bank of France 815 | Control Data 816 | Leicester University Press 817 | Du Pont 818 | Traditional Music 819 | Bell Atlantic 820 | MAN AG 821 | Tandy Corporation 822 | Southern Illinois University Press 823 | Penguin Books Ltd. 824 | Moody Press 825 | Mainstream Publishing 826 | SCO Group 827 | Mercer Human Resource Consulting 828 | Le monde 829 | Flerov Laboratory 830 | Scania AB 831 | Sears Roebuck 832 | Classical Music 833 | Duke Energy 834 | Mediacorp Canada Inc. 835 | John Wiley & Songs Inc. 836 | St Vladimir's Seminary Press 837 | National Cash Register Company 838 | RIA Novosti 839 | Broadcast Music 840 | Rockwell Collins 841 | Hilton Hotels Corporation 842 | Birlinn Ltd. 843 | Silicon Graphics Inc. 844 | Indian Head 845 | Danmarks Radio 846 | Doubleday & Company 847 | Somers Isles Company 848 | Foreign Trade 849 | International Fund 850 | World Wildlife Fund 851 | AT&T Bell Labs 852 | United Fruit Company 853 | Ohio University Press 854 | Volkswagen AG 855 | Knight Ridder 856 | Publishers Inc. 857 | Wiley & Sons 858 | United Technologies 859 | Steppenwolf Theatre Company 860 | Chicago Sun Times 861 | Ashgate Publishing 862 | Southern Maryland 863 | Westminster Press 864 | D'Oyly Carte Opera Company 865 | Adamant Media Corporation 866 | Wildside Press 867 | NASA's Jet Propulsion Laboratory 868 | New Testament Introduction 869 | Southern Sudan 870 | Dow Chemical Company 871 | CNN International 872 | Getty Oil 873 | Cycorp Inc. 874 | Seven Stories Press 875 | New Line Cinema 876 | Lulu Press 877 | MacMillan Company 878 | Gemstone Publishing 879 | Popular Press 880 | Encyclopaedia Britannica Inc. 881 | Delta Airlines 882 | Thunder's Mouth Press 883 | WarioWare Inc. 884 | Private Eye 885 | Sprint Nextel 886 | Two Chief World Systems 887 | Macintosh LC 888 | British Overseas Airways Corporation 889 | Mitsubishi Heavy Industries 890 | Bad Company 891 | Moog Music 892 | Particle Data Group 893 | Old Testament Introduction 894 | Fairleigh Dickinson University Press 895 | South Bank 896 | Parthenon Press 897 | William Kaufmann Inc. 898 | Joseph Henry Press 899 | Hudson Motor Car Company 900 | Hackett Publishing Company 901 | LG Electronics 902 | Tokyo Stock Exchange 903 | ITT Corporation 904 | International Crisis Group 905 | Inter Press Service 906 | Amadeus Press 907 | Management Information Systems 908 | Edward Jones 909 | PepsiCo Inc. 910 | Chicago Times 911 | Columbia Records 912 | Foster's Group 913 | St. Martins Press 914 | Paris Match 915 | Indian Music 916 | Gnome Press 917 | Southern Netherlands 918 | Natural Trust 919 | Helsinki Stock Exchange 920 | Contemporary Music 921 | BBC 6 Music 922 | Central European University Press 923 | Apollo Computer 924 | Atari Corporation 925 | Merck KGaA 926 | Bombardment Group 927 | Victor Gollancz Ltd 928 | Carlsberg Laboratory 929 | ATA Airlines 930 | Simon and Schuster Inc. 931 | Imperial College Press 932 | Holland Land Company 933 | Souvenir Press 934 | Mario Bros. 935 | Marks & Spencer 936 | Lucasfilm Ltd. 937 | ABC Television 938 | Bloomsbury Publishing 939 | Canada Limited 940 | Broadcasting Corporation 941 | Hyundai Motor Company 942 | Dornier GmbH 943 | Henry Holt & Company 944 | London Journal 945 | Lincoln Laboratory 946 | Shell Oil 947 | Rocky Mountain News 948 | United Technologies Corporation 949 | Evolution Publishing 950 | Space Systems 951 | Mail & Guardian 952 | Imperial Chemical Industries 953 | Southern England 954 | SVS Press 955 | Bank of Nova Scotia 956 | Borders Group 957 | Victoria's Secret 958 | SCM Press 959 | United Artists 960 | Park Place 961 | Levi Strauss & Co. 962 | Wellcome Trust 963 | Bank of Greece 964 | Greenwood Publishing 965 | Embedded Systems 966 | Chase Manhattan Bank 967 | National Express East Anglia 968 | National Educational Television 969 | National Petroleum 970 | English Electric Company 971 | Bomb Group 972 | Weekly Standard 973 | Ursa Major Moving Group 974 | Cygnus Solutions 975 | Cambridge Computer Laboratory 976 | Lockheed Corporation 977 | Hawker Siddeley Group 978 | Bantam Press 979 | Yum! Brands Inc. 980 | Universal Pictures 981 | Georgetown University Press 982 | Overlook Press 983 | Southern Russia 984 | AT&T Bell Laboratories 985 | IDW Publishing 986 | Fuji Television 987 | Qatar Airways 988 | Calgary Herald 989 | Disney-ABC Television Group 990 | Dow Chemical 991 | Harvill Press 992 | Mongoose Publishing 993 | Dan River 994 | Xerox Corporation 995 | Review Award for Best 996 | BAA Limited 997 | Yamaha Motor 998 | Aon Corporation 999 | Buick Motor Company 1000 | Applied Materials 1001 | St. Paul Pioneer Press 1002 | FM Radio 1003 | Paragon Book Reprint Corp. 1004 | Brown & Co. 1005 | D�il �ireann 1006 | Macmillan Publishers Ltd. 1007 | Sinclair Research Ltd 1008 | Indian Airlines 1009 | Lyons Press 1010 | International Computers Limited 1011 | Eastman Kodak Company 1012 | Saudi Arabian Airlines 1013 | Sinauer Associates Inc. 1014 | Anglia Television 1015 | LAN Airlines 1016 | White Wolf Inc. 1017 | Intervarsity Press 1018 | Virtual Laboratory 1019 | John Wiley and Sons Inc. 1020 | Freescale Semiconductor 1021 | Praeger Press 1022 | BBC Worldwide 1023 | Samsung Electronics 1024 | Samsung 1025 | Schiffer Publishing 1026 | American Telephone & Telegraph 1027 | Heineken International 1028 | LOT Polish Airlines 1029 | General Instrument 1030 | Public Radio 1031 | Tiffany & Co. 1032 | Gran Turismo 1033 | Mitsubishi Motors 1034 | Whispering Eagle Press 1035 | Harvard Business School Press 1036 | American International Group 1037 | Lutterworth Press 1038 | Cooper Car Company 1039 | Dance Music 1040 | BBC Northern Ireland 1041 | Arab Bank 1042 | DK Publishing Inc. 1043 | Pop Music 1044 | St. Regis 1045 | Tate Publishing 1046 | JPMorgan Chase 1047 | Sveriges Radio 1048 | Africa World Press 1049 | Berkley Publishing Group 1050 | PalmSource Inc. 1051 | Investor AB 1052 | Biograph Company 1053 | Research In Motion 1054 | Kawasaki Heavy Industries 1055 | Raven Press 1056 | Universal Studios 1057 | Caterpillar Inc. 1058 | J. Wiley & Sons 1059 | Ahmad Sa 1060 | Addison-Wesley Publishing Company 1061 | Golden West 1062 | H&R Block 1063 | RTL Group 1064 | Straits Times 1065 | Handmade Films 1066 | Newport News Shipbuilding 1067 | National Westminster Bank 1068 | Delco Electronics 1069 | British Broadcasting Company 1070 | Computer Research Corporation 1071 | Ford Motor Co. 1072 | Barclays Banks 1073 | Ars Technica 1074 | NPR Music 1075 | Lotus Software 1076 | General Electric Company plc 1077 | Chicago Review Press 1078 | Ohio State University Press 1079 | Dogger Bank 1080 | Occidental Petroleum 1081 | Frontier Airlines 1082 | DuMont Television 1083 | DuMont 1084 | Star Tribune 1085 | Peter Lang Publishing 1086 | Lotus Development Corporation 1087 | Verizon Wireless 1088 | Verizon 1089 | Carnegie Corporation 1090 | UCL Press 1091 | Volvo AB 1092 | Macmillan and Co. 1093 | Northwestern United States 1094 | Cavendish Astrophysics Radio Astronomy Group 1095 | Protein Data Bank 1096 | Lincoln National 1097 | Routledge Press 1098 | Equipment Corporation 1099 | Max Factor 1100 | Jim Henson Company 1101 | Electric Boat Company 1102 | Swatch Group Ltd. 1103 | Bell Telephone Company 1104 | Tyson Foods 1105 | Virgin Group 1106 | Academic Press Inc. 1107 | Infinity Inc. 1108 | NRC Handelsblad 1109 | Tata Motors 1110 | Landor Associates 1111 | Paramount Television 1112 | Royal Dutch Shell 1113 | Frankfurt Stock Exchange 1114 | Night Music 1115 | Rupert Murdoch's News Corporation 1116 | Ferrero SpA 1117 | Fiji Times 1118 | Crown Publishers Inc. 1119 | Columbus Dispatch 1120 | Right Bank 1121 | McFarland & Co. 1122 | Atlantic Monthly Press 1123 | Parker Pen Company 1124 | MIT Laboratory 1125 | American Telephone & Telegraph Company 1126 | London Docklands Development Corporation 1127 | Tata Group 1128 | Friden Inc. 1129 | Cartier SA 1130 | Bechtel Corporation 1131 | South African Broadcasting Corporation 1132 | South Bank Show 1133 | West Publishing Co. 1134 | Turkish Airlines 1135 | Humana Press 1136 | Paternoster Press 1137 | His Music 1138 | Magna International 1139 | Games Workshop 1140 | Henry Holt & Co. 1141 | Kitchen Sink Press 1142 | 3rd Ed. Worth Publishing 1143 | Winnipeg Free Press 1144 | Golden Gryphon Press 1145 | Collier's Weekly 1146 | die Welt 1147 | Long Range Desert Group 1148 | ATI Technologies 1149 | Capitol Records 1150 | Archer Daniels Midland 1151 | Hughes Aircraft Company 1152 | Sussex Academic Press 1153 | Sun Microsystems Inc. 1154 | Minneapolis Tribune 1155 | MCI WorldCom 1156 | Michigan State University Press 1157 | Trusted Computing Group 1158 | Chicago Tunnel Company 1159 | Kellogg Company 1160 | Acme Corporation 1161 | Western Music 1162 | Walker & Company 1163 | Greater Arab Free Trade 1164 | Hanson plc 1165 | Lindisfarne Press 1166 | Morgan Reynolds Publishing 1167 | Otis Elevator Company 1168 | Engineering Ltd 1169 | Associated TeleVision 1170 | National Capital 1171 | Bucknell University Press 1172 | Ebury Press 1173 | Garland Publishing Inc. 1174 | Marconi Electronic Systems 1175 | BBC Scotland 1176 | News Limited 1177 | Minnesota Public Radio 1178 | Emirates Airline 1179 | Lotus Cars 1180 | Oriental Steam Navigation Company 1181 | Atlantic Records 1182 | Western Electric Company 1183 | DK Publishing 1184 | Rock Music 1185 | Dun & Bradstreet 1186 | Mars Incorporated 1187 | Cadbury plc 1188 | SBC Communications 1189 | Jet Airways 1190 | United States Playing Card Company 1191 | St. Vladimir's Seminary Press 1192 | Sanford and Son 1193 | Metallurgical Laboratory 1194 | Reconstruction Finance Corporation 1195 | Focal Press 1196 | Ayer Publishing 1197 | Today's Best Music 1198 | A�rospatiale SA 1199 | Danielle Steel 1200 | Blackstone Group 1201 | Internet Corporation for Assigned Names 1202 | Olympia Press 1203 | Royal Bank of Scotland 1204 | Mozilla Corporation 1205 | Contemporary Christian Music 1206 | Bain Capital 1207 | Caledonian Company 1208 | NASDAQ stock market 1209 | Teaching Company 1210 | Wayne Corporation 1211 | Cold Spring Harbor Laboratory 1212 | Muscovy Company 1213 | Public Works 1214 | Vlaamse Radio 1215 | SKY Network Television 1216 | Netscape Communications 1217 | Weird Tales 1218 | American Stock Exchange 1219 | Father and son 1220 | A&E Television Networks 1221 | Juniper Networks 1222 | Elsevier Academic Press 1223 | Edison Manufacturing Company 1224 | JP Morgan 1225 | Lattice Semiconductor 1226 | King Features Syndicate 1227 | James Clarke & Co Ltd 1228 | Air New Zealand 1229 | Cerberus Capital Management 1230 | Groupe Bull 1231 | NFL Films 1232 | SourceForge Inc. 1233 | Air Force Research Laboratory 1234 | Large Cities Climate Leadership Group 1235 | Commercial Appeal 1236 | Trans-Pacific Strategic Economic Partnership 1237 | LL.D. F.S.A. 1238 | McKinsey & Company 1239 | Phoenix Technologies 1240 | Le Journal 1241 | Aer Lingus 1242 | Maxim Integrated Products 1243 | Energy National Laboratories 1244 | El Universal 1245 | Blue Steel 1246 | Air Liquide 1247 | NLM Hazardous Substances Databank 1248 | Oxford U. Press 1249 | Timber Press 1250 | Domino's Pizza 1251 | Southern Louisiana 1252 | Blue Man Group 1253 | Origin Systems 1254 | Shanghai Stock Exchange 1255 | Wildlife Service 1256 | Pan Am 1257 | Killeen Television 1258 | Hound Group 1259 | Alfred A. Knopf Inc. 1260 | Google Inc. 1261 | Voyageur Press 1262 | Old-Time Radio 1263 | Digital Press 1264 | New Zealand Journal 1265 | Compaq Computer Corporation 1266 | Eastern Caribbean Central Bank 1267 | Nutting Associates 1268 | All India Radio 1269 | Shockley Semiconductor Laboratory 1270 | Greatest Films 1271 | Raytheon Missile Systems 1272 | In God We Trust 1273 | MOS Technologies 1274 | Pacifica Radio 1275 | Country Music Television 1276 | H.W. Wilson Co. 1277 | Asahi Shimbun 1278 | Capital Airlines 1279 | Us Weekly 1280 | Loki Software 1281 | E.B. Eddy Company 1282 | Kenya Airways 1283 | Eastern Airlines 1284 | America West Airlines 1285 | Popular Electronics 1286 | Ralston Purina 1287 | In Music 1288 | JetBlue Airways 1289 | Engineering Laboratory 1290 | TwoMorrows Publishing 1291 | Open Court Publishing 1292 | Boots Group 1293 | IUPAP Transfermium Working Group 1294 | Moving Picture Experts Group 1295 | WB Television Network 1296 | Spirit Airlines 1297 | Princeton Plasma Physics Laboratory 1298 | Dimension Films 1299 | Fair Trading 1300 | Mohawk Industries 1301 | Dublin Corporation 1302 | Iron Crown Enterprises 1303 | RKO Radio 1304 | IBM Corporation 1305 | AirTran Airways 1306 | HD Radio 1307 | Pacific Fur Company 1308 | Ethiopian Airlines 1309 | Brookings Institution Press 1310 | Motor Car Company 1311 | ABC Radio Australia 1312 | Ace Hardware 1313 | Electronic Systems 1314 | Deutsche Werke 1315 | Baroque Music 1316 | Mead & Company 1317 | Microsoft Press 1318 | Alenia Marconi Systems 1319 | American Popular Music 1320 | East Japan Railway Company 1321 | Minnesota Law Review 1322 | Dolby Laboratories 1323 | Potash Corporation of Saskatchewan 1324 | Marconi plc 1325 | McFarland & Company Inc. 1326 | Chicago Herald 1327 | Movie Gallery 1328 | Stainless Steel 1329 | XM Radio 1330 | Canonical Ltd. 1331 | Bombardier Inc. 1332 | Lloyds TSB 1333 | Suncor Energy 1334 | Nash Motors 1335 | IEEE Software 1336 | Chamber Music 1337 | G. & C. Merriam Company 1338 | Bloomberg L.P. 1339 | Credit Suisse 1340 | Iowa State University Press 1341 | Valero Energy Corporation 1342 | SRI International 1343 | H.W. Wilson Company 1344 | National Express East Coast 1345 | Boston History Company 1346 | Doubleday & Company Inc. 1347 | Wal-Mart Stores Inc. 1348 | Marine Biological Laboratory 1349 | Aquarian Press 1350 | Eaton Corporation 1351 | AltaMira Press 1352 | Universal Music Group 1353 | Deutsche Bundesbank 1354 | Marvell Technology Group 1355 | LSI Logic 1356 | UNESCO Publishing 1357 | Weeb Ewbank 1358 | HMV Group 1359 | Red Sea Press 1360 | Be Inc. 1361 | Florida Times 1362 | Ford Australia 1363 | Universal Music 1364 | Island Press 1365 | Marvel Comics 1366 | Mars Inc. 1367 | Toronto Stock Exchange 1368 | Southern Lebanon 1369 | World Almanac Education Group 1370 | William Heinemann Ltd. 1371 | Alliant Techsystems 1372 | Canadian University Press 1373 | Chrysler LLC 1374 | John Knox Press 1375 | Paladin Press 1376 | Axel Springer AG 1377 | New Press 1378 | Lend Lease 1379 | Columbia Pictures 1380 | USA TODAY 1381 | Water Music 1382 | Rolls-Royce Limited 1383 | Canada Dry 1384 | Crowood Press 1385 | E.P. Dutton & Co. Inc. 1386 | Wayne Enterprises 1387 | Austrian Airlines 1388 | ConAgra Foods 1389 | Vauxhall Motors 1390 | Delacorte Press 1391 | Public Image Ltd. 1392 | Complex Systems 1393 | Construcciones Aeron�uticas SA 1394 | Fidelity Investments 1395 | Toyota Motor Corporation 1396 | Tyrell Corporation 1397 | Nation's Capital 1398 | Home Box Office 1399 | Thomas Crowell Press 1400 | Asahi Breweries 1401 | Massachusetts Review 1402 | CBS Paramount Television 1403 | Neural Networks 1404 | Hudson Ltd 1405 | Asia Television Limited 1406 | Science Applications International Corporation 1407 | Virginia Quarterly Review 1408 | Thinking Machines Corporation 1409 | Vanguard Group 1410 | Harvester Press 1411 | Braniff International Airways 1412 | Hoechst AG 1413 | Ignatius Press 1414 | National Express Group 1415 | Brussels Airlines 1416 | BNP Paribas 1417 | Quaker Oats Company 1418 | Aerospace Corporation 1419 | National Australia Bank 1420 | Broadcast Music Incorporated 1421 | EMC Corporation 1422 | RTL Television 1423 | Valve Corporation 1424 | J.B. Lippincott Company 1425 | Detroit Diesel 1426 | Presidio Press 1427 | MIT Artificial Intelligence Laboratory 1428 | Telcordia Technologies 1429 | Regnery Publishing 1430 | Museum Tusculanum Press 1431 | PS Publishing 1432 | REO Motor Car Company 1433 | International Journal 1434 | Encyclopedia Britannica Inc. 1435 | NPD Group 1436 | United States Steel 1437 | Macmillan Co. 1438 | Gorgias Press LLC 1439 | New York Times Co. 1440 | Philippine Stock Exchange 1441 | Aircraft Corporation 1442 | American Reprographics Company 1443 | Power Corporation 1444 | Macmillan Press 1445 | Bitstream Inc. 1446 | Reserve Bank of Australia 1447 | Zenith Press 1448 | Dodd Mead & Co. 1449 | Swiss International Air Lines 1450 | Brown & Company 1451 | Husky Energy 1452 | Open Systems 1453 | Phillips Petroleum 1454 | Old Town Canoe Company 1455 | Carnegie Steel Company 1456 | Midland Publishing 1457 | Associated Electrical Industries 1458 | Bay Area Laboratory 1459 | Erie Railroad Co. 1460 | Norfolk Naturalists' Trust 1461 | Harrah's Entertainment 1462 | Marathon Petroleum Company 1463 | CABI Publishing 1464 | Amateur Radio 1465 | Al-Ahram Weekly 1466 | National Capital Region Capital 1467 | NTT DoCoMo 1468 | Tesla Motors 1469 | Green Bank 1470 | Folk Metal 1471 | United Productions of America 1472 | Fantasy Games Unlimited 1473 | BBC Canada 1474 | American Tobacco Company 1475 | Joint Photographic Experts Group 1476 | Buell Motorcycle Company 1477 | Covered Bridge Capital 1478 | Toys R Us 1479 | Canadian Broadcasting Company 1480 | Geoscience Press 1481 | Grolier Incorporated 1482 | African Slave Trade 1483 | Banca d'Italia 1484 | Austin Motor Company 1485 | British Racing Motors 1486 | TNT N.V. 1487 | People Weekly 1488 | Malaysia Airlines 1489 | San Diego Union Tribune 1490 | Philtrum Press 1491 | Formula One Management 1492 | Nippon Telegraph and Telephone 1493 | Telos Press 1494 | Open Group 1495 | Stalky & Co 1496 | Bethlehem Steel 1497 | Farmers' Loan & Trust Co. 1498 | MAN SE 1499 | Mercury News 1500 | ARM Limited 1501 | Computing Tabulating Recording Corporation 1502 | Lisp Machines Inc. 1503 | Roundabout Theatre Company 1504 | Vancouver Sun 1505 | Devon Wildlife Trust 1506 | FOX network 1507 | B. Eerdmans Publishing Co. 1508 | New York Times Company 1509 | Air University Press 1510 | MIPS Technologies 1511 | New Worlds 1512 | Continuum International Publishing 1513 | Disinformation Company 1514 | Gauntlet Press 1515 | Ashgate Publishing Ltd 1516 | World Scientific Publishing Co. 1517 | Aral AG 1518 | US Weekly 1519 | A&M Records 1520 | TV4 AB 1521 | Vanguard Press 1522 | R.H. Donnelley 1523 | Columbia Journalism Review 1524 | Aerol�neas Argentinas 1525 | Digital Research Inc. 1526 | E.W. Scripps 1527 | Science Fiction 1528 | Associated British Foods 1529 | Aeroflot - Russian Airlines 1530 | Database Systems 1531 | Super Junior 1532 | State Street Corporation 1533 | Kessinger Publishing LLC 1534 | Island Records 1535 | Burlington Northern Santa Fe 1536 | Medical Examination Publishing Co. 1537 | Eternal Music 1538 | International Paper 1539 | White Wolf Publishing 1540 | Public Television 1541 | Fairleigh Dickinson Univ Press 1542 | Phillips Petroleum Company 1543 | Phillips 1544 | Southern Oregon 1545 | 3Com Corporation 1546 | Pakistan International Airlines 1547 | Falling Rain Genomics Inc 1548 | Simon & Schuster Inc. 1549 | Network Working Group 1550 | E. P. Dutton & Co. 1551 | Great Lakes Airlines 1552 | Wildlife Forensics Laboratory 1553 | Literary Dictionary Company 1554 | W. Norton & Company Inc. 1555 | Thomas Nelson Inc. 1556 | C. F. Martin & Company 1557 | Kyodo News 1558 | Stagecoach Group 1559 | St. Louis Business Journal 1560 | Chevron Corporation 1561 | Mattel Inc. 1562 | News International 1563 | Jane's Defence Weekly 1564 | American Bridge Company 1565 | Dell Publishing 1566 | Soviet Russia 1567 | Kadena AB 1568 | MRC Laboratory 1569 | Network Associates 1570 | Red Hat Inc. 1571 | United Artists Records 1572 | Wallflower Press 1573 | RKO Radio Pictures Inc. 1574 | China Central Television 1575 | Confederate States of America 1576 | Blue Bell 1577 | Japan Tobacco 1578 | Monotype Corporation 1579 | Black Rose Books Ltd. 1580 | China Southern Airlines 1581 | Bobbs-Merrill Company 1582 | Chestnut Canoe Company 1583 | Intuit Inc. 1584 | Insurance Australia Group 1585 | Royal Niger Company 1586 | Power Electronics 1587 | Virgin Records 1588 | System Development Corporation 1589 | Cosgrove Hall Films 1590 | Mega Inc. 1591 | ASM Press 1592 | B. Herder Book Co. 1593 | Sinclair Oil 1594 | Long-Term Capital Management 1595 | Henry Holt and Co. 1596 | Space Corporation 1597 | New Jersey 1598 | Bain & Company 1599 | Dana International 1600 | Banco Popular de Puerto Rico 1601 | Longmans Green & Co. 1602 | Wadsworth Publishing 1603 | General Motors Company 1604 | Thomas Y. Crowell Company 1605 | RCA Records 1606 | Mars Science Laboratory 1607 | Gulf Research Laboratories 1608 | MITRE Corporation 1609 | Pennsylvania Railroad 1610 | J.D. Power and Associates 1611 | Liz Claiborne 1612 | Top 50 Cult Films 1613 | Scarecrow Press Inc. 1614 | Cox Communications 1615 | Enron Corporation 1616 | Epoch Times 1617 | Oxford University Press Inc. 1618 | Hutchinson & Co. 1619 | Barings Bank 1620 | Applied Physics Laboratory 1621 | MTV Networks Europe 1622 | New Directions Publishing 1623 | Charles Jenkins Laboratories 1624 | Total Film 1625 | Waltham Watch Company 1626 | Xlibris Corporation 1627 | US Forest Service 1628 | Mega Party Inc. 1629 | Barnard Island Group 1630 | Korean Air 1631 | ABN AMRO 1632 | Penn Central 1633 | DuPont Company 1634 | State Capital 1635 | Postal Telegraph Company 1636 | Asian Development Bank 1637 | Triad Publishing Company 1638 | Norton Simon 1639 | Reader's Digest Association 1640 | Herald Press 1641 | Dassault Systemes 1642 | SkyWest Airlines 1643 | Abacus Software 1644 | Ace Radio 1645 | Denham Group 1646 | W. H. Freeman and Co. 1647 | East India East India Company 1648 | Oak Knoll Press 1649 | Flinders Group 1650 | Fordham University Press 1651 | Stone Bridge Press 1652 | Sacred Music 1653 | Columbia Pictures Television 1654 | Frankland Group 1655 | Park Street Press 1656 | Philadelphia Bulletin 1657 | Acorn Computers Ltd 1658 | Rover Company 1659 | MTV Networks 1660 | Radiation Laboratory 1661 | Jack in the Box 1662 | Industrial Development Corporation 1663 | Athlone Press 1664 | Consultative Group 1665 | Cardinal Health 1666 | City Bridge Trust 1667 | Cork University Press 1668 | Manx Wildlife Trust 1669 | Eiffel Software 1670 | Southern Illinois 1671 | Hudson�s Bay Company 1672 | Sirius XM Radio 1673 | BBC Two 1674 | British Music 1675 | Norton & Company 1676 | Marlowe & Company 1677 | Living Resources 1678 | Seattle Weekly 1679 | Copa Airlines 1680 | Colt's Manufacturing Company 1681 | Agency for the Prohibition 1682 | DIANE Publishing 1683 | Booz Allen 1684 | Broadman Press 1685 | Specialty Press 1686 | Creative Labs 1687 | Mitsubishi Electric 1688 | KPMG Europe LLP 1689 | Avalon Hill 1690 | Mitsubishi UFJ Financial Group 1691 | NATO Partnership 1692 | MCA Inc. 1693 | Mountain Music 1694 | Inner City Press 1695 | Waveland Press 1696 | BiblioBazaar LLC 1697 | Howick Group 1698 | William Morrow & Co. 1699 | Dick's Sporting Goods 1700 | Bath Spa 1701 | Harcourt Brace & Company 1702 | Plumas Transit Systems 1703 | Ambrosia Software 1704 | Ruth's Chris 1705 | Arkham House 1706 | Shanghai Automotive Industry Corporation 1707 | ARM Holdings 1708 | Queen's Music 1709 | Best Films 1710 | New Holland Ag 1711 | Turtle Beach Systems 1712 | GNU Radio 1713 | Ringling Bros. 1714 | Pullman Company 1715 | Oliphants Ltd. 1716 | Visa Inc. 1717 | American can 1718 | Johnson Controls 1719 | Blackwell Publishing Ltd. 1720 | Computer Laboratory 1721 | Short Brothers 1722 | Campbell Soup 1723 | Tandem Computers 1724 | Shugart Associates 1725 | National Bus Company 1726 | United Nations Industrial Development Universal Postal Union 1727 | Waste management 1728 | Great Beacon Press 1729 | Bell Canada 1730 | MIPS Computer Systems Inc. 1731 | Metal Machine Music 1732 | Open Court Publishing Company 1733 | Eyre Methuen Ltd 1734 | Standard Chartered Bank 1735 | John Benjamins Publishing Company 1736 | Fe Co 1737 | United States Naval Institute Press 1738 | Anglo American plc 1739 | Cisco Systems Inc. 1740 | Resorts Company 1741 | Barclays plc 1742 | Metropolitan Life Insurance Company 1743 | Pacific Crest 1744 | Thomas Y. Crowell Co. 1745 | Schlumberger Limited 1746 | Nimbus Publishing 1747 | Milton Hershey School Trust 1748 | Pharaonic Egypt 1749 | Tyndale Press 1750 | Ford of Europe 1751 | Playboy Enterprises 1752 | Vueling Airlines 1753 | Fuji Heavy Industries 1754 | Audi AG 1755 | Perseus Publishing 1756 | Rand Corporation 1757 | Plant Genetic Resources 1758 | Soncino Press 1759 | De Nederlandsche Bank 1760 | Briggs & Stratton 1761 | KM Group 1762 | El Pais 1763 | NEC Corporation 1764 | Ford Crown Victoria 1765 | Orlando Sentinel 1766 | General Foods 1767 | Crown Publishers Inc 1768 | Morris Motor Company 1769 | Simulations Publications Inc. 1770 | Henry Holt & Co 1771 | Warner Music 1772 | Bayer AG 1773 | Australian Securities Exchange 1774 | Bank of Japan 1775 | Ansett Australia 1776 | Four Courts Press 1777 | E! Entertainment Television 1778 | Black Entertainment Television 1779 | Computer Associates 1780 | Clear Channel Communications 1781 | Bangemall Group 1782 | MICRA Inc. 1783 | Bear Stearns 1784 | E. Remington and Sons 1785 | Lucas Industries plc 1786 | Irish Music 1787 | Trans-Canada Airlines 1788 | Royal Bank of Canada 1789 | West Publishing 1790 | Northern Rock 1791 | Oxford English Dictionary 1792 | Sri Lanka Telecom 1793 | Wonderwall Music 1794 | America Online Inc. 1795 | Bristol-Myers Squibb 1796 | New York Inc. 1797 | Turner Publishing Company 1798 | Mayfield Publishing 1799 | Valero Energy 1800 | Northrop Grumman Corporation 1801 | Visible Ink Press 1802 | Woody Press 1803 | Amsterdam Stock Exchange 1804 | Quarry Bank 1805 | NXP Semiconductors 1806 | Rough Guides Ltd. 1807 | Sudan Airways 1808 | Sony Corporation 1809 | Sony 1810 | Quintet Publishing 1811 | Aldine Press 1812 | National Railway Company 1813 | Essex Wildlife Trust 1814 | Dairy Queen 1815 | Publisher Inc. 1816 | Pacific Southwest Airlines 1817 | South China Morning Post 1818 | Ames Laboratory 1819 | Gem Trade 1820 | Kinney National Company 1821 | Polar Music 1822 | American World Airways 1823 | Indian Classical Music 1824 | Whole Foods Market 1825 | Local Group 1826 | Edison Trust 1827 | Engineering Research Associates 1828 | Pernod Ricard 1829 | Sterling Publishing Company Inc 1830 | Hawaiian Airlines 1831 | 1832 | -------------------------------------------------------------------------------- /honorifics.txt: -------------------------------------------------------------------------------- 1 | A. 2 | Adj. 3 | Adm. 4 | Adv. 5 | Asst. 6 | B. 7 | Bart. 8 | Brig. 9 | Bros. 10 | C. 11 | Capt. 12 | Cmdr. 13 | Col. 14 | Comdr. 15 | Con. 16 | Cpl. 17 | D. 18 | DR. 19 | Dr. 20 | E. 21 | Ens. 22 | F. 23 | G. 24 | Gen. 25 | Gov. 26 | H. 27 | Hon. 28 | Hosp. 29 | I. 30 | Insp. 31 | J. 32 | K. 33 | L. 34 | Lt. 35 | M. 36 | M. 37 | MM. 38 | MR. 39 | MRS. 40 | MS. 41 | Maj. 42 | Messrs. 43 | Mlle. 44 | Mme. 45 | Mr. 46 | Mrs. 47 | Ms. 48 | Msgr. 49 | N. 50 | O. 51 | Op. 52 | Ord. 53 | P. 54 | Pfc. 55 | Ph. 56 | Prof. 57 | Pvt. 58 | Q. 59 | R. 60 | Rep. 61 | Reps. 62 | Rev. 63 | S. 64 | Sen. 65 | Sens. 66 | Sfc. 67 | Sgt. 68 | Sr. 69 | St. 70 | Supt. 71 | T. 72 | U. 73 | V. 74 | W. 75 | X. 76 | Y. 77 | Z. 78 | v. 79 | vs. 80 | -------------------------------------------------------------------------------- /placenames.txt: -------------------------------------------------------------------------------- 1 | Barbados:country 2 | Hudson:us_city 3 | Japan:country 4 | Vientiane:country_capital 5 | Pago Pago:country_capital 6 | Michigan:us_state 7 | Malaysia:country 8 | Sedona:us_city 9 | Texas:us_state 10 | Larami:us_city 11 | P'yongyang:country_capital 12 | Samoa:country 13 | Praia:country_capital 14 | Buenos Aires:country_capital 15 | Anguilla:country 16 | Bethel:us_city 17 | Botswana:country 18 | Schenectady:us_city 19 | Northern Mariana Islands:country 20 | Kiribati:country 21 | Bridgetown:country_capital 22 | Basseterre:country_capital 23 | Evanston:us_city 24 | North Little Rock:us_city 25 | Somalia:country 26 | Gaza Strip:country 27 | Tempe:us_city 28 | Rio Rancho:us_city 29 | Brooklyn Park:us_city 30 | Layton:us_city 31 | Qatar:country 32 | Bishkek:country_capital 33 | Baker Island:country 34 | Stamford:us_city 35 | Majuro:country_capital 36 | Kuala Lumpur:country_capital 37 | Warner Robins:us_city 38 | Midwest City:us_city 39 | Florissant:us_city 40 | Kansas City:us_city 41 | Fairmont:us_city 42 | Biloxi:us_city 43 | Marietta:us_city 44 | Franklin:us_city 45 | Flagstaff:us_city 46 | North Dakota:us_state 47 | Westminster:us_city 48 | Delaware:us_state 49 | Haiti:country 50 | Providence:us_city 51 | Bend:us_city 52 | United States:country 53 | Burkina Faso:country 54 | Roanoke:us_city 55 | West Valley City:us_city 56 | Havre:us_city 57 | Rogers:us_city 58 | Rockford:us_city 59 | Guam:country 60 | Kenner:us_city 61 | Butte:us_city 62 | Conakry:country_capital 63 | Hanoi:country_capital 64 | Wahpeton:us_city 65 | Quito:country_capital 66 | Hollywood:us_city 67 | Bosnia:country 68 | Herzegovina:country 69 | Liberia:country 70 | Washington:us_state 71 | Saint Joseph:us_city 72 | Nicaragua:country 73 | Dover:us_city 74 | Fort Wayne:us_city 75 | New Caledonia:country 76 | Moore:us_city 77 | Warwick:us_city 78 | British Indian Ocean Territory:country 79 | Glorioso Islands:country 80 | Fort Collins:us_city 81 | Egypt:country 82 | New Haven:us_city 83 | Norfolk:us_city 84 | Jamaica:country 85 | Paramaribo:country_capital 86 | Douglas:country_capital 87 | Kingston:country_capital 88 | Valletta:country_capital 89 | Wichita:us_city 90 | New Castle:us_city 91 | Iraq:country 92 | Gibraltar:country_capital 93 | Zambia:country 94 | Iran:country 95 | Sioux City:us_city 96 | Ohio:us_state 97 | Portland:us_city 98 | Rochester:us_city 99 | Canberra:country_capital 100 | Peru:country 101 | Enid:us_city 102 | Montgomery:us_city 103 | West Island:country_capital 104 | Nairobi:country_capital 105 | Florida:us_state 106 | New Zealand:country 107 | Kodiak:us_city 108 | Clarksville:us_city 109 | Gadsden:us_city 110 | Mesa:us_city 111 | Charlotte:us_city 112 | Gary:us_city 113 | Corvallis:us_city 114 | British Virgin Islands:country 115 | Micronesia:country 116 | Bamako:country_capital 117 | Washington:us_state 118 | Portugal:country 119 | Turkmenistan:country 120 | Broken Arrow:us_city 121 | Cambodia:country 122 | Norman:us_city 123 | Congo:country 124 | Albuquerque:us_city 125 | Philadelphia:us_city 126 | Spokane:us_city 127 | Middletown:us_city 128 | Hilo:us_city 129 | Senegal:country 130 | Brunswick:us_city 131 | Angola:country 132 | Aurora:us_city 133 | Damascus:country_capital 134 | Baton Rouge:us_city 135 | Rapid City:us_city 136 | Minneapolis:us_city 137 | Greenville:us_city 138 | Adamstown:country_capital 139 | Suriname:country 140 | Ecuador:country 141 | Sofia:country_capital 142 | Rockville:us_city 143 | Lawrence:us_city 144 | Columbia:us_city 145 | Saint Louis:us_city 146 | Federal Way:us_city 147 | French Polynesia:country 148 | North Charleston:us_city 149 | Lebanon:country 150 | Belarus:country 151 | Lorain:us_city 152 | Portsmouth:us_city 153 | Auburn:us_city 154 | Ukraine:country 155 | Aruba:country 156 | Kiev:city 157 | Erie:us_city 158 | Suva:country_capital 159 | Eau Claire:us_city 160 | Livonia:us_city 161 | Quincy:us_city 162 | Vaduz:country_capital 163 | Frederick:us_city 164 | Ljubljana:country_capital 165 | Cyprus:country 166 | Wisconsin:us_state 167 | Biddeford:us_city 168 | Mogadishu:country_capital 169 | Ann Arbor:us_city 170 | Provo:us_city 171 | Abu Dhabi:country_capital 172 | Fremont:us_city 173 | Bahrain:country 174 | Sweden:country 175 | Dominican Republic:country 176 | Maldives:country 177 | Oklahoma City:us_city 178 | Madrid:country_capital 179 | Guinea-Bissau:country 180 | Conway:us_city 181 | Saint Helier:country_capital 182 | Copenhagen:country_capital 183 | Niamey:country_capital 184 | Indianapolis:us_city 185 | Port Moresby:country_capital 186 | Appleton:us_city 187 | Vatican City:country_capital 188 | Kingman Reef:country 189 | Youngstown:us_city 190 | Grenada:country 191 | Tuvalu:country 192 | Albany:us_city 193 | Iowa City:us_city 194 | Bangor:us_city 195 | Sandy:us_city 196 | Durham:us_city 197 | Djibouti:country_capital 198 | United Arab Emirates:country 199 | Milford:us_city 200 | Moldova:country 201 | Italy:country 202 | Syracuse:us_city 203 | Gambia:country 204 | Hattiesburg:us_city 205 | Buffalo:us_city 206 | West Fargo:us_city 207 | New York:us_city 208 | High Point:us_city 209 | Kingstown:country_capital 210 | Saint-Denis:country_capital 211 | Ulaanbaatar:country_capital 212 | Jerusalem:city 213 | Salisbury:us_city 214 | College Park:us_city 215 | Scranton:us_city 216 | Overland Park:us_city 217 | Bozeman:us_city 218 | Barrow:us_city 219 | Vienna:country_capital 220 | Road Town:country_capital 221 | Brookings:us_city 222 | Sierra Leone:country 223 | Anaheim:us_city 224 | Los Angeles:us_city 225 | Hot Springs:us_city 226 | Nevada:us_state 227 | Phoenix:us_city 228 | Martinique:country 229 | Dhaka:country_capital 230 | Beijing:country_capital 231 | Sri Lanka:country 232 | Memphis:us_city 233 | Atlanta:us_city 234 | Grand Forks:us_city 235 | Montana:us_state 236 | Wasilla:us_city 237 | Maryland:us_state 238 | Pembroke Pines:us_city 239 | Oman:country 240 | Pawtucket:us_city 241 | Armenia:country 242 | Denver:us_city 243 | Iceland:country 244 | Gillette:us_city 245 | Sanford:us_city 246 | Belgium:country 247 | Lancaster:us_city 248 | Monrovia:country_capital 249 | Edison Township:us_city 250 | New Hampshire:us_state 251 | Banjul:country_capital 252 | Nashville:us_city 253 | Doha:country_capital 254 | Algeria:country 255 | Marshall Islands:country 256 | Kalispell:us_city 257 | Switzerland:country 258 | Mauritania:country 259 | Manchester:us_city 260 | Oranjestad:country_capital 261 | Charleston:us_city 262 | Ouagadougou:country_capital 263 | Bossier City:us_city 264 | Beaverton:us_city 265 | Zimbabwe:country 266 | Carlsbad:us_city 267 | New Delhi:country_capital 268 | West Hartford:us_city 269 | Jordan:country 270 | Jersey:country 271 | Louisville:us_city 272 | Rome:country_capital 273 | Bucharest:country_capital 274 | Saint Kitts:country 275 | Bhutan:country 276 | San Diego:us_city 277 | Elizabeth:us_city 278 | Georgia:country 279 | Greensboro:us_city 280 | Antarctica:country 281 | Jonesboro:us_city 282 | Honiara:country_capital 283 | Netherlands:country 284 | Israel:country 285 | Dushanbe:country_capital 286 | Chandler:us_city 287 | Decatur:us_city 288 | Taylorsville:us_city 289 | Cincinnati:us_city 290 | Colorado:us_state 291 | Tucson:us_city 292 | Tupelo:us_city 293 | Huron:us_city 294 | Libreville:country_capital 295 | Manhattan:us_city 296 | Pakistan:country 297 | Winston-Salem:us_city 298 | Thailand:country 299 | Bandar Seri Begawan:country_capital 300 | Macedonia:country 301 | Honduras:country 302 | Bloomington:us_city 303 | Newark:us_city 304 | Europa Island:country 305 | Reading:us_city 306 | Lusaka:country_capital 307 | Hillsboro:us_city 308 | Cedar Rapids:us_city 309 | Tuscaloosa:us_city 310 | San Jose:country_capital 311 | Phnom Penh:country_capital 312 | Taiwan:country 313 | Nicosia:country_capital 314 | Canada:country 315 | Grand Rapids:us_city 316 | Minsk:country_capital 317 | Lome:country_capital 318 | Bangladesh:country 319 | Hong Kong:country 320 | New Britain:us_city 321 | Saint John's:country_capital 322 | Kinshasa:country_capital 323 | South Dakota:us_state 324 | Prague:country_capital 325 | Mali:country 326 | Bujumbura:country_capital 327 | Cambridge:us_city 328 | Little Rock:us_city 329 | Moscow:country_capital 330 | Augusta:us_city 331 | Male:country_capital 332 | La Paz:city 333 | South Bend:us_city 334 | Aberdeen:us_city 335 | Ashgabat:country_capital 336 | Kansas:us_state 337 | Omaha:us_city 338 | Pittsburgh:us_city 339 | Comoros:country 340 | Annapolis:us_city 341 | Sanaa:country_capital 342 | Solomon Islands:country 343 | Kampala:country_capital 344 | Lexington:us_city 345 | Westland:us_city 346 | Nepal:country 347 | Rock Hill:us_city 348 | Chicago:us_city 349 | Orlando:us_city 350 | Thornton:us_city 351 | New Jersey:us_state 352 | Kathmandu:country_capital 353 | Huntington:us_city 354 | Macau:country 355 | Cedar Falls:us_city 356 | Arvada:us_city 357 | Luxembourg:country_capital 358 | Guernsey:country 359 | Canton:us_city 360 | Shawnee:us_city 361 | Springfield:us_city 362 | Davenport:us_city 363 | Nassau:country_capital 364 | Chisinau:country_capital 365 | Bowling Green:us_city 366 | Tirana:country_capital 367 | Louisiana:us_state 368 | Hoover:us_city 369 | Panama:country_capital 370 | Oslo:country_capital 371 | Mozambique:country 372 | Sterling Heights:us_city 373 | Benin:country 374 | Saint Petersburg:us_city 375 | Argentina:country 376 | Lowell:us_city 377 | N'Djamena:country_capital 378 | Dublin:country_capital 379 | Saint Lucia:country 380 | South Portland:us_city 381 | Ghana:country 382 | Woonsocket:us_city 383 | Sao Tome:country_capital 384 | El Paso:us_city 385 | Lansing:us_city 386 | Mandan:us_city 387 | Kazakhstan:country 388 | Las Cruces:us_city 389 | Eugene:us_city 390 | West Jordan:us_city 391 | Tunisia:country 392 | Hastings:us_city 393 | Boston:us_city 394 | Reunion:country 395 | Anchorage:us_city 396 | Richmond:us_city 397 | Ketchikan:us_city 398 | Johnson City:us_city 399 | Parkersburg:us_city 400 | Tampa:us_city 401 | Las Vegas:us_city 402 | Mount Pleasant:us_city 403 | Hungary:country 404 | Duluth:us_city 405 | Dickinson:us_city 406 | Cuba:country 407 | Santiago:country_capital 408 | Toledo:us_city 409 | West Warwick:us_city 410 | Victoria:country_capital 411 | Alaska:us_state 412 | Detroit:us_city 413 | Thimphu:country_capital 414 | Smyrna:us_city 415 | Mexico:country_capital 416 | Vanuatu:country 417 | Jakarta:country_capital 418 | Des Moines:us_city 419 | Albania:country 420 | Kyrgyzstan:country 421 | Mitchell:us_city 422 | Arkansas:us_state 423 | Kentucky:us_state 424 | Castries:country_capital 425 | Guinea:country 426 | Korea:country 427 | Bolivia:country 428 | Gaborone:country_capital 429 | Meridian:us_city 430 | South Africa:country 431 | Maine:us_state 432 | Cook Islands:country 433 | Tennessee:us_state 434 | Cayenne:country_capital 435 | Cayman Islands:country 436 | Paraguay:country 437 | Independence:us_city 438 | Mayotte:country 439 | Alofi:country_capital 440 | Dearborn:us_city 441 | Muscat:country_capital 442 | Saint Peter Port:country_capital 443 | Henderson:us_city 444 | Turkey:country 445 | Germany:country 446 | Hamilton:country_capital 447 | Lincoln:us_city 448 | Islamabad:country_capital 449 | Burundi:country 450 | Minot:us_city 451 | Houston:us_city 452 | Christmas Island:country 453 | Yuma:us_city 454 | Concord:us_city 455 | Pennsylvania:us_state 456 | Oakland:us_city 457 | Connecticut:us_state 458 | Georgetown:country_capital 459 | Midway Islands:country 460 | Austria:country 461 | Mount Vernon:us_city 462 | Salt Lake City:us_city 463 | Cary:us_city 464 | Port-au-Prince:country_capital 465 | Finland:country 466 | Gabon:country 467 | Clovis:us_city 468 | Elko:us_city 469 | Spratly Islands:country 470 | Farmington:us_city 471 | Cumberland:us_city 472 | Hawaii:us_state 473 | Bogota:country_capital 474 | Jersey City:us_city 475 | Basse-Terre:country_capital 476 | Malta:country 477 | French Guiana:country 478 | Falkland Islands:country 479 | Czech Republic:country 480 | Croatia:country 481 | Lesotho:country 482 | Scottsdale:us_city 483 | Afghanistan:country 484 | Western Sahara:country 485 | Atlantic Ocean:country 486 | Libya:country 487 | New Mexico:us_state 488 | Australia:country 489 | Palau:country 490 | Monroe:us_city 491 | El Salvador:country 492 | Zagreb:country_capital 493 | Hartford:us_city 494 | West Virginia:us_state 495 | San Francisco:us_city 496 | Cameroon:country 497 | Green Bay:us_city 498 | Vancouver:us_city 499 | Chad:country 500 | Pretoria:us_city 501 | Denmark:country 502 | Gilbert:us_city 503 | Avarua:country_capital 504 | Bassas da India:country 505 | Lewiston:us_city 506 | Latvia:country 507 | Hagerstown:us_city 508 | Great Falls:us_city 509 | Upper Darby Twp:us_city 510 | Mililani:us_city 511 | Scarborough:us_city 512 | Laos:country 513 | Hilton Head Island:us_city 514 | Juan de Nova Island:country 515 | Jackson:us_city 516 | Utah:us_state 517 | Singapore:country_capital 518 | Alamogordo:us_city 519 | Danbury:us_city 520 | Virginia:us_state 521 | Indiana:us_state 522 | Wilmington:us_city 523 | New Rochelle:us_city 524 | Eritrea:country 525 | Frankfort:us_city 526 | Milwaukee:us_city 527 | Tehran:country_capital 528 | Kenai:us_city 529 | Greenwich:us_city 530 | Lee's Summit:us_city 531 | Mesquite:us_city 532 | Venezuela:country 533 | St. George:us_city 534 | London:country_capital 535 | Lilongwe:country_capital 536 | Managua:country_capital 537 | Tallahassee:us_city 538 | Clipperton Island:country 539 | Vietnam:country 540 | Dover Township:us_city 541 | Knoxville:us_city 542 | Palikir:country_capital 543 | Huntsville:us_city 544 | Dominica:country 545 | Brockton:us_city 546 | Khartoum:country_capital 547 | Alabama:us_state 548 | Colombia:country 549 | Equatorial Guinea:country 550 | Coral Sea Islands:country 551 | Fargo:us_city 552 | West Bank:country 553 | Algiers:country_capital 554 | Anderson:us_city 555 | Bridgeport:us_city 556 | Reno:us_city 557 | Navassa Island:country 558 | Morocco:country 559 | Jacksonville:us_city 560 | Sacramento:us_city 561 | Tarawa:country_capital 562 | Johnston Atoll:country 563 | Berlin:country_capital 564 | San Juan:country_capital 565 | Spartanburg:us_city 566 | Plymouth:us_city 567 | Pine Bluff:us_city 568 | Yemen:country 569 | Kenya:country 570 | Norwalk:us_city 571 | Russia:country 572 | Miles City:us_city 573 | Windhoek:country_capital 574 | Oklahoma:us_state 575 | Tunis:country_capital 576 | Skopje:country_capital 577 | Colorado Springs:us_city 578 | Papua New Guinea:country 579 | Sarajevo:country_capital 580 | Missouri:us_state 581 | Trenton:us_city 582 | Bermuda:country 583 | Congo:country 584 | Glendale:us_city 585 | San Marino:country_capital 586 | Bissau:country_capital 587 | Antigua:country 588 | Barbuda:country 589 | Chesapeake:us_city 590 | Guadeloupe:country 591 | Dakar:country_capital 592 | Athens:country_capital 593 | Hampton:us_city 594 | Netherlands Antilles:country 595 | Tromelin Island:country 596 | Asuncion:country_capital 597 | Parma:us_city 598 | San Salvador:country_capital 599 | Manama:country_capital 600 | Brazil:country 601 | The Valley:country_capital 602 | Havana:country_capital 603 | Tanzania:country 604 | Miami:us_city 605 | Owensboro:us_city 606 | Salem:us_city 607 | Columbus:us_city 608 | Nuku'alofa:country_capital 609 | Billings:us_city 610 | Waterbury:us_city 611 | Minnesota:us_state 612 | Hobbs:us_city 613 | Florence:us_city 614 | Bouvet Island:country 615 | Papeete:country_capital 616 | Roseau:country_capital 617 | Virginia Beach:us_city 618 | Saint Charles:us_city 619 | Luanda:country_capital 620 | Amsterdam:city 621 | West Allis:us_city 622 | Slovenia:country 623 | Bellevue:us_city 624 | Oshkosh:us_city 625 | Anaconda:us_city 626 | Guatemala:country_capital 627 | Springdale:us_city 628 | Boulder:us_city 629 | Alexandria:us_city 630 | Carson City:us_city 631 | Yerevan:country_capital 632 | Fort Lauderdale:us_city 633 | Waukesha:us_city 634 | Philippines:country 635 | Sioux Falls:us_city 636 | Clarksburg:us_city 637 | Seaford:us_city 638 | New Orleans:us_city 639 | Nigeria:country 640 | Arlington:us_city 641 | Flint:us_city 642 | Grand Island:us_city 643 | Fallon:us_city 644 | Racine:us_city 645 | Wyoming:us_state 646 | Vermillion:us_city 647 | Wheeling:us_city 648 | none:country_capital 649 | Montserrat:country 650 | India:country 651 | Belize:country 652 | Kearney:us_city 653 | Howland Island:country 654 | Andorra:country 655 | East Providence:us_city 656 | Namibia:country 657 | Saint Pierre:country 658 | Miquelon:country 659 | Guyana:country 660 | Nebraska:us_state 661 | Wellington:country_capital 662 | Santa Ana:us_city 663 | Swaziland:country 664 | Evansville:us_city 665 | Tulsa:us_city 666 | Dubuque:us_city 667 | Yonkers:us_city 668 | Gulfport:us_city 669 | Fresno:us_city 670 | France:country 671 | Southern Ocean:country 672 | Worcester:us_city 673 | Yakima:us_city 674 | Edmond:us_city 675 | Arctic Ocean:country 676 | Jamestown:country_capital 677 | Akron:us_city 678 | Madagascar:country 679 | Cody:us_city 680 | Greece:country 681 | Fort Worth:us_city 682 | Cape Verde:country 683 | Central African Republic:country 684 | Illinois:us_state 685 | Ethiopia:country 686 | Lawton:us_city 687 | Saipan:country_capital 688 | Yaounde:country_capital 689 | Bellingham:us_city 690 | Faroe Islands:country 691 | Seychelles:country 692 | Terre Haute:us_city 693 | Tokyo:country_capital 694 | Slovakia:country 695 | Rabat:country_capital 696 | Trinidad:country 697 | Tobago:country 698 | Belgrade:country_capital 699 | Dallas:us_city 700 | Montevideo:country_capital 701 | Norway:country 702 | Niue:country 703 | Malabo:country_capital 704 | Watertown:us_city 705 | Bismarck:us_city 706 | San Antonio:us_city 707 | North Las Vegas:us_city 708 | Lima:country_capital 709 | Plymouth:us_city 710 | Tegucigalpa:country_capital 711 | Fall River:us_city 712 | Tallinn:country_capital 713 | Port-of-Spain:country_capital 714 | Longyearbyen:country_capital 715 | Covington:us_city 716 | Tonga:country 717 | Gaithersburg:us_city 718 | Kuwait:country_capital 719 | Svalbard:country 720 | Beirut:country_capital 721 | Maseru:country_capital 722 | Uruguay:country 723 | Naperville:us_city 724 | Kingsport:us_city 725 | Green River:us_city 726 | Greenland:country 727 | Liechtenstein:country 728 | Vermont:us_state 729 | Norfolk Island:country 730 | Saint-Pierre:country_capital 731 | Lisbon:country_capital 732 | Bulgaria:country 733 | Waukegan:us_city 734 | Paris:country_capital 735 | California:us_state 736 | Brazzaville:country_capital 737 | Coventry:us_city 738 | Pueblo:us_city 739 | Fayetteville:us_city 740 | Antananarivo:country_capital 741 | Baghdad:country_capital 742 | Ogden:us_city 743 | Amman:country_capital 744 | Budapest:country_capital 745 | Manila:country_capital 746 | Lake Charles:us_city 747 | Waterloo:us_city 748 | China:country 749 | Torshavn:country_capital 750 | Lakewood:us_city 751 | Chattanooga:us_city 752 | Murfreesboro:us_city 753 | Addis Ababa:country_capital 754 | Topeka:us_city 755 | Stockholm:country_capital 756 | Saint Helena:country 757 | Tacoma:us_city 758 | Sitka:us_city 759 | Puerto Rico:country 760 | Medford:us_city 761 | Reykjavik:country_capital 762 | Sumter:us_city 763 | Jarvis Island:country 764 | Helsinki:country_capital 765 | Costa Rica:country 766 | Birmingham:us_city 767 | Ottawa:country_capital 768 | Bethlehem:us_city 769 | Brasilia:country_capital 770 | Macon:us_city 771 | Helena:us_city 772 | The Settlement:country_capital 773 | Cheyenne:us_city 774 | Togo:country 775 | Taipei:country_capital 776 | Brunei:country 777 | Utica:us_city 778 | Paterson:us_city 779 | Fort Smith:us_city 780 | Idaho:us_state 781 | Brussels:country_capital 782 | South Carolina:us_state 783 | Saint George's:country_capital 784 | T'bilisi:country_capital 785 | Asheville:us_city 786 | Burma:country 787 | Woodbridge Township:us_city 788 | Stanley:country_capital 789 | Pitcairn Islands:country 790 | Paracel Islands:country 791 | Pacific Ocean:country 792 | Harare:country_capital 793 | Weirton:us_city 794 | Yankton:us_city 795 | Saudi Arabia:country 796 | Garland:us_city 797 | Nauru:country 798 | Ames:us_city 799 | Riverton:us_city 800 | Saint Peters:us_city 801 | Ankara:country_capital 802 | Bowie:us_city 803 | Uzbekistan:country 804 | Freetown:country_capital 805 | Plano:us_city 806 | Mobile:us_city 807 | Williston:us_city 808 | Indonesia:country 809 | Charlotte Amalie:country_capital 810 | Port Louis:country_capital 811 | Stillwater:us_city 812 | Chile:country 813 | Dayton:us_city 814 | Missoula:us_city 815 | United Kingdom:country 816 | Salina:us_city 817 | Fort-de-France:country_capital 818 | Korea:country 819 | Muncie:us_city 820 | Santa Fe:us_city 821 | Cleveland:us_city 822 | Saint Vincent:country 823 | Grenadines:country 824 | Vicksburg:us_city 825 | Sudan:country 826 | Sheridan:us_city 827 | Jeffersontown:us_city 828 | Peoria:us_city 829 | Martinsburg:us_city 830 | Saint Paul:us_city 831 | Wake Island:country 832 | Belmopan:country_capital 833 | Baltimore:us_city 834 | Uganda:country 835 | Newport News:us_city 836 | Riyadh:country_capital 837 | Bangkok:country_capital 838 | Tajikistan:country 839 | Sao Tome:country 840 | Bern:country_capital 841 | New Bedford:us_city 842 | LaFayette:us_city 843 | Moroni:country_capital 844 | Syria:country 845 | Hammond:us_city 846 | Camden:us_city 847 | Orem:us_city 848 | Allentown:us_city 849 | Accra:country_capital 850 | Council Bluffs:us_city 851 | Palmyra Atoll:country 852 | Lynn:us_city 853 | Coon Rapids:us_city 854 | Fiji:country 855 | Santo Domingo:country_capital 856 | Bratislava:country_capital 857 | Nouakchott:country_capital 858 | Malawi:country 859 | Londonderry:us_city 860 | Derry:us_city 861 | Burnsville:us_city 862 | Fairbanks:us_city 863 | Mongolia:country 864 | Funafuti:country_capital 865 | Warsaw:country_capital 866 | North Carolina:us_state 867 | Azerbaijan:country 868 | Southaven:us_city 869 | Caracas:country_capital 870 | Maputo:country_capital 871 | North Platte:us_city 872 | Cote d'Ivoire:country 873 | Honolulu:us_city 874 | Kailua:us_city 875 | Rhode Island:us_state 876 | Boulder City:us_city 877 | Ireland:country 878 | Seoul:country_capital 879 | Elgin:us_city 880 | Gresham:us_city 881 | Arizona:us_state 882 | George Town:country_capital 883 | Bahamas:country 884 | Riga:country_capital 885 | Massachusetts:us_state 886 | Corpus Christi:us_city 887 | Romania:country 888 | Savannah:us_city 889 | Oregon:us_state 890 | Rock Springs:us_city 891 | Dothan:us_city 892 | Beckley:us_city 893 | American Samoa:country 894 | Tripoli:country_capital 895 | Madison:us_city 896 | Iowa:us_state 897 | North Providence:us_city 898 | Tokelau:country 899 | Juneau:us_city 900 | Austin:us_city 901 | Seattle:us_city 902 | Hialeah:us_city 903 | Kabul:country_capital 904 | Niger:country 905 | Noumea:country_capital 906 | Shreveport:us_city 907 | Mamoutzou:country_capital 908 | Everett:us_city 909 | Olathe:us_city 910 | Andorra la Vella:country_capital 911 | Indian Ocean:country 912 | Bangui:country_capital 913 | Joliet:us_city 914 | Cranston:us_city 915 | Raleigh:us_city 916 | Morgantown:us_city 917 | Kent:us_city 918 | Jan Mayen:country 919 | Spain:country 920 | Mauritius:country 921 | Port-Vila:country_capital 922 | Hutchinson:us_city 923 | Mississippi:us_state 924 | Lithuania:country 925 | Rwanda:country 926 | Kigali:country_capital 927 | Estonia:country 928 | Merrimack:us_city 929 | Sparks:us_city 930 | Virgin Islands:country 931 | Vilnius:country_capital 932 | Long Beach:us_city 933 | Willemstad:country_capital 934 | Cairo:country_capital 935 | Poland:country 936 | Kenosha:us_city 937 | Apia:country_capital 938 | Monaco:country_capital 939 | Casper:us_city 940 | Roswell:us_city 941 | Eagan:us_city 942 | Hopkinsville:us_city 943 | Yugoslavia:country 944 | 945 | -------------------------------------------------------------------------------- /prefixnames.txt: -------------------------------------------------------------------------------- 1 | Dr 2 | Premier 3 | Major 4 | Corporal 5 | King 6 | General 7 | Ms 8 | Gen 9 | Mrs 10 | Sen 11 | Mr 12 | Doctor 13 | St 14 | Prince 15 | Representative 16 | Maj 17 | President 18 | Congressman 19 | Vice 20 | Lt 21 | Senator 22 | -------------------------------------------------------------------------------- /product_names.txt: -------------------------------------------------------------------------------- 1 | Java 2 | IntelliJ 3 | Coke 4 | Coca Cola 5 | Pepsi 6 | Diet Pepsi 7 | Fanta Orange 8 | Dr Pepper 9 | Alka Seltzer 10 | Avanta 11 | Levitra 12 | Mac 13 | Macintosh 14 | iPhone 15 | Jeep 16 | Jeep Cherokee 17 | Ford Thunderbird 18 | Pontiac 19 | Winnebago 20 | Volkswagen 21 | Hula Hoops 22 | Pringles 23 | Ruffles 24 | Kit Kat 25 | Oreo 26 | Eskimo Pie 27 | VAIO 28 | Handycam 29 | Cyber Shot 30 | DigiMax 31 | Ford 32 | Chevie 33 | -------------------------------------------------------------------------------- /src/.properties: -------------------------------------------------------------------------------- 1 | { 2 | #format : #tonel 3 | } -------------------------------------------------------------------------------- /src/KBSnlp.st: -------------------------------------------------------------------------------- 1 | PackageManifest subclass: #ManifestKBSnlp instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'KBSnlp'! !ManifestKBSnlp commentStamp: 'MarkWatson 5/19/2017 06:24' prior: 0! Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. See: https://github.com/mark-watson/nlp_smalltalk ! "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "! ManifestKBSnlp class instanceVariableNames: ''! !ManifestKBSnlp class methodsFor: 'code-critics' stamp: 'MarkWatson 5/14/2017 21:23'! ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive ^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') )! ! !ManifestKBSnlp class methodsFor: 'code-critics' stamp: 'MarkWatson 5/14/2017 21:25'! ruleRBToDoCollectRuleV1FalsePositive ^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') )! ! Object subclass: #NLPcategories instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'KBSnlp'! !NLPcategories commentStamp: 'MarkWatson 5/19/2017 06:25' prior: 0! A NLPcategories is class to categorize text. Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. See: https://github.com/mark-watson/nlp_smalltalk ! "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "! NLPcategories class instanceVariableNames: ''! !NLPcategories class methodsFor: 'classify' stamp: 'MarkWatson 1/13/2015 14:09'! classify: text "classify text in a string" | tokens categories scores num hash numTokens results cutoff | tokens := NLPtagger tokenize: (text , 'XXXXXX'). categories := (Smalltalk at: #NlpCategoryHash) keys. num := categories size. numTokens := tokens size - 1. scores := Array new: num. 1 to: num do: [ :i | scores at: i put: 0. hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i). 1 to: numTokens do: [ :j | (hash includesKey: (tokens at: j)) ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ]. hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i). 1 to: numTokens do: [ :j | (hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1))) ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]]. results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ]. cutoff := ((results at: 1) at: 1) / 2. results := results select: [ :x | (x at: 1) > cutoff ]. ^results. ! ! !NLPcategories class methodsFor: 'classify' stamp: 'MarkWatson 1/13/2015 13:59'! initializeCategoryHash "requires NeoJSON" Smalltalk at: #NlpCategoryHash put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags.json') contentsOfEntireFile). Smalltalk at: #NlpCategory2gramHash put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags_2gram.json') contentsOfEntireFile)! ! Object subclass: #NLPentities instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'KBSnlp'! !NLPentities commentStamp: 'MarkWatson 5/19/2017 06:25' prior: 0! A NLPentities is a class to find people's names, company names, place names, etc. in text. Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. See: https://github.com/mark-watson/nlp_smalltalk ! "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "! NLPentities class instanceVariableNames: ''! !NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/13/2015 10:32'! entities: aString "return a Dictionary of entities (keys type, values Sets" | temp result | result := Dictionary new. temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString. temp size > 0 ifTrue: [ result at: 'companies' put: temp ]. temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString. temp size > 0 ifTrue: [ result at: 'products' put: temp ]. temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString. temp size > 0 ifTrue: [ result at: 'places' put: temp ]. temp := NLPentities humanNameHelper: aString. temp size > 0 ifTrue: [ result at: 'places' put: temp ]. ^ result! ! !NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/13/2015 10:32'! humanNameHelper: aString "this is a helper method for everything **but** person names" | tokens num results | results := Set new. tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. num := tokens size - 3. " account for the 3 fake tokens at the end " 1 to: num do: [ :i | ((Smalltalk at: #NLPfirstNames) includes: (tokens at: i)) ifTrue: [ (((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1)) and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2))) ifTrue: [ results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2). i := i + 2 ] ifFalse: [ ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1)) ifTrue: [ results add: (tokens at: i) , ' ' , (tokens at: i + 1). i := i + 1 ] ] ] ]. ^ results! ! !NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/13/2015 10:33'! entityHelper: entitySet text: aString "this is a helper method for everything **but** person names" | tokens num ngram2 ngram3 results | results := Set new. tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. num := tokens size - 3. " account for the 3 fake tokens at the end " 1 to: num do: [ :i | ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1). ngram3 := ngram2 , ' ' , (tokens at: i + 2). "Transcript show: ngram2; cr." (entitySet includes: ngram3) ifTrue: [ results add: ngram3 ] ifFalse: [ (entitySet includes: ngram2) ifTrue: [ results add: ngram2 ] ifFalse: [ (entitySet includes: (tokens at: i)) ifTrue: [ results add: (tokens at: i) ] ] ] ]. ^ results! ! !NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/12/2015 15:43'! fileToDictionary: filePath "Read data/lexicon.txt and build in memory lexicon" | read count aLine strm set | Transcript show: 'Processing file ' , filePath; cr. set := Set new. read := (MultiByteFileStream fileNamed: filePath) readOnly. count := 0. [read atEnd] whileFalse: [count := count + 1. aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" "look for a space character: " ((aLine indexOf: $:) > 0) ifTrue: [ strm := ReadStream on: aLine. aLine := strm upTo: $:]. set add: aLine]. read close. ^set ! ! !NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 5/16/2017 20:36'! initializeEntities "load entity name data" " Note: place name lines of the form: Cairo:country_capital Fixed in fileToDictionary " Smalltalk at: #NLPcompanyNames put: (NLPentities fileToDictionary: './nlp_smalltalk/company_names.txt'). Smalltalk at: #NLPfirstNames put: (NLPentities fileToDictionary: './nlp_smalltalk/firstnames.txt'). Smalltalk at: #NLPlastNames put: (NLPentities fileToDictionary: './nlp_smalltalk/lastnames.txt'). Smalltalk at: #NLPhonorifics put: (NLPentities fileToDictionary: './nlp_smalltalk/honorifics.txt'). Smalltalk at: #NLPprefixNames put: (NLPentities fileToDictionary: './nlp_smalltalk/prefixnames.txt'). Smalltalk at: #NLPplaceNames put: (NLPentities fileToDictionary: './nlp_smalltalk/placenames.txt'). Smalltalk at: #NLPproductNames put: (NLPentities fileToDictionary: './nlp_smalltalk/product_names.txt'). " also read in data we will need for sentence segmentation: " Smalltalk at: #NLPtokensWithPeriods put: (NLPentities fileToDictionary: './nlp_smalltalk/tokens_with_periods.txt').! ! Object subclass: #NLPsentences instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'KBSnlp'! !NLPsentences commentStamp: 'MarkWatson 5/19/2017 06:26' prior: 0! A class to segment text into sentences. Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. See: https://github.com/mark-watson/nlp_smalltalk ! "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "! NLPsentences class instanceVariableNames: ''! !NLPsentences class methodsFor: 'initialize' stamp: 'MarkWatson 5/18/2017 16:42'! loadData "Load tokens that normally contain periods" | aSet count reverseDictionary forwardDictionary | count := 0. reverseDictionary := Dictionary new. forwardDictionary := Dictionary new. aSet := NLPsentences fileToSet: './nlp_smalltalk/tokensWithPeriods.txt'. Smalltalk at: #NLPtokensWithPeriods put: aSet. ^ 'tokens with periods data loaded'! ! !NLPsentences class methodsFor: 'segment' stamp: 'MarkWatson 5/18/2017 16:33'! sentences: someText "tokenize a string into individual sentences" | tokens aSet lastToken currentSentence allSentences | aSet := Smalltalk at: #NLPtokensWithPeriods. tokens := OrderedCollection new. (NLPsentences tokenizeLeavePeriods: someText) do: [ :token | (token includesSubstring: '.') not ifTrue: [ tokens add: token ] ifFalse: [ (aSet includes: token) ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: ''). tokens add: '.' ] ifTrue: [ tokens add: token ] ] ]. currentSentence := OrderedCollection new. allSentences := OrderedCollection new. lastToken := ''. Transcript show: tokens; cr. tokens do: [ :token | Transcript show: token; cr. currentSentence add: token. ((token = '.' and: lastToken isAllDigits not) or: token = '?') ifTrue: [ allSentences addLast: currentSentence. currentSentence := OrderedCollection new ]. lastToken := token ]. currentSentence isNotEmpty ifTrue: [ allSentences addLast: currentSentence ]. ^ allSentences! ! !NLPsentences class methodsFor: 'utiities' stamp: 'MarkWatson 5/18/2017 13:42'! fileToSet: filePath "Read file, create Set with elements being each line in file" | read aLine set | Transcript show: 'Processing file ' , filePath; cr. set := Set new. read := (MultiByteFileStream fileNamed: filePath) readOnly. [ read atEnd ] whileFalse: [ aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" set add: aLine ]. read close. ^ set! ! !NLPsentences class methodsFor: 'utiities' stamp: 'MarkWatson 5/18/2017 15:31'! tokenizeLeavePeriods: wordsInAString "tokenizes a string" ^ wordsInAString findTokens: ' ;:,<>[]{}!! @#$%^&*()?' keep: ';:.,<>[]{}!!$?' " keep CR in this string!!!! "! ! Object subclass: #NLPsummarizer instanceVariableNames: '' classVariableNames: '' poolDictionaries: '' category: 'KBSnlp'! !NLPsummarizer commentStamp: 'MarkWatson 5/19/2017 06:26' prior: 0! A class to classify English text into categories. Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. See: https://github.com/mark-watson/nlp_smalltalk ! "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "! NLPsummarizer class instanceVariableNames: ''! !NLPsummarizer class methodsFor: 'summary' stamp: 'MarkWatson 5/15/2017 09:34'! summarize: text "extractive summarizer" | sentences sentenceScores tokens scoredCategories hash x bestIndices | scoredCategories := NLPcategories classify: text. sentences := NLPtagger sentences: text. sentenceScores := Array new: sentences size. 1 to: sentences size do: [ :i | sentenceScores at: i put: 0. tokens := sentences at: i. Transcript show: (sentences at: i); cr. scoredCategories do: [ :sc | hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2). tokens do: [ :token | (hash includesKey: token) ifTrue: [ x := hash at: token. sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ]. bestIndices := sentenceScores collectWithIndex: [ :score :i | {score. i} ]. Transcript show: 'sentence scoring: '; show: bestIndices; cr. bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ]. ^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ]! ! Object subclass: #NLPtagger instanceVariableNames: '' classVariableNames: 'NLPlexicon' poolDictionaries: '' category: 'KBSnlp'! !NLPtagger commentStamp: 'MarkWatson 5/19/2017 06:27' prior: 0! NLP tagger converted to Squeak. A class that implements an NLP tagger. Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. See: https://github.com/mark-watson/nlp_smalltalk ! "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "! NLPtagger class instanceVariableNames: ''! !NLPtagger class methodsFor: 'segmentation' stamp: 'MarkWatson 5/16/2017 20:43'! sentences: data "Handle either a string or array of tokens. Limitations: 1. This code does not currently handle special characters like — 2. Periods in numbers: only check previous character, not the next so a sentence ending with e.g., 2. will not be handled correctly. " | tokens lastToken currentSentence allSentences token | tokens := (data isMemberOf: ByteString) ifTrue: (NLPtagger tokenize: data) ifFalse: data. currentSentence := OrderedCollection new. allSentences := OrderedCollection new. lastToken := ''. Transcript show: tokens; cr. tokens do: [ :token1 | ((Smalltalk at: #NLPtokensWithPeriods) includes: token1) ifTrue: [ token := (Smalltalk at: #NLPtokensWithPeriods) get: token1 ] ifFalse: [ token := token1 ]. Transcript show: token; cr. currentSentence add: token. ((token = '.' and: lastToken isAllDigits not) or: token = '?') ifTrue: [ allSentences addLast: currentSentence. currentSentence := OrderedCollection new ]. lastToken := token ]. currentSentence isNotEmpty ifTrue: [ allSentences addLast: currentSentence ]. ^ allSentences! ! !NLPtagger class methodsFor: 'tagging' stamp: 'MW 1/27/2008 12:53'! tag: words "tag an ordered collection of words, returning an ordered collection of corresponding tags" | lex tags tag count i word lastWord lastTag | tags := OrderedCollection new. lex := Smalltalk at: #NLPlexicon. words do: [:aWord | tag := lex at: aWord ifAbsent: [nil]. tag isNil ifFalse: [tag := tag at: 1] ifTrue: [tag := 'NN']. " the default tag " tags add: tag]. " Apply transformation rules: " lastWord := ''. lastTag := ''. i := 0. count := words size. [i < count] whileTrue: [i := i + 1. word := words at: i. tag := tags at: i. " reuse tag variable " " First, handle all rules for i > 1 " i > 1 ifTrue: [" rule 1: DT, {VBD | VBP} --> DT, NN " lastTag = 'DT' & (tag = 'VBD' | (tag = 'VBP') | (tag = 'VB')) ifTrue: [tags at: i put: 'NN']. tag size > 1 ifTrue: [" rule 6: convert a noun to a verb if the preceeding work is 'would' " (tag at: 1) = $N & ((tag at: 2) = $N) & (lastWord asLowercase = 'would') ifTrue: [tags at: i put: 'VB']]]. " Now, handle the remaining rules that are valid for i = 1: " " rule 2: convert a noun to a number (CD) if '.' appears in the word" (word findString: '.') > 0 ifTrue: [(tag at: 1) = $N ifTrue: [tags at: i put: 'CD']]. " not working - tokenizer tosses '.' characters " " rule 3: convert a noun to a past participle if words[i] ends with 'ed' " (tag at: 1) = $N & (word endsWith: 'ed') ifTrue: [tags at: i put: 'VBN']. " rule 4: convert any type to adverb if it ends in 'ly' " (word endsWith: 'ly') ifTrue: [tags at: i put: 'RB']. " rule 5: convert a common noun (NN or NNS) to a adjective if it ends with 'al' " (tag at: 1) = $N & (word endsWith: 'al') ifTrue: [tags at: i put: 'JJ']. " rule 7: if a word has been categorized as a common noun and it ends with 's;, " " then set its type to plural common noun (NNS) " tag = 'NN' & (word endsWith: 's') ifTrue: [tags at: i put: 'NNS']. " rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) " (tag at: 1) = $N & (word endsWith: 'ing') ifTrue: [tags at: i put: 'VBG']. lastWord := word. lastTag := tag]. ^tags! ! !NLPtagger class methodsFor: 'tagging' stamp: 'MW 1/27/2008 13:21'! pptag: wordString "returns a string of word/tag ..." | words tags write size count | words := NLPtagger tokenize: wordString. tags := NLPtagger tag: words. write := TextStream on: String new. size := words size. count := 1. [count <= size] whileTrue: [ write nextPutAll: (words at: count). write nextPutAll: '/'. write nextPutAll: (tags at: count). write nextPutAll: ' '. count := count + 1]. ^write contents string! ! !NLPtagger class methodsFor: 'tagging' stamp: 'MarkWatson 5/18/2017 16:37'! initializeLexicon "Read data/lexicon.txt and build in memory lexicon" | read count strm aLine word taglist token lex | lex := Dictionary new. read := (FileStream fileNamed: './nlp_smalltalk/lexicon.txt') readOnly. count := 0. [ read atEnd ] whileFalse: [ count := count + 1. aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" strm := ReadStream on: aLine. word := strm upTo: Character space. taglist := OrderedCollection new. [ strm atEnd ] whileFalse: [ token := strm upTo: Character space. taglist add: token ]. "Transcript show: word; cr." "Transcript show: taglist printString; cr." lex at: word put: taglist ]. read close. Smalltalk at: #NLPlexicon put: lex! ! !NLPtagger class methodsFor: 'tokenization' stamp: 'MarkWatson 5/15/2017 10:11'! tokenize: wordsInAString "tokenizes a string" ^ wordsInAString findTokens: ' ;:.,<>[]{}!! @#$%^&*()?' keep: ';:.,<>[]{}!!$?' " keep CR in this string!!!! "! ! -------------------------------------------------------------------------------- /src/KBSnlp/ManifestKBSnlp.class.st: -------------------------------------------------------------------------------- 1 | " 2 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 3 | 4 | See: https://github.com/mark-watson/nlp_smalltalk 5 | 6 | " 7 | Class { 8 | #name : #ManifestKBSnlp, 9 | #superclass : #PackageManifest, 10 | #category : #KBSnlp 11 | } 12 | 13 | { #category : #'code-critics' } 14 | ManifestKBSnlp class >> ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive [ 15 | ^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') ) 16 | ] 17 | 18 | { #category : #'code-critics' } 19 | ManifestKBSnlp class >> ruleRBToDoCollectRuleV1FalsePositive [ 20 | ^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') ) 21 | ] 22 | -------------------------------------------------------------------------------- /src/KBSnlp/NLPcategories.class.st: -------------------------------------------------------------------------------- 1 | " 2 | A NLPcategories is class to categorize text. 3 | 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 5 | 6 | See: https://github.com/mark-watson/nlp_smalltalk 7 | 8 | " 9 | Class { 10 | #name : #NLPcategories, 11 | #superclass : #Object, 12 | #category : #KBSnlp 13 | } 14 | 15 | { #category : #classify } 16 | NLPcategories class >> classify: text [ 17 | "classify text in a string" 18 | 19 | | tokens categories scores num hash numTokens results cutoff | 20 | tokens := NLPtagger tokenize: (text , 'XXXXXX'). 21 | categories := (Smalltalk at: #NlpCategoryHash) keys. 22 | num := categories size. 23 | numTokens := tokens size - 1. 24 | scores := Array new: num. 25 | 1 to: num do: [ :i | 26 | scores at: i put: 0. 27 | hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i). 28 | 1 to: numTokens do: [ :j | 29 | (hash includesKey: (tokens at: j)) 30 | ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ]. 31 | hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i). 32 | 1 to: numTokens do: [ :j | 33 | (hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1))) 34 | ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]]. 35 | results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 36 | 1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ]. 37 | cutoff := ((results at: 1) at: 1) / 2. 38 | results := results select: [ :x | (x at: 1) > cutoff ]. 39 | ^results. 40 | 41 | ] 42 | 43 | { #category : #classify } 44 | NLPcategories class >> initializeCategoryHash [ 45 | "requires NeoJSON" 46 | | aDir | 47 | aDir := FileSystem disk workingDirectory. 48 | Smalltalk at: #NlpCategoryHash 49 | put: (NeoJSONReader fromString: ((aDir / './pharo-local/iceberg/mark-watson/nlp_smalltalk/tags.json') readStream) contents). 50 | Smalltalk at: #NlpCategory2gramHash 51 | put: (NeoJSONReader fromString: ((aDir / './pharo-local/iceberg/mark-watson/nlp_smalltalk/tags_2gram.json') readStream) contents) 52 | ] 53 | -------------------------------------------------------------------------------- /src/KBSnlp/NLPentities.class.st: -------------------------------------------------------------------------------- 1 | " 2 | A NLPentities is a class to find people's names, company names, place names, etc. in text. 3 | 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 5 | 6 | See: https://github.com/mark-watson/nlp_smalltalk 7 | 8 | " 9 | Class { 10 | #name : #NLPentities, 11 | #superclass : #Object, 12 | #category : #KBSnlp 13 | } 14 | 15 | { #category : #entityDetection } 16 | NLPentities class >> entities: aString [ 17 | "return a Dictionary of entities (keys type, values Sets" 18 | 19 | | temp result | 20 | result := Dictionary new. 21 | temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString. 22 | temp size > 0 23 | ifTrue: [ result at: 'companies' put: temp ]. 24 | temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString. 25 | temp size > 0 26 | ifTrue: [ result at: 'products' put: temp ]. 27 | temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString. 28 | temp size > 0 29 | ifTrue: [ result at: 'places' put: temp ]. 30 | temp := NLPentities humanNameHelper: aString. 31 | temp size > 0 32 | ifTrue: [ result at: 'places' put: temp ]. 33 | ^ result 34 | ] 35 | 36 | { #category : #entityDetection } 37 | NLPentities class >> entityHelper: entitySet text: aString [ 38 | "this is a helper method for everything **but** person names" 39 | 40 | | tokens num ngram2 ngram3 results | 41 | results := Set new. 42 | tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. 43 | num := tokens size - 3. " account for the 3 fake tokens at the end " 44 | 1 to: num do: [ :i | 45 | ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1). 46 | ngram3 := ngram2 , ' ' , (tokens at: i + 2). "Transcript show: ngram2; cr." 47 | (entitySet includes: ngram3) 48 | ifTrue: [ results add: ngram3 ] 49 | ifFalse: [ 50 | (entitySet includes: ngram2) 51 | ifTrue: [ results add: ngram2 ] 52 | ifFalse: [ 53 | (entitySet includes: (tokens at: i)) 54 | ifTrue: [ results add: (tokens at: i) ] ] ] ]. 55 | ^ results 56 | ] 57 | 58 | { #category : #entityDetection } 59 | NLPentities class >> fileToDictionary: filePath [ 60 | 61 | "Read data/lexicon.txt and build in memory lexicon" 62 | 63 | | aDir read2 read count aLine strm set | 64 | 65 | Transcript show: 'Processing file ' , filePath; cr. 66 | 67 | set := Set new. 68 | aDir := FileSystem disk workingDirectory. 69 | "read := (MultiByteFileStream fileNamed: filePath) readOnly." 70 | read := (aDir / filePath) readStream. 71 | "read := (ZnCharacterReadStream on: read2 encoding: #utf8)." 72 | 73 | count := 0. 74 | [read atEnd] 75 | whileFalse: [count := count + 1. 76 | aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" 77 | "look for a space character: " 78 | ((aLine indexOf: $:) > 0) 79 | ifTrue: [ 80 | strm := ReadStream on: aLine. 81 | aLine := strm upTo: $:]. 82 | set add: aLine]. 83 | read close. 84 | ^set 85 | 86 | ] 87 | 88 | { #category : #entityDetection } 89 | NLPentities class >> humanNameHelper: aString [ 90 | "this is a helper method for everything **but** person names" 91 | 92 | | tokens num results | 93 | results := Set new. 94 | tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'. 95 | num := tokens size - 3. " account for the 3 fake tokens at the end " 96 | 1 to: num do: [ :i | 97 | ((Smalltalk at: #NLPfirstNames) includes: (tokens at: i)) 98 | ifTrue: [ 99 | (((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1)) 100 | and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2))) 101 | ifTrue: [ 102 | results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2). 103 | i := i + 2 ] 104 | ifFalse: [ 105 | ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1)) 106 | ifTrue: [ 107 | results add: (tokens at: i) , ' ' , (tokens at: i + 1). 108 | i := i + 1 ] ] ] ]. 109 | ^ results 110 | ] 111 | 112 | { #category : #entityDetection } 113 | NLPentities class >> initializeEntities [ 114 | "load entity name data" 115 | 116 | " Note: place name lines of the form: Cairo:country_capital Fixed in fileToDictionary " 117 | | repo path | 118 | repo := IceRepository registeredRepositoryIncludingPackage: (self class) package. 119 | path := (repo location) asString . 120 | 121 | Smalltalk 122 | at: #NLPcompanyNames 123 | put: (NLPentities fileToDictionary: path , '/company_names.txt'). 124 | Smalltalk 125 | at: #NLPfirstNames 126 | put: (NLPentities fileToDictionary: path , './firstnames.txt'). 127 | Smalltalk 128 | at: #NLPlastNames 129 | put: (NLPentities fileToDictionary: path , './lastnames.txt'). 130 | Smalltalk 131 | at: #NLPhonorifics 132 | put: (NLPentities fileToDictionary: path , '/honorifics.txt'). 133 | Smalltalk 134 | at: #NLPprefixNames 135 | put: (NLPentities fileToDictionary: '/prefixnames.txt'). 136 | Smalltalk 137 | at: #NLPplaceNames 138 | put: (NLPentities fileToDictionary: '/placenames.txt'). 139 | Smalltalk 140 | at: #NLPproductNames 141 | put: (NLPentities fileToDictionary: path , '/product_names.txt'). 142 | 143 | " also read in data we will need for sentence segmentation: " 144 | Smalltalk 145 | at: #NLPtokensWithPeriods 146 | put: (NLPentities fileToDictionary: path , '/tokensWithPeriods.txt'). 147 | ] 148 | -------------------------------------------------------------------------------- /src/KBSnlp/NLPsentences.class.st: -------------------------------------------------------------------------------- 1 | " 2 | A class to segment text into sentences. 3 | 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 5 | 6 | See: https://github.com/mark-watson/nlp_smalltalk 7 | 8 | " 9 | Class { 10 | #name : #NLPsentences, 11 | #superclass : #Object, 12 | #category : #KBSnlp 13 | } 14 | 15 | { #category : #utiities } 16 | NLPsentences class >> fileToSet: filePath [ 17 | "Read file, create Set with elements being each line in file" 18 | 19 | | read aLine set aDir | 20 | Transcript 21 | show: 'Processing file ' , filePath; 22 | cr. 23 | set := Set new. 24 | "read := (MultiByteFileStream fileNamed: filePath) readOnly." 25 | aDir := FileSystem disk workingDirectory. 26 | read := (aDir / './pharo-local/iceberg/mark-watson/nlp_smalltalk/lexicon.txt') readStream. 27 | 28 | [ read atEnd ] 29 | whileFalse: [ aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" 30 | set add: aLine ]. 31 | read close. 32 | ^ set 33 | ] 34 | 35 | { #category : #initialize } 36 | NLPsentences class >> loadData [ 37 | "Load tokens that normally contain periods" 38 | 39 | | aSet count reverseDictionary forwardDictionary | 40 | count := 0. 41 | reverseDictionary := Dictionary new. 42 | forwardDictionary := Dictionary new. 43 | aSet := NLPsentences fileToSet: './pharo-local/iceberg/mark-watson/nlp_smalltalk/tokensWithPeriods.txt'. 44 | Smalltalk at: #NLPtokensWithPeriods put: aSet. 45 | ^ 'tokens with periods data loaded' 46 | ] 47 | 48 | { #category : #segment } 49 | NLPsentences class >> sentences: someText [ 50 | "tokenize a string into individual sentences" 51 | 52 | | tokens aSet lastToken currentSentence allSentences | 53 | aSet := Smalltalk at: #NLPtokensWithPeriods. 54 | tokens := OrderedCollection new. 55 | (NLPsentences tokenizeLeavePeriods: someText) 56 | do: [ :token | 57 | (token includesSubstring: '.') not 58 | ifTrue: [ tokens add: token ] 59 | ifFalse: [ (aSet includes: token) 60 | ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: ''). 61 | tokens add: '.' ] 62 | ifTrue: [ tokens add: token ] ] ]. 63 | currentSentence := OrderedCollection new. 64 | allSentences := OrderedCollection new. 65 | lastToken := ''. 66 | Transcript 67 | show: tokens; 68 | cr. 69 | tokens 70 | do: [ :token | 71 | Transcript 72 | show: token; 73 | cr. 74 | currentSentence add: token. 75 | ((token = '.' and: lastToken isAllDigits not) or: token = '?') 76 | ifTrue: [ allSentences addLast: currentSentence. 77 | currentSentence := OrderedCollection new ]. 78 | lastToken := token ]. 79 | currentSentence isNotEmpty 80 | ifTrue: [ allSentences addLast: currentSentence ]. 81 | ^ allSentences 82 | ] 83 | 84 | { #category : #utiities } 85 | NLPsentences class >> tokenizeLeavePeriods: wordsInAString [ 86 | "tokenizes a string" 87 | 88 | ^ wordsInAString 89 | findTokens: 90 | ' ;:,<>[]{}! 91 | @#$%^&*()?' 92 | keep: ';:.,<>[]{}!$?' " keep CR in this string!! " 93 | ] 94 | -------------------------------------------------------------------------------- /src/KBSnlp/NLPsummarizer.class.st: -------------------------------------------------------------------------------- 1 | " 2 | A class to classify English text into categories. 3 | 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 5 | 6 | See: https://github.com/mark-watson/nlp_smalltalk 7 | 8 | " 9 | Class { 10 | #name : #NLPsummarizer, 11 | #superclass : #Object, 12 | #category : #KBSnlp 13 | } 14 | 15 | { #category : #summary } 16 | NLPsummarizer class >> summarize: text [ 17 | "extractive summarizer" 18 | 19 | | sentences sentenceScores tokens scoredCategories hash x bestIndices | 20 | scoredCategories := NLPcategories classify: text. 21 | sentences := NLPtagger sentences: text. 22 | sentenceScores := Array new: sentences size. 23 | 1 to: sentences size do: [ :i | 24 | sentenceScores at: i put: 0. 25 | tokens := sentences at: i. 26 | Transcript 27 | show: (sentences at: i); 28 | cr. 29 | scoredCategories 30 | do: [ :sc | 31 | hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2). 32 | tokens 33 | do: [ :token | 34 | (hash includesKey: token) 35 | ifTrue: [ x := hash at: token. 36 | sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ]. 37 | bestIndices := sentenceScores 38 | collectWithIndex: [ :score :i | 39 | {score. 40 | i} ]. 41 | Transcript 42 | show: 'sentence scoring: '; 43 | show: bestIndices; 44 | cr. 45 | bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ]. 46 | ^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ] 47 | ] 48 | -------------------------------------------------------------------------------- /src/KBSnlp/NLPtagger.class.st: -------------------------------------------------------------------------------- 1 | " 2 | NLP tagger converted to Squeak. 3 | A class that implements an NLP tagger. 4 | 5 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required. 6 | 7 | See: https://github.com/mark-watson/nlp_smalltalk 8 | 9 | " 10 | Class { 11 | #name : #NLPtagger, 12 | #superclass : #Object, 13 | #classVars : [ 14 | 'NLPlexicon' 15 | ], 16 | #category : #KBSnlp 17 | } 18 | 19 | { #category : #tagging } 20 | NLPtagger class >> initializeLexicon [ 21 | "Read data/lexicon.txt and build in memory lexicon" 22 | 23 | | read count strm aLine word taglist token lex repo | 24 | lex := Dictionary new. 25 | repo := IceRepository registeredRepositoryIncludingPackage: (self class) package. 26 | read := ((repo location) asString , '/lexicon.txt') readStream . 27 | count := 0. 28 | [ read atEnd ] 29 | whileFalse: [ count := count + 1. 30 | aLine := read upTo: Character lf. "Mac: use lf, Windows: use cr ???" 31 | strm := ReadStream on: aLine. 32 | word := strm upTo: Character space. 33 | taglist := OrderedCollection new. 34 | [ strm atEnd ] 35 | whileFalse: [ token := strm upTo: Character space. 36 | taglist add: token ]. 37 | "Transcript show: word; cr." 38 | "Transcript show: taglist printString; cr." 39 | lex at: word put: taglist ]. 40 | read close. 41 | Smalltalk at: #NLPlexicon put: lex 42 | ] 43 | 44 | { #category : #tagging } 45 | NLPtagger class >> pptag: wordString [ 46 | "returns a string of word/tag ..." 47 | 48 | | words tags write size count | 49 | words := NLPtagger tokenize: wordString. 50 | tags := NLPtagger tag: words. 51 | write := TextStream on: String new. 52 | size := words size. 53 | count := 1. 54 | [count <= size] 55 | whileTrue: [ 56 | write nextPutAll: (words at: count). 57 | write nextPutAll: '/'. 58 | write nextPutAll: (tags at: count). 59 | write nextPutAll: ' '. 60 | count := count + 1]. 61 | ^write contents string 62 | ] 63 | 64 | { #category : #segmentation } 65 | NLPtagger class >> sentences: data [ 66 | "Handle either a string or array of tokens. 67 | Limitations: 68 | 1. This code does not currently handle special characters like — 69 | 2. Periods in numbers: only check previous character, not the 70 | next so a sentence ending with e.g., 2. will not be handled correctly. 71 | " 72 | 73 | | tokens lastToken currentSentence allSentences token | 74 | tokens := (data isMemberOf: ByteString) 75 | ifTrue: (NLPtagger tokenize: data) 76 | ifFalse: data. 77 | currentSentence := OrderedCollection new. 78 | allSentences := OrderedCollection new. 79 | lastToken := ''. 80 | Transcript 81 | show: tokens; 82 | cr. 83 | tokens 84 | do: [ :token1 | 85 | ((Smalltalk at: #NLPtokensWithPeriods) includes: token1) 86 | ifTrue: [ token := (Smalltalk at: #NLPtokensWithPeriods) get: token1 ] 87 | ifFalse: [ token := token1 ]. 88 | 89 | Transcript 90 | show: token; 91 | cr. 92 | currentSentence add: token. 93 | ((token = '.' and: lastToken isAllDigits not) or: token = '?') 94 | ifTrue: [ allSentences addLast: currentSentence. 95 | currentSentence := OrderedCollection new ]. 96 | lastToken := token ]. 97 | currentSentence isNotEmpty 98 | ifTrue: [ allSentences addLast: currentSentence ]. 99 | ^ allSentences 100 | ] 101 | 102 | { #category : #tagging } 103 | NLPtagger class >> tag: words [ 104 | "tag an ordered collection of words, returning an ordered collection of corresponding tags" 105 | 106 | | lex tags tag count i word lastWord lastTag | 107 | tags := OrderedCollection new. 108 | lex := Smalltalk at: #NLPlexicon. 109 | words do: 110 | [:aWord | 111 | tag := lex at: aWord ifAbsent: [nil]. 112 | tag isNil ifFalse: [tag := tag at: 1] ifTrue: [tag := 'NN']. " the default tag " 113 | tags add: tag]. 114 | " Apply transformation rules: " 115 | lastWord := ''. 116 | lastTag := ''. 117 | i := 0. 118 | count := words size. 119 | [i < count] whileTrue: 120 | [i := i + 1. 121 | word := words at: i. 122 | tag := tags at: i. " reuse tag variable " 123 | " First, handle all rules for i > 1 " 124 | i > 1 125 | ifTrue: 126 | [" rule 1: DT, {VBD | VBP} --> DT, NN " 127 | 128 | lastTag = 'DT' & (tag = 'VBD' | (tag = 'VBP') | (tag = 'VB')) 129 | ifTrue: [tags at: i put: 'NN']. 130 | tag size > 1 131 | ifTrue: 132 | [" rule 6: convert a noun to a verb if the preceeding work is 'would' " 133 | (tag at: 1) = $N & ((tag at: 2) = $N) & (lastWord asLowercase = 'would') 134 | ifTrue: [tags at: i put: 'VB']]]. 135 | " Now, handle the remaining rules that are valid for i = 1: " 136 | " rule 2: convert a noun to a number (CD) if '.' appears in the word" 137 | (word findString: '.') > 0 138 | ifTrue: [(tag at: 1) = $N ifTrue: [tags at: i put: 'CD']]. " not working - tokenizer tosses '.' characters " 139 | " rule 3: convert a noun to a past participle if words[i] ends with 'ed' " 140 | (tag at: 1) = $N & (word endsWith: 'ed') ifTrue: [tags at: i put: 'VBN']. 141 | " rule 4: convert any type to adverb if it ends in 'ly' " 142 | (word endsWith: 'ly') ifTrue: [tags at: i put: 'RB']. 143 | " rule 5: convert a common noun (NN or NNS) to a adjective if it ends with 'al' " 144 | (tag at: 1) = $N & (word endsWith: 'al') ifTrue: [tags at: i put: 'JJ']. 145 | " rule 7: if a word has been categorized as a common noun and it ends with 's;, " 146 | " then set its type to plural common noun (NNS) " 147 | tag = 'NN' & (word endsWith: 's') ifTrue: [tags at: i put: 'NNS']. 148 | " rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) " 149 | (tag at: 1) = $N & (word endsWith: 'ing') ifTrue: [tags at: i put: 'VBG']. 150 | lastWord := word. 151 | lastTag := tag]. 152 | ^tags 153 | ] 154 | 155 | { #category : #tokenization } 156 | NLPtagger class >> tokenize: wordsInAString [ 157 | "tokenizes a string" 158 | 159 | ^ wordsInAString 160 | findTokens: 161 | ' ;:.,<>[]{}! 162 | @#$%^&*()?' 163 | keep: ';:.,<>[]{}!$?' " keep CR in this string!! " 164 | ] 165 | -------------------------------------------------------------------------------- /src/KBSnlp/package.st: -------------------------------------------------------------------------------- 1 | Package { #name : #KBSnlp } 2 | -------------------------------------------------------------------------------- /tags_2gram.json: -------------------------------------------------------------------------------- 1 | { 2 | "chemistry": { 3 | "chemical reaction": 1.55, 4 | "atoms molecules": 0.6, 5 | "organic matter": 0.55, 6 | "electrons are": 0.55, 7 | "carbon carbon": 0.5, 8 | "periodic table": 0.5, 9 | "chemical reactions": 0.5, 10 | "carbon atom": 0.5 11 | }, 12 | "computers": { 13 | "computer system": 0.9, 14 | "operating system": 0.75, 15 | "random memory": 0.65, 16 | "computer science": 0.65, 17 | "computer program": 0.6, 18 | "osi reference": 0.5 19 | }, 20 | "computers_ai": { 21 | "artificial intelligence": 1.45, 22 | "ai research": 1.6, 23 | "john mccarthy": 0.95, 24 | "strong ai": 0.8, 25 | "computer science": 0.8, 26 | "symbolic ai": 0.7, 27 | "language processing": 0.55, 28 | "alan turing": 0.55 29 | }, 30 | "computers_ai_datamining": { 31 | "data mining": 1.0, 32 | "machine learning": 0.65, 33 | "ai artificial": 0.85, 34 | "knowledgebooks ai": 0.85, 35 | "mining knowledgebooks": 0.85, 36 | "datamining data": 0.85, 37 | "intelligence datamining": 0.85, 38 | "bayesian networks": 0.55, 39 | "mining algorithms": 0.55, 40 | "mining knowledge": 0.5, 41 | "knowledge databases": 0.5, 42 | "terabytes data": 0.5, 43 | "databases world": 0.5, 44 | "largest databases": 0.5, 45 | "this tutorial": 0.5, 46 | "reinforcement learning": 0.5 47 | }, 48 | "computers_ai_learning": { 49 | "machine learning": 1.8, 50 | "neural networks": 0.55, 51 | "artificial intelligence": 2.2, 52 | "neural network": 0.55, 53 | "human brain": 1.25, 54 | "learning algorithms": 1.1, 55 | "fuzzy logic": 0.7, 56 | "artificial neural": 0.7, 57 | "learning theory": 0.7, 58 | "ai magazine": 0.55, 59 | "speech recognition": 0.55, 60 | "computational learning": 0.55, 61 | "learning algorithm": 0.55, 62 | "supervised learning": 0.55, 63 | "computer vision": 0.55, 64 | "pattern recognition": 0.55, 65 | "hacking knowledge": 0.55 66 | }, 67 | "computers_ai_nlp": { 68 | "natural language": 0.8, 69 | "machine learning": 0.75, 70 | "language processing": 0.5, 71 | "million words": 0.5 72 | }, 73 | "computers_ai_search": { 74 | "node goal": 3.0, 75 | "data mining": 3.0, 76 | "mining data": 3.0, 77 | "text mining": 3.0, 78 | "search lucene": 3.0, 79 | "ai search": 3.0, 80 | "intelligence ai": 3.0, 81 | "artificial intelligence": 3.0, 82 | "goal node": 2.5, 83 | "search results": 2.0, 84 | "heuristic value": 1.5, 85 | "worst complexity": 1.0, 86 | "search strategies": 1.0, 87 | "guaranteed halt": 1.0, 88 | "depth search": 1.0, 89 | "derive heuristic": 1.0, 90 | "heuristic information": 1.0, 91 | "goal heuristic": 1.0, 92 | "heuristic search": 1.0, 93 | "intelligent systems": 1.0, 94 | "relevant query": 1.0, 95 | "results snippet": 1.0, 96 | "googlebot crawls": 1.0, 97 | "branching factors": 0.5, 98 | "graph infinite": 0.5, 99 | "exponential space": 0.5, 100 | "lowest exponential": 0.5, 101 | "path lengths": 0.5, 102 | "linear space": 0.5, 103 | "summary search": 0.5, 104 | "space complexity": 0.5, 105 | "complexity space": 0.5, 106 | "path algorithms": 0.5, 107 | "increases exponentially": 0.5, 108 | "bound search": 0.5, 109 | "globally minimal": 0.5, 110 | "search heuristic": 0.5, 111 | "priority queue": 0.5, 112 | "goal path": 0.5, 113 | "heuristic path": 0.5, 114 | "path heuristic": 0.5, 115 | "goal paths": 0.5, 116 | "networks ai": 0.5, 117 | "neural networks": 0.5, 118 | "google servers": 0.5, 119 | "map googlebot": 0.5, 120 | "crawls web": 0.5 121 | }, 122 | "computers_ai_textmining": { 123 | "parameters call": 5.15, 124 | "text mining": 0.9, 125 | "marti hearst": 3.25, 126 | "words format": 2.15, 127 | "natural language": 1.75, 128 | "format parameters": 1.45, 129 | "documents words": 1.45, 130 | "text analytics": 0.8, 131 | "preslav nakov": 1.35, 132 | "curt monash": 1.35, 133 | "data mining": 1.2, 134 | "semantic space": 1.2, 135 | "information retrieval": 1.2, 136 | "information extraction": 0.9, 137 | "search engine": 0.9, 138 | "nakov schwartz": 0.8, 139 | "language processing": 0.8, 140 | "representation documents": 0.8, 141 | "training algorithm": 0.8, 142 | "vector machine": 0.8, 143 | "support vector": 0.8, 144 | "file parameters": 0.8, 145 | "hearst proceedings": 0.65, 146 | "nakov marti": 0.65, 147 | "emilia stoica": 0.65, 148 | "barbara rosario": 0.65, 149 | "search engines": 0.65, 150 | "text storage": 0.65, 151 | "search text": 0.65, 152 | "space documents": 0.65, 153 | "via training": 0.65, 154 | "learns via": 0.65, 155 | "compact documents": 0.65, 156 | "information text": 0.5, 157 | "hlt naacl": 0.5, 158 | "rosario marti": 0.5, 159 | "genomics track": 0.5, 160 | "trec genomics": 0.5, 161 | "stoica marti": 0.5, 162 | "categories search": 0.5, 163 | "odp dmoz": 0.5, 164 | "social networking": 0.5, 165 | "directories filtering": 0.5, 166 | "call search": 0.5, 167 | "tiling representation": 0.5, 168 | "space representation": 0.5, 169 | "documents bow": 0.5, 170 | "text format": 0.5, 171 | "format file": 0.5, 172 | "bioscience researchers": 0.5, 173 | "word document": 0.5 174 | }, 175 | "computers_microsoft": { 176 | "microsoft corporation": 4.25, 177 | "windows vista": 3.8, 178 | "corporation microsoft": 2.85, 179 | "microsoft office": 2.0, 180 | "operating system": 1.6, 181 | "playtech estonia": 1.55, 182 | "sourcenext corporation": 0.95, 183 | "casino playtech": 0.85, 184 | "mamut asa": 0.85, 185 | "corporation sourcenext": 0.8, 186 | "software professor": 0.8, 187 | "bill gates": 0.8, 188 | "professor teaches": 0.75, 189 | "enginia research": 0.75, 190 | "windows xp": 0.75, 191 | "avanquest usa": 0.65, 192 | "encyclopaedia britannica": 0.65, 193 | "windows windows": 0.65, 194 | "asa mamut": 0.6, 195 | "software llc": 0.55, 196 | "punch software": 0.55, 197 | "microsoft visual": 0.55, 198 | "internet explorer": 0.55, 199 | "memory manager": 0.55, 200 | "microsoft windows": 0.5 201 | }, 202 | "computers_programming": { 203 | "program programming": 3.95, 204 | "scheme program": 3.95, 205 | "lisp scheme": 3.95, 206 | "scala lisp": 3.95, 207 | "java scala": 3.95, 208 | "lisp java": 3.95, 209 | "java lisp": 3.95, 210 | "debugging java": 3.95, 211 | "debug debugging": 3.95, 212 | "logic debug": 3.95, 213 | "netbeans logic": 3.95, 214 | "intellij netbeans": 3.95, 215 | "eclipse intellij": 3.95, 216 | "ide eclipse": 3.95, 217 | "compilers ide": 3.95, 218 | "compiler compilers": 3.95, 219 | "programming language": 2.45, 220 | "programming compiler": 3.45, 221 | "pl pl": 2.95, 222 | "computer programs": 2.45, 223 | "iso iec": 2.45, 224 | "software development": 1.95, 225 | "something until": 0.95, 226 | "person something": 0.95, 227 | "file output": 0.95, 228 | "arrays file": 0.95, 229 | "actors arrays": 0.95, 230 | "turtles actors": 0.95, 231 | "project turtles": 0.95, 232 | "mastermind project": 0.95, 233 | "iteration mastermind": 0.95, 234 | "sentence iteration": 0.95, 235 | "word sentence": 0.95, 236 | "variables word": 0.95, 237 | "global variables": 0.95, 238 | "variables global": 0.95, 239 | "local variables": 0.95, 240 | "recursion local": 0.95, 241 | "predicates recursion": 0.95, 242 | "events predicates": 0.95, 243 | "interface events": 0.95, 244 | "sentences interface": 0.95, 245 | "words sentences": 0.95, 246 | "operators words": 0.95, 247 | "defining operators": 0.95, 248 | "operators defining": 0.95, 249 | "primitive operators": 0.95, 250 | "inputs primitive": 0.95, 251 | "procedure inputs": 0.95, 252 | "structure procedure": 0.95, 253 | "hierarchical structure": 0.95, 254 | "animation hierarchical": 0.95, 255 | "iteration animation": 0.95, 256 | "commands iteration": 0.95, 257 | "adding commands": 0.95, 258 | "pseudocode adding": 0.95, 259 | "requirements analysis": 0.95, 260 | "efficient evolvable": 0.95, 261 | "debate extent": 0.95, 262 | "going debate": 0.95, 263 | "development process": 0.95, 264 | "source code": 0.95, 265 | "visual visual": 0.95, 266 | "visual basic": 0.95, 267 | "objective objective": 0.95, 268 | "modula modula": 0.95, 269 | "intermediate language": 0.95, 270 | "common intermediate": 0.95, 271 | "prototype ada": 0.95 272 | }, 273 | "computers_programming_c++": { 274 | "bjarne stroustrup": 1.15, 275 | "source code": 0.95, 276 | "std cout": 0.75, 277 | "template library": 0.75, 278 | "world program": 0.75, 279 | "operator overloading": 0.75, 280 | "programming languages": 0.75, 281 | "type this": 0.55, 282 | "representation type": 0.55, 283 | "members class": 0.55, 284 | "templates are": 0.55, 285 | "conditional compilation": 0.55, 286 | "namespace std": 0.55, 287 | "library template": 0.55, 288 | "type checking": 0.55, 289 | "derived class": 0.55, 290 | "hello world": 0.55, 291 | "iso iec": 0.55, 292 | "exception handling": 0.55, 293 | "multiple inheritance": 0.55, 294 | "bell labs": 0.55, 295 | "oriented programming": 0.55, 296 | "multi paradigm": 0.55 297 | }, 298 | "computers_programming_java": { 299 | "web toolkit": 2.2, 300 | "google web": 2.2, 301 | "web services": 1.25, 302 | "eberhard wolff": 1.9, 303 | "java classes": 1.55, 304 | "java platform": 1.55, 305 | "wolff interface": 1.25, 306 | "ouml ller": 1.25, 307 | "rgen ouml": 1.25, 308 | "uuml rgen": 1.25, 309 | "mdash java": 1.25, 310 | "mdash introduction": 1.25, 311 | "toolkit gwt": 0.95, 312 | "register read": 0.95, 313 | "login register": 0.95, 314 | "ller interface": 0.95, 315 | "jax conference": 0.95, 316 | "spring jax": 0.95, 317 | "interface spring": 0.95, 318 | "source code": 0.95, 319 | "web service": 0.95, 320 | "axis spring": 0.95, 321 | "business logic": 0.95, 322 | "mdash lessons": 0.95, 323 | "java language": 0.6, 324 | "java web": 0.95, 325 | "java ee": 0.95, 326 | "java tutorials": 0.95, 327 | "unit test": 0.6, 328 | "browser button": 0.6, 329 | "widgets history": 0.6, 330 | "ui widgets": 0.6, 331 | "features ui": 0.6, 332 | "gwt features": 0.6, 333 | "gwt lets": 0.6, 334 | "ajax applications": 0.6, 335 | "spring evening": 0.6, 336 | "are presentations": 0.6, 337 | "timezone location": 0.6, 338 | "tue timezone": 0.6, 339 | "alex tue": 0.6, 340 | "ben alex": 0.6, 341 | "submitted ben": 0.6, 342 | "melbourne spring": 0.6, 343 | "christian dupuis": 0.6, 344 | "description spring": 0.6, 345 | "web login": 0.6, 346 | "mike wiesner": 0.6, 347 | "johnson interface": 0.6, 348 | "rod johnson": 0.6, 349 | "spring uuml": 0.6, 350 | "timezone description": 0.6, 351 | "wed timezone": 0.6, 352 | "wolff wed": 0.6, 353 | "submitted eberhard": 0.6, 354 | "existing classes": 0.6, 355 | "approach create": 0.6, 356 | "bottom approach": 0.6, 357 | "down approach": 0.6, 358 | "producer side": 0.6, 359 | "string accountid": 0.6, 360 | "accountmanager methods": 0.6, 361 | "web tier": 0.6, 362 | "application servers": 0.6, 363 | "certain application": 0.6, 364 | "applications today": 0.6, 365 | "services axis": 0.6, 366 | "ee applications": 0.6, 367 | "services this": 0.6, 368 | "soap web": 0.6, 369 | "ranges are": 0.6, 370 | "vista os": 0.6, 371 | "windows vista": 0.6, 372 | "microsoft windows": 0.6, 373 | "slider microsoft": 0.6, 374 | "custom component": 0.6, 375 | "ui delegate": 0.6, 376 | "core swing": 0.6, 377 | "easier maintain": 0.6, 378 | "process creating": 0.6, 379 | "swing component": 0.6, 380 | "creating custom": 0.6, 381 | "mdash api": 0.6, 382 | "virtual machine": 0.6, 383 | "java virtual": 0.6, 384 | "naming directory": 0.6, 385 | "note this": 0.6, 386 | "overview features": 0.6, 387 | "swing mdash": 0.6, 388 | "introduction java": 0.6, 389 | "getting started": 0.6, 390 | "refer box": 0.6, 391 | "this refer": 0.6, 392 | "buy this": 0.6, 393 | "tutorial buy": 0.6, 394 | "ee tutorial": 0.6, 395 | "tutorial java": 0.6, 396 | "java tutorial": 0.6, 397 | "tutorials java": 0.6, 398 | "java se": 0.6 399 | }, 400 | "computers_programming_lisp": { 401 | "common lisp": 1.4, 402 | "code data": 1.4, 403 | "data syntax": 1.4, 404 | "haskell common": 1.05, 405 | "scheme haskell": 1.05, 406 | "racket scheme": 1.05, 407 | "cons cons": 1.05, 408 | "elements empty": 1.05, 409 | "lisp code": 1.05, 410 | "higher order": 1.05, 411 | "languages lisp": 1.05, 412 | "encoding code": 1.05, 413 | "lisp language": 1.05, 414 | "programming language": 1.05, 415 | "language implementation": 1.05, 416 | "recursive call": 0.7, 417 | "combinator recursive": 0.7, 418 | "lambda calculus": 0.7, 419 | "listing shows": 0.7, 420 | "nil cons": 0.7, 421 | "cons nil": 0.7, 422 | "element elements": 0.7, 423 | "empty nil": 0.7, 424 | "lisp tradition": 0.7, 425 | "language extension": 0.7, 426 | "mutually incompatible": 0.7, 427 | "meta programs": 0.7, 428 | "structure fixed": 0.7, 429 | "mainstream languages": 0.7, 430 | "generating code": 0.7, 431 | "non terminals": 0.7, 432 | "data structures": 0.7, 433 | "metaprograms lisp": 0.7, 434 | "data structure": 0.7, 435 | "intellectual property": 0.7, 436 | "language itself": 0.7, 437 | "nested lists": 0.7, 438 | "code nested": 0.7, 439 | "data programs": 0.7, 440 | "language design": 0.7, 441 | "manipulate programs": 0.7, 442 | "write metaprograms": 0.7, 443 | "easily write": 0.7, 444 | "lisp programs": 0.7, 445 | "machine code": 0.7 446 | }, 447 | "computers_programming_ruby": { 448 | "ruby rails": 1.65, 449 | "gem install": 5.0, 450 | "ruby ruby": 5.0, 451 | "rails rubyine": 3.35, 452 | "ruby basics": 3.35, 453 | "mswin mongrel": 3.35, 454 | "mongrel mswin": 3.35, 455 | "mongrel ruby": 3.35, 456 | "bin gem": 3.35, 457 | "local bin": 3.35, 458 | "usr local": 3.35, 459 | "sudo gem": 3.35, 460 | "railsconf ruby": 3.35, 461 | "gemfile sinatra": 3.35, 462 | "gem gemfile": 3.35, 463 | "rails gem": 3.35, 464 | "rspec ruby": 3.35, 465 | "recursive processes": 3.35, 466 | "rubyine railsconf": 1.65, 467 | "rubymine ruby": 1.65, 468 | "ide rubymine": 1.65, 469 | "rails ide": 1.65, 470 | "source ruby": 1.65, 471 | "open source": 1.65, 472 | "radrails open": 1.65, 473 | "maps radrails": 1.65, 474 | "hash maps": 1.65, 475 | "ing hash": 1.65, 476 | "syntax ing": 1.65, 477 | "value syntax": 1.65, 478 | "key value": 1.65, 479 | "entry key": 1.65, 480 | "specify entry": 1.65, 481 | "hash specify": 1.65, 482 | "define hash": 1.65, 483 | "below define": 1.65, 484 | "shown below": 1.65, 485 | "shortcut shown": 1.65, 486 | "brace shortcut": 1.65, 487 | "curly brace": 1.65, 488 | "construct curly": 1.65, 489 | "hash construct": 1.65, 490 | "hash hash": 1.65, 491 | "class hash": 1.65, 492 | "instantiating class": 1.65, 493 | "either instantiating": 1.65, 494 | "created either": 1.65, 495 | "es created": 1.65, 496 | "hash es": 1.65, 497 | "relationship hash": 1.65, 498 | "definition relationship": 1.65, 499 | "word definition": 1.65, 500 | "value word": 1.65, 501 | "another value": 1.65, 502 | "word another": 1.65, 503 | "key word": 1.65, 504 | "map key": 1.65, 505 | "dictionaries map": 1.65, 506 | "conceptually dictionaries": 1.65, 507 | "similar conceptually": 1.65, 508 | "are similar": 1.65, 509 | "hashes are": 1.65, 510 | "container hashes": 1.65, 511 | "storage container": 1.65, 512 | "data storage": 1.65, 513 | "kind data": 1.65, 514 | "another kind": 1.65, 515 | "hash another": 1.65, 516 | "hashes hash": 1.65, 517 | "irb hashes": 1.65, 518 | "confuse irb": 1.65, 519 | "wondering confuse": 1.65, 520 | "interactive wondering": 1.65, 521 | "ruby interactive": 1.65, 522 | "stands ruby": 1.65, 523 | "ri stands": 1.65, 524 | "oh ri": 1.65, 525 | "array oh": 1.65, 526 | "ri array": 1.65, 527 | "class ri": 1.65, 528 | "ed class": 1.65, 529 | "shell ed": 1.65, 530 | "ruby shell": 1.65, 531 | "system ruby": 1.65, 532 | "operating system": 1.65, 533 | "window operating": 1.65, 534 | "terminal window": 1.65, 535 | "command terminal": 1.65, 536 | "ri command": 1.65, 537 | "entering ri": 1.65, 538 | "documentation entering": 1.65, 539 | "reference documentation": 1.65, 540 | "ruby reference": 1.65, 541 | "via ruby": 1.65, 542 | "class via": 1.65, 543 | "array class": 1.65, 544 | "methods array": 1.65, 545 | "instance methods": 1.65, 546 | "methods instance": 1.65, 547 | "class methods": 1.65, 548 | "brackets class": 1.65, 549 | "square brackets": 1.65, 550 | "array square": 1.65, 551 | "inside array": 1.65, 552 | "place inside": 1.65, 553 | "enclose place": 1.65, 554 | "approach enclose": 1.65, 555 | "shortcut approach": 1.65, 556 | "construct shortcut": 1.65, 557 | "array construct": 1.65, 558 | "array array": 1.65, 559 | "class array": 1.65, 560 | "basics class": 1.65, 561 | "learn ruby": 1.65, 562 | "rails learn": 1.65, 563 | "guts rails": 1.65, 564 | "diving guts": 1.65, 565 | "before diving": 1.65, 566 | "ruby before": 1.65, 567 | "beneficial ruby": 1.65, 568 | "extremely beneficial": 1.65, 569 | "concerned extremely": 1.65, 570 | "far concerned": 1.65, 571 | "basics far": 1.65, 572 | "learning ruby": 1.65, 573 | "without learning": 1.65, 574 | "rails without": 1.65, 575 | "learn rails": 1.65, 576 | "possible learn": 1.65, 577 | "suggest possible": 1.65, 578 | "developers suggest": 1.65, 579 | "rails developers": 1.65, 580 | "ruby mongrel": 1.65, 581 | "platform mongrel": 1.65, 582 | "install platform": 1.65, 583 | "this gem": 1.65, 584 | "something this": 1.65, 585 | "shown something": 1.65, 586 | "platform shown": 1.65, 587 | "gem platform": 1.65, 588 | "appropriate gem": 1.65, 589 | "prompted appropriate": 1.65, 590 | "dependencies prompted": 1.65, 591 | "mongrel dependencies": 1.65, 592 | "install mongrel": 1.65, 593 | "command sudo": 1.65, 594 | "this command": 1.65, 595 | "type this": 1.65, 596 | "gem type": 1.65, 597 | "mongrel gem": 1.65, 598 | "better mongrel": 1.65, 599 | "apache better": 1.65, 600 | "lighttpd apache": 1.65, 601 | "scgi lighttpd": 1.65, 602 | "fastcgi scgi": 1.65, 603 | "applications fastcgi": 1.65, 604 | "compile applications": 1.65, 605 | "having compile": 1.65, 606 | "without having": 1.65, 607 | "applications without": 1.65, 608 | "rails applications": 1.65, 609 | "server ruby": 1.65, 610 | "library server": 1.65, 611 | "alone library": 1.65, 612 | "stand alone": 1.65, 613 | "fast stand": 1.65, 614 | "mongrel fast": 1.65, 615 | "mongrel mongrel": 1.65, 616 | "step mongrel": 1.65, 617 | "this step": 1.65, 618 | "retry this": 1.65, 619 | "again retry": 1.65, 620 | "step again": 1.65, 621 | "beginning step": 1.65, 622 | "step beginning": 1.65, 623 | "instructed step": 1.65, 624 | "path instructed": 1.65, 625 | "didn path": 1.65, 626 | "gem didn": 1.65, 627 | "loaderror usr": 1.65, 628 | "rubygems loaderror": 1.65, 629 | "load rubygems": 1.65, 630 | "file load": 1.65, 631 | "require file": 1.65, 632 | "gem require": 1.65, 633 | "this usr": 1.65, 634 | "message this": 1.65, 635 | "dependencies message": 1.65, 636 | "rails dependencies": 1.65, 637 | "install rails": 1.65, 638 | "install sudo": 1.65, 639 | "line install": 1.65, 640 | "simple line": 1.65, 641 | "rails simple": 1.65, 642 | "installed rails": 1.65, 643 | "rubygems installed": 1.65, 644 | "rails rubygems": 1.65, 645 | "rails mongrel": 1.65, 646 | "programmers mongrel": 1.65, 647 | "ruby programmers": 1.65, 648 | "sinatra railsconf": 1.65, 649 | "imagemagick rspec": 1.65, 650 | "runs imagemagick": 1.65, 651 | "rmagick runs": 1.65, 652 | "memory rmagick": 1.65, 653 | "easier memory": 1.65, 654 | "minimagick easier": 1.65, 655 | "library minimagick": 1.65, 656 | "imagemagick library": 1.65, 657 | "wraps imagemagick": 1.65, 658 | "usage wraps": 1.65, 659 | "memory usage": 1.65, 660 | "features memory": 1.65, 661 | "processing features": 1.65, 662 | "advanced processing": 1.65, 663 | "terms advanced": 1.65, 664 | "daddy terms": 1.65, 665 | "grand daddy": 1.65, 666 | "rmagick grand": 1.65, 667 | "methods rmagick": 1.65, 668 | "builtin methods": 1.65, 669 | "ruby builtin": 1.65, 670 | "addition ruby": 1.65, 671 | "above addition": 1.65, 672 | "prompt above": 1.65, 673 | "code prompt": 1.65, 674 | "ruby code": 1.65, 675 | "simplicity ruby": 1.65, 676 | "balance simplicity": 1.65, 677 | "found balance": 1.65, 678 | "ruby found": 1.65, 679 | "beauty ruby": 1.65, 680 | "sinatra beauty": 1.65, 681 | "ruby rspec": 1.65, 682 | "javascript ruby": 1.65, 683 | "functional javascript": 1.65, 684 | "responses functional": 1.65, 685 | "processes responses": 1.65, 686 | "linear recursive": 1.65, 687 | "differ linear": 1.65, 688 | "processes differ": 1.65, 689 | "better recursive": 1.65, 690 | "instance better": 1.65, 691 | "learning instance": 1.65, 692 | "still learning": 1.65, 693 | "let still": 1.65, 694 | "please let": 1.65, 695 | "wrong please": 1.65, 696 | "something wrong": 1.65, 697 | "programmer something": 1.65, 698 | "functional programmer": 1.65, 699 | "read functional": 1.65, 700 | "helpful read": 1.65, 701 | "this helpful": 1.65, 702 | "before this": 1.65, 703 | "functional before": 1.65, 704 | "curious functional": 1.65, 705 | "hope curious": 1.65, 706 | "understanding hope": 1.65, 707 | "balances understanding": 1.65, 708 | "exposure balances": 1.65, 709 | "benefits exposure": 1.65, 710 | "certainly benefits": 1.65, 711 | "ruby certainly": 1.65, 712 | "candidates ruby": 1.65, 713 | "are candidates": 1.65, 714 | "ruby are": 1.65, 715 | "rails ruby": 1.65 716 | }, 717 | "economics": { 718 | "goods services": 2.25, 719 | "cost average": 1.05, 720 | "terms starting": 1.05, 721 | "definitions terms": 1.05, 722 | "starting letter": 1.0, 723 | "nobel prize": 0.85, 724 | "price elasticity": 0.8, 725 | "total cost": 0.8, 726 | "elasticity demand": 0.7, 727 | "variable cost": 0.7, 728 | "fixed cost": 0.7, 729 | "cost total": 0.7, 730 | "income taxes": 0.65, 731 | "quantity demanded": 0.65, 732 | "baseball players": 0.6, 733 | "prize economics": 0.6, 734 | "quantity supplied": 0.6, 735 | "exchange rates": 0.55, 736 | "easy explanation": 0.55, 737 | "sense easy": 0.55, 738 | "common sense": 0.55, 739 | "gives common": 0.55, 740 | "price quantity": 0.55, 741 | "austrian school": 0.55, 742 | "econometrics project": 0.5, 743 | "returns scale": 0.5, 744 | "monetary policy": 0.5, 745 | "marginal cost": 0.5 746 | }, 747 | "health": { 748 | "blood pressure": 1.2, 749 | "blood vessels": 1.1, 750 | "immune system": 0.65, 751 | "blood flow": 0.55, 752 | "blood vessel": 0.5 753 | }, 754 | "health_exercise": { 755 | "warmup walk": 1.95, 756 | "minute warmup": 1.95, 757 | "brisk minute": 1.9, 758 | "pull squats": 1.7, 759 | "jog mile": 1.55, 760 | "walk mile": 1.05, 761 | "mile walk": 1.05, 762 | "heart rate": 0.95, 763 | "sets repetitions": 0.95, 764 | "perform sets": 0.9, 765 | "ball chair": 0.85, 766 | "seconds rest": 0.85, 767 | "smith machine": 0.85, 768 | "walk jog": 0.85, 769 | "jog miles": 0.75, 770 | "exercise ball": 0.7, 771 | "leg press": 0.6, 772 | "mile jog": 0.6, 773 | "seconds jog": 0.6, 774 | "squats pull": 0.55, 775 | "rounds pull": 0.55, 776 | "squats rounds": 0.55, 777 | "weight loss": 0.55, 778 | "rest sets": 0.55, 779 | "cable crossover": 0.55, 780 | "universal machine": 0.55, 781 | "starting position": 0.55 782 | }, 783 | "health_nutrition": { 784 | "healthiest foods": 1.15, 785 | "world healthiest": 1.15, 786 | "nutrition data": 1.05, 787 | "amino acids": 0.55, 788 | "heart disease": 0.8, 789 | "essential amino": 0.75, 790 | "blood sugar": 0.75, 791 | "beta carotene": 0.65, 792 | "saturated fats": 0.65, 793 | "daily values": 0.6, 794 | "blood pressure": 0.6, 795 | "fats are": 0.55, 796 | "foods are": 0.55, 797 | "soluble vitamin": 0.5, 798 | "water soluble": 0.5, 799 | "fatty acids": 0.5 800 | }, 801 | "mathematics": { 802 | "coordinate graph": 2.45, 803 | "property multiplication": 1.65, 804 | "square root": 1.65, 805 | "expression algebra": 1.65, 806 | "completeness theorem": 1.65, 807 | "ordered pair": 1.2, 808 | "common multiple": 1.2, 809 | "common factor": 1.2, 810 | "numerator denominator": 1.2, 811 | "axis intercept": 0.8, 812 | "identity property": 0.8, 813 | "equivalent fractions": 0.8, 814 | "commutative property": 0.8, 815 | "associative property": 0.8, 816 | "graph coordinate": 0.8, 817 | "axis coordinate": 0.8, 818 | "sum equals": 0.8, 819 | "combined sum": 0.8, 820 | "angles combined": 0.8, 821 | "angles opposite": 0.8, 822 | "multiplicative inverse": 0.8, 823 | "obtuse angle": 0.8, 824 | "common denominator": 0.8, 825 | "plotted coordinate": 0.8, 826 | "line plotted": 0.8, 827 | "natural integer": 0.8, 828 | "fraction natural": 0.8, 829 | "cartesian coordinates": 0.8, 830 | "central angle": 0.8, 831 | "additive inverse": 0.8, 832 | "angle measures": 0.8, 833 | "mathematical expression": 0.8, 834 | "logically valid": 0.8, 835 | "mathematical logic": 0.8 836 | }, 837 | "music": { 838 | "piece music": 1.1, 839 | "sheet music": 1.0, 840 | "art music": 0.8, 841 | "bruno mars": 0.7, 842 | "music theory": 0.7, 843 | "music cognition": 0.6, 844 | "perform music": 0.6, 845 | "music notation": 0.6, 846 | "popular music": 0.6, 847 | "toby keith": 0.5, 848 | "jason aldean": 0.5, 849 | "taylor swift": 0.5, 850 | "tamar braxton": 0.5, 851 | "nicki minaj": 0.5, 852 | "lil wayne": 0.5, 853 | "cognitive musicology": 0.5, 854 | "study music": 0.5, 855 | "classical music": 0.5 856 | }, 857 | "news": { 858 | "united states": 2.1, 859 | "points hours": 1.1, 860 | "north korea": 0.6, 861 | "middle east": 0.5 862 | }, 863 | "news_economy": { 864 | "national debt": 1.3, 865 | "interest rates": 1.1, 866 | "interest rate": 0.75, 867 | "consumer debt": 0.7, 868 | "deep poverty": 0.6, 869 | "economic growth": 0.6, 870 | "consumer spending": 0.6, 871 | "tea party": 0.5, 872 | "poor people": 0.5 873 | }, 874 | "news_politics": { 875 | "united states": 1.45, 876 | "middle east": 0.55, 877 | "white house": 0.5 878 | }, 879 | "news_war": { 880 | "united states": 2.6, 881 | "war weapons": 0.9, 882 | "white house": 0.7, 883 | "middle east": 0.7, 884 | "nuclear weapons": 0.7 885 | }, 886 | "news_weather": { 887 | "water vapor": 0.85, 888 | "low pressure": 0.85, 889 | "weather forecast": 0.65, 890 | "air mass": 0.6, 891 | "atmospheric pressure": 0.55, 892 | "ice crystals": 0.55 893 | }, 894 | "physics": { 895 | "potential energy": 0.7, 896 | "quantum mechanics": 0.7, 897 | "motion body": 0.6, 898 | "electric current": 0.6, 899 | "water vapor": 0.55, 900 | "law motion": 0.5 901 | }, 902 | "religion": { 903 | "jesus christ": 1.1, 904 | "united states": 1.05, 905 | "roman catholic": 1.05, 906 | "stark finke": 0.75, 907 | "catholic church": 0.7, 908 | "old testament": 0.7, 909 | "judaism christianity": 0.7, 910 | "holy spirit": 0.65, 911 | "prophet muhammad": 0.6, 912 | "christianity islam": 0.6, 913 | "hebrew bible": 0.55, 914 | "eastern orthodox": 0.5, 915 | "century ce": 0.5 916 | }, 917 | "religion_buddhism": { 918 | "pure land": 0.7, 919 | "dalai lama": 0.7, 920 | "eightfold path": 0.65, 921 | "therav da": 0.6, 922 | "tibetan buddhism": 0.5 923 | }, 924 | "religion_christianity": { 925 | "jesus christ": 4.05, 926 | "roman catholic": 1.7, 927 | "holy spirit": 1.7, 928 | "old testament": 1.25, 929 | "christian church": 1.05, 930 | "catholic church": 0.85, 931 | "eastern orthodox": 0.85, 932 | "birth jesus": 0.85, 933 | "love bible": 0.85, 934 | "vatican city": 0.6, 935 | "orthodox churches": 0.6, 936 | "hebrew scripture": 0.6, 937 | "life jesus": 0.6, 938 | "judgment seat": 0.6, 939 | "christ learn": 0.6 940 | }, 941 | "religion_hinduism": { 942 | "upanishad stories": 1.0, 943 | "stories episodes": 0.9, 944 | "hindu gods": 0.65, 945 | "rig veda": 0.65, 946 | "bhagavad gita": 0.65, 947 | "sri ramakrishna": 0.65, 948 | "stage life": 0.6, 949 | "hatha yoga": 0.6, 950 | "gods goddesses": 0.5, 951 | "raja yoga": 0.5 952 | }, 953 | "religion_islam": { 954 | "shi ite": 1.2, 955 | "prophet muhammad": 0.75, 956 | "muslims believe": 0.75, 957 | "ka ba": 0.65, 958 | "mentioned qur": 0.65, 959 | "allah mentioned": 0.65, 960 | "abi talib": 0.65, 961 | "ali abi": 0.65, 962 | "pillars islam": 0.65, 963 | "muslim community": 0.55, 964 | "old testament": 0.55, 965 | "qur old": 0.55, 966 | "muhammad ali": 0.55, 967 | "holy prophet": 0.55, 968 | "prophet allah": 0.55, 969 | "imam ali": 0.55, 970 | "qur anic": 0.55 971 | }, 972 | "religion_judaism": { 973 | "jewish law": 0.5, 974 | "jewish liturgy": 1.1, 975 | "yom kippur": 1.1, 976 | "rosh hashanah": 0.8, 977 | "prayers blessings": 0.8, 978 | "pesach passover": 0.75, 979 | "sephardic jews": 0.75, 980 | "jewish occurring": 0.65, 981 | "jacob israel": 0.65, 982 | "type sacrifice": 0.65, 983 | "hebrew alphabet": 0.6, 984 | "lashon ra": 0.5, 985 | "toh ruh": 0.5, 986 | "movements judaism": 0.5, 987 | "occurring jewish": 0.5, 988 | "orthodox jews": 0.5 989 | }, 990 | "sports": { 991 | "american football": 1.15, 992 | "olympic games": 0.95, 993 | "olympic committee": 0.95, 994 | "rose bowl": 0.95, 995 | "horse racing": 0.7, 996 | "final score": 0.5, 997 | "ball games": 0.5, 998 | "contest game": 0.5, 999 | "sports team": 0.5 1000 | } 1001 | } 1002 | -------------------------------------------------------------------------------- /tokensWithPeriods.txt: -------------------------------------------------------------------------------- 1 | A. 2 | Adj. 3 | Adm. 4 | Adv. 5 | Asst. 6 | B. 7 | Bart. 8 | Bldg. 9 | Brig. 10 | Bros. 11 | C. 12 | Capt. 13 | Cmdr. 14 | Col. 15 | Comdr. 16 | Con. 17 | Cpl. 18 | D. 19 | DR. 20 | Dr. 21 | E. 22 | Ens. 23 | F. 24 | G. 25 | Gen. 26 | Gov. 27 | H. 28 | Hon. 29 | Hosp. 30 | I. 31 | Insp. 32 | J. 33 | K. 34 | L. 35 | Lt. 36 | M. 37 | M. 38 | MM. 39 | MR. 40 | MRS. 41 | MS. 42 | Maj. 43 | Messrs. 44 | Mlle. 45 | Mme. 46 | Mr. 47 | Mrs. 48 | Ms. 49 | Msgr. 50 | N. 51 | O. 52 | Op. 53 | Ord. 54 | P. 55 | Pfc. 56 | Ph. 57 | Prof. 58 | Pvt. 59 | Q. 60 | R. 61 | Rep. 62 | Reps. 63 | Rev. 64 | S. 65 | Sen. 66 | Sens. 67 | Sfc. 68 | Sgt. 69 | Sr. 70 | St. 71 | Supt. 72 | T. 73 | U. 74 | V. 75 | W. 76 | X. 77 | Y. 78 | Z. 79 | v. 80 | vs. 81 | Inc. 82 | U.S. 83 | U.S.A. 84 | --------------------------------------------------------------------------------