├── .filetree
├── .github
    └── FUNDING.yml
├── .gitignore
├── .project
├── KBSnlp.package
    ├── .filetree
    ├── ManifestKBSnlp.class
    │   ├── README.md
    │   ├── class
    │   │   ├── ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st
    │   │   └── ruleRBToDoCollectRuleV1FalsePositive.st
    │   └── properties.json
    ├── NLPcategories.class
    │   ├── README.md
    │   ├── class
    │   │   ├── classify..st
    │   │   └── initializeCategoryHash.st
    │   └── properties.json
    ├── NLPentities.class
    │   ├── README.md
    │   ├── class
    │   │   ├── entities..st
    │   │   ├── entityHelper.text..st
    │   │   ├── fileToDictionary..st
    │   │   ├── humanNameHelper..st
    │   │   └── initializeEntities.st
    │   └── properties.json
    ├── NLPsentences.class
    │   ├── README.md
    │   ├── class
    │   │   ├── fileToSet..st
    │   │   ├── loadData.st
    │   │   ├── sentences..st
    │   │   └── tokenizeLeavePeriods..st
    │   └── properties.json
    ├── NLPsummarizer.class
    │   ├── README.md
    │   ├── class
    │   │   └── summarize..st
    │   └── properties.json
    ├── NLPtagger.class
    │   ├── README.md
    │   ├── class
    │   │   ├── initializeLexicon.st
    │   │   ├── pptag..st
    │   │   ├── sentences..st
    │   │   ├── tag..st
    │   │   └── tokenize..st
    │   └── properties.json
    ├── monticello.meta
    │   ├── categories.st
    │   ├── initializers.st
    │   └── package
    └── properties.json
├── LICENSE.txt
├── README.md
├── company_names.txt
├── firstnames.txt
├── honorifics.txt
├── lastnames.txt
├── lexicon.txt
├── placenames.txt
├── prefixnames.txt
├── product_names.txt
├── src
    ├── .properties
    ├── KBSnlp.st
    └── KBSnlp
    │   ├── ManifestKBSnlp.class.st
    │   ├── NLPcategories.class.st
    │   ├── NLPentities.class.st
    │   ├── NLPsentences.class.st
    │   ├── NLPsummarizer.class.st
    │   ├── NLPtagger.class.st
    │   └── package.st
├── tags.json
├── tags_2gram.json
└── tokensWithPeriods.txt


/.filetree:
--------------------------------------------------------------------------------
1 | {"packageExtension" : ".package",
2 | "propertyFileExtension" : ".json",
3 | "Metadata" : "false" }


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: mark-watson # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 | {
2 | 	'srcDirectory' : 'src'
3 | }


--------------------------------------------------------------------------------
/KBSnlp.package/.filetree:
--------------------------------------------------------------------------------
1 | {
2 | 	"noMethodMetaData" : true,
3 | 	"separateMethodMetaAndSource" : false,
4 | 	"useCypressPropertiesFile" : true }
5 | 


--------------------------------------------------------------------------------
/KBSnlp.package/ManifestKBSnlp.class/README.md:
--------------------------------------------------------------------------------
1 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
2 | 
3 | See:  https://github.com/mark-watson/nlp_smalltalk
4 | 


--------------------------------------------------------------------------------
/KBSnlp.package/ManifestKBSnlp.class/class/ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive.st:
--------------------------------------------------------------------------------
1 | code-critics
2 | ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive
3 | 	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') )


--------------------------------------------------------------------------------
/KBSnlp.package/ManifestKBSnlp.class/class/ruleRBToDoCollectRuleV1FalsePositive.st:
--------------------------------------------------------------------------------
1 | code-critics
2 | ruleRBToDoCollectRuleV1FalsePositive
3 | 	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') )


--------------------------------------------------------------------------------
/KBSnlp.package/ManifestKBSnlp.class/properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"category" : "KBSnlp",
 3 | 	"classinstvars" : [
 4 | 		 ],
 5 | 	"classvars" : [
 6 | 		 ],
 7 | 	"commentStamp" : "MarkWatson 5/19/2017 06:24",
 8 | 	"instvars" : [
 9 | 		 ],
10 | 	"name" : "ManifestKBSnlp",
11 | 	"pools" : [
12 | 		 ],
13 | 	"super" : "PackageManifest",
14 | 	"type" : "normal" }
15 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPcategories.class/README.md:
--------------------------------------------------------------------------------
1 | A NLPcategories is class to categorize text.
2 | 
3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
4 | 
5 | See:  https://github.com/mark-watson/nlp_smalltalk
6 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPcategories.class/class/classify..st:
--------------------------------------------------------------------------------
 1 | classify
 2 | classify: text
 3 | 	"classify text in a string"
 4 | 
 5 | 	| tokens categories scores num hash numTokens results cutoff |
 6 | 	tokens :=  NLPtagger tokenize: (text , 'XXXXXX').
 7 | 	categories := (Smalltalk at: #NlpCategoryHash) keys.
 8 | 	num := categories size.
 9 | 	numTokens := tokens size - 1.
10 | 	scores := Array new: num.
11 | 	1 to: num do: [ :i |
12 | 		scores at: i put: 0.
13 | 		hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i).
14 | 		1 to: numTokens do: [ :j |
15 | 			(hash includesKey: (tokens at: j))
16 | 			  ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ].
17 | 		hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i).
18 | 		1 to: numTokens do: [ :j |
19 | 			(hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1)))
20 | 			  ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]].
21 | 	results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 
22 | 	1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ].
23 | 	cutoff := ((results at: 1) at: 1) / 2.
24 | 	results := results select: [ :x | (x at: 1) > cutoff ].
25 | 	^results.
26 | 	


--------------------------------------------------------------------------------
/KBSnlp.package/NLPcategories.class/class/initializeCategoryHash.st:
--------------------------------------------------------------------------------
1 | classify
2 | initializeCategoryHash
3 | 	"requires NeoJSON"
4 | 
5 |      Smalltalk at: #NlpCategoryHash
6 |                     put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags.json') contentsOfEntireFile).
7 |      Smalltalk at: #NlpCategory2gramHash
8 |                     put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags_2gram.json') contentsOfEntireFile)


--------------------------------------------------------------------------------
/KBSnlp.package/NLPcategories.class/properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"category" : "KBSnlp",
 3 | 	"classinstvars" : [
 4 | 		 ],
 5 | 	"classvars" : [
 6 | 		 ],
 7 | 	"commentStamp" : "MarkWatson 5/19/2017 06:25",
 8 | 	"instvars" : [
 9 | 		 ],
10 | 	"name" : "NLPcategories",
11 | 	"pools" : [
12 | 		 ],
13 | 	"super" : "Object",
14 | 	"type" : "normal" }
15 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/README.md:
--------------------------------------------------------------------------------
1 | A NLPentities is a class to find people's names, company names, place names, etc. in text.
2 | 
3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
4 | 
5 | See:  https://github.com/mark-watson/nlp_smalltalk
6 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/class/entities..st:
--------------------------------------------------------------------------------
 1 | entityDetection
 2 | entities: aString
 3 | 	"return a Dictionary of entities (keys type, values Sets"
 4 | 
 5 | 	| temp result |
 6 | 	result := Dictionary new.
 7 | 	temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString.
 8 | 	temp size > 0
 9 | 		ifTrue: [ result at: 'companies' put: temp ].
10 | 	temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString.
11 | 	temp size > 0
12 | 		ifTrue: [ result at: 'products' put: temp ].
13 | 	temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString.
14 | 	temp size > 0
15 | 		ifTrue: [ result at: 'places' put: temp ].
16 | 	temp := NLPentities humanNameHelper: aString.
17 | 	temp size > 0
18 | 		ifTrue: [ result at: 'places' put: temp ].
19 | 	^ result


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/class/entityHelper.text..st:
--------------------------------------------------------------------------------
 1 | entityDetection
 2 | entityHelper: entitySet text: aString
 3 | 	"this is a helper method for everything **but** person names"
 4 | 
 5 | 	| tokens num ngram2 ngram3 results |
 6 | 	results := Set new.
 7 | 	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.
 8 | 	num := tokens size - 3.	" account for the 3 fake tokens at the end "
 9 | 	1 to: num do: [ :i | 
10 | 		ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1).
11 | 		ngram3 := ngram2 , ' ' , (tokens at: i + 2).	"Transcript show: ngram2; cr."
12 | 		(entitySet includes: ngram3)
13 | 			ifTrue: [ results add: ngram3 ]
14 | 			ifFalse: [ 
15 | 				(entitySet includes: ngram2)
16 | 					ifTrue: [ results add: ngram2 ]
17 | 					ifFalse: [ 
18 | 						(entitySet includes: (tokens at: i))
19 | 							ifTrue: [ results add: (tokens at: i) ] ] ] ].
20 | 	^ results


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/class/fileToDictionary..st:
--------------------------------------------------------------------------------
 1 | entityDetection
 2 | fileToDictionary: filePath
 3 | 
 4 | 	"Read data/lexicon.txt and build in memory lexicon"
 5 | 
 6 |       | read count  aLine  strm  set |
 7 | 
 8 |       Transcript show: 'Processing file ' , filePath; cr.
 9 | 
10 | 	set := Set new.
11 | 	read := (MultiByteFileStream fileNamed: filePath) readOnly.
12 | 	count := 0.
13 | 	[read atEnd]
14 | 		whileFalse: [count := count + 1.
15 | 			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
16 | 			"look for a space character: "
17 | 			((aLine indexOf: $:) > 0)
18 | 			  ifTrue: [ 
19 | 				 strm := ReadStream on: aLine.
20 | 			       aLine := strm upTo: $:].
21 | 			set add: aLine].
22 | 	read close.
23 | 	^set
24 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/class/humanNameHelper..st:
--------------------------------------------------------------------------------
 1 | entityDetection
 2 | humanNameHelper: aString
 3 | 	"this is a helper method for everything **but** person names"
 4 | 
 5 | 	| tokens num results |
 6 | 	results := Set new.
 7 | 	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.
 8 | 	num := tokens size - 3.	" account for the 3 fake tokens at the end "
 9 | 	1 to: num do: [ :i | 
10 | 		((Smalltalk at: #NLPfirstNames) includes: (tokens at: i))
11 | 			ifTrue: [ 
12 | 				(((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1))
13 | 					and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2)))
14 | 					ifTrue: [ 
15 | 						results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2).
16 | 						i := i + 2 ]
17 | 					ifFalse: [ 
18 | 						((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1))
19 | 							ifTrue: [ 
20 | 								results add: (tokens at: i) , ' ' , (tokens at: i + 1).
21 | 								i := i + 1 ] ] ] ].
22 | 	^ results


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/class/initializeEntities.st:
--------------------------------------------------------------------------------
 1 | entityDetection
 2 | initializeEntities
 3 | 	"load entity name data"
 4 | 
 5 | 	" Note: place name lines of the form: Cairo:country_capital   Fixed in fileToDictionary "
 6 | 
 7 | 	Smalltalk
 8 | 		at: #NLPcompanyNames
 9 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/company_names.txt').
10 | 	Smalltalk
11 | 		at: #NLPfirstNames
12 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/firstnames.txt').
13 | 	Smalltalk
14 | 		at: #NLPlastNames
15 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/lastnames.txt').
16 | 	Smalltalk
17 | 		at: #NLPhonorifics
18 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/honorifics.txt').
19 | 	Smalltalk
20 | 		at: #NLPprefixNames
21 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/prefixnames.txt').
22 | 	Smalltalk
23 | 		at: #NLPplaceNames
24 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/placenames.txt').
25 | 	Smalltalk
26 | 		at: #NLPproductNames
27 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/product_names.txt').
28 | 		
29 | 	" also read in data we will need for sentence segmentation: "
30 | 	Smalltalk
31 | 		at: #NLPtokensWithPeriods
32 | 		put: (NLPentities fileToDictionary: './nlp_smalltalk/tokens_with_periods.txt').


--------------------------------------------------------------------------------
/KBSnlp.package/NLPentities.class/properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"category" : "KBSnlp",
 3 | 	"classinstvars" : [
 4 | 		 ],
 5 | 	"classvars" : [
 6 | 		 ],
 7 | 	"commentStamp" : "MarkWatson 5/19/2017 06:25",
 8 | 	"instvars" : [
 9 | 		 ],
10 | 	"name" : "NLPentities",
11 | 	"pools" : [
12 | 		 ],
13 | 	"super" : "Object",
14 | 	"type" : "normal" }
15 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsentences.class/README.md:
--------------------------------------------------------------------------------
1 | A class to segment text into sentences.
2 | 
3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
4 | 
5 | See:  https://github.com/mark-watson/nlp_smalltalk
6 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsentences.class/class/fileToSet..st:
--------------------------------------------------------------------------------
 1 | utiities
 2 | fileToSet: filePath
 3 | 	"Read file, create Set with elements being each line in file"
 4 | 
 5 | 	| read aLine set |
 6 | 	Transcript
 7 | 		show: 'Processing file ' , filePath;
 8 | 		cr.
 9 | 	set := Set new.
10 | 	read := (MultiByteFileStream fileNamed: filePath) readOnly.
11 | 	[ read atEnd ]
12 | 		whileFalse: [ aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
13 | 			set add: aLine ].
14 | 	read close.
15 | 	^ set


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsentences.class/class/loadData.st:
--------------------------------------------------------------------------------
 1 | initialize
 2 | loadData
 3 | 	"Load tokens that normally contain periods"
 4 | 
 5 | 	| aSet count reverseDictionary forwardDictionary |
 6 | 	count := 0.
 7 | 	reverseDictionary := Dictionary new.
 8 | 	forwardDictionary := Dictionary new.
 9 | 	aSet := NLPsentences fileToSet: './nlp_smalltalk/tokensWithPeriods.txt'.
10 | 	Smalltalk at: #NLPtokensWithPeriods put: aSet.
11 | 	^ 'tokens with periods data loaded'


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsentences.class/class/sentences..st:
--------------------------------------------------------------------------------
 1 | segment
 2 | sentences: someText
 3 | 	"tokenize a string into individual sentences"
 4 | 
 5 | 	| tokens aSet lastToken currentSentence allSentences |
 6 | 	aSet := Smalltalk at: #NLPtokensWithPeriods.
 7 | 	tokens := OrderedCollection new.
 8 | 	(NLPsentences tokenizeLeavePeriods: someText)
 9 | 		do: [ :token | 
10 | 			(token includesSubstring: '.') not
11 | 				ifTrue: [ tokens add: token ]
12 | 				ifFalse: [ (aSet includes: token)
13 | 						ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: '').
14 | 							tokens add: '.' ]
15 | 						ifTrue: [ tokens add: token ] ] ].
16 | 	currentSentence := OrderedCollection new.
17 | 	allSentences := OrderedCollection new.
18 | 	lastToken := ''.
19 | 	Transcript
20 | 		show: tokens;
21 | 		cr.
22 | 	tokens
23 | 		do: [ :token | 
24 | 			Transcript
25 | 				show: token;
26 | 				cr.
27 | 			currentSentence add: token.
28 | 			((token = '.' and: lastToken isAllDigits not) or: token = '?')
29 | 				ifTrue: [ allSentences addLast: currentSentence.
30 | 					currentSentence := OrderedCollection new ].
31 | 			lastToken := token ].
32 | 	currentSentence isNotEmpty
33 | 		ifTrue: [ allSentences addLast: currentSentence ].
34 | 	^ allSentences


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsentences.class/class/tokenizeLeavePeriods..st:
--------------------------------------------------------------------------------
1 | utiities
2 | tokenizeLeavePeriods: wordsInAString
3 | 	"tokenizes a string"
4 | 
5 | 	^ wordsInAString
6 | 		findTokens:
7 | 			' ;:,<>[]{}!
8 | @#$%^&*()?'
9 | 		keep: ';:.,<>[]{}!$?'	" keep CR in this string!! "


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsentences.class/properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"category" : "KBSnlp",
 3 | 	"classinstvars" : [
 4 | 		 ],
 5 | 	"classvars" : [
 6 | 		 ],
 7 | 	"commentStamp" : "MarkWatson 5/19/2017 06:26",
 8 | 	"instvars" : [
 9 | 		 ],
10 | 	"name" : "NLPsentences",
11 | 	"pools" : [
12 | 		 ],
13 | 	"super" : "Object",
14 | 	"type" : "normal" }
15 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsummarizer.class/README.md:
--------------------------------------------------------------------------------
1 | A class to classify English text into categories.
2 | 
3 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
4 | 
5 | See:  https://github.com/mark-watson/nlp_smalltalk
6 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsummarizer.class/class/summarize..st:
--------------------------------------------------------------------------------
 1 | summary
 2 | summarize: text
 3 | 	"extractive summarizer"
 4 | 
 5 | 	| sentences sentenceScores tokens scoredCategories hash x bestIndices |
 6 | 	scoredCategories := NLPcategories classify: text.
 7 | 	sentences := NLPtagger sentences: text.
 8 | 	sentenceScores := Array new: sentences size.
 9 | 	1 to: sentences size do: [ :i | 
10 | 		sentenceScores at: i put: 0.
11 | 		tokens := sentences at: i.
12 | 		Transcript
13 | 			show: (sentences at: i);
14 | 			cr.
15 | 		scoredCategories
16 | 			do: [ :sc | 
17 | 				hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2).
18 | 				tokens
19 | 					do: [ :token | 
20 | 						(hash includesKey: token)
21 | 							ifTrue: [ x := hash at: token.
22 | 								sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ].
23 | 	bestIndices := sentenceScores
24 | 		collectWithIndex: [ :score :i | 
25 | 			{score.
26 | 			i} ].
27 | 	Transcript
28 | 		show: 'sentence scoring: ';
29 | 		show: bestIndices;
30 | 		cr.
31 | 	bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ].
32 | 	^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ]


--------------------------------------------------------------------------------
/KBSnlp.package/NLPsummarizer.class/properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"category" : "KBSnlp",
 3 | 	"classinstvars" : [
 4 | 		 ],
 5 | 	"classvars" : [
 6 | 		 ],
 7 | 	"commentStamp" : "MarkWatson 5/19/2017 06:26",
 8 | 	"instvars" : [
 9 | 		 ],
10 | 	"name" : "NLPsummarizer",
11 | 	"pools" : [
12 | 		 ],
13 | 	"super" : "Object",
14 | 	"type" : "normal" }
15 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/README.md:
--------------------------------------------------------------------------------
1 | NLP tagger converted to Squeak.
2 | A class that implements an NLP tagger.
3 | 
4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
5 | 
6 | See:  https://github.com/mark-watson/nlp_smalltalk
7 | 


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/class/initializeLexicon.st:
--------------------------------------------------------------------------------
 1 | tagging
 2 | initializeLexicon
 3 | 	"Read data/lexicon.txt and build in memory lexicon"
 4 | 
 5 | 	| read count strm aLine word taglist token lex |
 6 | 	lex := Dictionary new.
 7 | 	read := (FileStream fileNamed: './nlp_smalltalk/lexicon.txt') readOnly.
 8 | 	count := 0.
 9 | 	[ read atEnd ]
10 | 		whileFalse: [ count := count + 1.
11 | 			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
12 | 			strm := ReadStream on: aLine.
13 | 			word := strm upTo: Character space.
14 | 			taglist := OrderedCollection new.
15 | 			[ strm atEnd ]
16 | 				whileFalse: [ token := strm upTo: Character space.
17 | 					taglist add: token ].
18 | 			"Transcript show: word; cr."
19 | 			"Transcript show: taglist printString; cr."
20 | 			lex at: word put: taglist ].
21 | 	read close.
22 | 	Smalltalk at: #NLPlexicon put: lex


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/class/pptag..st:
--------------------------------------------------------------------------------
 1 | tagging
 2 | pptag: wordString 
 3 | 	"returns a string of word/tag ..."
 4 | 
 5 | 	| words tags write size count |
 6 | 	words := NLPtagger tokenize: wordString.
 7 | 	tags := NLPtagger tag: words.
 8 | 	write := TextStream on: String new.
 9 | 	size := words size.
10 | 	count := 1.
11 | 	[count <= size]
12 | 		whileTrue: [
13 | 			write nextPutAll: (words at: count).
14 | 			write nextPutAll: '/'.
15 | 			write nextPutAll: (tags at: count).
16 | 			write nextPutAll: ' '.
17 | 			count := count + 1].
18 | 	^write contents string


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/class/sentences..st:
--------------------------------------------------------------------------------
 1 | segmentation
 2 | sentences: data
 3 | 	"Handle either a string or array of tokens.
 4 | 	 Limitations:
 5 | 	  1. This code does not currently handle special characters like —
 6 | 	  2. Periods in numbers: only check previous character, not the
 7 | 	     next so a sentence ending with e.g., 2. will not be handled correctly.      
 8 | 	"
 9 | 
10 | 	| tokens lastToken currentSentence allSentences token |
11 | 	tokens := (data isMemberOf: ByteString)
12 | 		ifTrue: (NLPtagger tokenize: data)
13 | 		ifFalse: data.
14 | 	currentSentence := OrderedCollection new.
15 | 	allSentences := OrderedCollection new.
16 | 	lastToken := ''.
17 | 	tokens
18 | 		do: [ :token1 | 
19 | 			((Smalltalk at: #NLPtokensWithPeriods)
20 | 				includes: token1)
21 | 				ifTrue: [ token := (Smalltalk
22 | 						at: #NLPtokensWithPeriods)
23 | 						get: token1 ]
24 | 				ifFalse: [ token := token1 ].
25 | 			Transcript
26 | 				show: token;
27 | 				cr.
28 | 			currentSentence add: token.
29 | 			((token = '.'
30 | 				and: lastToken isAllDigits not)
31 | 				or: token = '?')
32 | 				ifTrue: [ allSentences addLast: currentSentence.
33 | 					currentSentence := OrderedCollection
34 | 						new ].
35 | 			lastToken := token ].
36 | 	currentSentence isNotEmpty
37 | 		ifTrue: [ allSentences addLast: currentSentence ].
38 | 	^ allSentences


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/class/tag..st:
--------------------------------------------------------------------------------
 1 | tagging
 2 | tag: words
 3 | 	"tag an ordered collection of words, returning an ordered collection of corresponding tags"
 4 | 
 5 | 	| lex tags tag count i word lastWord lastTag |
 6 | 	tags := OrderedCollection new.
 7 | 	lex := Smalltalk at: #NLPlexicon.
 8 | 	words do: 
 9 | 			[:aWord | 
10 | 			tag := lex at: aWord ifAbsent: [nil].
11 | 			tag isNil ifFalse: [tag := tag at: 1] ifTrue: [tag := 'NN'].	" the default tag "
12 | 			tags add: tag].
13 | 	" Apply transformation rules: "
14 | 	lastWord := ''.
15 | 	lastTag := ''.
16 | 	i := 0.
17 | 	count := words size.
18 | 	[i < count] whileTrue: 
19 | 			[i := i + 1.
20 | 			word := words at: i.
21 | 			tag := tags at: i.	" reuse tag variable "
22 | 			" First, handle all rules for i &gt; 1 "
23 | 			i > 1 
24 | 				ifTrue: 
25 | 					[" rule 1: DT, {VBD | VBP} --> DT, NN "
26 | 
27 | 					lastTag = 'DT' & (tag = 'VBD' | (tag = 'VBP') | (tag = 'VB')) 
28 | 						ifTrue: [tags at: i put: 'NN'].
29 | 					tag size > 1 
30 | 						ifTrue: 
31 | 							[" rule 6: convert a noun to a verb if the preceeding work is 'would' "
32 | 							(tag at: 1) = $N & ((tag at: 2) = $N) & (lastWord asLowercase = 'would') 
33 | 								ifTrue: [tags at: i put: 'VB']]].
34 | 			" Now, handle the remaining rules that are valid for i = 1: "
35 | 			" rule 2: convert a noun to a number (CD) if '.' appears in the word"
36 | 			(word findString: '.') > 0 
37 | 				ifTrue: [(tag at: 1) = $N ifTrue: [tags at: i put: 'CD']].	" not working - tokenizer tosses '.' characters "
38 | 			" rule 3: convert a noun to a past participle if words[i] ends with 'ed' "
39 | 			(tag at: 1) = $N & (word endsWith: 'ed') ifTrue: [tags at: i put: 'VBN'].
40 | 			" rule 4: convert any type to adverb if it ends in 'ly' "
41 | 			(word endsWith: 'ly') ifTrue: [tags at: i put: 'RB'].
42 | 			" rule 5: convert a common noun (NN or NNS) to a adjective if it ends with 'al' "
43 | 			(tag at: 1) = $N & (word endsWith: 'al') ifTrue: [tags at: i put: 'JJ'].
44 | 			" rule 7: if a word has been categorized as a common noun and it ends with 's;, "
45 | 			"          then set its type to plural common noun (NNS) "
46 | 			tag = 'NN' & (word endsWith: 's') ifTrue: [tags at: i put: 'NNS'].
47 | 			" rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) "
48 | 			(tag at: 1) = $N & (word endsWith: 'ing') ifTrue: [tags at: i put: 'VBG'].
49 | 			lastWord := word.
50 | 			lastTag := tag].
51 | 	^tags


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/class/tokenize..st:
--------------------------------------------------------------------------------
1 | tokenization
2 | tokenize: wordsInAString
3 | 	"tokenizes a string"
4 | 
5 | 	^ wordsInAString
6 | 		findTokens:
7 | 			' ;:.,<>[]{}!
8 | @#$%^&*()?'
9 | 		keep: ';:.,<>[]{}!$?'	" keep CR in this string!! "


--------------------------------------------------------------------------------
/KBSnlp.package/NLPtagger.class/properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"category" : "KBSnlp",
 3 | 	"classinstvars" : [
 4 | 		 ],
 5 | 	"classvars" : [
 6 | 		"NLPlexicon" ],
 7 | 	"commentStamp" : "MarkWatson 5/19/2017 06:27",
 8 | 	"instvars" : [
 9 | 		 ],
10 | 	"name" : "NLPtagger",
11 | 	"pools" : [
12 | 		 ],
13 | 	"super" : "Object",
14 | 	"type" : "normal" }
15 | 


--------------------------------------------------------------------------------
/KBSnlp.package/monticello.meta/categories.st:
--------------------------------------------------------------------------------
1 | SystemOrganization addCategory: #KBSnlp!
2 | 


--------------------------------------------------------------------------------
/KBSnlp.package/monticello.meta/initializers.st:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mark-watson/nlp_smalltalk/3a6c09aed17bed08f0ee3074ac0f1e578881ab87/KBSnlp.package/monticello.meta/initializers.st


--------------------------------------------------------------------------------
/KBSnlp.package/monticello.meta/package:
--------------------------------------------------------------------------------
1 | (name 'KBSnlp')


--------------------------------------------------------------------------------
/KBSnlp.package/properties.json:
--------------------------------------------------------------------------------
1 | {
2 | 	 }
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2004-2017 Mark Watson. All Rights Reserved.
 2 | 
 3 | This software may be used under the conditions of attribution of authorship and the MIT license.
 4 | 
 5 | MIT License
 6 | Copyright (c) 2004-2017 Mark Watson, corporate, and institutional contributors who have collectively contributed elements to this software ("The Pharo and Squeak Communities"). All rights reserved.
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Natural Language Processing Library for Pharo Smalltalk
 2 | 
 3 | Copyright 2005 to 2021 by Mark Watson
 4 | 
 5 | License: MIT
 6 | 
 7 | Note: the most frequent updates to this Pharo Smalltalk package will appear on the [github repo for this project](https://github.com/mark-watson/nlp_smalltalk). 
 8 | 
 9 | Note 2: on 4/25/2021 I converted this project to use the IceBerg github support for Pharo Smalltalk. All source code and data have been moved to the subdirectory **src**. 
10 | 
11 | IceBerg/github documentation: [https://books.pharo.org/booklet-ManageCode/pdf/2019-03-24-ManageCode.pdf](https://books.pharo.org/booklet-ManageCode/pdf/2019-03-24-ManageCode.pdf)
12 | 
13 | Add this repository using the IcewBerg Browser.
14 | 
15 | ## Setup to be done one time after loading the code via IceBerg
16 | 
17 | 
18 | ### Part Of Speech Tagging
19 | 
20 | Open a File Browser and fileIn the KBSnlp.st source file. Open a Class Browser
21 | and and look at the code in the KBnlp class.
22 | 
23 | Open a Workspace and one time only evaluate:
24 | 
25 |     NLPtagger initializeLexicon
26 | 
27 | Try tagging a sentence to make sure the data was read from disk correctly:
28 | 
29 |     NLPtagger pptag: 'The dog ran down the street'
30 | 
31 | If this does not work then probably the directory nlp_smalltalk is not in the default directory. The code containing the file path is:
32 | 
33 |     read := (FileStream fileNamed: './nlp_smalltalk/lexicon.txt') readOnly.
34 | 
35 | ### Categorization
36 | 
37 | I am using NeoJSON to parse the category word count data so make sure NeoJSON is installed. NeoJSON can be installed using:
38 | 
39 |     Gofer it
40 |        smalltalkhubUser: 'SvenVanCaekenberghe' project: 'Neo';
41 |        configurationOf: 'NeoJSON';
42 |        loadStable.
43 | 
44 | One time initialization:
45 | 
46 |     NLPcategories initializeCategoryHash
47 |     
48 | Try it:
49 | 
50 |      NLPcategories classify: 'The economy is bad and taxes are too high.'
51 |      
52 | ### Entity Recognition
53 | 
54 | Implemented for products, companies, places, and people's names.
55 | 
56 | One time initialization:
57 | 
58 |      NLPentities initializeEntities
59 |      
60 | Example:
61 | 
62 |     NLPentities entities: 'The Coca Cola factory is in London'
63 |     
64 |             -->  a Dictionary('companies'->a Set('Coca Cola') 'places'->a Set('London') 'products'->a Set('Coca Cola') )
65 |     
66 |     NLPentities humanNameHelper: 'John Alex Smith and Andy Jones went to the store.'
67 |     
68 |                         --> a Set('John Alex Smith' 'Andy Jones')
69 | 
70 | ### Sentence Segmentation
71 | 
72 | One time initialization:
73 | 
74 |     NLPsentences loadData
75 | 
76 |     NLPsentences sentences: 'Today Mr. Jones went to town. He bought gas.'
77 |     
78 |       --> an OrderedCollection(an OrderedCollection('Today' 'Mr.' 'Jones' 'went' 'to' 'town' '.') an OrderedCollection('He' 'bought' 'gas' '.'))
79 |       
80 | ### Summarization
81 | 
82 | No additional data needs to be loaded for summarization, but all other data should be loaded as-per the above directions. Here is a short example:
83 | 
84 |     NLPsummarizer summarize: 'The administration and House Republicans have asked a federal appeals court for a 90-day extension in a case that involves federal payments to reduce deductibles and copayments for people with modest incomes who buy their own policies. The fate of $7 billion in "cost-sharing subsidies" remains under a cloud as insurers finalize their premium requests for next year. Experts say premiums could jump about 20 percent without the funding. In requesting the extension, lawyers for the Trump administration and the House said the parties are continuing to work on measures, including potential legislative action, to resolve the issue. Requests for extensions are usually granted routinely.'
85 |     
86 |     --> #('The administration and House Republicans have asked a federal appeals court for a 90-day extension in a case that involves federal payments to reduce deductibles and copayments for people with modest incomes who buy their own policies .' 'The fate of $ 7 billion in "cost-sharing subsidies" remains under a cloud as insurers finalize their premium requests for next year .' 'In requesting the extension , lawyers for the Trump administration and the House said the parties are continuing to work on measures , including potential legislative action , to resolve the issue .')
87 |     
88 | ## Limitations
89 | 
90 | - Does not currently handle special characters like: —
91 | - Categorization and summarization should also use "bag of ngrams" in addition to "bag of words" (BOW)
92 | 


--------------------------------------------------------------------------------
/company_names.txt:
--------------------------------------------------------------------------------
   1 | IBM
   2 | Twitter
   3 | Facebook
   4 | Motorola
   5 | Ford
   6 | Panasonic
   7 | General Motors
   8 | Casio
   9 | GM
  10 | Google
  11 | Microsoft
  12 | 3M
  13 | Adobe
  14 | AES
  15 | Aetna
  16 | AFLAC
  17 | Agilent
  18 | Akamai
  19 | Alcoa
  20 | Allegheny
  21 | Allstate
  22 | Altera
  23 | Amazon
  24 | American Express
  25 | Analog Devices
  26 | Apple
  27 | AT&T
  28 | Autodesk
  29 | Avon
  30 | Bank of America
  31 | Best Buy
  32 | Boeing
  33 | Boston Scientific
  34 | Bristol-Myers Squibb
  35 | Broadcom
  36 | Campbell Soup
  37 | Chevron
  38 | CIGNA
  39 | Cisco
  40 | Citigroup
  41 | Citrix
  42 | Clorox
  43 | Coca Cola
  44 | Colgate-Palmolive
  45 | Comcast
  46 | ConocoPhillips
  47 | Corning
  48 | Costco
  49 | Dell
  50 | DeVry
  51 | DIRECTV
  52 | Dow Chemical
  53 | Du Pont
  54 | eBay
  55 | Exxon Mobil
  56 | FedEx
  57 | Ford Motor
  58 | GameStop
  59 | Gannett
  60 | General Electric
  61 | General Mills
  62 | Goldman Sachs
  63 | Goodyear
  64 | Halliburton
  65 | Hasbro
  66 | Heinz
  67 | Hewlett-Packard
  68 | Honeywell
  69 | Hormel
  70 | Humana
  71 | Intel
  72 | Intuit
  73 | Kellogg
  74 | Kimco
  75 | Kohl
  76 | Kraft
  77 | Marriott
  78 | Mastercard
  79 | Mattel
  80 | McAfee
  81 | McGraw-Hill
  82 | Merck
  83 | MetLife
  84 | Microsoft
  85 | Monsanto
  86 | Morgan Stanley
  87 | NIKE
  88 | Nike
  89 | Novell
  90 | Nvidia
  91 | Office Depot
  92 | Oracle
  93 | Philip Morris
  94 | Procter & Gamble
  95 | Prudential
  96 | QLogic
  97 | QUALCOMM
  98 | Qualcomm
  99 | Quest
 100 | Raytheon
 101 | Red Hat
 102 | Rockwell
 103 | Safeway
 104 | Salesforce
 105 | SanDisk
 106 | Schlumberger
 107 | Sears
 108 | Sempra Energy
 109 | Southwest Airlines
 110 | Starbucks
 111 | Sun Microsystems
 112 | Time Warner
 113 | Verisign
 114 | Volkswagen
 115 | Wal-Mart
 116 | Walgreen
 117 | WellPoint
 118 | Wells Fargo
 119 | Winnebago
 120 | Xerox
 121 | Yahoo
 122 | New York Times
 123 | Oxford University Press
 124 | Cambridge University Press
 125 | Washington Post
 126 | Harvard University Press
 127 | BBC Radio
 128 | American Broadcasting Company
 129 | Walt Disney
 130 | Princeton University Press
 131 | Columbia University Press
 132 | Associated Press
 133 | Yale University Press
 134 | Los Angeles Times
 135 | MIT Press
 136 | Warner Bros.
 137 | General Motors
 138 | Daily Telegraph
 139 | Clarendon Press
 140 | General Electric
 141 | Walt Disney Company
 142 | Paramount Pictures
 143 | Ford Motor Company
 144 | Wall Street Journal
 145 | USA Today
 146 | British East India Company
 147 | Apple Inc.
 148 | Greenwood Press
 149 | Sun Microsystems
 150 | Entertainment Weekly
 151 | Apple Computer
 152 | Cornell University Press
 153 | Johns Hopkins University Press
 154 | Chicago Tribune
 155 | Dutch East India Company
 156 | Indiana University Press
 157 | Stanford University Press
 158 | Blackwell Publishing
 159 | Boston Globe
 160 | San Francisco Chronicle
 161 | Fox Broadcasting Company
 162 | Development Bank
 163 | St. Martin's Press
 164 | Time Warner
 165 | Texas Instruments
 166 | International Herald Tribune
 167 | John Wiley & Sons
 168 | National Public Radio
 169 | Chicago Sun-Times
 170 | Hudson's Bay Company
 171 | Norfolk Southern
 172 | Southern California
 173 | CRC Press
 174 | Best Music
 175 | Houghton Mifflin Company
 176 | Tuttle Publishing
 177 | Digital Equipment Corporation
 178 | American Airlines
 179 | International Business Machines
 180 | East India Company
 181 | New York Stock Exchange
 182 | United Airlines
 183 | Sunday Times
 184 | Los Alamos National Laboratory
 185 | Sydney Morning Herald
 186 | Da Capo Press
 187 | University Press
 188 | McDonnell Douglas
 189 | Le Monde
 190 | Warner Bros
 191 | New York Review
 192 | Union Pacific
 193 | Bell Labs
 194 | American general
 195 | International Bank
 196 | Bank of England
 197 | Westview Press
 198 | Lockheed Martin
 199 | British Airways
 200 | Greenwood Publishing Group
 201 | Febrero co
 202 | Electronic Arts
 203 | London Stock Exchange
 204 | Digital Equipment
 205 | ISBN Princeton University Press
 206 | Duke University Press
 207 | Orion Publishing Group
 208 | Canadian Broadcasting Corporation
 209 | Australian Broadcasting Corporation
 210 | General Dynamics
 211 | Manchester University Press
 212 | SUNY Press
 213 | John Wiley and Sons
 214 | Southern Africa
 215 | Rutgers University Press
 216 | Nike Inc.
 217 | Jerusalem Post
 218 | Delta Air Lines
 219 | Washington Times
 220 | Academic Press
 221 | CBS News
 222 | Scarecrow Press
 223 | Random House Inc.
 224 | Viking Press
 225 | Edinburgh University Press
 226 | Siemens AG
 227 | Barnes & Noble
 228 | W. Norton & Company
 229 | LA Times
 230 | Daily News
 231 | New York Post
 232 | Southern United States
 233 | Silicon Graphics
 234 | New York Daily News
 235 | County Commissioners
 236 | Popular Music
 237 | New York University Press
 238 | Canadian Pacific Railway
 239 | Super Mario Bros.
 240 | Japan Times
 241 | Stars and Stripes
 242 | Free Press
 243 | Seattle Times
 244 | Omnibus Press
 245 | North Fork
 246 | Procter & Gamble
 247 | Pan American World Airways
 248 | BBC Television
 249 | Toronto Star
 250 | Christian Science Monitor
 251 | Houston Chronicle
 252 | Bank of America
 253 | Adobe Systems
 254 | Oracle Corporation
 255 | Osprey Publishing
 256 | BAE Systems
 257 | NBC Universal
 258 | Dutch West India Company
 259 | Lawrence Livermore National Laboratory
 260 | John Wiley & Sons Inc.
 261 | New England Journal
 262 | Kessinger Publishing
 263 | Royal Shakespeare Company
 264 | Evening Standard
 265 | Globe and Mail
 266 | Microsoft Corporation
 267 | CBC Television
 268 | Red Crescent International Labour International Monetary Fund
 269 | Burger King
 270 | Bell Laboratories
 271 | Northwestern University Press
 272 | Data General
 273 | Cisco Systems
 274 | New York Times Book Review
 275 | General Electric Company
 276 | Naval Institute Press
 277 | Heavy Metal
 278 | TSR Inc.
 279 | Temple University Press
 280 | Robert Appleton Company
 281 | Eastman Kodak
 282 | Dover Publications Inc.
 283 | Jet Propulsion Laboratory
 284 | Guardian Unlimited
 285 | Super Smash Bros
 286 | International Finance Corporation
 287 | Comic Book Resources
 288 | Irish Times
 289 | Pergamon Press
 290 | W.W. Norton & Company
 291 | News Corporation
 292 | Natural Resources
 293 | World Scientific
 294 | National Trust
 295 | Judaica Press
 296 | Intellectual Property World Meteorological World Tourism World Trade
 297 | Paulist Press
 298 | Square Co.
 299 | Continental Airlines
 300 | Air Canada
 301 | CBC Radio
 302 | Northrop Grumman
 303 | Warner Music Group
 304 | America Online
 305 | State Street
 306 | Belknap Press
 307 | Hewlett Packard
 308 | Harper's Weekly
 309 | Oak Ridge National Laboratory
 310 | Continuum International Publishing Group
 311 | Northwest Airlines
 312 | Coca-Cola Company
 313 | Southwest Airlines
 314 | Boydell Press
 315 | New York Herald
 316 | Control Data Corporation
 317 | Palm Beach
 318 | Garland Publishing
 319 | Wells Fargo
 320 | Second Bank of the United States
 321 | Prentice-Hall Inc.
 322 | Advanced Micro Devices
 323 | Western Union
 324 | Wesleyan University Press
 325 | Miami Herald
 326 | Nuclear Suppliers Group
 327 | RAND Corporation
 328 | Daimler AG
 329 | London Gazette
 330 | American Express
 331 | LA Weekly
 332 | Trans World Airlines
 333 | Country Music
 334 | Philadelphia Inquirer
 335 | British Broadcasting Corporation
 336 | Monsanto Company
 337 | US Airways
 338 | World Bank Group
 339 | Chosen Freeholders
 340 | Bell Telephone Laboratories
 341 | American Standard
 342 | InterVarsity Press
 343 | Pluto Press
 344 | Macmillan Company
 345 | North West Company
 346 | New York Tribune
 347 | Granada Television
 348 | British Telecom
 349 | National Semiconductor
 350 | Baltimore Sun
 351 | Time Inc.
 352 | National Physical Laboratory
 353 | Publishers Weekly
 354 | Worshipful Company
 355 | BBC Wales
 356 | Thames Television
 357 | Fairchild Semiconductor
 358 | British Museum Press
 359 | General Mills
 360 | Black Hills
 361 | PC World
 362 | Sony Music
 363 | United Parcel Service
 364 | Abingdon Press
 365 | Electronic Music
 366 | Robert Bosch GmbH
 367 | Coca Cola
 368 | Paris Review
 369 | CBS Radio
 370 | Burroughs Corporation
 371 | Nintendo of America
 372 | London Review
 373 | Roxy Music
 374 | New Zealand Herald
 375 | Human Rights
 376 | Chrysler Corporation
 377 | Johnson & Johnson
 378 | Argonne National Laboratory
 379 | Polity Press
 380 | Syracuse University Press
 381 | Arcadia Publishing
 382 | Bank of Sweden Prize
 383 | Wayne State University Press
 384 | South End Press
 385 | NCR Corporation
 386 | World Wide Fund
 387 | Penguin Group
 388 | Smithsonian Institution Press
 389 | Gale Group
 390 | IARC Group
 391 | Review Award
 392 | Goldman Sachs
 393 | Singapore Airlines
 394 | Times of India
 395 | Remington Rand
 396 | Dish Network
 397 | Atlantic Slave Trade
 398 | Intel Corporation
 399 | Inter-American Development Bank
 400 | Hitachi Ltd.
 401 | Deere & Company
 402 | Angiosperm Phylogeny Group
 403 | Kraft Foods
 404 | Westminster John Knox Press
 405 | Deutsche Bank
 406 | Warner Brothers
 407 | St Martin's Press
 408 | St. Petersburg Times
 409 | Lehman Brothers
 410 | Shell Oil Company
 411 | NYU Press
 412 | Hackett Publishing
 413 | Caves Books Ltd.
 414 | Universum Film AG
 415 | Electric Company
 416 | Eastern Air Lines
 417 | Penguin Press
 418 | Morgan Stanley
 419 | American Music
 420 | Canadian National Railway
 421 | American General
 422 | Corriere della Sera
 423 | DC Comics
 424 | IBM Personal Computer
 425 | El Paso
 426 | XM Satellite Radio
 427 | Aurum Press
 428 | Douglas Aircraft
 429 | London Corporation
 430 | Southern Europe
 431 | News of the World
 432 | Volkswagen Group
 433 | El Mundo
 434 | Readers Digest
 435 | Pennsylvania State University Press
 436 | Louisiana State University Press
 437 | Lawrence Berkeley National Laboratory
 438 | Vega Science Trust
 439 | Sirius Satellite Radio
 440 | New York Journal
 441 | Atari Inc.
 442 | Merrill Lynch
 443 | Universal Time
 444 | U.S. Steel
 445 | Apple Computer Inc.
 446 | Japan Airlines
 447 | Detroit Free Press
 448 | Nazi Germany
 449 | New York Herald Tribune
 450 | Netscape Communications Corporation
 451 | Turner Network Television
 452 | Harry N. Abrams Inc.
 453 | Eli Lilly and Company
 454 | Financial Times
 455 | Eli Lilly
 456 | Safeway Inc.
 457 | BBC America
 458 | Boston Herald
 459 | Home Depot
 460 | MCI Inc.
 461 | Ernst & Young
 462 | San Jose Mercury News
 463 | Russell Group
 464 | Bank of the United States
 465 | Rio Group
 466 | Caribbean Development Bank
 467 | Frankfurter Allgemeine
 468 | Melbourne University Press
 469 | Liturgical Press
 470 | NBC Radio
 471 | Sinauer Associates
 472 | Brown and Co.
 473 | BT Group
 474 | Honourable East India Company
 475 | Citadel Press
 476 | Modern Music
 477 | Rockwell International
 478 | Daily Herald
 479 | Cable & Wireless
 480 | Plenum Press
 481 | Electronic Data Systems
 482 | Justice League Unlimited
 483 | Westinghouse Electric Corporation
 484 | Denver Post
 485 | Brookhaven National Laboratory
 486 | Rio Tinto
 487 | J.P. Morgan
 488 | SAP AG
 489 | Beacon Press
 490 | Bank of Sweden
 491 | Daily Star
 492 | Folk Music
 493 | Best Buy
 494 | Turner Broadcasting
 495 | World Music
 496 | CBS Corporation
 497 | New Music
 498 | Humanities Press
 499 | Liverpool University Press
 500 | Crescent International Hydrographic International Labour International Monetary Fund
 501 | Tr�bner & Co.
 502 | Continental Europe
 503 | London Weekend Television
 504 | North Point Press
 505 | National Post
 506 | British Aircraft Corporation
 507 | Pearson Education Inc.
 508 | Early Music
 509 | Boeing Integrated Defense Systems
 510 | Curzon Press
 511 | Metro Goldwyn Mayer
 512 | Union Carbide
 513 | Kyle Cathie Limited
 514 | Southern Italy
 515 | Broadview Press
 516 | NATO's Partnership
 517 | World Bank
 518 | Hal Leonard Corporation
 519 | Virgin Media
 520 | NSU Motorenwerke AG
 521 | Sutton Publishing
 522 | Standard Oil Company
 523 | Belfast Telegraph
 524 | Weinstein Company
 525 | Bilderberg Group
 526 | French East India Company
 527 | General Motors Corporation
 528 | JHU Press
 529 | Houghton Mifflin Co.
 530 | Ten Speed Press
 531 | Air France
 532 | Alaska Airlines
 533 | Virgin Atlantic Airways
 534 | Warner Communications
 535 | Knights of Columbus
 536 | International Trade
 537 | Martin Marietta
 538 | Super Mario Bros
 539 | Thales Group
 540 | Merck & Co.
 541 | Sandia National Laboratories
 542 | Dexter's Laboratory
 543 | Yorkshire Television
 544 | Taipei Times
 545 | Victor Talking Machine Company
 546 | University Press of America
 547 | Palm Inc.
 548 | Southern Ontario
 549 | H. J. Heinz Company
 550 | Analog Devices
 551 | Fox Network
 552 | ABC Radio
 553 | Bank of Scotland
 554 | British South Africa Company
 555 | National Academy Press
 556 | Rio Tinto Group
 557 | Pan American Airways
 558 | Marconi Company
 559 | National Bank
 560 | Southern Rhodesia
 561 | English East India Company
 562 | Bristol Aeroplane Company
 563 | Sperry Corporation
 564 | Barclays Bank
 565 | Raven Software
 566 | AMS Press
 567 | Harcourt Brace Jovanovich
 568 | Mercer University Press
 569 | Grove Press
 570 | Carl Zeiss AG
 571 | Ralph Lauren
 572 | Encyclop�dia Britannica Inc.
 573 | Abbeville Press
 574 | Lucent Technologies
 575 | AOL Time Warner
 576 | Die Welt
 577 | Father and Son
 578 | Chicago University Press
 579 | Monsters Inc.
 580 | American Broadcasting
 581 | Texas A&M University Press
 582 | Exxon Mobil
 583 | IEEE Press
 584 | Mario Bros
 585 | UBS AG
 586 | Lawrence Erlbaum Associates
 587 | National Broadcasting Company
 588 | Slave Trade
 589 | Rolls-Royce plc
 590 | David Steel
 591 | D. Van Nostrand Company
 592 | 20th Century Fox
 593 | South Vietnam
 594 | Earth Metrics Inc.
 595 | Edwin Mellen Press
 596 | Atlanta Journal
 597 | McGraw-Hill Book Company
 598 | Catalan Company
 599 | Clear Channel
 600 | MIT Lincoln Laboratory
 601 | Walt Disney Parks and Resorts
 602 | Rotten Tomatoes
 603 | Human Resources
 604 | Carolina Academic Press
 605 | News Corp.
 606 | Kent State University Press
 607 | Open University Press
 608 | Hearst Corporation
 609 | NZ Herald
 610 | Computer Sciences
 611 | Hogarth Press
 612 | Cathay Pacific
 613 | B. Eerdmans Publishing
 614 | Naval Research Laboratory
 615 | Canadian Press
 616 | Coachella Valley Music
 617 | W.W. Norton & Co.
 618 | Standard & Poor's
 619 | Boeing Company
 620 | McFarland & Company
 621 | Bank of China
 622 | Iberia Airlines
 623 | Sky Television plc
 624 | ACM Press
 625 | France Telecom
 626 | La Repubblica
 627 | Thievery Corporation
 628 | Central Bank
 629 | Royal Asiatic Translation Fund
 630 | Cray Research
 631 | Johns Hopkins Press
 632 | Gulf Oil
 633 | Miramax Films
 634 | Sveriges Riksbank
 635 | United States Steel Corporation
 636 | John Hopkins University Press
 637 | Macquarie Library Pty Ltd
 638 | Eckert-Mauchly Computer Corporation
 639 | Irish Independent
 640 | Banque de France
 641 | Nissan Motors
 642 | Southern France
 643 | Coors Brewing Company
 644 | J.C. Penney
 645 | Gladstone Publishing
 646 | Super Mario USA
 647 | Altria Group
 648 | Norfolk Wildlife Trust
 649 | Freedom Press
 650 | J. P. Morgan
 651 | du Pont
 652 | Haworth Press
 653 | General Dynamics Corporation
 654 | Kia Motors
 655 | Princeton Architectural Press
 656 | Iron Mountain
 657 | ING Group
 658 | Guilford Press
 659 | Asiana Airlines
 660 | Sky News
 661 | Business Wire
 662 | United Press International
 663 | Northeastern University Press
 664 | Sheffield Academic Press
 665 | Canon Inc.
 666 | McGill-Queen's University Press
 667 | Pacific Northwest National Laboratory
 668 | Arno Press
 669 | Minute Maid
 670 | Film Music
 671 | China Airlines
 672 | Total S.A.
 673 | Commonwealth Bank
 674 | Dow Jones
 675 | Western Digital
 676 | MTV Europe
 677 | Portia Group
 678 | Three Rivers Press
 679 | Bibliographic Resources
 680 | Milton Bradley Company
 681 | Washington Mutual
 682 | Avon Products
 683 | Norsk Hydro
 684 | Dow Jones & Company
 685 | Ty Inc.
 686 | American Fur Company
 687 | Arcade Publishing
 688 | Levi Strauss
 689 | Continental AG
 690 | South African Airways
 691 | ADV Films
 692 | Bloomsbury Group
 693 | Tribune Company
 694 | Hilton Hotels
 695 | Imperial Airways
 696 | Gramophone Company
 697 | Hudson Ltd.
 698 | Honourable Artillery Company
 699 | Berkshire Hathaway
 700 | Imperial Oil
 701 | Deutsche Telekom
 702 | Atari Corp.
 703 | Apogee Software
 704 | Phaidon Press
 705 | Historical Capital
 706 | Addison-Wesley Publishing
 707 | Steinway & Sons
 708 | All Nippon Airways
 709 | Free Software
 710 | W. Norton & Co.
 711 | Fox Film Corporation
 712 | Milwaukee Journal Sentinel
 713 | Generations Network Inc.
 714 | Thames Bank
 715 | Arizona Republic
 716 | Philip Morris USA
 717 | World Trade
 718 | Eastern Daily Press
 719 | Left Bank
 720 | Open Source Software
 721 | Sea Venture
 722 | Development Corporation
 723 | Green and Co.
 724 | Liberty Media
 725 | Fox network
 726 | Ashgate Publishing Ltd.
 727 | Cavendish Laboratory
 728 | Fannie Mae
 729 | William B. Eerdmans Publishing Company
 730 | South Sea Company
 731 | MCI Communications
 732 | British Sky Broadcasting
 733 | Red Bank
 734 | Hartford Courant
 735 | Vivendi Universal
 736 | Macmillan Publishing
 737 | Sony Pictures Television
 738 | Bausch & Lomb
 739 | Thomson SA
 740 | BHP Billiton
 741 | Abercrombie & Fitch
 742 | Glenn L. Martin Company
 743 | Super Smash Bros.
 744 | Canberra Times
 745 | Philip Morris
 746 | Sveriges Television
 747 | MOS Technology Inc.
 748 | Notre Dame Press
 749 | Orange SA
 750 | St. Martin�s Press
 751 | United U.S.
 752 | Xinhua News Agency
 753 | W.W. Norton & Company Inc.
 754 | Three's Company
 755 | Staples Inc.
 756 | Wal-Mart stores
 757 | Bank of Montreal
 758 | Cairns Group
 759 | Freddie Mac
 760 | Dallas Morning News
 761 | American Motors Corporation
 762 | Dell Inc.
 763 | St. James Press
 764 | Puma AG
 765 | Yahoo! Music
 766 | SIAM Journal
 767 | Information Systems
 768 | National High Magnetic Field Laboratory
 769 | Los Alamos Scientific Laboratory
 770 | Hershey Company
 771 | National Academies Press
 772 | Fortress Press
 773 | Working Group
 774 | Second City Television
 775 | McGill-Queen's Press
 776 | Wang Laboratories
 777 | Arab Monetary Fund
 778 | Southern Asia
 779 | Thorndike Press
 780 | Christian Dior
 781 | Review Awards
 782 | Penn State Press
 783 | Artificial Intelligence Laboratory
 784 | First Bank of the United States
 785 | Water Resources
 786 | Boston Review
 787 | Scientific Data Systems
 788 | British European Airways
 789 | Object Management Group
 790 | Chicago Daily Tribune
 791 | Hot Dance Music
 792 | UPS Airlines
 793 | Pearson PLC
 794 | American Motors
 795 | Southern India
 796 | Hudson Bay Company
 797 | Subterranean Press
 798 | British Heavy Metal
 799 | Who Do You Trust
 800 | New Super Mario Bros
 801 | Oxford Clarendon Press
 802 | Phoenix Press
 803 | ION Television
 804 | Montreal Gazette
 805 | Bank of Canada
 806 | Birmingham Small Arms Company
 807 | Investment Bank
 808 | Canterbury University Press
 809 | Abbott Laboratories
 810 | Leamington Spa
 811 | BT Group plc
 812 | Parker Brothers
 813 | Trafford Publishing
 814 | Bank of France
 815 | Control Data
 816 | Leicester University Press
 817 | Du Pont
 818 | Traditional Music
 819 | Bell Atlantic
 820 | MAN AG
 821 | Tandy Corporation
 822 | Southern Illinois University Press
 823 | Penguin Books Ltd.
 824 | Moody Press
 825 | Mainstream Publishing
 826 | SCO Group
 827 | Mercer Human Resource Consulting
 828 | Le monde
 829 | Flerov Laboratory
 830 | Scania AB
 831 | Sears Roebuck
 832 | Classical Music
 833 | Duke Energy
 834 | Mediacorp Canada Inc.
 835 | John Wiley & Songs Inc.
 836 | St Vladimir's Seminary Press
 837 | National Cash Register Company
 838 | RIA Novosti
 839 | Broadcast Music
 840 | Rockwell Collins
 841 | Hilton Hotels Corporation
 842 | Birlinn Ltd.
 843 | Silicon Graphics Inc.
 844 | Indian Head
 845 | Danmarks Radio
 846 | Doubleday & Company
 847 | Somers Isles Company
 848 | Foreign Trade
 849 | International Fund
 850 | World Wildlife Fund
 851 | AT&T Bell Labs
 852 | United Fruit Company
 853 | Ohio University Press
 854 | Volkswagen AG
 855 | Knight Ridder
 856 | Publishers Inc.
 857 | Wiley & Sons
 858 | United Technologies
 859 | Steppenwolf Theatre Company
 860 | Chicago Sun Times
 861 | Ashgate Publishing
 862 | Southern Maryland
 863 | Westminster Press
 864 | D'Oyly Carte Opera Company
 865 | Adamant Media Corporation
 866 | Wildside Press
 867 | NASA's Jet Propulsion Laboratory
 868 | New Testament Introduction
 869 | Southern Sudan
 870 | Dow Chemical Company
 871 | CNN International
 872 | Getty Oil
 873 | Cycorp Inc.
 874 | Seven Stories Press
 875 | New Line Cinema
 876 | Lulu Press
 877 | MacMillan Company
 878 | Gemstone Publishing
 879 | Popular Press
 880 | Encyclopaedia Britannica Inc.
 881 | Delta Airlines
 882 | Thunder's Mouth Press
 883 | WarioWare Inc.
 884 | Private Eye
 885 | Sprint Nextel
 886 | Two Chief World Systems
 887 | Macintosh LC
 888 | British Overseas Airways Corporation
 889 | Mitsubishi Heavy Industries
 890 | Bad Company
 891 | Moog Music
 892 | Particle Data Group
 893 | Old Testament Introduction
 894 | Fairleigh Dickinson University Press
 895 | South Bank
 896 | Parthenon Press
 897 | William Kaufmann Inc.
 898 | Joseph Henry Press
 899 | Hudson Motor Car Company
 900 | Hackett Publishing Company
 901 | LG Electronics
 902 | Tokyo Stock Exchange
 903 | ITT Corporation
 904 | International Crisis Group
 905 | Inter Press Service
 906 | Amadeus Press
 907 | Management Information Systems
 908 | Edward Jones
 909 | PepsiCo Inc.
 910 | Chicago Times
 911 | Columbia Records
 912 | Foster's Group
 913 | St. Martins Press
 914 | Paris Match
 915 | Indian Music
 916 | Gnome Press
 917 | Southern Netherlands
 918 | Natural Trust
 919 | Helsinki Stock Exchange
 920 | Contemporary Music
 921 | BBC 6 Music
 922 | Central European University Press
 923 | Apollo Computer
 924 | Atari Corporation
 925 | Merck KGaA
 926 | Bombardment Group
 927 | Victor Gollancz Ltd
 928 | Carlsberg Laboratory
 929 | ATA Airlines
 930 | Simon and Schuster Inc.
 931 | Imperial College Press
 932 | Holland Land Company
 933 | Souvenir Press
 934 | Mario Bros.
 935 | Marks & Spencer
 936 | Lucasfilm Ltd.
 937 | ABC Television
 938 | Bloomsbury Publishing
 939 | Canada Limited
 940 | Broadcasting Corporation
 941 | Hyundai Motor Company
 942 | Dornier GmbH
 943 | Henry Holt & Company
 944 | London Journal
 945 | Lincoln Laboratory
 946 | Shell Oil
 947 | Rocky Mountain News
 948 | United Technologies Corporation
 949 | Evolution Publishing
 950 | Space Systems
 951 | Mail & Guardian
 952 | Imperial Chemical Industries
 953 | Southern England
 954 | SVS Press
 955 | Bank of Nova Scotia
 956 | Borders Group
 957 | Victoria's Secret
 958 | SCM Press
 959 | United Artists
 960 | Park Place
 961 | Levi Strauss & Co.
 962 | Wellcome Trust
 963 | Bank of Greece
 964 | Greenwood Publishing
 965 | Embedded Systems
 966 | Chase Manhattan Bank
 967 | National Express East Anglia
 968 | National Educational Television
 969 | National Petroleum
 970 | English Electric Company
 971 | Bomb Group
 972 | Weekly Standard
 973 | Ursa Major Moving Group
 974 | Cygnus Solutions
 975 | Cambridge Computer Laboratory
 976 | Lockheed Corporation
 977 | Hawker Siddeley Group
 978 | Bantam Press
 979 | Yum! Brands Inc.
 980 | Universal Pictures
 981 | Georgetown University Press
 982 | Overlook Press
 983 | Southern Russia
 984 | AT&T Bell Laboratories
 985 | IDW Publishing
 986 | Fuji Television
 987 | Qatar Airways
 988 | Calgary Herald
 989 | Disney-ABC Television Group
 990 | Dow Chemical
 991 | Harvill Press
 992 | Mongoose Publishing
 993 | Dan River
 994 | Xerox Corporation
 995 | Review Award for Best
 996 | BAA Limited
 997 | Yamaha Motor
 998 | Aon Corporation
 999 | Buick Motor Company
1000 | Applied Materials
1001 | St. Paul Pioneer Press
1002 | FM Radio
1003 | Paragon Book Reprint Corp.
1004 | Brown & Co.
1005 | D�il �ireann
1006 | Macmillan Publishers Ltd.
1007 | Sinclair Research Ltd
1008 | Indian Airlines
1009 | Lyons Press
1010 | International Computers Limited
1011 | Eastman Kodak Company
1012 | Saudi Arabian Airlines
1013 | Sinauer Associates Inc.
1014 | Anglia Television
1015 | LAN Airlines
1016 | White Wolf Inc.
1017 | Intervarsity Press
1018 | Virtual Laboratory
1019 | John Wiley and Sons Inc.
1020 | Freescale Semiconductor
1021 | Praeger Press
1022 | BBC Worldwide
1023 | Samsung Electronics
1024 | Samsung
1025 | Schiffer Publishing
1026 | American Telephone & Telegraph
1027 | Heineken International
1028 | LOT Polish Airlines
1029 | General Instrument
1030 | Public Radio
1031 | Tiffany & Co.
1032 | Gran Turismo
1033 | Mitsubishi Motors
1034 | Whispering Eagle Press
1035 | Harvard Business School Press
1036 | American International Group
1037 | Lutterworth Press
1038 | Cooper Car Company
1039 | Dance Music
1040 | BBC Northern Ireland
1041 | Arab Bank
1042 | DK Publishing Inc.
1043 | Pop Music
1044 | St. Regis
1045 | Tate Publishing
1046 | JPMorgan Chase
1047 | Sveriges Radio
1048 | Africa World Press
1049 | Berkley Publishing Group
1050 | PalmSource Inc.
1051 | Investor AB
1052 | Biograph Company
1053 | Research In Motion
1054 | Kawasaki Heavy Industries
1055 | Raven Press
1056 | Universal Studios
1057 | Caterpillar Inc.
1058 | J. Wiley & Sons
1059 | Ahmad Sa
1060 | Addison-Wesley Publishing Company
1061 | Golden West
1062 | H&R Block
1063 | RTL Group
1064 | Straits Times
1065 | Handmade Films
1066 | Newport News Shipbuilding
1067 | National Westminster Bank
1068 | Delco Electronics
1069 | British Broadcasting Company
1070 | Computer Research Corporation
1071 | Ford Motor Co.
1072 | Barclays Banks
1073 | Ars Technica
1074 | NPR Music
1075 | Lotus Software
1076 | General Electric Company plc
1077 | Chicago Review Press
1078 | Ohio State University Press
1079 | Dogger Bank
1080 | Occidental Petroleum
1081 | Frontier Airlines
1082 | DuMont Television
1083 | DuMont
1084 | Star Tribune
1085 | Peter Lang Publishing
1086 | Lotus Development Corporation
1087 | Verizon Wireless
1088 | Verizon
1089 | Carnegie Corporation
1090 | UCL Press
1091 | Volvo AB
1092 | Macmillan and Co.
1093 | Northwestern United States
1094 | Cavendish Astrophysics Radio Astronomy Group
1095 | Protein Data Bank
1096 | Lincoln National
1097 | Routledge Press
1098 | Equipment Corporation
1099 | Max Factor
1100 | Jim Henson Company
1101 | Electric Boat Company
1102 | Swatch Group Ltd.
1103 | Bell Telephone Company
1104 | Tyson Foods
1105 | Virgin Group
1106 | Academic Press Inc.
1107 | Infinity Inc.
1108 | NRC Handelsblad
1109 | Tata Motors
1110 | Landor Associates
1111 | Paramount Television
1112 | Royal Dutch Shell
1113 | Frankfurt Stock Exchange
1114 | Night Music
1115 | Rupert Murdoch's News Corporation
1116 | Ferrero SpA
1117 | Fiji Times
1118 | Crown Publishers Inc.
1119 | Columbus Dispatch
1120 | Right Bank
1121 | McFarland & Co.
1122 | Atlantic Monthly Press
1123 | Parker Pen Company
1124 | MIT Laboratory
1125 | American Telephone & Telegraph Company
1126 | London Docklands Development Corporation
1127 | Tata Group
1128 | Friden Inc.
1129 | Cartier SA
1130 | Bechtel Corporation
1131 | South African Broadcasting Corporation
1132 | South Bank Show
1133 | West Publishing Co.
1134 | Turkish Airlines
1135 | Humana Press
1136 | Paternoster Press
1137 | His Music
1138 | Magna International
1139 | Games Workshop
1140 | Henry Holt & Co.
1141 | Kitchen Sink Press
1142 | 3rd Ed. Worth Publishing
1143 | Winnipeg Free Press
1144 | Golden Gryphon Press
1145 | Collier's Weekly
1146 | die Welt
1147 | Long Range Desert Group
1148 | ATI Technologies
1149 | Capitol Records
1150 | Archer Daniels Midland
1151 | Hughes Aircraft Company
1152 | Sussex Academic Press
1153 | Sun Microsystems Inc.
1154 | Minneapolis Tribune
1155 | MCI WorldCom
1156 | Michigan State University Press
1157 | Trusted Computing Group
1158 | Chicago Tunnel Company
1159 | Kellogg Company
1160 | Acme Corporation
1161 | Western Music
1162 | Walker & Company
1163 | Greater Arab Free Trade
1164 | Hanson plc
1165 | Lindisfarne Press
1166 | Morgan Reynolds Publishing
1167 | Otis Elevator Company
1168 | Engineering Ltd
1169 | Associated TeleVision
1170 | National Capital
1171 | Bucknell University Press
1172 | Ebury Press
1173 | Garland Publishing Inc.
1174 | Marconi Electronic Systems
1175 | BBC Scotland
1176 | News Limited
1177 | Minnesota Public Radio
1178 | Emirates Airline
1179 | Lotus Cars
1180 | Oriental Steam Navigation Company
1181 | Atlantic Records
1182 | Western Electric Company
1183 | DK Publishing
1184 | Rock Music
1185 | Dun & Bradstreet
1186 | Mars Incorporated
1187 | Cadbury plc
1188 | SBC Communications
1189 | Jet Airways
1190 | United States Playing Card Company
1191 | St. Vladimir's Seminary Press
1192 | Sanford and Son
1193 | Metallurgical Laboratory
1194 | Reconstruction Finance Corporation
1195 | Focal Press
1196 | Ayer Publishing
1197 | Today's Best Music
1198 | A�rospatiale SA
1199 | Danielle Steel
1200 | Blackstone Group
1201 | Internet Corporation for Assigned Names
1202 | Olympia Press
1203 | Royal Bank of Scotland
1204 | Mozilla Corporation
1205 | Contemporary Christian Music
1206 | Bain Capital
1207 | Caledonian Company
1208 | NASDAQ stock market
1209 | Teaching Company
1210 | Wayne Corporation
1211 | Cold Spring Harbor Laboratory
1212 | Muscovy Company
1213 | Public Works
1214 | Vlaamse Radio
1215 | SKY Network Television
1216 | Netscape Communications
1217 | Weird Tales
1218 | American Stock Exchange
1219 | Father and son
1220 | A&E Television Networks
1221 | Juniper Networks
1222 | Elsevier Academic Press
1223 | Edison Manufacturing Company
1224 | JP Morgan
1225 | Lattice Semiconductor
1226 | King Features Syndicate
1227 | James Clarke & Co Ltd
1228 | Air New Zealand
1229 | Cerberus Capital Management
1230 | Groupe Bull
1231 | NFL Films
1232 | SourceForge Inc.
1233 | Air Force Research Laboratory
1234 | Large Cities Climate Leadership Group
1235 | Commercial Appeal
1236 | Trans-Pacific Strategic Economic Partnership
1237 | LL.D. F.S.A.
1238 | McKinsey & Company
1239 | Phoenix Technologies
1240 | Le Journal
1241 | Aer Lingus
1242 | Maxim Integrated Products
1243 | Energy National Laboratories
1244 | El Universal
1245 | Blue Steel
1246 | Air Liquide
1247 | NLM Hazardous Substances Databank
1248 | Oxford U. Press
1249 | Timber Press
1250 | Domino's Pizza
1251 | Southern Louisiana
1252 | Blue Man Group
1253 | Origin Systems
1254 | Shanghai Stock Exchange
1255 | Wildlife Service
1256 | Pan Am
1257 | Killeen Television
1258 | Hound Group
1259 | Alfred A. Knopf Inc.
1260 | Google Inc.
1261 | Voyageur Press
1262 | Old-Time Radio
1263 | Digital Press
1264 | New Zealand Journal
1265 | Compaq Computer Corporation
1266 | Eastern Caribbean Central Bank
1267 | Nutting Associates
1268 | All India Radio
1269 | Shockley Semiconductor Laboratory
1270 | Greatest Films
1271 | Raytheon Missile Systems
1272 | In God We Trust
1273 | MOS Technologies
1274 | Pacifica Radio
1275 | Country Music Television
1276 | H.W. Wilson Co.
1277 | Asahi Shimbun
1278 | Capital Airlines
1279 | Us Weekly
1280 | Loki Software
1281 | E.B. Eddy Company
1282 | Kenya Airways
1283 | Eastern Airlines
1284 | America West Airlines
1285 | Popular Electronics
1286 | Ralston Purina
1287 | In Music
1288 | JetBlue Airways
1289 | Engineering Laboratory
1290 | TwoMorrows Publishing
1291 | Open Court Publishing
1292 | Boots Group
1293 | IUPAP Transfermium Working Group
1294 | Moving Picture Experts Group
1295 | WB Television Network
1296 | Spirit Airlines
1297 | Princeton Plasma Physics Laboratory
1298 | Dimension Films
1299 | Fair Trading
1300 | Mohawk Industries
1301 | Dublin Corporation
1302 | Iron Crown Enterprises
1303 | RKO Radio
1304 | IBM Corporation
1305 | AirTran Airways
1306 | HD Radio
1307 | Pacific Fur Company
1308 | Ethiopian Airlines
1309 | Brookings Institution Press
1310 | Motor Car Company
1311 | ABC Radio Australia
1312 | Ace Hardware
1313 | Electronic Systems
1314 | Deutsche Werke
1315 | Baroque Music
1316 | Mead & Company
1317 | Microsoft Press
1318 | Alenia Marconi Systems
1319 | American Popular Music
1320 | East Japan Railway Company
1321 | Minnesota Law Review
1322 | Dolby Laboratories
1323 | Potash Corporation of Saskatchewan
1324 | Marconi plc
1325 | McFarland & Company Inc.
1326 | Chicago Herald
1327 | Movie Gallery
1328 | Stainless Steel
1329 | XM Radio
1330 | Canonical Ltd.
1331 | Bombardier Inc.
1332 | Lloyds TSB
1333 | Suncor Energy
1334 | Nash Motors
1335 | IEEE Software
1336 | Chamber Music
1337 | G. & C. Merriam Company
1338 | Bloomberg L.P.
1339 | Credit Suisse
1340 | Iowa State University Press
1341 | Valero Energy Corporation
1342 | SRI International
1343 | H.W. Wilson Company
1344 | National Express East Coast
1345 | Boston History Company
1346 | Doubleday & Company Inc.
1347 | Wal-Mart Stores Inc.
1348 | Marine Biological Laboratory
1349 | Aquarian Press
1350 | Eaton Corporation
1351 | AltaMira Press
1352 | Universal Music Group
1353 | Deutsche Bundesbank
1354 | Marvell Technology Group
1355 | LSI Logic
1356 | UNESCO Publishing
1357 | Weeb Ewbank
1358 | HMV Group
1359 | Red Sea Press
1360 | Be Inc.
1361 | Florida Times
1362 | Ford Australia
1363 | Universal Music
1364 | Island Press
1365 | Marvel Comics
1366 | Mars Inc.
1367 | Toronto Stock Exchange
1368 | Southern Lebanon
1369 | World Almanac Education Group
1370 | William Heinemann Ltd.
1371 | Alliant Techsystems
1372 | Canadian University Press
1373 | Chrysler LLC
1374 | John Knox Press
1375 | Paladin Press
1376 | Axel Springer AG
1377 | New Press
1378 | Lend Lease
1379 | Columbia Pictures
1380 | USA TODAY
1381 | Water Music
1382 | Rolls-Royce Limited
1383 | Canada Dry
1384 | Crowood Press
1385 | E.P. Dutton & Co. Inc.
1386 | Wayne Enterprises
1387 | Austrian Airlines
1388 | ConAgra Foods
1389 | Vauxhall Motors
1390 | Delacorte Press
1391 | Public Image Ltd.
1392 | Complex Systems
1393 | Construcciones Aeron�uticas SA
1394 | Fidelity Investments
1395 | Toyota Motor Corporation
1396 | Tyrell Corporation
1397 | Nation's Capital
1398 | Home Box Office
1399 | Thomas Crowell Press
1400 | Asahi Breweries
1401 | Massachusetts Review
1402 | CBS Paramount Television
1403 | Neural Networks
1404 | Hudson Ltd
1405 | Asia Television Limited
1406 | Science Applications International Corporation
1407 | Virginia Quarterly Review
1408 | Thinking Machines Corporation
1409 | Vanguard Group
1410 | Harvester Press
1411 | Braniff International Airways
1412 | Hoechst AG
1413 | Ignatius Press
1414 | National Express Group
1415 | Brussels Airlines
1416 | BNP Paribas
1417 | Quaker Oats Company
1418 | Aerospace Corporation
1419 | National Australia Bank
1420 | Broadcast Music Incorporated
1421 | EMC Corporation
1422 | RTL Television
1423 | Valve Corporation
1424 | J.B. Lippincott Company
1425 | Detroit Diesel
1426 | Presidio Press
1427 | MIT Artificial Intelligence Laboratory
1428 | Telcordia Technologies
1429 | Regnery Publishing
1430 | Museum Tusculanum Press
1431 | PS Publishing
1432 | REO Motor Car Company
1433 | International Journal
1434 | Encyclopedia Britannica Inc.
1435 | NPD Group
1436 | United States Steel
1437 | Macmillan Co.
1438 | Gorgias Press LLC
1439 | New York Times Co.
1440 | Philippine Stock Exchange
1441 | Aircraft Corporation
1442 | American Reprographics Company
1443 | Power Corporation
1444 | Macmillan Press
1445 | Bitstream Inc.
1446 | Reserve Bank of Australia
1447 | Zenith Press
1448 | Dodd Mead & Co.
1449 | Swiss International Air Lines
1450 | Brown & Company
1451 | Husky Energy
1452 | Open Systems
1453 | Phillips Petroleum
1454 | Old Town Canoe Company
1455 | Carnegie Steel Company
1456 | Midland Publishing
1457 | Associated Electrical Industries
1458 | Bay Area Laboratory
1459 | Erie Railroad Co.
1460 | Norfolk Naturalists' Trust
1461 | Harrah's Entertainment
1462 | Marathon Petroleum Company
1463 | CABI Publishing
1464 | Amateur Radio
1465 | Al-Ahram Weekly
1466 | National Capital Region Capital
1467 | NTT DoCoMo
1468 | Tesla Motors
1469 | Green Bank
1470 | Folk Metal
1471 | United Productions of America
1472 | Fantasy Games Unlimited
1473 | BBC Canada
1474 | American Tobacco Company
1475 | Joint Photographic Experts Group
1476 | Buell Motorcycle Company
1477 | Covered Bridge Capital
1478 | Toys R Us
1479 | Canadian Broadcasting Company
1480 | Geoscience Press
1481 | Grolier Incorporated
1482 | African Slave Trade
1483 | Banca d'Italia
1484 | Austin Motor Company
1485 | British Racing Motors
1486 | TNT N.V.
1487 | People Weekly
1488 | Malaysia Airlines
1489 | San Diego Union Tribune
1490 | Philtrum Press
1491 | Formula One Management
1492 | Nippon Telegraph and Telephone
1493 | Telos Press
1494 | Open Group
1495 | Stalky & Co
1496 | Bethlehem Steel
1497 | Farmers' Loan & Trust Co.
1498 | MAN SE
1499 | Mercury News
1500 | ARM Limited
1501 | Computing Tabulating Recording Corporation
1502 | Lisp Machines Inc.
1503 | Roundabout Theatre Company
1504 | Vancouver Sun
1505 | Devon Wildlife Trust
1506 | FOX network
1507 | B. Eerdmans Publishing Co.
1508 | New York Times Company
1509 | Air University Press
1510 | MIPS Technologies
1511 | New Worlds
1512 | Continuum International Publishing
1513 | Disinformation Company
1514 | Gauntlet Press
1515 | Ashgate Publishing Ltd
1516 | World Scientific Publishing Co.
1517 | Aral AG
1518 | US Weekly
1519 | A&M Records
1520 | TV4 AB
1521 | Vanguard Press
1522 | R.H. Donnelley
1523 | Columbia Journalism Review
1524 | Aerol�neas Argentinas
1525 | Digital Research Inc.
1526 | E.W. Scripps
1527 | Science Fiction
1528 | Associated British Foods
1529 | Aeroflot - Russian Airlines
1530 | Database Systems
1531 | Super Junior
1532 | State Street Corporation
1533 | Kessinger Publishing LLC
1534 | Island Records
1535 | Burlington Northern Santa Fe
1536 | Medical Examination Publishing Co.
1537 | Eternal Music
1538 | International Paper
1539 | White Wolf Publishing
1540 | Public Television
1541 | Fairleigh Dickinson Univ Press
1542 | Phillips Petroleum Company
1543 | Phillips
1544 | Southern Oregon
1545 | 3Com Corporation
1546 | Pakistan International Airlines
1547 | Falling Rain Genomics Inc
1548 | Simon & Schuster Inc.
1549 | Network Working Group
1550 | E. P. Dutton & Co.
1551 | Great Lakes Airlines
1552 | Wildlife Forensics Laboratory
1553 | Literary Dictionary Company
1554 | W. Norton & Company Inc.
1555 | Thomas Nelson Inc.
1556 | C. F. Martin & Company
1557 | Kyodo News
1558 | Stagecoach Group
1559 | St. Louis Business Journal
1560 | Chevron Corporation
1561 | Mattel Inc.
1562 | News International
1563 | Jane's Defence Weekly
1564 | American Bridge Company
1565 | Dell Publishing
1566 | Soviet Russia
1567 | Kadena AB
1568 | MRC Laboratory
1569 | Network Associates
1570 | Red Hat Inc.
1571 | United Artists Records
1572 | Wallflower Press
1573 | RKO Radio Pictures Inc.
1574 | China Central Television
1575 | Confederate States of America
1576 | Blue Bell
1577 | Japan Tobacco
1578 | Monotype Corporation
1579 | Black Rose Books Ltd.
1580 | China Southern Airlines
1581 | Bobbs-Merrill Company
1582 | Chestnut Canoe Company
1583 | Intuit Inc.
1584 | Insurance Australia Group
1585 | Royal Niger Company
1586 | Power Electronics
1587 | Virgin Records
1588 | System Development Corporation
1589 | Cosgrove Hall Films
1590 | Mega Inc.
1591 | ASM Press
1592 | B. Herder Book Co.
1593 | Sinclair Oil
1594 | Long-Term Capital Management
1595 | Henry Holt and Co.
1596 | Space Corporation
1597 | New Jersey
1598 | Bain & Company
1599 | Dana International
1600 | Banco Popular de Puerto Rico
1601 | Longmans Green & Co.
1602 | Wadsworth Publishing
1603 | General Motors Company
1604 | Thomas Y. Crowell Company
1605 | RCA Records
1606 | Mars Science Laboratory
1607 | Gulf Research Laboratories
1608 | MITRE Corporation
1609 | Pennsylvania Railroad
1610 | J.D. Power and Associates
1611 | Liz Claiborne
1612 | Top 50 Cult Films
1613 | Scarecrow Press Inc.
1614 | Cox Communications
1615 | Enron Corporation
1616 | Epoch Times
1617 | Oxford University Press Inc.
1618 | Hutchinson & Co.
1619 | Barings Bank
1620 | Applied Physics Laboratory
1621 | MTV Networks Europe
1622 | New Directions Publishing
1623 | Charles Jenkins Laboratories
1624 | Total Film
1625 | Waltham Watch Company
1626 | Xlibris Corporation
1627 | US Forest Service
1628 | Mega Party Inc.
1629 | Barnard Island Group
1630 | Korean Air
1631 | ABN AMRO
1632 | Penn Central
1633 | DuPont Company
1634 | State Capital
1635 | Postal Telegraph Company
1636 | Asian Development Bank
1637 | Triad Publishing Company
1638 | Norton Simon
1639 | Reader's Digest Association
1640 | Herald Press
1641 | Dassault Systemes
1642 | SkyWest Airlines
1643 | Abacus Software
1644 | Ace Radio
1645 | Denham Group
1646 | W. H. Freeman and Co.
1647 | East India East India Company
1648 | Oak Knoll Press
1649 | Flinders Group
1650 | Fordham University Press
1651 | Stone Bridge Press
1652 | Sacred Music
1653 | Columbia Pictures Television
1654 | Frankland Group
1655 | Park Street Press
1656 | Philadelphia Bulletin
1657 | Acorn Computers Ltd
1658 | Rover Company
1659 | MTV Networks
1660 | Radiation Laboratory
1661 | Jack in the Box
1662 | Industrial Development Corporation
1663 | Athlone Press
1664 | Consultative Group
1665 | Cardinal Health
1666 | City Bridge Trust
1667 | Cork University Press
1668 | Manx Wildlife Trust
1669 | Eiffel Software
1670 | Southern Illinois
1671 | Hudson�s Bay Company
1672 | Sirius XM Radio
1673 | BBC Two
1674 | British Music
1675 | Norton & Company
1676 | Marlowe & Company
1677 | Living Resources
1678 | Seattle Weekly
1679 | Copa Airlines
1680 | Colt's Manufacturing Company
1681 | Agency for the Prohibition
1682 | DIANE Publishing
1683 | Booz Allen
1684 | Broadman Press
1685 | Specialty Press
1686 | Creative Labs
1687 | Mitsubishi Electric
1688 | KPMG Europe LLP
1689 | Avalon Hill
1690 | Mitsubishi UFJ Financial Group
1691 | NATO Partnership
1692 | MCA Inc.
1693 | Mountain Music
1694 | Inner City Press
1695 | Waveland Press
1696 | BiblioBazaar LLC
1697 | Howick Group
1698 | William Morrow & Co.
1699 | Dick's Sporting Goods
1700 | Bath Spa
1701 | Harcourt Brace & Company
1702 | Plumas Transit Systems
1703 | Ambrosia Software
1704 | Ruth's Chris
1705 | Arkham House
1706 | Shanghai Automotive Industry Corporation
1707 | ARM Holdings
1708 | Queen's Music
1709 | Best Films
1710 | New Holland Ag
1711 | Turtle Beach Systems
1712 | GNU Radio
1713 | Ringling Bros.
1714 | Pullman Company
1715 | Oliphants Ltd.
1716 | Visa Inc.
1717 | American can
1718 | Johnson Controls
1719 | Blackwell Publishing Ltd.
1720 | Computer Laboratory
1721 | Short Brothers
1722 | Campbell Soup
1723 | Tandem Computers
1724 | Shugart Associates
1725 | National Bus Company
1726 | United Nations Industrial Development Universal Postal Union
1727 | Waste management
1728 | Great Beacon Press
1729 | Bell Canada
1730 | MIPS Computer Systems Inc.
1731 | Metal Machine Music
1732 | Open Court Publishing Company
1733 | Eyre Methuen Ltd
1734 | Standard Chartered Bank
1735 | John Benjamins Publishing Company
1736 | Fe Co
1737 | United States Naval Institute Press
1738 | Anglo American plc
1739 | Cisco Systems Inc.
1740 | Resorts Company
1741 | Barclays plc
1742 | Metropolitan Life Insurance Company
1743 | Pacific Crest
1744 | Thomas Y. Crowell Co.
1745 | Schlumberger Limited
1746 | Nimbus Publishing
1747 | Milton Hershey School Trust
1748 | Pharaonic Egypt
1749 | Tyndale Press
1750 | Ford of Europe
1751 | Playboy Enterprises
1752 | Vueling Airlines
1753 | Fuji Heavy Industries
1754 | Audi AG
1755 | Perseus Publishing
1756 | Rand Corporation
1757 | Plant Genetic Resources
1758 | Soncino Press
1759 | De Nederlandsche Bank
1760 | Briggs & Stratton
1761 | KM Group
1762 | El Pais
1763 | NEC Corporation
1764 | Ford Crown Victoria
1765 | Orlando Sentinel
1766 | General Foods
1767 | Crown Publishers Inc
1768 | Morris Motor Company
1769 | Simulations Publications Inc.
1770 | Henry Holt & Co
1771 | Warner Music
1772 | Bayer AG
1773 | Australian Securities Exchange
1774 | Bank of Japan
1775 | Ansett Australia
1776 | Four Courts Press
1777 | E! Entertainment Television
1778 | Black Entertainment Television
1779 | Computer Associates
1780 | Clear Channel Communications
1781 | Bangemall Group
1782 | MICRA Inc.
1783 | Bear Stearns
1784 | E. Remington and Sons
1785 | Lucas Industries plc
1786 | Irish Music
1787 | Trans-Canada Airlines
1788 | Royal Bank of Canada
1789 | West Publishing
1790 | Northern Rock
1791 | Oxford English Dictionary
1792 | Sri Lanka Telecom
1793 | Wonderwall Music
1794 | America Online Inc.
1795 | Bristol-Myers Squibb
1796 | New York Inc.
1797 | Turner Publishing Company
1798 | Mayfield Publishing
1799 | Valero Energy
1800 | Northrop Grumman Corporation
1801 | Visible Ink Press
1802 | Woody Press
1803 | Amsterdam Stock Exchange
1804 | Quarry Bank
1805 | NXP Semiconductors
1806 | Rough Guides Ltd.
1807 | Sudan Airways
1808 | Sony Corporation
1809 | Sony
1810 | Quintet Publishing
1811 | Aldine Press
1812 | National Railway Company
1813 | Essex Wildlife Trust
1814 | Dairy Queen
1815 | Publisher Inc.
1816 | Pacific Southwest Airlines
1817 | South China Morning Post
1818 | Ames Laboratory
1819 | Gem Trade
1820 | Kinney National Company
1821 | Polar Music
1822 | American World Airways
1823 | Indian Classical Music
1824 | Whole Foods Market
1825 | Local Group
1826 | Edison Trust
1827 | Engineering Research Associates
1828 | Pernod Ricard
1829 | Sterling Publishing Company Inc
1830 | Hawaiian Airlines
1831 | 
1832 | 


--------------------------------------------------------------------------------
/honorifics.txt:
--------------------------------------------------------------------------------
 1 | A.
 2 | Adj.
 3 | Adm.
 4 | Adv.
 5 | Asst.
 6 | B.
 7 | Bart.
 8 | Brig.
 9 | Bros.
10 | C.
11 | Capt.
12 | Cmdr.
13 | Col.
14 | Comdr.
15 | Con.
16 | Cpl.
17 | D.
18 | DR.
19 | Dr.
20 | E.
21 | Ens.
22 | F.
23 | G.
24 | Gen.
25 | Gov.
26 | H.
27 | Hon.
28 | Hosp.
29 | I.
30 | Insp.
31 | J.
32 | K.
33 | L.
34 | Lt.
35 | M.
36 | M.
37 | MM.
38 | MR.
39 | MRS.
40 | MS.
41 | Maj.
42 | Messrs.
43 | Mlle.
44 | Mme.
45 | Mr.
46 | Mrs.
47 | Ms.
48 | Msgr.
49 | N.
50 | O.
51 | Op.
52 | Ord.
53 | P.
54 | Pfc.
55 | Ph.
56 | Prof.
57 | Pvt.
58 | Q.
59 | R.
60 | Rep.
61 | Reps.
62 | Rev.
63 | S.
64 | Sen.
65 | Sens.
66 | Sfc.
67 | Sgt.
68 | Sr.
69 | St.
70 | Supt.
71 | T.
72 | U.
73 | V.
74 | W.
75 | X.
76 | Y.
77 | Z.
78 | v.
79 | vs.
80 | 


--------------------------------------------------------------------------------
/placenames.txt:
--------------------------------------------------------------------------------
  1 | Barbados:country
  2 | Hudson:us_city
  3 | Japan:country
  4 | Vientiane:country_capital
  5 | Pago Pago:country_capital
  6 | Michigan:us_state
  7 | Malaysia:country
  8 | Sedona:us_city
  9 | Texas:us_state
 10 | Larami:us_city
 11 | P'yongyang:country_capital
 12 | Samoa:country
 13 | Praia:country_capital
 14 | Buenos Aires:country_capital
 15 | Anguilla:country
 16 | Bethel:us_city
 17 | Botswana:country
 18 | Schenectady:us_city
 19 | Northern Mariana Islands:country
 20 | Kiribati:country
 21 | Bridgetown:country_capital
 22 | Basseterre:country_capital
 23 | Evanston:us_city
 24 | North Little Rock:us_city
 25 | Somalia:country
 26 | Gaza Strip:country
 27 | Tempe:us_city
 28 | Rio Rancho:us_city
 29 | Brooklyn Park:us_city
 30 | Layton:us_city
 31 | Qatar:country
 32 | Bishkek:country_capital
 33 | Baker Island:country
 34 | Stamford:us_city
 35 | Majuro:country_capital
 36 | Kuala Lumpur:country_capital
 37 | Warner Robins:us_city
 38 | Midwest City:us_city
 39 | Florissant:us_city
 40 | Kansas City:us_city
 41 | Fairmont:us_city
 42 | Biloxi:us_city
 43 | Marietta:us_city
 44 | Franklin:us_city
 45 | Flagstaff:us_city
 46 | North Dakota:us_state
 47 | Westminster:us_city
 48 | Delaware:us_state
 49 | Haiti:country
 50 | Providence:us_city
 51 | Bend:us_city
 52 | United States:country
 53 | Burkina Faso:country
 54 | Roanoke:us_city
 55 | West Valley City:us_city
 56 | Havre:us_city
 57 | Rogers:us_city
 58 | Rockford:us_city
 59 | Guam:country
 60 | Kenner:us_city
 61 | Butte:us_city
 62 | Conakry:country_capital
 63 | Hanoi:country_capital
 64 | Wahpeton:us_city
 65 | Quito:country_capital
 66 | Hollywood:us_city
 67 | Bosnia:country
 68 | Herzegovina:country
 69 | Liberia:country
 70 | Washington:us_state
 71 | Saint Joseph:us_city
 72 | Nicaragua:country
 73 | Dover:us_city
 74 | Fort Wayne:us_city
 75 | New Caledonia:country
 76 | Moore:us_city
 77 | Warwick:us_city
 78 | British Indian Ocean Territory:country
 79 | Glorioso Islands:country
 80 | Fort Collins:us_city
 81 | Egypt:country
 82 | New Haven:us_city
 83 | Norfolk:us_city
 84 | Jamaica:country
 85 | Paramaribo:country_capital
 86 | Douglas:country_capital
 87 | Kingston:country_capital
 88 | Valletta:country_capital
 89 | Wichita:us_city
 90 | New Castle:us_city
 91 | Iraq:country
 92 | Gibraltar:country_capital
 93 | Zambia:country
 94 | Iran:country
 95 | Sioux City:us_city
 96 | Ohio:us_state
 97 | Portland:us_city
 98 | Rochester:us_city
 99 | Canberra:country_capital
100 | Peru:country
101 | Enid:us_city
102 | Montgomery:us_city
103 | West Island:country_capital
104 | Nairobi:country_capital
105 | Florida:us_state
106 | New Zealand:country
107 | Kodiak:us_city
108 | Clarksville:us_city
109 | Gadsden:us_city
110 | Mesa:us_city
111 | Charlotte:us_city
112 | Gary:us_city
113 | Corvallis:us_city
114 | British Virgin Islands:country
115 | Micronesia:country
116 | Bamako:country_capital
117 | Washington:us_state
118 | Portugal:country
119 | Turkmenistan:country
120 | Broken Arrow:us_city
121 | Cambodia:country
122 | Norman:us_city
123 | Congo:country
124 | Albuquerque:us_city
125 | Philadelphia:us_city
126 | Spokane:us_city
127 | Middletown:us_city
128 | Hilo:us_city
129 | Senegal:country
130 | Brunswick:us_city
131 | Angola:country
132 | Aurora:us_city
133 | Damascus:country_capital
134 | Baton Rouge:us_city
135 | Rapid City:us_city
136 | Minneapolis:us_city
137 | Greenville:us_city
138 | Adamstown:country_capital
139 | Suriname:country
140 | Ecuador:country
141 | Sofia:country_capital
142 | Rockville:us_city
143 | Lawrence:us_city
144 | Columbia:us_city
145 | Saint Louis:us_city
146 | Federal Way:us_city
147 | French Polynesia:country
148 | North Charleston:us_city
149 | Lebanon:country
150 | Belarus:country
151 | Lorain:us_city
152 | Portsmouth:us_city
153 | Auburn:us_city
154 | Ukraine:country
155 | Aruba:country
156 | Kiev:city
157 | Erie:us_city
158 | Suva:country_capital
159 | Eau Claire:us_city
160 | Livonia:us_city
161 | Quincy:us_city
162 | Vaduz:country_capital
163 | Frederick:us_city
164 | Ljubljana:country_capital
165 | Cyprus:country
166 | Wisconsin:us_state
167 | Biddeford:us_city
168 | Mogadishu:country_capital
169 | Ann Arbor:us_city
170 | Provo:us_city
171 | Abu Dhabi:country_capital
172 | Fremont:us_city
173 | Bahrain:country
174 | Sweden:country
175 | Dominican Republic:country
176 | Maldives:country
177 | Oklahoma City:us_city
178 | Madrid:country_capital
179 | Guinea-Bissau:country
180 | Conway:us_city
181 | Saint Helier:country_capital
182 | Copenhagen:country_capital
183 | Niamey:country_capital
184 | Indianapolis:us_city
185 | Port Moresby:country_capital
186 | Appleton:us_city
187 | Vatican City:country_capital
188 | Kingman Reef:country
189 | Youngstown:us_city
190 | Grenada:country
191 | Tuvalu:country
192 | Albany:us_city
193 | Iowa City:us_city
194 | Bangor:us_city
195 | Sandy:us_city
196 | Durham:us_city
197 | Djibouti:country_capital
198 | United Arab Emirates:country
199 | Milford:us_city
200 | Moldova:country
201 | Italy:country
202 | Syracuse:us_city
203 | Gambia:country
204 | Hattiesburg:us_city
205 | Buffalo:us_city
206 | West Fargo:us_city
207 | New York:us_city
208 | High Point:us_city
209 | Kingstown:country_capital
210 | Saint-Denis:country_capital
211 | Ulaanbaatar:country_capital
212 | Jerusalem:city
213 | Salisbury:us_city
214 | College Park:us_city
215 | Scranton:us_city
216 | Overland Park:us_city
217 | Bozeman:us_city
218 | Barrow:us_city
219 | Vienna:country_capital
220 | Road Town:country_capital
221 | Brookings:us_city
222 | Sierra Leone:country
223 | Anaheim:us_city
224 | Los Angeles:us_city
225 | Hot Springs:us_city
226 | Nevada:us_state
227 | Phoenix:us_city
228 | Martinique:country
229 | Dhaka:country_capital
230 | Beijing:country_capital
231 | Sri Lanka:country
232 | Memphis:us_city
233 | Atlanta:us_city
234 | Grand Forks:us_city
235 | Montana:us_state
236 | Wasilla:us_city
237 | Maryland:us_state
238 | Pembroke Pines:us_city
239 | Oman:country
240 | Pawtucket:us_city
241 | Armenia:country
242 | Denver:us_city
243 | Iceland:country
244 | Gillette:us_city
245 | Sanford:us_city
246 | Belgium:country
247 | Lancaster:us_city
248 | Monrovia:country_capital
249 | Edison Township:us_city
250 | New Hampshire:us_state
251 | Banjul:country_capital
252 | Nashville:us_city
253 | Doha:country_capital
254 | Algeria:country
255 | Marshall Islands:country
256 | Kalispell:us_city
257 | Switzerland:country
258 | Mauritania:country
259 | Manchester:us_city
260 | Oranjestad:country_capital
261 | Charleston:us_city
262 | Ouagadougou:country_capital
263 | Bossier City:us_city
264 | Beaverton:us_city
265 | Zimbabwe:country
266 | Carlsbad:us_city
267 | New Delhi:country_capital
268 | West Hartford:us_city
269 | Jordan:country
270 | Jersey:country
271 | Louisville:us_city
272 | Rome:country_capital
273 | Bucharest:country_capital
274 | Saint Kitts:country
275 | Bhutan:country
276 | San Diego:us_city
277 | Elizabeth:us_city
278 | Georgia:country
279 | Greensboro:us_city
280 | Antarctica:country
281 | Jonesboro:us_city
282 | Honiara:country_capital
283 | Netherlands:country
284 | Israel:country
285 | Dushanbe:country_capital
286 | Chandler:us_city
287 | Decatur:us_city
288 | Taylorsville:us_city
289 | Cincinnati:us_city
290 | Colorado:us_state
291 | Tucson:us_city
292 | Tupelo:us_city
293 | Huron:us_city
294 | Libreville:country_capital
295 | Manhattan:us_city
296 | Pakistan:country
297 | Winston-Salem:us_city
298 | Thailand:country
299 | Bandar Seri Begawan:country_capital
300 | Macedonia:country
301 | Honduras:country
302 | Bloomington:us_city
303 | Newark:us_city
304 | Europa Island:country
305 | Reading:us_city
306 | Lusaka:country_capital
307 | Hillsboro:us_city
308 | Cedar Rapids:us_city
309 | Tuscaloosa:us_city
310 | San Jose:country_capital
311 | Phnom Penh:country_capital
312 | Taiwan:country
313 | Nicosia:country_capital
314 | Canada:country
315 | Grand Rapids:us_city
316 | Minsk:country_capital
317 | Lome:country_capital
318 | Bangladesh:country
319 | Hong Kong:country
320 | New Britain:us_city
321 | Saint John's:country_capital
322 | Kinshasa:country_capital
323 | South Dakota:us_state
324 | Prague:country_capital
325 | Mali:country
326 | Bujumbura:country_capital
327 | Cambridge:us_city
328 | Little Rock:us_city
329 | Moscow:country_capital
330 | Augusta:us_city
331 | Male:country_capital
332 | La Paz:city
333 | South Bend:us_city
334 | Aberdeen:us_city
335 | Ashgabat:country_capital
336 | Kansas:us_state
337 | Omaha:us_city
338 | Pittsburgh:us_city
339 | Comoros:country
340 | Annapolis:us_city
341 | Sanaa:country_capital
342 | Solomon Islands:country
343 | Kampala:country_capital
344 | Lexington:us_city
345 | Westland:us_city
346 | Nepal:country
347 | Rock Hill:us_city
348 | Chicago:us_city
349 | Orlando:us_city
350 | Thornton:us_city
351 | New Jersey:us_state
352 | Kathmandu:country_capital
353 | Huntington:us_city
354 | Macau:country
355 | Cedar Falls:us_city
356 | Arvada:us_city
357 | Luxembourg:country_capital
358 | Guernsey:country
359 | Canton:us_city
360 | Shawnee:us_city
361 | Springfield:us_city
362 | Davenport:us_city
363 | Nassau:country_capital
364 | Chisinau:country_capital
365 | Bowling Green:us_city
366 | Tirana:country_capital
367 | Louisiana:us_state
368 | Hoover:us_city
369 | Panama:country_capital
370 | Oslo:country_capital
371 | Mozambique:country
372 | Sterling Heights:us_city
373 | Benin:country
374 | Saint Petersburg:us_city
375 | Argentina:country
376 | Lowell:us_city
377 | N'Djamena:country_capital
378 | Dublin:country_capital
379 | Saint Lucia:country
380 | South Portland:us_city
381 | Ghana:country
382 | Woonsocket:us_city
383 | Sao Tome:country_capital
384 | El Paso:us_city
385 | Lansing:us_city
386 | Mandan:us_city
387 | Kazakhstan:country
388 | Las Cruces:us_city
389 | Eugene:us_city
390 | West Jordan:us_city
391 | Tunisia:country
392 | Hastings:us_city
393 | Boston:us_city
394 | Reunion:country
395 | Anchorage:us_city
396 | Richmond:us_city
397 | Ketchikan:us_city
398 | Johnson City:us_city
399 | Parkersburg:us_city
400 | Tampa:us_city
401 | Las Vegas:us_city
402 | Mount Pleasant:us_city
403 | Hungary:country
404 | Duluth:us_city
405 | Dickinson:us_city
406 | Cuba:country
407 | Santiago:country_capital
408 | Toledo:us_city
409 | West Warwick:us_city
410 | Victoria:country_capital
411 | Alaska:us_state
412 | Detroit:us_city
413 | Thimphu:country_capital
414 | Smyrna:us_city
415 | Mexico:country_capital
416 | Vanuatu:country
417 | Jakarta:country_capital
418 | Des Moines:us_city
419 | Albania:country
420 | Kyrgyzstan:country
421 | Mitchell:us_city
422 | Arkansas:us_state
423 | Kentucky:us_state
424 | Castries:country_capital
425 | Guinea:country
426 | Korea:country
427 | Bolivia:country
428 | Gaborone:country_capital
429 | Meridian:us_city
430 | South Africa:country
431 | Maine:us_state
432 | Cook Islands:country
433 | Tennessee:us_state
434 | Cayenne:country_capital
435 | Cayman Islands:country
436 | Paraguay:country
437 | Independence:us_city
438 | Mayotte:country
439 | Alofi:country_capital
440 | Dearborn:us_city
441 | Muscat:country_capital
442 | Saint Peter Port:country_capital
443 | Henderson:us_city
444 | Turkey:country
445 | Germany:country
446 | Hamilton:country_capital
447 | Lincoln:us_city
448 | Islamabad:country_capital
449 | Burundi:country
450 | Minot:us_city
451 | Houston:us_city
452 | Christmas Island:country
453 | Yuma:us_city
454 | Concord:us_city
455 | Pennsylvania:us_state
456 | Oakland:us_city
457 | Connecticut:us_state
458 | Georgetown:country_capital
459 | Midway Islands:country
460 | Austria:country
461 | Mount Vernon:us_city
462 | Salt Lake City:us_city
463 | Cary:us_city
464 | Port-au-Prince:country_capital
465 | Finland:country
466 | Gabon:country
467 | Clovis:us_city
468 | Elko:us_city
469 | Spratly Islands:country
470 | Farmington:us_city
471 | Cumberland:us_city
472 | Hawaii:us_state
473 | Bogota:country_capital
474 | Jersey City:us_city
475 | Basse-Terre:country_capital
476 | Malta:country
477 | French Guiana:country
478 | Falkland Islands:country
479 | Czech Republic:country
480 | Croatia:country
481 | Lesotho:country
482 | Scottsdale:us_city
483 | Afghanistan:country
484 | Western Sahara:country
485 | Atlantic Ocean:country
486 | Libya:country
487 | New Mexico:us_state
488 | Australia:country
489 | Palau:country
490 | Monroe:us_city
491 | El Salvador:country
492 | Zagreb:country_capital
493 | Hartford:us_city
494 | West Virginia:us_state
495 | San Francisco:us_city
496 | Cameroon:country
497 | Green Bay:us_city
498 | Vancouver:us_city
499 | Chad:country
500 | Pretoria:us_city
501 | Denmark:country
502 | Gilbert:us_city
503 | Avarua:country_capital
504 | Bassas da India:country
505 | Lewiston:us_city
506 | Latvia:country
507 | Hagerstown:us_city
508 | Great Falls:us_city
509 | Upper Darby Twp:us_city
510 | Mililani:us_city
511 | Scarborough:us_city
512 | Laos:country
513 | Hilton Head Island:us_city
514 | Juan de Nova Island:country
515 | Jackson:us_city
516 | Utah:us_state
517 | Singapore:country_capital
518 | Alamogordo:us_city
519 | Danbury:us_city
520 | Virginia:us_state
521 | Indiana:us_state
522 | Wilmington:us_city
523 | New Rochelle:us_city
524 | Eritrea:country
525 | Frankfort:us_city
526 | Milwaukee:us_city
527 | Tehran:country_capital
528 | Kenai:us_city
529 | Greenwich:us_city
530 | Lee's Summit:us_city
531 | Mesquite:us_city
532 | Venezuela:country
533 | St. George:us_city
534 | London:country_capital
535 | Lilongwe:country_capital
536 | Managua:country_capital
537 | Tallahassee:us_city
538 | Clipperton Island:country
539 | Vietnam:country
540 | Dover Township:us_city
541 | Knoxville:us_city
542 | Palikir:country_capital
543 | Huntsville:us_city
544 | Dominica:country
545 | Brockton:us_city
546 | Khartoum:country_capital
547 | Alabama:us_state
548 | Colombia:country
549 | Equatorial Guinea:country
550 | Coral Sea Islands:country
551 | Fargo:us_city
552 | West Bank:country
553 | Algiers:country_capital
554 | Anderson:us_city
555 | Bridgeport:us_city
556 | Reno:us_city
557 | Navassa Island:country
558 | Morocco:country
559 | Jacksonville:us_city
560 | Sacramento:us_city
561 | Tarawa:country_capital
562 | Johnston Atoll:country
563 | Berlin:country_capital
564 | San Juan:country_capital
565 | Spartanburg:us_city
566 | Plymouth:us_city
567 | Pine Bluff:us_city
568 | Yemen:country
569 | Kenya:country
570 | Norwalk:us_city
571 | Russia:country
572 | Miles City:us_city
573 | Windhoek:country_capital
574 | Oklahoma:us_state
575 | Tunis:country_capital
576 | Skopje:country_capital
577 | Colorado Springs:us_city
578 | Papua New Guinea:country
579 | Sarajevo:country_capital
580 | Missouri:us_state
581 | Trenton:us_city
582 | Bermuda:country
583 | Congo:country
584 | Glendale:us_city
585 | San Marino:country_capital
586 | Bissau:country_capital
587 | Antigua:country
588 | Barbuda:country
589 | Chesapeake:us_city
590 | Guadeloupe:country
591 | Dakar:country_capital
592 | Athens:country_capital
593 | Hampton:us_city
594 | Netherlands Antilles:country
595 | Tromelin Island:country
596 | Asuncion:country_capital
597 | Parma:us_city
598 | San Salvador:country_capital
599 | Manama:country_capital
600 | Brazil:country
601 | The Valley:country_capital
602 | Havana:country_capital
603 | Tanzania:country
604 | Miami:us_city
605 | Owensboro:us_city
606 | Salem:us_city
607 | Columbus:us_city
608 | Nuku'alofa:country_capital
609 | Billings:us_city
610 | Waterbury:us_city
611 | Minnesota:us_state
612 | Hobbs:us_city
613 | Florence:us_city
614 | Bouvet Island:country
615 | Papeete:country_capital
616 | Roseau:country_capital
617 | Virginia Beach:us_city
618 | Saint Charles:us_city
619 | Luanda:country_capital
620 | Amsterdam:city
621 | West Allis:us_city
622 | Slovenia:country
623 | Bellevue:us_city
624 | Oshkosh:us_city
625 | Anaconda:us_city
626 | Guatemala:country_capital
627 | Springdale:us_city
628 | Boulder:us_city
629 | Alexandria:us_city
630 | Carson City:us_city
631 | Yerevan:country_capital
632 | Fort Lauderdale:us_city
633 | Waukesha:us_city
634 | Philippines:country
635 | Sioux Falls:us_city
636 | Clarksburg:us_city
637 | Seaford:us_city
638 | New Orleans:us_city
639 | Nigeria:country
640 | Arlington:us_city
641 | Flint:us_city
642 | Grand Island:us_city
643 | Fallon:us_city
644 | Racine:us_city
645 | Wyoming:us_state
646 | Vermillion:us_city
647 | Wheeling:us_city
648 | none:country_capital
649 | Montserrat:country
650 | India:country
651 | Belize:country
652 | Kearney:us_city
653 | Howland Island:country
654 | Andorra:country
655 | East Providence:us_city
656 | Namibia:country
657 | Saint Pierre:country
658 | Miquelon:country
659 | Guyana:country
660 | Nebraska:us_state
661 | Wellington:country_capital
662 | Santa Ana:us_city
663 | Swaziland:country
664 | Evansville:us_city
665 | Tulsa:us_city
666 | Dubuque:us_city
667 | Yonkers:us_city
668 | Gulfport:us_city
669 | Fresno:us_city
670 | France:country
671 | Southern Ocean:country
672 | Worcester:us_city
673 | Yakima:us_city
674 | Edmond:us_city
675 | Arctic Ocean:country
676 | Jamestown:country_capital
677 | Akron:us_city
678 | Madagascar:country
679 | Cody:us_city
680 | Greece:country
681 | Fort Worth:us_city
682 | Cape Verde:country
683 | Central African Republic:country
684 | Illinois:us_state
685 | Ethiopia:country
686 | Lawton:us_city
687 | Saipan:country_capital
688 | Yaounde:country_capital
689 | Bellingham:us_city
690 | Faroe Islands:country
691 | Seychelles:country
692 | Terre Haute:us_city
693 | Tokyo:country_capital
694 | Slovakia:country
695 | Rabat:country_capital
696 | Trinidad:country
697 | Tobago:country
698 | Belgrade:country_capital
699 | Dallas:us_city
700 | Montevideo:country_capital
701 | Norway:country
702 | Niue:country
703 | Malabo:country_capital
704 | Watertown:us_city
705 | Bismarck:us_city
706 | San Antonio:us_city
707 | North Las Vegas:us_city
708 | Lima:country_capital
709 | Plymouth:us_city
710 | Tegucigalpa:country_capital
711 | Fall River:us_city
712 | Tallinn:country_capital
713 | Port-of-Spain:country_capital
714 | Longyearbyen:country_capital
715 | Covington:us_city
716 | Tonga:country
717 | Gaithersburg:us_city
718 | Kuwait:country_capital
719 | Svalbard:country
720 | Beirut:country_capital
721 | Maseru:country_capital
722 | Uruguay:country
723 | Naperville:us_city
724 | Kingsport:us_city
725 | Green River:us_city
726 | Greenland:country
727 | Liechtenstein:country
728 | Vermont:us_state
729 | Norfolk Island:country
730 | Saint-Pierre:country_capital
731 | Lisbon:country_capital
732 | Bulgaria:country
733 | Waukegan:us_city
734 | Paris:country_capital
735 | California:us_state
736 | Brazzaville:country_capital
737 | Coventry:us_city
738 | Pueblo:us_city
739 | Fayetteville:us_city
740 | Antananarivo:country_capital
741 | Baghdad:country_capital
742 | Ogden:us_city
743 | Amman:country_capital
744 | Budapest:country_capital
745 | Manila:country_capital
746 | Lake Charles:us_city
747 | Waterloo:us_city
748 | China:country
749 | Torshavn:country_capital
750 | Lakewood:us_city
751 | Chattanooga:us_city
752 | Murfreesboro:us_city
753 | Addis Ababa:country_capital
754 | Topeka:us_city
755 | Stockholm:country_capital
756 | Saint Helena:country
757 | Tacoma:us_city
758 | Sitka:us_city
759 | Puerto Rico:country
760 | Medford:us_city
761 | Reykjavik:country_capital
762 | Sumter:us_city
763 | Jarvis Island:country
764 | Helsinki:country_capital
765 | Costa Rica:country
766 | Birmingham:us_city
767 | Ottawa:country_capital
768 | Bethlehem:us_city
769 | Brasilia:country_capital
770 | Macon:us_city
771 | Helena:us_city
772 | The Settlement:country_capital
773 | Cheyenne:us_city
774 | Togo:country
775 | Taipei:country_capital
776 | Brunei:country
777 | Utica:us_city
778 | Paterson:us_city
779 | Fort Smith:us_city
780 | Idaho:us_state
781 | Brussels:country_capital
782 | South Carolina:us_state
783 | Saint George's:country_capital
784 | T'bilisi:country_capital
785 | Asheville:us_city
786 | Burma:country
787 | Woodbridge Township:us_city
788 | Stanley:country_capital
789 | Pitcairn Islands:country
790 | Paracel Islands:country
791 | Pacific Ocean:country
792 | Harare:country_capital
793 | Weirton:us_city
794 | Yankton:us_city
795 | Saudi Arabia:country
796 | Garland:us_city
797 | Nauru:country
798 | Ames:us_city
799 | Riverton:us_city
800 | Saint Peters:us_city
801 | Ankara:country_capital
802 | Bowie:us_city
803 | Uzbekistan:country
804 | Freetown:country_capital
805 | Plano:us_city
806 | Mobile:us_city
807 | Williston:us_city
808 | Indonesia:country
809 | Charlotte Amalie:country_capital
810 | Port Louis:country_capital
811 | Stillwater:us_city
812 | Chile:country
813 | Dayton:us_city
814 | Missoula:us_city
815 | United Kingdom:country
816 | Salina:us_city
817 | Fort-de-France:country_capital
818 | Korea:country
819 | Muncie:us_city
820 | Santa Fe:us_city
821 | Cleveland:us_city
822 | Saint Vincent:country
823 | Grenadines:country
824 | Vicksburg:us_city
825 | Sudan:country
826 | Sheridan:us_city
827 | Jeffersontown:us_city
828 | Peoria:us_city
829 | Martinsburg:us_city
830 | Saint Paul:us_city
831 | Wake Island:country
832 | Belmopan:country_capital
833 | Baltimore:us_city
834 | Uganda:country
835 | Newport News:us_city
836 | Riyadh:country_capital
837 | Bangkok:country_capital
838 | Tajikistan:country
839 | Sao Tome:country
840 | Bern:country_capital
841 | New Bedford:us_city
842 | LaFayette:us_city
843 | Moroni:country_capital
844 | Syria:country
845 | Hammond:us_city
846 | Camden:us_city
847 | Orem:us_city
848 | Allentown:us_city
849 | Accra:country_capital
850 | Council Bluffs:us_city
851 | Palmyra Atoll:country
852 | Lynn:us_city
853 | Coon Rapids:us_city
854 | Fiji:country
855 | Santo Domingo:country_capital
856 | Bratislava:country_capital
857 | Nouakchott:country_capital
858 | Malawi:country
859 | Londonderry:us_city
860 | Derry:us_city
861 | Burnsville:us_city
862 | Fairbanks:us_city
863 | Mongolia:country
864 | Funafuti:country_capital
865 | Warsaw:country_capital
866 | North Carolina:us_state
867 | Azerbaijan:country
868 | Southaven:us_city
869 | Caracas:country_capital
870 | Maputo:country_capital
871 | North Platte:us_city
872 | Cote d'Ivoire:country
873 | Honolulu:us_city
874 | Kailua:us_city
875 | Rhode Island:us_state
876 | Boulder City:us_city
877 | Ireland:country
878 | Seoul:country_capital
879 | Elgin:us_city
880 | Gresham:us_city
881 | Arizona:us_state
882 | George Town:country_capital
883 | Bahamas:country
884 | Riga:country_capital
885 | Massachusetts:us_state
886 | Corpus Christi:us_city
887 | Romania:country
888 | Savannah:us_city
889 | Oregon:us_state
890 | Rock Springs:us_city
891 | Dothan:us_city
892 | Beckley:us_city
893 | American Samoa:country
894 | Tripoli:country_capital
895 | Madison:us_city
896 | Iowa:us_state
897 | North Providence:us_city
898 | Tokelau:country
899 | Juneau:us_city
900 | Austin:us_city
901 | Seattle:us_city
902 | Hialeah:us_city
903 | Kabul:country_capital
904 | Niger:country
905 | Noumea:country_capital
906 | Shreveport:us_city
907 | Mamoutzou:country_capital
908 | Everett:us_city
909 | Olathe:us_city
910 | Andorra la Vella:country_capital
911 | Indian Ocean:country
912 | Bangui:country_capital
913 | Joliet:us_city
914 | Cranston:us_city
915 | Raleigh:us_city
916 | Morgantown:us_city
917 | Kent:us_city
918 | Jan Mayen:country
919 | Spain:country
920 | Mauritius:country
921 | Port-Vila:country_capital
922 | Hutchinson:us_city
923 | Mississippi:us_state
924 | Lithuania:country
925 | Rwanda:country
926 | Kigali:country_capital
927 | Estonia:country
928 | Merrimack:us_city
929 | Sparks:us_city
930 | Virgin Islands:country
931 | Vilnius:country_capital
932 | Long Beach:us_city
933 | Willemstad:country_capital
934 | Cairo:country_capital
935 | Poland:country
936 | Kenosha:us_city
937 | Apia:country_capital
938 | Monaco:country_capital
939 | Casper:us_city
940 | Roswell:us_city
941 | Eagan:us_city
942 | Hopkinsville:us_city
943 | Yugoslavia:country
944 | 
945 | 


--------------------------------------------------------------------------------
/prefixnames.txt:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Premier
 3 | Major
 4 | Corporal
 5 | King
 6 | General
 7 | Ms
 8 | Gen
 9 | Mrs
10 | Sen
11 | Mr
12 | Doctor
13 | St
14 | Prince
15 | Representative
16 | Maj
17 | President
18 | Congressman
19 | Vice
20 | Lt
21 | Senator
22 | 


--------------------------------------------------------------------------------
/product_names.txt:
--------------------------------------------------------------------------------
 1 | Java
 2 | IntelliJ
 3 | Coke
 4 | Coca Cola
 5 | Pepsi
 6 | Diet Pepsi
 7 | Fanta Orange
 8 | Dr Pepper
 9 | Alka Seltzer
10 | Avanta
11 | Levitra
12 | Mac
13 | Macintosh
14 | iPhone
15 | Jeep
16 | Jeep Cherokee
17 | Ford Thunderbird
18 | Pontiac
19 | Winnebago
20 | Volkswagen
21 | Hula Hoops
22 | Pringles
23 | Ruffles
24 | Kit Kat
25 | Oreo
26 | Eskimo Pie
27 | VAIO
28 | Handycam
29 | Cyber Shot
30 | DigiMax
31 | Ford
32 | Chevie
33 | 


--------------------------------------------------------------------------------
/src/.properties:
--------------------------------------------------------------------------------
1 | {
2 | 	#format : #tonel
3 | }


--------------------------------------------------------------------------------
/src/KBSnlp.st:
--------------------------------------------------------------------------------
1 | ﻿PackageManifest subclass: #ManifestKBSnlp	instanceVariableNames: ''	classVariableNames: ''	poolDictionaries: ''	category: 'KBSnlp'!!ManifestKBSnlp commentStamp: 'MarkWatson 5/19/2017 06:24' prior: 0!Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.See:  https://github.com/mark-watson/nlp_smalltalk!"-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "!ManifestKBSnlp class	instanceVariableNames: ''!!ManifestKBSnlp class methodsFor: 'code-critics' stamp: 'MarkWatson 5/14/2017 21:23'!ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') )! !!ManifestKBSnlp class methodsFor: 'code-critics' stamp: 'MarkWatson 5/14/2017 21:25'!ruleRBToDoCollectRuleV1FalsePositive	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') )! !Object subclass: #NLPcategories	instanceVariableNames: ''	classVariableNames: ''	poolDictionaries: ''	category: 'KBSnlp'!!NLPcategories commentStamp: 'MarkWatson 5/19/2017 06:25' prior: 0!A NLPcategories is class to categorize text.Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.See:  https://github.com/mark-watson/nlp_smalltalk!"-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "!NLPcategories class	instanceVariableNames: ''!!NLPcategories class methodsFor: 'classify' stamp: 'MarkWatson 1/13/2015 14:09'!classify: text	"classify text in a string"	| tokens categories scores num hash numTokens results cutoff |	tokens :=  NLPtagger tokenize: (text , 'XXXXXX').	categories := (Smalltalk at: #NlpCategoryHash) keys.	num := categories size.	numTokens := tokens size - 1.	scores := Array new: num.	1 to: num do: [ :i |		scores at: i put: 0.		hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i).		1 to: numTokens do: [ :j |			(hash includesKey: (tokens at: j))			  ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ].		hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i).		1 to: numTokens do: [ :j |			(hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1)))			  ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]].	results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 	1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ].	cutoff := ((results at: 1) at: 1) / 2.	results := results select: [ :x | (x at: 1) > cutoff ].	^results.	! !!NLPcategories class methodsFor: 'classify' stamp: 'MarkWatson 1/13/2015 13:59'!initializeCategoryHash	"requires NeoJSON"     Smalltalk at: #NlpCategoryHash                    put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags.json') contentsOfEntireFile).     Smalltalk at: #NlpCategory2gramHash                    put: (NeoJSONReader fromString: (FileStream fileNamed: './nlp_smalltalk/tags_2gram.json') contentsOfEntireFile)! !Object subclass: #NLPentities	instanceVariableNames: ''	classVariableNames: ''	poolDictionaries: ''	category: 'KBSnlp'!!NLPentities commentStamp: 'MarkWatson 5/19/2017 06:25' prior: 0!A NLPentities is a class to find people's names, company names, place names, etc. in text.Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.See:  https://github.com/mark-watson/nlp_smalltalk!"-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "!NLPentities class	instanceVariableNames: ''!!NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/13/2015 10:32'!entities: aString	"return a Dictionary of entities (keys type, values Sets"	| temp result |	result := Dictionary new.	temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString.	temp size > 0		ifTrue: [ result at: 'companies' put: temp ].	temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString.	temp size > 0		ifTrue: [ result at: 'products' put: temp ].	temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString.	temp size > 0		ifTrue: [ result at: 'places' put: temp ].	temp := NLPentities humanNameHelper: aString.	temp size > 0		ifTrue: [ result at: 'places' put: temp ].	^ result! !!NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/13/2015 10:32'!humanNameHelper: aString	"this is a helper method for everything **but** person names"	| tokens num results |	results := Set new.	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.	num := tokens size - 3.	" account for the 3 fake tokens at the end "	1 to: num do: [ :i | 		((Smalltalk at: #NLPfirstNames) includes: (tokens at: i))			ifTrue: [ 				(((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1))					and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2)))					ifTrue: [ 						results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2).						i := i + 2 ]					ifFalse: [ 						((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1))							ifTrue: [ 								results add: (tokens at: i) , ' ' , (tokens at: i + 1).								i := i + 1 ] ] ] ].	^ results! !!NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/13/2015 10:33'!entityHelper: entitySet text: aString	"this is a helper method for everything **but** person names"	| tokens num ngram2 ngram3 results |	results := Set new.	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.	num := tokens size - 3.	" account for the 3 fake tokens at the end "	1 to: num do: [ :i | 		ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1).		ngram3 := ngram2 , ' ' , (tokens at: i + 2).	"Transcript show: ngram2; cr."		(entitySet includes: ngram3)			ifTrue: [ results add: ngram3 ]			ifFalse: [ 				(entitySet includes: ngram2)					ifTrue: [ results add: ngram2 ]					ifFalse: [ 						(entitySet includes: (tokens at: i))							ifTrue: [ results add: (tokens at: i) ] ] ] ].	^ results! !!NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 1/12/2015 15:43'!fileToDictionary: filePath	"Read data/lexicon.txt and build in memory lexicon"      | read count  aLine  strm  set |      Transcript show: 'Processing file ' , filePath; cr.	set := Set new.	read := (MultiByteFileStream fileNamed: filePath) readOnly.	count := 0.	[read atEnd]		whileFalse: [count := count + 1.			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"			"look for a space character: "			((aLine indexOf: $:) > 0)			  ifTrue: [ 				 strm := ReadStream on: aLine.			       aLine := strm upTo: $:].			set add: aLine].	read close.	^set! !!NLPentities class methodsFor: 'entityDetection' stamp: 'MarkWatson 5/16/2017 20:36'!initializeEntities	"load entity name data"	" Note: place name lines of the form: Cairo:country_capital   Fixed in fileToDictionary "	Smalltalk		at: #NLPcompanyNames		put: (NLPentities fileToDictionary: './nlp_smalltalk/company_names.txt').	Smalltalk		at: #NLPfirstNames		put: (NLPentities fileToDictionary: './nlp_smalltalk/firstnames.txt').	Smalltalk		at: #NLPlastNames		put: (NLPentities fileToDictionary: './nlp_smalltalk/lastnames.txt').	Smalltalk		at: #NLPhonorifics		put: (NLPentities fileToDictionary: './nlp_smalltalk/honorifics.txt').	Smalltalk		at: #NLPprefixNames		put: (NLPentities fileToDictionary: './nlp_smalltalk/prefixnames.txt').	Smalltalk		at: #NLPplaceNames		put: (NLPentities fileToDictionary: './nlp_smalltalk/placenames.txt').	Smalltalk		at: #NLPproductNames		put: (NLPentities fileToDictionary: './nlp_smalltalk/product_names.txt').			" also read in data we will need for sentence segmentation: "	Smalltalk		at: #NLPtokensWithPeriods		put: (NLPentities fileToDictionary: './nlp_smalltalk/tokens_with_periods.txt').! !Object subclass: #NLPsentences	instanceVariableNames: ''	classVariableNames: ''	poolDictionaries: ''	category: 'KBSnlp'!!NLPsentences commentStamp: 'MarkWatson 5/19/2017 06:26' prior: 0!A class to segment text into sentences.Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.See:  https://github.com/mark-watson/nlp_smalltalk!"-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "!NLPsentences class	instanceVariableNames: ''!!NLPsentences class methodsFor: 'initialize' stamp: 'MarkWatson 5/18/2017 16:42'!loadData	"Load tokens that normally contain periods"	| aSet count reverseDictionary forwardDictionary |	count := 0.	reverseDictionary := Dictionary new.	forwardDictionary := Dictionary new.	aSet := NLPsentences fileToSet: './nlp_smalltalk/tokensWithPeriods.txt'.	Smalltalk at: #NLPtokensWithPeriods put: aSet.	^ 'tokens with periods data loaded'! !!NLPsentences class methodsFor: 'segment' stamp: 'MarkWatson 5/18/2017 16:33'!sentences: someText	"tokenize a string into individual sentences"	| tokens aSet lastToken currentSentence allSentences |	aSet := Smalltalk at: #NLPtokensWithPeriods.	tokens := OrderedCollection new.	(NLPsentences tokenizeLeavePeriods: someText)		do: [ :token | 			(token includesSubstring: '.') not				ifTrue: [ tokens add: token ]				ifFalse: [ (aSet includes: token)						ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: '').							tokens add: '.' ]						ifTrue: [ tokens add: token ] ] ].	currentSentence := OrderedCollection new.	allSentences := OrderedCollection new.	lastToken := ''.	Transcript		show: tokens;		cr.	tokens		do: [ :token | 			Transcript				show: token;				cr.			currentSentence add: token.			((token = '.' and: lastToken isAllDigits not) or: token = '?')				ifTrue: [ allSentences addLast: currentSentence.					currentSentence := OrderedCollection new ].			lastToken := token ].	currentSentence isNotEmpty		ifTrue: [ allSentences addLast: currentSentence ].	^ allSentences! !!NLPsentences class methodsFor: 'utiities' stamp: 'MarkWatson 5/18/2017 13:42'!fileToSet: filePath	"Read file, create Set with elements being each line in file"	| read aLine set |	Transcript		show: 'Processing file ' , filePath;		cr.	set := Set new.	read := (MultiByteFileStream fileNamed: filePath) readOnly.	[ read atEnd ]		whileFalse: [ aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"			set add: aLine ].	read close.	^ set! !!NLPsentences class methodsFor: 'utiities' stamp: 'MarkWatson 5/18/2017 15:31'!tokenizeLeavePeriods: wordsInAString	"tokenizes a string"	^ wordsInAString		findTokens:			' ;:,<>[]{}!!@#$%^&*()?'		keep: ';:.,<>[]{}!!$?'	" keep CR in this string!!!! "! !Object subclass: #NLPsummarizer	instanceVariableNames: ''	classVariableNames: ''	poolDictionaries: ''	category: 'KBSnlp'!!NLPsummarizer commentStamp: 'MarkWatson 5/19/2017 06:26' prior: 0!A class to classify English text into categories.Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.See:  https://github.com/mark-watson/nlp_smalltalk!"-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "!NLPsummarizer class	instanceVariableNames: ''!!NLPsummarizer class methodsFor: 'summary' stamp: 'MarkWatson 5/15/2017 09:34'!summarize: text	"extractive summarizer"	| sentences sentenceScores tokens scoredCategories hash x bestIndices |	scoredCategories := NLPcategories classify: text.	sentences := NLPtagger sentences: text.	sentenceScores := Array new: sentences size.	1 to: sentences size do: [ :i | 		sentenceScores at: i put: 0.		tokens := sentences at: i.		Transcript			show: (sentences at: i);			cr.		scoredCategories			do: [ :sc | 				hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2).				tokens					do: [ :token | 						(hash includesKey: token)							ifTrue: [ x := hash at: token.								sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ].	bestIndices := sentenceScores		collectWithIndex: [ :score :i | 			{score.			i} ].	Transcript		show: 'sentence scoring: ';		show: bestIndices;		cr.	bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ].	^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ]! !Object subclass: #NLPtagger	instanceVariableNames: ''	classVariableNames: 'NLPlexicon'	poolDictionaries: ''	category: 'KBSnlp'!!NLPtagger commentStamp: 'MarkWatson 5/19/2017 06:27' prior: 0!NLP tagger converted to Squeak.A class that implements an NLP tagger.Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.See:  https://github.com/mark-watson/nlp_smalltalk!"-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- "!NLPtagger class	instanceVariableNames: ''!!NLPtagger class methodsFor: 'segmentation' stamp: 'MarkWatson 5/16/2017 20:43'!sentences: data	"Handle either a string or array of tokens.	 Limitations:	  1. This code does not currently handle special characters like —	  2. Periods in numbers: only check previous character, not the	     next so a sentence ending with e.g., 2. will not be handled correctly.      	"	| tokens lastToken currentSentence allSentences token |	tokens := (data isMemberOf: ByteString)		ifTrue: (NLPtagger tokenize: data)		ifFalse: data.	currentSentence := OrderedCollection new.	allSentences := OrderedCollection new.	lastToken := ''.	Transcript		show: tokens;		cr.	tokens		do: [ :token1 | 			((Smalltalk at: #NLPtokensWithPeriods) includes: token1)			  ifTrue: [ token := (Smalltalk at: #NLPtokensWithPeriods) get: token1 ]			  ifFalse: [ token := token1 ].			Transcript				show: token;				cr.			currentSentence add: token.			((token = '.' and: lastToken isAllDigits not) or: token = '?')				ifTrue: [ allSentences addLast: currentSentence.					currentSentence := OrderedCollection new ].			lastToken := token ].	currentSentence isNotEmpty		ifTrue: [ allSentences addLast: currentSentence ].	^ allSentences! !!NLPtagger class methodsFor: 'tagging' stamp: 'MW 1/27/2008 12:53'!tag: words	"tag an ordered collection of words, returning an ordered collection of corresponding tags"	| lex tags tag count i word lastWord lastTag |	tags := OrderedCollection new.	lex := Smalltalk at: #NLPlexicon.	words do: 			[:aWord | 			tag := lex at: aWord ifAbsent: [nil].			tag isNil ifFalse: [tag := tag at: 1] ifTrue: [tag := 'NN'].	" the default tag "			tags add: tag].	" Apply transformation rules: "	lastWord := ''.	lastTag := ''.	i := 0.	count := words size.	[i < count] whileTrue: 			[i := i + 1.			word := words at: i.			tag := tags at: i.	" reuse tag variable "			" First, handle all rules for i &gt; 1 "			i > 1 				ifTrue: 					[" rule 1: DT, {VBD | VBP} --> DT, NN "					lastTag = 'DT' & (tag = 'VBD' | (tag = 'VBP') | (tag = 'VB')) 						ifTrue: [tags at: i put: 'NN'].					tag size > 1 						ifTrue: 							[" rule 6: convert a noun to a verb if the preceeding work is 'would' "							(tag at: 1) = $N & ((tag at: 2) = $N) & (lastWord asLowercase = 'would') 								ifTrue: [tags at: i put: 'VB']]].			" Now, handle the remaining rules that are valid for i = 1: "			" rule 2: convert a noun to a number (CD) if '.' appears in the word"			(word findString: '.') > 0 				ifTrue: [(tag at: 1) = $N ifTrue: [tags at: i put: 'CD']].	" not working - tokenizer tosses '.' characters "			" rule 3: convert a noun to a past participle if words[i] ends with 'ed' "			(tag at: 1) = $N & (word endsWith: 'ed') ifTrue: [tags at: i put: 'VBN'].			" rule 4: convert any type to adverb if it ends in 'ly' "			(word endsWith: 'ly') ifTrue: [tags at: i put: 'RB'].			" rule 5: convert a common noun (NN or NNS) to a adjective if it ends with 'al' "			(tag at: 1) = $N & (word endsWith: 'al') ifTrue: [tags at: i put: 'JJ'].			" rule 7: if a word has been categorized as a common noun and it ends with 's;, "			"          then set its type to plural common noun (NNS) "			tag = 'NN' & (word endsWith: 's') ifTrue: [tags at: i put: 'NNS'].			" rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) "			(tag at: 1) = $N & (word endsWith: 'ing') ifTrue: [tags at: i put: 'VBG'].			lastWord := word.			lastTag := tag].	^tags! !!NLPtagger class methodsFor: 'tagging' stamp: 'MW 1/27/2008 13:21'!pptag: wordString 	"returns a string of word/tag ..."	| words tags write size count |	words := NLPtagger tokenize: wordString.	tags := NLPtagger tag: words.	write := TextStream on: String new.	size := words size.	count := 1.	[count <= size]		whileTrue: [			write nextPutAll: (words at: count).			write nextPutAll: '/'.			write nextPutAll: (tags at: count).			write nextPutAll: ' '.			count := count + 1].	^write contents string! !!NLPtagger class methodsFor: 'tagging' stamp: 'MarkWatson 5/18/2017 16:37'!initializeLexicon	"Read data/lexicon.txt and build in memory lexicon"	| read count strm aLine word taglist token lex |	lex := Dictionary new.	read := (FileStream fileNamed: './nlp_smalltalk/lexicon.txt') readOnly.	count := 0.	[ read atEnd ]		whileFalse: [ count := count + 1.			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"			strm := ReadStream on: aLine.			word := strm upTo: Character space.			taglist := OrderedCollection new.			[ strm atEnd ]				whileFalse: [ token := strm upTo: Character space.					taglist add: token ].			"Transcript show: word; cr."			"Transcript show: taglist printString; cr."			lex at: word put: taglist ].	read close.	Smalltalk at: #NLPlexicon put: lex! !!NLPtagger class methodsFor: 'tokenization' stamp: 'MarkWatson 5/15/2017 10:11'!tokenize: wordsInAString	"tokenizes a string"	^ wordsInAString		findTokens:			' ;:.,<>[]{}!!@#$%^&*()?'		keep: ';:.,<>[]{}!!$?'	" keep CR in this string!!!! "! !


--------------------------------------------------------------------------------
/src/KBSnlp/ManifestKBSnlp.class.st:
--------------------------------------------------------------------------------
 1 | "
 2 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
 3 | 
 4 | See:  https://github.com/mark-watson/nlp_smalltalk
 5 | 
 6 | "
 7 | Class {
 8 | 	#name : #ManifestKBSnlp,
 9 | 	#superclass : #PackageManifest,
10 | 	#category : #KBSnlp
11 | }
12 | 
13 | { #category : #'code-critics' }
14 | ManifestKBSnlp class >> ruleRBOnlyReadOrWrittenTemporaryRuleV1FalsePositive [
15 | 	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:23:23.063039-07:00') )
16 | ]
17 | 
18 | { #category : #'code-critics' }
19 | ManifestKBSnlp class >> ruleRBToDoCollectRuleV1FalsePositive [
20 | 	^ #(#(#(#RGMethodDefinition #(#'NLPsummarizer class' #summarize: #true)) #'2017-05-14T21:25:54.536453-07:00') )
21 | ]
22 | 


--------------------------------------------------------------------------------
/src/KBSnlp/NLPcategories.class.st:
--------------------------------------------------------------------------------
 1 | "
 2 | A NLPcategories is class to categorize text.
 3 | 
 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
 5 | 
 6 | See:  https://github.com/mark-watson/nlp_smalltalk
 7 | 
 8 | "
 9 | Class {
10 | 	#name : #NLPcategories,
11 | 	#superclass : #Object,
12 | 	#category : #KBSnlp
13 | }
14 | 
15 | { #category : #classify }
16 | NLPcategories class >> classify: text [
17 | 	"classify text in a string"
18 | 
19 | 	| tokens categories scores num hash numTokens results cutoff |
20 | 	tokens :=  NLPtagger tokenize: (text , 'XXXXXX').
21 | 	categories := (Smalltalk at: #NlpCategoryHash) keys.
22 | 	num := categories size.
23 | 	numTokens := tokens size - 1.
24 | 	scores := Array new: num.
25 | 	1 to: num do: [ :i |
26 | 		scores at: i put: 0.
27 | 		hash := (Smalltalk at: #NlpCategoryHash) at: (categories at: i).
28 | 		1 to: numTokens do: [ :j |
29 | 			(hash includesKey: (tokens at: j))
30 | 			  ifTrue: [scores at: i put: ((scores at: i) + (hash at: (tokens at: j)))] ].
31 | 		hash := (Smalltalk at: #NlpCategory2gramHash) at: (categories at: i).
32 | 		1 to: numTokens do: [ :j |
33 | 			(hash includesKey: ((tokens at: j) , ' ' , (tokens at: j + 1)))
34 | 			  ifTrue: [scores at: i put: ((scores at: i)+ ((hash at: (tokens at: j) , ' ' , (tokens at: j + 1)) * 8))]]].
35 | 	results := SortedCollection sortBlock: [:c1 :c2 | (c1 at:1) > (c2 at:1)]. 
36 | 	1 to: num do: [ :i | |a| a := (Array new: 2). a at: 1 put: (scores at:i); at: 2 put: (categories at: i). results add: a ].
37 | 	cutoff := ((results at: 1) at: 1) / 2.
38 | 	results := results select: [ :x | (x at: 1) > cutoff ].
39 | 	^results.
40 | 	
41 | ]
42 | 
43 | { #category : #classify }
44 | NLPcategories class >> initializeCategoryHash [
45 | 	"requires NeoJSON"
46 | 	| aDir |
47 | 	aDir := FileSystem disk workingDirectory.
48 |    Smalltalk at: #NlpCategoryHash
49 |                  put: (NeoJSONReader fromString: ((aDir / './pharo-local/iceberg/mark-watson/nlp_smalltalk/tags.json') readStream) contents).
50 |    Smalltalk at: #NlpCategory2gramHash
51 |                  put: (NeoJSONReader fromString: ((aDir / './pharo-local/iceberg/mark-watson/nlp_smalltalk/tags_2gram.json') readStream) contents)
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/KBSnlp/NLPentities.class.st:
--------------------------------------------------------------------------------
  1 | "
  2 | A NLPentities is a class to find people's names, company names, place names, etc. in text.
  3 | 
  4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
  5 | 
  6 | See:  https://github.com/mark-watson/nlp_smalltalk
  7 | 
  8 | "
  9 | Class {
 10 | 	#name : #NLPentities,
 11 | 	#superclass : #Object,
 12 | 	#category : #KBSnlp
 13 | }
 14 | 
 15 | { #category : #entityDetection }
 16 | NLPentities class >> entities: aString [
 17 | 	"return a Dictionary of entities (keys type, values Sets"
 18 | 
 19 | 	| temp result |
 20 | 	result := Dictionary new.
 21 | 	temp := NLPentities entityHelper: (Smalltalk at: #NLPcompanyNames) text: aString.
 22 | 	temp size > 0
 23 | 		ifTrue: [ result at: 'companies' put: temp ].
 24 | 	temp := NLPentities entityHelper: (Smalltalk at: #NLPproductNames) text: aString.
 25 | 	temp size > 0
 26 | 		ifTrue: [ result at: 'products' put: temp ].
 27 | 	temp := NLPentities entityHelper: (Smalltalk at: #NLPplaceNames) text: aString.
 28 | 	temp size > 0
 29 | 		ifTrue: [ result at: 'places' put: temp ].
 30 | 	temp := NLPentities humanNameHelper: aString.
 31 | 	temp size > 0
 32 | 		ifTrue: [ result at: 'places' put: temp ].
 33 | 	^ result
 34 | ]
 35 | 
 36 | { #category : #entityDetection }
 37 | NLPentities class >> entityHelper: entitySet text: aString [
 38 | 	"this is a helper method for everything **but** person names"
 39 | 
 40 | 	| tokens num ngram2 ngram3 results |
 41 | 	results := Set new.
 42 | 	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.
 43 | 	num := tokens size - 3.	" account for the 3 fake tokens at the end "
 44 | 	1 to: num do: [ :i | 
 45 | 		ngram2 := (tokens at: i) , ' ' , (tokens at: i + 1).
 46 | 		ngram3 := ngram2 , ' ' , (tokens at: i + 2).	"Transcript show: ngram2; cr."
 47 | 		(entitySet includes: ngram3)
 48 | 			ifTrue: [ results add: ngram3 ]
 49 | 			ifFalse: [ 
 50 | 				(entitySet includes: ngram2)
 51 | 					ifTrue: [ results add: ngram2 ]
 52 | 					ifFalse: [ 
 53 | 						(entitySet includes: (tokens at: i))
 54 | 							ifTrue: [ results add: (tokens at: i) ] ] ] ].
 55 | 	^ results
 56 | ]
 57 | 
 58 | { #category : #entityDetection }
 59 | NLPentities class >> fileToDictionary: filePath [
 60 | 
 61 | 	"Read data/lexicon.txt and build in memory lexicon"
 62 | 
 63 |       | aDir read2 read count  aLine  strm  set |
 64 | 
 65 |       Transcript show: 'Processing file ' , filePath; cr.
 66 | 
 67 | 	set := Set new.
 68 | 	aDir := FileSystem disk workingDirectory.
 69 | 	"read := (MultiByteFileStream fileNamed: filePath) readOnly."
 70 | 	read := (aDir / filePath) readStream.
 71 | 	"read := (ZnCharacterReadStream on: read2 encoding: #utf8)."
 72 | 	
 73 | 	count := 0.
 74 | 	[read atEnd]
 75 | 		whileFalse: [count := count + 1.
 76 | 			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
 77 | 			"look for a space character: "
 78 | 			((aLine indexOf: $:) > 0)
 79 | 			  ifTrue: [ 
 80 | 				 strm := ReadStream on: aLine.
 81 | 			       aLine := strm upTo: $:].
 82 | 			set add: aLine].
 83 | 	read close.
 84 | 	^set
 85 | 
 86 | ]
 87 | 
 88 | { #category : #entityDetection }
 89 | NLPentities class >> humanNameHelper: aString [
 90 | 	"this is a helper method for everything **but** person names"
 91 | 
 92 | 	| tokens num results |
 93 | 	results := Set new.
 94 | 	tokens := NLPtagger tokenize: aString , ' xxxxx yyyyy zzzzz'.
 95 | 	num := tokens size - 3.	" account for the 3 fake tokens at the end "
 96 | 	1 to: num do: [ :i | 
 97 | 		((Smalltalk at: #NLPfirstNames) includes: (tokens at: i))
 98 | 			ifTrue: [ 
 99 | 				(((Smalltalk at: #NLPfirstNames) includes: (tokens at: i + 1))
100 | 					and: ((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 2)))
101 | 					ifTrue: [ 
102 | 						results add: (tokens at: i) , ' ' , (tokens at: i + 1) , ' ' , (tokens at: i + 2).
103 | 						i := i + 2 ]
104 | 					ifFalse: [ 
105 | 						((Smalltalk at: #NLPlastNames) includes: (tokens at: i + 1))
106 | 							ifTrue: [ 
107 | 								results add: (tokens at: i) , ' ' , (tokens at: i + 1).
108 | 								i := i + 1 ] ] ] ].
109 | 	^ results
110 | ]
111 | 
112 | { #category : #entityDetection }
113 | NLPentities class >> initializeEntities [
114 | 	"load entity name data"
115 | 
116 | 	" Note: place name lines of the form: Cairo:country_capital   Fixed in fileToDictionary "
117 |    | repo path |
118 | 	repo := IceRepository registeredRepositoryIncludingPackage: (self class) package. 
119 |    path := (repo location) asString .
120 | 
121 | 	Smalltalk
122 | 		at: #NLPcompanyNames
123 | 		put: (NLPentities fileToDictionary: path , '/company_names.txt').
124 | 	Smalltalk
125 | 		at: #NLPfirstNames
126 | 		put: (NLPentities fileToDictionary: path , './firstnames.txt').
127 | 	Smalltalk
128 | 		at: #NLPlastNames
129 | 		put: (NLPentities fileToDictionary: path , './lastnames.txt').
130 | 	Smalltalk
131 | 		at: #NLPhonorifics
132 | 		put: (NLPentities fileToDictionary: path , '/honorifics.txt').
133 | 	Smalltalk
134 | 		at: #NLPprefixNames
135 | 		put: (NLPentities fileToDictionary: '/prefixnames.txt').
136 | 	Smalltalk
137 | 		at: #NLPplaceNames
138 | 		put: (NLPentities fileToDictionary: '/placenames.txt').
139 | 	Smalltalk
140 | 		at: #NLPproductNames
141 | 		put: (NLPentities fileToDictionary: path , '/product_names.txt').
142 | 		
143 | 	" also read in data we will need for sentence segmentation: "
144 | 	Smalltalk
145 | 		at: #NLPtokensWithPeriods
146 | 		put: (NLPentities fileToDictionary: path , '/tokensWithPeriods.txt').
147 | ]
148 | 


--------------------------------------------------------------------------------
/src/KBSnlp/NLPsentences.class.st:
--------------------------------------------------------------------------------
 1 | "
 2 | A class to segment text into sentences.
 3 | 
 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
 5 | 
 6 | See:  https://github.com/mark-watson/nlp_smalltalk
 7 | 
 8 | "
 9 | Class {
10 | 	#name : #NLPsentences,
11 | 	#superclass : #Object,
12 | 	#category : #KBSnlp
13 | }
14 | 
15 | { #category : #utiities }
16 | NLPsentences class >> fileToSet: filePath [
17 | 	"Read file, create Set with elements being each line in file"
18 | 
19 | 	| read aLine set aDir |
20 | 	Transcript
21 | 		show: 'Processing file ' , filePath;
22 | 		cr.
23 | 	set := Set new.
24 | 	"read := (MultiByteFileStream fileNamed: filePath) readOnly."
25 | 	aDir := FileSystem disk workingDirectory.
26 | 	read := (aDir / './pharo-local/iceberg/mark-watson/nlp_smalltalk/lexicon.txt') readStream.
27 | 	
28 | 	[ read atEnd ]
29 | 		whileFalse: [ aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
30 | 			set add: aLine ].
31 | 	read close.
32 | 	^ set
33 | ]
34 | 
35 | { #category : #initialize }
36 | NLPsentences class >> loadData [
37 | 	"Load tokens that normally contain periods"
38 | 
39 | 	| aSet count reverseDictionary forwardDictionary |
40 | 	count := 0.
41 | 	reverseDictionary := Dictionary new.
42 | 	forwardDictionary := Dictionary new.
43 | 	aSet := NLPsentences fileToSet: './pharo-local/iceberg/mark-watson/nlp_smalltalk/tokensWithPeriods.txt'.
44 | 	Smalltalk at: #NLPtokensWithPeriods put: aSet.
45 | 	^ 'tokens with periods data loaded'
46 | ]
47 | 
48 | { #category : #segment }
49 | NLPsentences class >> sentences: someText [
50 | 	"tokenize a string into individual sentences"
51 | 
52 | 	| tokens aSet lastToken currentSentence allSentences |
53 | 	aSet := Smalltalk at: #NLPtokensWithPeriods.
54 | 	tokens := OrderedCollection new.
55 | 	(NLPsentences tokenizeLeavePeriods: someText)
56 | 		do: [ :token | 
57 | 			(token includesSubstring: '.') not
58 | 				ifTrue: [ tokens add: token ]
59 | 				ifFalse: [ (aSet includes: token)
60 | 						ifFalse: [ tokens add: (token copyWithRegex: '\.' matchesReplacedWith: '').
61 | 							tokens add: '.' ]
62 | 						ifTrue: [ tokens add: token ] ] ].
63 | 	currentSentence := OrderedCollection new.
64 | 	allSentences := OrderedCollection new.
65 | 	lastToken := ''.
66 | 	Transcript
67 | 		show: tokens;
68 | 		cr.
69 | 	tokens
70 | 		do: [ :token | 
71 | 			Transcript
72 | 				show: token;
73 | 				cr.
74 | 			currentSentence add: token.
75 | 			((token = '.' and: lastToken isAllDigits not) or: token = '?')
76 | 				ifTrue: [ allSentences addLast: currentSentence.
77 | 					currentSentence := OrderedCollection new ].
78 | 			lastToken := token ].
79 | 	currentSentence isNotEmpty
80 | 		ifTrue: [ allSentences addLast: currentSentence ].
81 | 	^ allSentences
82 | ]
83 | 
84 | { #category : #utiities }
85 | NLPsentences class >> tokenizeLeavePeriods: wordsInAString [
86 | 	"tokenizes a string"
87 | 
88 | 	^ wordsInAString
89 | 		findTokens:
90 | 			' ;:,<>[]{}!
91 | @#$%^&*()?'
92 | 		keep: ';:.,<>[]{}!$?'	" keep CR in this string!! "
93 | ]
94 | 


--------------------------------------------------------------------------------
/src/KBSnlp/NLPsummarizer.class.st:
--------------------------------------------------------------------------------
 1 | "
 2 | A class to classify English text into categories.
 3 | 
 4 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
 5 | 
 6 | See:  https://github.com/mark-watson/nlp_smalltalk
 7 | 
 8 | "
 9 | Class {
10 | 	#name : #NLPsummarizer,
11 | 	#superclass : #Object,
12 | 	#category : #KBSnlp
13 | }
14 | 
15 | { #category : #summary }
16 | NLPsummarizer class >> summarize: text [
17 | 	"extractive summarizer"
18 | 
19 | 	| sentences sentenceScores tokens scoredCategories hash x bestIndices |
20 | 	scoredCategories := NLPcategories classify: text.
21 | 	sentences := NLPtagger sentences: text.
22 | 	sentenceScores := Array new: sentences size.
23 | 	1 to: sentences size do: [ :i | 
24 | 		sentenceScores at: i put: 0.
25 | 		tokens := sentences at: i.
26 | 		Transcript
27 | 			show: (sentences at: i);
28 | 			cr.
29 | 		scoredCategories
30 | 			do: [ :sc | 
31 | 				hash := (Smalltalk at: #NlpCategoryHash) at: (sc at: 2).
32 | 				tokens
33 | 					do: [ :token | 
34 | 						(hash includesKey: token)
35 | 							ifTrue: [ x := hash at: token.
36 | 								sentenceScores at: i put: (sentenceScores at: i) + (sc at: 1) ] ] ] ].
37 | 	bestIndices := sentenceScores
38 | 		collectWithIndex: [ :score :i | 
39 | 			{score.
40 | 			i} ].
41 | 	Transcript
42 | 		show: 'sentence scoring: ';
43 | 		show: bestIndices;
44 | 		cr.
45 | 	bestIndices := bestIndices select: [ :p | (p at: 1) > 2 ].
46 | 	^ bestIndices collect: [ :p | Character space join: (sentences at: (p at: 2)) ]
47 | ]
48 | 


--------------------------------------------------------------------------------
/src/KBSnlp/NLPtagger.class.st:
--------------------------------------------------------------------------------
  1 | "
  2 | NLP tagger converted to Squeak.
  3 | A class that implements an NLP tagger.
  4 | 
  5 | Copyright 2005-2017 Mark Watson. All rights reserved. Licensed for use under the MIT license with attribution required.
  6 | 
  7 | See:  https://github.com/mark-watson/nlp_smalltalk
  8 | 
  9 | "
 10 | Class {
 11 | 	#name : #NLPtagger,
 12 | 	#superclass : #Object,
 13 | 	#classVars : [
 14 | 		'NLPlexicon'
 15 | 	],
 16 | 	#category : #KBSnlp
 17 | }
 18 | 
 19 | { #category : #tagging }
 20 | NLPtagger class >> initializeLexicon [
 21 | 	"Read data/lexicon.txt and build in memory lexicon"
 22 | 
 23 | 	| read count strm aLine word taglist token lex repo |
 24 | 	lex := Dictionary new.
 25 | 	repo := IceRepository registeredRepositoryIncludingPackage: (self class) package. 
 26 |    read := ((repo location) asString , '/lexicon.txt') readStream .
 27 | 	count := 0.
 28 | 	[ read atEnd ]
 29 | 		whileFalse: [ count := count + 1.
 30 | 			aLine := read upTo: Character lf.	"Mac: use lf, Windows: use cr ???"
 31 | 			strm := ReadStream on: aLine.
 32 | 			word := strm upTo: Character space.
 33 | 			taglist := OrderedCollection new.
 34 | 			[ strm atEnd ]
 35 | 				whileFalse: [ token := strm upTo: Character space.
 36 | 					taglist add: token ].
 37 | 			"Transcript show: word; cr."
 38 | 			"Transcript show: taglist printString; cr."
 39 | 			lex at: word put: taglist ].
 40 | 	read close.
 41 | 	Smalltalk at: #NLPlexicon put: lex
 42 | ]
 43 | 
 44 | { #category : #tagging }
 45 | NLPtagger class >> pptag: wordString [ 
 46 | 	"returns a string of word/tag ..."
 47 | 
 48 | 	| words tags write size count |
 49 | 	words := NLPtagger tokenize: wordString.
 50 | 	tags := NLPtagger tag: words.
 51 | 	write := TextStream on: String new.
 52 | 	size := words size.
 53 | 	count := 1.
 54 | 	[count <= size]
 55 | 		whileTrue: [
 56 | 			write nextPutAll: (words at: count).
 57 | 			write nextPutAll: '/'.
 58 | 			write nextPutAll: (tags at: count).
 59 | 			write nextPutAll: ' '.
 60 | 			count := count + 1].
 61 | 	^write contents string
 62 | ]
 63 | 
 64 | { #category : #segmentation }
 65 | NLPtagger class >> sentences: data [
 66 | 	"Handle either a string or array of tokens.
 67 | 	 Limitations:
 68 | 	  1. This code does not currently handle special characters like —
 69 | 	  2. Periods in numbers: only check previous character, not the
 70 | 	     next so a sentence ending with e.g., 2. will not be handled correctly.      
 71 | 	"
 72 | 
 73 | 	| tokens lastToken currentSentence allSentences token |
 74 | 	tokens := (data isMemberOf: ByteString)
 75 | 		ifTrue: (NLPtagger tokenize: data)
 76 | 		ifFalse: data.
 77 | 	currentSentence := OrderedCollection new.
 78 | 	allSentences := OrderedCollection new.
 79 | 	lastToken := ''.
 80 | 	Transcript
 81 | 		show: tokens;
 82 | 		cr.
 83 | 	tokens
 84 | 		do: [ :token1 | 
 85 | 			((Smalltalk at: #NLPtokensWithPeriods) includes: token1)
 86 | 			  ifTrue: [ token := (Smalltalk at: #NLPtokensWithPeriods) get: token1 ]
 87 | 			  ifFalse: [ token := token1 ].
 88 | 
 89 | 			Transcript
 90 | 				show: token;
 91 | 				cr.
 92 | 			currentSentence add: token.
 93 | 			((token = '.' and: lastToken isAllDigits not) or: token = '?')
 94 | 				ifTrue: [ allSentences addLast: currentSentence.
 95 | 					currentSentence := OrderedCollection new ].
 96 | 			lastToken := token ].
 97 | 	currentSentence isNotEmpty
 98 | 		ifTrue: [ allSentences addLast: currentSentence ].
 99 | 	^ allSentences
100 | ]
101 | 
102 | { #category : #tagging }
103 | NLPtagger class >> tag: words [
104 | 	"tag an ordered collection of words, returning an ordered collection of corresponding tags"
105 | 
106 | 	| lex tags tag count i word lastWord lastTag |
107 | 	tags := OrderedCollection new.
108 | 	lex := Smalltalk at: #NLPlexicon.
109 | 	words do: 
110 | 			[:aWord | 
111 | 			tag := lex at: aWord ifAbsent: [nil].
112 | 			tag isNil ifFalse: [tag := tag at: 1] ifTrue: [tag := 'NN'].	" the default tag "
113 | 			tags add: tag].
114 | 	" Apply transformation rules: "
115 | 	lastWord := ''.
116 | 	lastTag := ''.
117 | 	i := 0.
118 | 	count := words size.
119 | 	[i < count] whileTrue: 
120 | 			[i := i + 1.
121 | 			word := words at: i.
122 | 			tag := tags at: i.	" reuse tag variable "
123 | 			" First, handle all rules for i &gt; 1 "
124 | 			i > 1 
125 | 				ifTrue: 
126 | 					[" rule 1: DT, {VBD | VBP} --> DT, NN "
127 | 
128 | 					lastTag = 'DT' & (tag = 'VBD' | (tag = 'VBP') | (tag = 'VB')) 
129 | 						ifTrue: [tags at: i put: 'NN'].
130 | 					tag size > 1 
131 | 						ifTrue: 
132 | 							[" rule 6: convert a noun to a verb if the preceeding work is 'would' "
133 | 							(tag at: 1) = $N & ((tag at: 2) = $N) & (lastWord asLowercase = 'would') 
134 | 								ifTrue: [tags at: i put: 'VB']]].
135 | 			" Now, handle the remaining rules that are valid for i = 1: "
136 | 			" rule 2: convert a noun to a number (CD) if '.' appears in the word"
137 | 			(word findString: '.') > 0 
138 | 				ifTrue: [(tag at: 1) = $N ifTrue: [tags at: i put: 'CD']].	" not working - tokenizer tosses '.' characters "
139 | 			" rule 3: convert a noun to a past participle if words[i] ends with 'ed' "
140 | 			(tag at: 1) = $N & (word endsWith: 'ed') ifTrue: [tags at: i put: 'VBN'].
141 | 			" rule 4: convert any type to adverb if it ends in 'ly' "
142 | 			(word endsWith: 'ly') ifTrue: [tags at: i put: 'RB'].
143 | 			" rule 5: convert a common noun (NN or NNS) to a adjective if it ends with 'al' "
144 | 			(tag at: 1) = $N & (word endsWith: 'al') ifTrue: [tags at: i put: 'JJ'].
145 | 			" rule 7: if a word has been categorized as a common noun and it ends with 's;, "
146 | 			"          then set its type to plural common noun (NNS) "
147 | 			tag = 'NN' & (word endsWith: 's') ifTrue: [tags at: i put: 'NNS'].
148 | 			" rule 8: convert a common noun to a present prticiple verb (i.e., a gerand) "
149 | 			(tag at: 1) = $N & (word endsWith: 'ing') ifTrue: [tags at: i put: 'VBG'].
150 | 			lastWord := word.
151 | 			lastTag := tag].
152 | 	^tags
153 | ]
154 | 
155 | { #category : #tokenization }
156 | NLPtagger class >> tokenize: wordsInAString [
157 | 	"tokenizes a string"
158 | 
159 | 	^ wordsInAString
160 | 		findTokens:
161 | 			' ;:.,<>[]{}!
162 | @#$%^&*()?'
163 | 		keep: ';:.,<>[]{}!$?'	" keep CR in this string!! "
164 | ]
165 | 


--------------------------------------------------------------------------------
/src/KBSnlp/package.st:
--------------------------------------------------------------------------------
1 | Package { #name : #KBSnlp }
2 | 


--------------------------------------------------------------------------------
/tags_2gram.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "chemistry": {
   3 |     "chemical reaction": 1.55,
   4 |     "atoms molecules": 0.6,
   5 |     "organic matter": 0.55,
   6 |     "electrons are": 0.55,
   7 |     "carbon carbon": 0.5,
   8 |     "periodic table": 0.5,
   9 |     "chemical reactions": 0.5,
  10 |     "carbon atom": 0.5
  11 |   },
  12 |   "computers": {
  13 |     "computer system": 0.9,
  14 |     "operating system": 0.75,
  15 |     "random memory": 0.65,
  16 |     "computer science": 0.65,
  17 |     "computer program": 0.6,
  18 |     "osi reference": 0.5
  19 |   },
  20 |   "computers_ai": {
  21 |     "artificial intelligence": 1.45,
  22 |     "ai research": 1.6,
  23 |     "john mccarthy": 0.95,
  24 |     "strong ai": 0.8,
  25 |     "computer science": 0.8,
  26 |     "symbolic ai": 0.7,
  27 |     "language processing": 0.55,
  28 |     "alan turing": 0.55
  29 |   },
  30 |   "computers_ai_datamining": {
  31 |     "data mining": 1.0,
  32 |     "machine learning": 0.65,
  33 |     "ai artificial": 0.85,
  34 |     "knowledgebooks ai": 0.85,
  35 |     "mining knowledgebooks": 0.85,
  36 |     "datamining data": 0.85,
  37 |     "intelligence datamining": 0.85,
  38 |     "bayesian networks": 0.55,
  39 |     "mining algorithms": 0.55,
  40 |     "mining knowledge": 0.5,
  41 |     "knowledge databases": 0.5,
  42 |     "terabytes data": 0.5,
  43 |     "databases world": 0.5,
  44 |     "largest databases": 0.5,
  45 |     "this tutorial": 0.5,
  46 |     "reinforcement learning": 0.5
  47 |   },
  48 |   "computers_ai_learning": {
  49 |     "machine learning": 1.8,
  50 |     "neural networks": 0.55,
  51 |     "artificial intelligence": 2.2,
  52 |     "neural network": 0.55,
  53 |     "human brain": 1.25,
  54 |     "learning algorithms": 1.1,
  55 |     "fuzzy logic": 0.7,
  56 |     "artificial neural": 0.7,
  57 |     "learning theory": 0.7,
  58 |     "ai magazine": 0.55,
  59 |     "speech recognition": 0.55,
  60 |     "computational learning": 0.55,
  61 |     "learning algorithm": 0.55,
  62 |     "supervised learning": 0.55,
  63 |     "computer vision": 0.55,
  64 |     "pattern recognition": 0.55,
  65 |     "hacking knowledge": 0.55
  66 |   },
  67 |   "computers_ai_nlp": {
  68 |     "natural language": 0.8,
  69 |     "machine learning": 0.75,
  70 |     "language processing": 0.5,
  71 |     "million words": 0.5
  72 |   },
  73 |   "computers_ai_search": {
  74 |     "node goal": 3.0,
  75 |     "data mining": 3.0,
  76 |     "mining data": 3.0,
  77 |     "text mining": 3.0,
  78 |     "search lucene": 3.0,
  79 |     "ai search": 3.0,
  80 |     "intelligence ai": 3.0,
  81 |     "artificial intelligence": 3.0,
  82 |     "goal node": 2.5,
  83 |     "search results": 2.0,
  84 |     "heuristic value": 1.5,
  85 |     "worst complexity": 1.0,
  86 |     "search strategies": 1.0,
  87 |     "guaranteed halt": 1.0,
  88 |     "depth search": 1.0,
  89 |     "derive heuristic": 1.0,
  90 |     "heuristic information": 1.0,
  91 |     "goal heuristic": 1.0,
  92 |     "heuristic search": 1.0,
  93 |     "intelligent systems": 1.0,
  94 |     "relevant query": 1.0,
  95 |     "results snippet": 1.0,
  96 |     "googlebot crawls": 1.0,
  97 |     "branching factors": 0.5,
  98 |     "graph infinite": 0.5,
  99 |     "exponential space": 0.5,
 100 |     "lowest exponential": 0.5,
 101 |     "path lengths": 0.5,
 102 |     "linear space": 0.5,
 103 |     "summary search": 0.5,
 104 |     "space complexity": 0.5,
 105 |     "complexity space": 0.5,
 106 |     "path algorithms": 0.5,
 107 |     "increases exponentially": 0.5,
 108 |     "bound search": 0.5,
 109 |     "globally minimal": 0.5,
 110 |     "search heuristic": 0.5,
 111 |     "priority queue": 0.5,
 112 |     "goal path": 0.5,
 113 |     "heuristic path": 0.5,
 114 |     "path heuristic": 0.5,
 115 |     "goal paths": 0.5,
 116 |     "networks ai": 0.5,
 117 |     "neural networks": 0.5,
 118 |     "google servers": 0.5,
 119 |     "map googlebot": 0.5,
 120 |     "crawls web": 0.5
 121 |   },
 122 |   "computers_ai_textmining": {
 123 |     "parameters call": 5.15,
 124 |     "text mining": 0.9,
 125 |     "marti hearst": 3.25,
 126 |     "words format": 2.15,
 127 |     "natural language": 1.75,
 128 |     "format parameters": 1.45,
 129 |     "documents words": 1.45,
 130 |     "text analytics": 0.8,
 131 |     "preslav nakov": 1.35,
 132 |     "curt monash": 1.35,
 133 |     "data mining": 1.2,
 134 |     "semantic space": 1.2,
 135 |     "information retrieval": 1.2,
 136 |     "information extraction": 0.9,
 137 |     "search engine": 0.9,
 138 |     "nakov schwartz": 0.8,
 139 |     "language processing": 0.8,
 140 |     "representation documents": 0.8,
 141 |     "training algorithm": 0.8,
 142 |     "vector machine": 0.8,
 143 |     "support vector": 0.8,
 144 |     "file parameters": 0.8,
 145 |     "hearst proceedings": 0.65,
 146 |     "nakov marti": 0.65,
 147 |     "emilia stoica": 0.65,
 148 |     "barbara rosario": 0.65,
 149 |     "search engines": 0.65,
 150 |     "text storage": 0.65,
 151 |     "search text": 0.65,
 152 |     "space documents": 0.65,
 153 |     "via training": 0.65,
 154 |     "learns via": 0.65,
 155 |     "compact documents": 0.65,
 156 |     "information text": 0.5,
 157 |     "hlt naacl": 0.5,
 158 |     "rosario marti": 0.5,
 159 |     "genomics track": 0.5,
 160 |     "trec genomics": 0.5,
 161 |     "stoica marti": 0.5,
 162 |     "categories search": 0.5,
 163 |     "odp dmoz": 0.5,
 164 |     "social networking": 0.5,
 165 |     "directories filtering": 0.5,
 166 |     "call search": 0.5,
 167 |     "tiling representation": 0.5,
 168 |     "space representation": 0.5,
 169 |     "documents bow": 0.5,
 170 |     "text format": 0.5,
 171 |     "format file": 0.5,
 172 |     "bioscience researchers": 0.5,
 173 |     "word document": 0.5
 174 |   },
 175 |   "computers_microsoft": {
 176 |     "microsoft corporation": 4.25,
 177 |     "windows vista": 3.8,
 178 |     "corporation microsoft": 2.85,
 179 |     "microsoft office": 2.0,
 180 |     "operating system": 1.6,
 181 |     "playtech estonia": 1.55,
 182 |     "sourcenext corporation": 0.95,
 183 |     "casino playtech": 0.85,
 184 |     "mamut asa": 0.85,
 185 |     "corporation sourcenext": 0.8,
 186 |     "software professor": 0.8,
 187 |     "bill gates": 0.8,
 188 |     "professor teaches": 0.75,
 189 |     "enginia research": 0.75,
 190 |     "windows xp": 0.75,
 191 |     "avanquest usa": 0.65,
 192 |     "encyclopaedia britannica": 0.65,
 193 |     "windows windows": 0.65,
 194 |     "asa mamut": 0.6,
 195 |     "software llc": 0.55,
 196 |     "punch software": 0.55,
 197 |     "microsoft visual": 0.55,
 198 |     "internet explorer": 0.55,
 199 |     "memory manager": 0.55,
 200 |     "microsoft windows": 0.5
 201 |   },
 202 |   "computers_programming": {
 203 |     "program programming": 3.95,
 204 |     "scheme program": 3.95,
 205 |     "lisp scheme": 3.95,
 206 |     "scala lisp": 3.95,
 207 |     "java scala": 3.95,
 208 |     "lisp java": 3.95,
 209 |     "java lisp": 3.95,
 210 |     "debugging java": 3.95,
 211 |     "debug debugging": 3.95,
 212 |     "logic debug": 3.95,
 213 |     "netbeans logic": 3.95,
 214 |     "intellij netbeans": 3.95,
 215 |     "eclipse intellij": 3.95,
 216 |     "ide eclipse": 3.95,
 217 |     "compilers ide": 3.95,
 218 |     "compiler compilers": 3.95,
 219 |     "programming language": 2.45,
 220 |     "programming compiler": 3.45,
 221 |     "pl pl": 2.95,
 222 |     "computer programs": 2.45,
 223 |     "iso iec": 2.45,
 224 |     "software development": 1.95,
 225 |     "something until": 0.95,
 226 |     "person something": 0.95,
 227 |     "file output": 0.95,
 228 |     "arrays file": 0.95,
 229 |     "actors arrays": 0.95,
 230 |     "turtles actors": 0.95,
 231 |     "project turtles": 0.95,
 232 |     "mastermind project": 0.95,
 233 |     "iteration mastermind": 0.95,
 234 |     "sentence iteration": 0.95,
 235 |     "word sentence": 0.95,
 236 |     "variables word": 0.95,
 237 |     "global variables": 0.95,
 238 |     "variables global": 0.95,
 239 |     "local variables": 0.95,
 240 |     "recursion local": 0.95,
 241 |     "predicates recursion": 0.95,
 242 |     "events predicates": 0.95,
 243 |     "interface events": 0.95,
 244 |     "sentences interface": 0.95,
 245 |     "words sentences": 0.95,
 246 |     "operators words": 0.95,
 247 |     "defining operators": 0.95,
 248 |     "operators defining": 0.95,
 249 |     "primitive operators": 0.95,
 250 |     "inputs primitive": 0.95,
 251 |     "procedure inputs": 0.95,
 252 |     "structure procedure": 0.95,
 253 |     "hierarchical structure": 0.95,
 254 |     "animation hierarchical": 0.95,
 255 |     "iteration animation": 0.95,
 256 |     "commands iteration": 0.95,
 257 |     "adding commands": 0.95,
 258 |     "pseudocode adding": 0.95,
 259 |     "requirements analysis": 0.95,
 260 |     "efficient evolvable": 0.95,
 261 |     "debate extent": 0.95,
 262 |     "going debate": 0.95,
 263 |     "development process": 0.95,
 264 |     "source code": 0.95,
 265 |     "visual visual": 0.95,
 266 |     "visual basic": 0.95,
 267 |     "objective objective": 0.95,
 268 |     "modula modula": 0.95,
 269 |     "intermediate language": 0.95,
 270 |     "common intermediate": 0.95,
 271 |     "prototype ada": 0.95
 272 |   },
 273 |   "computers_programming_c++": {
 274 |     "bjarne stroustrup": 1.15,
 275 |     "source code": 0.95,
 276 |     "std cout": 0.75,
 277 |     "template library": 0.75,
 278 |     "world program": 0.75,
 279 |     "operator overloading": 0.75,
 280 |     "programming languages": 0.75,
 281 |     "type this": 0.55,
 282 |     "representation type": 0.55,
 283 |     "members class": 0.55,
 284 |     "templates are": 0.55,
 285 |     "conditional compilation": 0.55,
 286 |     "namespace std": 0.55,
 287 |     "library template": 0.55,
 288 |     "type checking": 0.55,
 289 |     "derived class": 0.55,
 290 |     "hello world": 0.55,
 291 |     "iso iec": 0.55,
 292 |     "exception handling": 0.55,
 293 |     "multiple inheritance": 0.55,
 294 |     "bell labs": 0.55,
 295 |     "oriented programming": 0.55,
 296 |     "multi paradigm": 0.55
 297 |   },
 298 |   "computers_programming_java": {
 299 |     "web toolkit": 2.2,
 300 |     "google web": 2.2,
 301 |     "web services": 1.25,
 302 |     "eberhard wolff": 1.9,
 303 |     "java classes": 1.55,
 304 |     "java platform": 1.55,
 305 |     "wolff interface": 1.25,
 306 |     "ouml ller": 1.25,
 307 |     "rgen ouml": 1.25,
 308 |     "uuml rgen": 1.25,
 309 |     "mdash java": 1.25,
 310 |     "mdash introduction": 1.25,
 311 |     "toolkit gwt": 0.95,
 312 |     "register read": 0.95,
 313 |     "login register": 0.95,
 314 |     "ller interface": 0.95,
 315 |     "jax conference": 0.95,
 316 |     "spring jax": 0.95,
 317 |     "interface spring": 0.95,
 318 |     "source code": 0.95,
 319 |     "web service": 0.95,
 320 |     "axis spring": 0.95,
 321 |     "business logic": 0.95,
 322 |     "mdash lessons": 0.95,
 323 |     "java language": 0.6,
 324 |     "java web": 0.95,
 325 |     "java ee": 0.95,
 326 |     "java tutorials": 0.95,
 327 |     "unit test": 0.6,
 328 |     "browser button": 0.6,
 329 |     "widgets history": 0.6,
 330 |     "ui widgets": 0.6,
 331 |     "features ui": 0.6,
 332 |     "gwt features": 0.6,
 333 |     "gwt lets": 0.6,
 334 |     "ajax applications": 0.6,
 335 |     "spring evening": 0.6,
 336 |     "are presentations": 0.6,
 337 |     "timezone location": 0.6,
 338 |     "tue timezone": 0.6,
 339 |     "alex tue": 0.6,
 340 |     "ben alex": 0.6,
 341 |     "submitted ben": 0.6,
 342 |     "melbourne spring": 0.6,
 343 |     "christian dupuis": 0.6,
 344 |     "description spring": 0.6,
 345 |     "web login": 0.6,
 346 |     "mike wiesner": 0.6,
 347 |     "johnson interface": 0.6,
 348 |     "rod johnson": 0.6,
 349 |     "spring uuml": 0.6,
 350 |     "timezone description": 0.6,
 351 |     "wed timezone": 0.6,
 352 |     "wolff wed": 0.6,
 353 |     "submitted eberhard": 0.6,
 354 |     "existing classes": 0.6,
 355 |     "approach create": 0.6,
 356 |     "bottom approach": 0.6,
 357 |     "down approach": 0.6,
 358 |     "producer side": 0.6,
 359 |     "string accountid": 0.6,
 360 |     "accountmanager methods": 0.6,
 361 |     "web tier": 0.6,
 362 |     "application servers": 0.6,
 363 |     "certain application": 0.6,
 364 |     "applications today": 0.6,
 365 |     "services axis": 0.6,
 366 |     "ee applications": 0.6,
 367 |     "services this": 0.6,
 368 |     "soap web": 0.6,
 369 |     "ranges are": 0.6,
 370 |     "vista os": 0.6,
 371 |     "windows vista": 0.6,
 372 |     "microsoft windows": 0.6,
 373 |     "slider microsoft": 0.6,
 374 |     "custom component": 0.6,
 375 |     "ui delegate": 0.6,
 376 |     "core swing": 0.6,
 377 |     "easier maintain": 0.6,
 378 |     "process creating": 0.6,
 379 |     "swing component": 0.6,
 380 |     "creating custom": 0.6,
 381 |     "mdash api": 0.6,
 382 |     "virtual machine": 0.6,
 383 |     "java virtual": 0.6,
 384 |     "naming directory": 0.6,
 385 |     "note this": 0.6,
 386 |     "overview features": 0.6,
 387 |     "swing mdash": 0.6,
 388 |     "introduction java": 0.6,
 389 |     "getting started": 0.6,
 390 |     "refer box": 0.6,
 391 |     "this refer": 0.6,
 392 |     "buy this": 0.6,
 393 |     "tutorial buy": 0.6,
 394 |     "ee tutorial": 0.6,
 395 |     "tutorial java": 0.6,
 396 |     "java tutorial": 0.6,
 397 |     "tutorials java": 0.6,
 398 |     "java se": 0.6
 399 |   },
 400 |   "computers_programming_lisp": {
 401 |     "common lisp": 1.4,
 402 |     "code data": 1.4,
 403 |     "data syntax": 1.4,
 404 |     "haskell common": 1.05,
 405 |     "scheme haskell": 1.05,
 406 |     "racket scheme": 1.05,
 407 |     "cons cons": 1.05,
 408 |     "elements empty": 1.05,
 409 |     "lisp code": 1.05,
 410 |     "higher order": 1.05,
 411 |     "languages lisp": 1.05,
 412 |     "encoding code": 1.05,
 413 |     "lisp language": 1.05,
 414 |     "programming language": 1.05,
 415 |     "language implementation": 1.05,
 416 |     "recursive call": 0.7,
 417 |     "combinator recursive": 0.7,
 418 |     "lambda calculus": 0.7,
 419 |     "listing shows": 0.7,
 420 |     "nil cons": 0.7,
 421 |     "cons nil": 0.7,
 422 |     "element elements": 0.7,
 423 |     "empty nil": 0.7,
 424 |     "lisp tradition": 0.7,
 425 |     "language extension": 0.7,
 426 |     "mutually incompatible": 0.7,
 427 |     "meta programs": 0.7,
 428 |     "structure fixed": 0.7,
 429 |     "mainstream languages": 0.7,
 430 |     "generating code": 0.7,
 431 |     "non terminals": 0.7,
 432 |     "data structures": 0.7,
 433 |     "metaprograms lisp": 0.7,
 434 |     "data structure": 0.7,
 435 |     "intellectual property": 0.7,
 436 |     "language itself": 0.7,
 437 |     "nested lists": 0.7,
 438 |     "code nested": 0.7,
 439 |     "data programs": 0.7,
 440 |     "language design": 0.7,
 441 |     "manipulate programs": 0.7,
 442 |     "write metaprograms": 0.7,
 443 |     "easily write": 0.7,
 444 |     "lisp programs": 0.7,
 445 |     "machine code": 0.7
 446 |   },
 447 |   "computers_programming_ruby": {
 448 |     "ruby rails": 1.65,
 449 |     "gem install": 5.0,
 450 |     "ruby ruby": 5.0,
 451 |     "rails rubyine": 3.35,
 452 |     "ruby basics": 3.35,
 453 |     "mswin mongrel": 3.35,
 454 |     "mongrel mswin": 3.35,
 455 |     "mongrel ruby": 3.35,
 456 |     "bin gem": 3.35,
 457 |     "local bin": 3.35,
 458 |     "usr local": 3.35,
 459 |     "sudo gem": 3.35,
 460 |     "railsconf ruby": 3.35,
 461 |     "gemfile sinatra": 3.35,
 462 |     "gem gemfile": 3.35,
 463 |     "rails gem": 3.35,
 464 |     "rspec ruby": 3.35,
 465 |     "recursive processes": 3.35,
 466 |     "rubyine railsconf": 1.65,
 467 |     "rubymine ruby": 1.65,
 468 |     "ide rubymine": 1.65,
 469 |     "rails ide": 1.65,
 470 |     "source ruby": 1.65,
 471 |     "open source": 1.65,
 472 |     "radrails open": 1.65,
 473 |     "maps radrails": 1.65,
 474 |     "hash maps": 1.65,
 475 |     "ing hash": 1.65,
 476 |     "syntax ing": 1.65,
 477 |     "value syntax": 1.65,
 478 |     "key value": 1.65,
 479 |     "entry key": 1.65,
 480 |     "specify entry": 1.65,
 481 |     "hash specify": 1.65,
 482 |     "define hash": 1.65,
 483 |     "below define": 1.65,
 484 |     "shown below": 1.65,
 485 |     "shortcut shown": 1.65,
 486 |     "brace shortcut": 1.65,
 487 |     "curly brace": 1.65,
 488 |     "construct curly": 1.65,
 489 |     "hash construct": 1.65,
 490 |     "hash hash": 1.65,
 491 |     "class hash": 1.65,
 492 |     "instantiating class": 1.65,
 493 |     "either instantiating": 1.65,
 494 |     "created either": 1.65,
 495 |     "es created": 1.65,
 496 |     "hash es": 1.65,
 497 |     "relationship hash": 1.65,
 498 |     "definition relationship": 1.65,
 499 |     "word definition": 1.65,
 500 |     "value word": 1.65,
 501 |     "another value": 1.65,
 502 |     "word another": 1.65,
 503 |     "key word": 1.65,
 504 |     "map key": 1.65,
 505 |     "dictionaries map": 1.65,
 506 |     "conceptually dictionaries": 1.65,
 507 |     "similar conceptually": 1.65,
 508 |     "are similar": 1.65,
 509 |     "hashes are": 1.65,
 510 |     "container hashes": 1.65,
 511 |     "storage container": 1.65,
 512 |     "data storage": 1.65,
 513 |     "kind data": 1.65,
 514 |     "another kind": 1.65,
 515 |     "hash another": 1.65,
 516 |     "hashes hash": 1.65,
 517 |     "irb hashes": 1.65,
 518 |     "confuse irb": 1.65,
 519 |     "wondering confuse": 1.65,
 520 |     "interactive wondering": 1.65,
 521 |     "ruby interactive": 1.65,
 522 |     "stands ruby": 1.65,
 523 |     "ri stands": 1.65,
 524 |     "oh ri": 1.65,
 525 |     "array oh": 1.65,
 526 |     "ri array": 1.65,
 527 |     "class ri": 1.65,
 528 |     "ed class": 1.65,
 529 |     "shell ed": 1.65,
 530 |     "ruby shell": 1.65,
 531 |     "system ruby": 1.65,
 532 |     "operating system": 1.65,
 533 |     "window operating": 1.65,
 534 |     "terminal window": 1.65,
 535 |     "command terminal": 1.65,
 536 |     "ri command": 1.65,
 537 |     "entering ri": 1.65,
 538 |     "documentation entering": 1.65,
 539 |     "reference documentation": 1.65,
 540 |     "ruby reference": 1.65,
 541 |     "via ruby": 1.65,
 542 |     "class via": 1.65,
 543 |     "array class": 1.65,
 544 |     "methods array": 1.65,
 545 |     "instance methods": 1.65,
 546 |     "methods instance": 1.65,
 547 |     "class methods": 1.65,
 548 |     "brackets class": 1.65,
 549 |     "square brackets": 1.65,
 550 |     "array square": 1.65,
 551 |     "inside array": 1.65,
 552 |     "place inside": 1.65,
 553 |     "enclose place": 1.65,
 554 |     "approach enclose": 1.65,
 555 |     "shortcut approach": 1.65,
 556 |     "construct shortcut": 1.65,
 557 |     "array construct": 1.65,
 558 |     "array array": 1.65,
 559 |     "class array": 1.65,
 560 |     "basics class": 1.65,
 561 |     "learn ruby": 1.65,
 562 |     "rails learn": 1.65,
 563 |     "guts rails": 1.65,
 564 |     "diving guts": 1.65,
 565 |     "before diving": 1.65,
 566 |     "ruby before": 1.65,
 567 |     "beneficial ruby": 1.65,
 568 |     "extremely beneficial": 1.65,
 569 |     "concerned extremely": 1.65,
 570 |     "far concerned": 1.65,
 571 |     "basics far": 1.65,
 572 |     "learning ruby": 1.65,
 573 |     "without learning": 1.65,
 574 |     "rails without": 1.65,
 575 |     "learn rails": 1.65,
 576 |     "possible learn": 1.65,
 577 |     "suggest possible": 1.65,
 578 |     "developers suggest": 1.65,
 579 |     "rails developers": 1.65,
 580 |     "ruby mongrel": 1.65,
 581 |     "platform mongrel": 1.65,
 582 |     "install platform": 1.65,
 583 |     "this gem": 1.65,
 584 |     "something this": 1.65,
 585 |     "shown something": 1.65,
 586 |     "platform shown": 1.65,
 587 |     "gem platform": 1.65,
 588 |     "appropriate gem": 1.65,
 589 |     "prompted appropriate": 1.65,
 590 |     "dependencies prompted": 1.65,
 591 |     "mongrel dependencies": 1.65,
 592 |     "install mongrel": 1.65,
 593 |     "command sudo": 1.65,
 594 |     "this command": 1.65,
 595 |     "type this": 1.65,
 596 |     "gem type": 1.65,
 597 |     "mongrel gem": 1.65,
 598 |     "better mongrel": 1.65,
 599 |     "apache better": 1.65,
 600 |     "lighttpd apache": 1.65,
 601 |     "scgi lighttpd": 1.65,
 602 |     "fastcgi scgi": 1.65,
 603 |     "applications fastcgi": 1.65,
 604 |     "compile applications": 1.65,
 605 |     "having compile": 1.65,
 606 |     "without having": 1.65,
 607 |     "applications without": 1.65,
 608 |     "rails applications": 1.65,
 609 |     "server ruby": 1.65,
 610 |     "library server": 1.65,
 611 |     "alone library": 1.65,
 612 |     "stand alone": 1.65,
 613 |     "fast stand": 1.65,
 614 |     "mongrel fast": 1.65,
 615 |     "mongrel mongrel": 1.65,
 616 |     "step mongrel": 1.65,
 617 |     "this step": 1.65,
 618 |     "retry this": 1.65,
 619 |     "again retry": 1.65,
 620 |     "step again": 1.65,
 621 |     "beginning step": 1.65,
 622 |     "step beginning": 1.65,
 623 |     "instructed step": 1.65,
 624 |     "path instructed": 1.65,
 625 |     "didn path": 1.65,
 626 |     "gem didn": 1.65,
 627 |     "loaderror usr": 1.65,
 628 |     "rubygems loaderror": 1.65,
 629 |     "load rubygems": 1.65,
 630 |     "file load": 1.65,
 631 |     "require file": 1.65,
 632 |     "gem require": 1.65,
 633 |     "this usr": 1.65,
 634 |     "message this": 1.65,
 635 |     "dependencies message": 1.65,
 636 |     "rails dependencies": 1.65,
 637 |     "install rails": 1.65,
 638 |     "install sudo": 1.65,
 639 |     "line install": 1.65,
 640 |     "simple line": 1.65,
 641 |     "rails simple": 1.65,
 642 |     "installed rails": 1.65,
 643 |     "rubygems installed": 1.65,
 644 |     "rails rubygems": 1.65,
 645 |     "rails mongrel": 1.65,
 646 |     "programmers mongrel": 1.65,
 647 |     "ruby programmers": 1.65,
 648 |     "sinatra railsconf": 1.65,
 649 |     "imagemagick rspec": 1.65,
 650 |     "runs imagemagick": 1.65,
 651 |     "rmagick runs": 1.65,
 652 |     "memory rmagick": 1.65,
 653 |     "easier memory": 1.65,
 654 |     "minimagick easier": 1.65,
 655 |     "library minimagick": 1.65,
 656 |     "imagemagick library": 1.65,
 657 |     "wraps imagemagick": 1.65,
 658 |     "usage wraps": 1.65,
 659 |     "memory usage": 1.65,
 660 |     "features memory": 1.65,
 661 |     "processing features": 1.65,
 662 |     "advanced processing": 1.65,
 663 |     "terms advanced": 1.65,
 664 |     "daddy terms": 1.65,
 665 |     "grand daddy": 1.65,
 666 |     "rmagick grand": 1.65,
 667 |     "methods rmagick": 1.65,
 668 |     "builtin methods": 1.65,
 669 |     "ruby builtin": 1.65,
 670 |     "addition ruby": 1.65,
 671 |     "above addition": 1.65,
 672 |     "prompt above": 1.65,
 673 |     "code prompt": 1.65,
 674 |     "ruby code": 1.65,
 675 |     "simplicity ruby": 1.65,
 676 |     "balance simplicity": 1.65,
 677 |     "found balance": 1.65,
 678 |     "ruby found": 1.65,
 679 |     "beauty ruby": 1.65,
 680 |     "sinatra beauty": 1.65,
 681 |     "ruby rspec": 1.65,
 682 |     "javascript ruby": 1.65,
 683 |     "functional javascript": 1.65,
 684 |     "responses functional": 1.65,
 685 |     "processes responses": 1.65,
 686 |     "linear recursive": 1.65,
 687 |     "differ linear": 1.65,
 688 |     "processes differ": 1.65,
 689 |     "better recursive": 1.65,
 690 |     "instance better": 1.65,
 691 |     "learning instance": 1.65,
 692 |     "still learning": 1.65,
 693 |     "let still": 1.65,
 694 |     "please let": 1.65,
 695 |     "wrong please": 1.65,
 696 |     "something wrong": 1.65,
 697 |     "programmer something": 1.65,
 698 |     "functional programmer": 1.65,
 699 |     "read functional": 1.65,
 700 |     "helpful read": 1.65,
 701 |     "this helpful": 1.65,
 702 |     "before this": 1.65,
 703 |     "functional before": 1.65,
 704 |     "curious functional": 1.65,
 705 |     "hope curious": 1.65,
 706 |     "understanding hope": 1.65,
 707 |     "balances understanding": 1.65,
 708 |     "exposure balances": 1.65,
 709 |     "benefits exposure": 1.65,
 710 |     "certainly benefits": 1.65,
 711 |     "ruby certainly": 1.65,
 712 |     "candidates ruby": 1.65,
 713 |     "are candidates": 1.65,
 714 |     "ruby are": 1.65,
 715 |     "rails ruby": 1.65
 716 |   },
 717 |   "economics": {
 718 |     "goods services": 2.25,
 719 |     "cost average": 1.05,
 720 |     "terms starting": 1.05,
 721 |     "definitions terms": 1.05,
 722 |     "starting letter": 1.0,
 723 |     "nobel prize": 0.85,
 724 |     "price elasticity": 0.8,
 725 |     "total cost": 0.8,
 726 |     "elasticity demand": 0.7,
 727 |     "variable cost": 0.7,
 728 |     "fixed cost": 0.7,
 729 |     "cost total": 0.7,
 730 |     "income taxes": 0.65,
 731 |     "quantity demanded": 0.65,
 732 |     "baseball players": 0.6,
 733 |     "prize economics": 0.6,
 734 |     "quantity supplied": 0.6,
 735 |     "exchange rates": 0.55,
 736 |     "easy explanation": 0.55,
 737 |     "sense easy": 0.55,
 738 |     "common sense": 0.55,
 739 |     "gives common": 0.55,
 740 |     "price quantity": 0.55,
 741 |     "austrian school": 0.55,
 742 |     "econometrics project": 0.5,
 743 |     "returns scale": 0.5,
 744 |     "monetary policy": 0.5,
 745 |     "marginal cost": 0.5
 746 |   },
 747 |   "health": {
 748 |     "blood pressure": 1.2,
 749 |     "blood vessels": 1.1,
 750 |     "immune system": 0.65,
 751 |     "blood flow": 0.55,
 752 |     "blood vessel": 0.5
 753 |   },
 754 |   "health_exercise": {
 755 |     "warmup walk": 1.95,
 756 |     "minute warmup": 1.95,
 757 |     "brisk minute": 1.9,
 758 |     "pull squats": 1.7,
 759 |     "jog mile": 1.55,
 760 |     "walk mile": 1.05,
 761 |     "mile walk": 1.05,
 762 |     "heart rate": 0.95,
 763 |     "sets repetitions": 0.95,
 764 |     "perform sets": 0.9,
 765 |     "ball chair": 0.85,
 766 |     "seconds rest": 0.85,
 767 |     "smith machine": 0.85,
 768 |     "walk jog": 0.85,
 769 |     "jog miles": 0.75,
 770 |     "exercise ball": 0.7,
 771 |     "leg press": 0.6,
 772 |     "mile jog": 0.6,
 773 |     "seconds jog": 0.6,
 774 |     "squats pull": 0.55,
 775 |     "rounds pull": 0.55,
 776 |     "squats rounds": 0.55,
 777 |     "weight loss": 0.55,
 778 |     "rest sets": 0.55,
 779 |     "cable crossover": 0.55,
 780 |     "universal machine": 0.55,
 781 |     "starting position": 0.55
 782 |   },
 783 |   "health_nutrition": {
 784 |     "healthiest foods": 1.15,
 785 |     "world healthiest": 1.15,
 786 |     "nutrition data": 1.05,
 787 |     "amino acids": 0.55,
 788 |     "heart disease": 0.8,
 789 |     "essential amino": 0.75,
 790 |     "blood sugar": 0.75,
 791 |     "beta carotene": 0.65,
 792 |     "saturated fats": 0.65,
 793 |     "daily values": 0.6,
 794 |     "blood pressure": 0.6,
 795 |     "fats are": 0.55,
 796 |     "foods are": 0.55,
 797 |     "soluble vitamin": 0.5,
 798 |     "water soluble": 0.5,
 799 |     "fatty acids": 0.5
 800 |   },
 801 |   "mathematics": {
 802 |     "coordinate graph": 2.45,
 803 |     "property multiplication": 1.65,
 804 |     "square root": 1.65,
 805 |     "expression algebra": 1.65,
 806 |     "completeness theorem": 1.65,
 807 |     "ordered pair": 1.2,
 808 |     "common multiple": 1.2,
 809 |     "common factor": 1.2,
 810 |     "numerator denominator": 1.2,
 811 |     "axis intercept": 0.8,
 812 |     "identity property": 0.8,
 813 |     "equivalent fractions": 0.8,
 814 |     "commutative property": 0.8,
 815 |     "associative property": 0.8,
 816 |     "graph coordinate": 0.8,
 817 |     "axis coordinate": 0.8,
 818 |     "sum equals": 0.8,
 819 |     "combined sum": 0.8,
 820 |     "angles combined": 0.8,
 821 |     "angles opposite": 0.8,
 822 |     "multiplicative inverse": 0.8,
 823 |     "obtuse angle": 0.8,
 824 |     "common denominator": 0.8,
 825 |     "plotted coordinate": 0.8,
 826 |     "line plotted": 0.8,
 827 |     "natural integer": 0.8,
 828 |     "fraction natural": 0.8,
 829 |     "cartesian coordinates": 0.8,
 830 |     "central angle": 0.8,
 831 |     "additive inverse": 0.8,
 832 |     "angle measures": 0.8,
 833 |     "mathematical expression": 0.8,
 834 |     "logically valid": 0.8,
 835 |     "mathematical logic": 0.8
 836 |   },
 837 |   "music": {
 838 |     "piece music": 1.1,
 839 |     "sheet music": 1.0,
 840 |     "art music": 0.8,
 841 |     "bruno mars": 0.7,
 842 |     "music theory": 0.7,
 843 |     "music cognition": 0.6,
 844 |     "perform music": 0.6,
 845 |     "music notation": 0.6,
 846 |     "popular music": 0.6,
 847 |     "toby keith": 0.5,
 848 |     "jason aldean": 0.5,
 849 |     "taylor swift": 0.5,
 850 |     "tamar braxton": 0.5,
 851 |     "nicki minaj": 0.5,
 852 |     "lil wayne": 0.5,
 853 |     "cognitive musicology": 0.5,
 854 |     "study music": 0.5,
 855 |     "classical music": 0.5
 856 |   },
 857 |   "news": {
 858 |     "united states": 2.1,
 859 |     "points hours": 1.1,
 860 |     "north korea": 0.6,
 861 |     "middle east": 0.5
 862 |   },
 863 |   "news_economy": {
 864 |     "national debt": 1.3,
 865 |     "interest rates": 1.1,
 866 |     "interest rate": 0.75,
 867 |     "consumer debt": 0.7,
 868 |     "deep poverty": 0.6,
 869 |     "economic growth": 0.6,
 870 |     "consumer spending": 0.6,
 871 |     "tea party": 0.5,
 872 |     "poor people": 0.5
 873 |   },
 874 |   "news_politics": {
 875 |     "united states": 1.45,
 876 |     "middle east": 0.55,
 877 |     "white house": 0.5
 878 |   },
 879 |   "news_war": {
 880 |     "united states": 2.6,
 881 |     "war weapons": 0.9,
 882 |     "white house": 0.7,
 883 |     "middle east": 0.7,
 884 |     "nuclear weapons": 0.7
 885 |   },
 886 |   "news_weather": {
 887 |     "water vapor": 0.85,
 888 |     "low pressure": 0.85,
 889 |     "weather forecast": 0.65,
 890 |     "air mass": 0.6,
 891 |     "atmospheric pressure": 0.55,
 892 |     "ice crystals": 0.55
 893 |   },
 894 |   "physics": {
 895 |     "potential energy": 0.7,
 896 |     "quantum mechanics": 0.7,
 897 |     "motion body": 0.6,
 898 |     "electric current": 0.6,
 899 |     "water vapor": 0.55,
 900 |     "law motion": 0.5
 901 |   },
 902 |   "religion": {
 903 |     "jesus christ": 1.1,
 904 |     "united states": 1.05,
 905 |     "roman catholic": 1.05,
 906 |     "stark finke": 0.75,
 907 |     "catholic church": 0.7,
 908 |     "old testament": 0.7,
 909 |     "judaism christianity": 0.7,
 910 |     "holy spirit": 0.65,
 911 |     "prophet muhammad": 0.6,
 912 |     "christianity islam": 0.6,
 913 |     "hebrew bible": 0.55,
 914 |     "eastern orthodox": 0.5,
 915 |     "century ce": 0.5
 916 |   },
 917 |   "religion_buddhism": {
 918 |     "pure land": 0.7,
 919 |     "dalai lama": 0.7,
 920 |     "eightfold path": 0.65,
 921 |     "therav da": 0.6,
 922 |     "tibetan buddhism": 0.5
 923 |   },
 924 |   "religion_christianity": {
 925 |     "jesus christ": 4.05,
 926 |     "roman catholic": 1.7,
 927 |     "holy spirit": 1.7,
 928 |     "old testament": 1.25,
 929 |     "christian church": 1.05,
 930 |     "catholic church": 0.85,
 931 |     "eastern orthodox": 0.85,
 932 |     "birth jesus": 0.85,
 933 |     "love bible": 0.85,
 934 |     "vatican city": 0.6,
 935 |     "orthodox churches": 0.6,
 936 |     "hebrew scripture": 0.6,
 937 |     "life jesus": 0.6,
 938 |     "judgment seat": 0.6,
 939 |     "christ learn": 0.6
 940 |   },
 941 |   "religion_hinduism": {
 942 |     "upanishad stories": 1.0,
 943 |     "stories episodes": 0.9,
 944 |     "hindu gods": 0.65,
 945 |     "rig veda": 0.65,
 946 |     "bhagavad gita": 0.65,
 947 |     "sri ramakrishna": 0.65,
 948 |     "stage life": 0.6,
 949 |     "hatha yoga": 0.6,
 950 |     "gods goddesses": 0.5,
 951 |     "raja yoga": 0.5
 952 |   },
 953 |   "religion_islam": {
 954 |     "shi ite": 1.2,
 955 |     "prophet muhammad": 0.75,
 956 |     "muslims believe": 0.75,
 957 |     "ka ba": 0.65,
 958 |     "mentioned qur": 0.65,
 959 |     "allah mentioned": 0.65,
 960 |     "abi talib": 0.65,
 961 |     "ali abi": 0.65,
 962 |     "pillars islam": 0.65,
 963 |     "muslim community": 0.55,
 964 |     "old testament": 0.55,
 965 |     "qur old": 0.55,
 966 |     "muhammad ali": 0.55,
 967 |     "holy prophet": 0.55,
 968 |     "prophet allah": 0.55,
 969 |     "imam ali": 0.55,
 970 |     "qur anic": 0.55
 971 |   },
 972 |   "religion_judaism": {
 973 |     "jewish law": 0.5,
 974 |     "jewish liturgy": 1.1,
 975 |     "yom kippur": 1.1,
 976 |     "rosh hashanah": 0.8,
 977 |     "prayers blessings": 0.8,
 978 |     "pesach passover": 0.75,
 979 |     "sephardic jews": 0.75,
 980 |     "jewish occurring": 0.65,
 981 |     "jacob israel": 0.65,
 982 |     "type sacrifice": 0.65,
 983 |     "hebrew alphabet": 0.6,
 984 |     "lashon ra": 0.5,
 985 |     "toh ruh": 0.5,
 986 |     "movements judaism": 0.5,
 987 |     "occurring jewish": 0.5,
 988 |     "orthodox jews": 0.5
 989 |   },
 990 |   "sports": {
 991 |     "american football": 1.15,
 992 |     "olympic games": 0.95,
 993 |     "olympic committee": 0.95,
 994 |     "rose bowl": 0.95,
 995 |     "horse racing": 0.7,
 996 |     "final score": 0.5,
 997 |     "ball games": 0.5,
 998 |     "contest game": 0.5,
 999 |     "sports team": 0.5
1000 |   }
1001 | }
1002 | 


--------------------------------------------------------------------------------
/tokensWithPeriods.txt:
--------------------------------------------------------------------------------
 1 | A.
 2 | Adj.
 3 | Adm.
 4 | Adv.
 5 | Asst.
 6 | B.
 7 | Bart.
 8 | Bldg.
 9 | Brig.
10 | Bros.
11 | C.
12 | Capt.
13 | Cmdr.
14 | Col.
15 | Comdr.
16 | Con.
17 | Cpl.
18 | D.
19 | DR.
20 | Dr.
21 | E.
22 | Ens.
23 | F.
24 | G.
25 | Gen.
26 | Gov.
27 | H.
28 | Hon.
29 | Hosp.
30 | I.
31 | Insp.
32 | J.
33 | K.
34 | L.
35 | Lt.
36 | M.
37 | M.
38 | MM.
39 | MR.
40 | MRS.
41 | MS.
42 | Maj.
43 | Messrs.
44 | Mlle.
45 | Mme.
46 | Mr.
47 | Mrs.
48 | Ms.
49 | Msgr.
50 | N.
51 | O.
52 | Op.
53 | Ord.
54 | P.
55 | Pfc.
56 | Ph.
57 | Prof.
58 | Pvt.
59 | Q.
60 | R.
61 | Rep.
62 | Reps.
63 | Rev.
64 | S.
65 | Sen.
66 | Sens.
67 | Sfc.
68 | Sgt.
69 | Sr.
70 | St.
71 | Supt.
72 | T.
73 | U.
74 | V.
75 | W.
76 | X.
77 | Y.
78 | Z.
79 | v.
80 | vs.
81 | Inc.
82 | U.S.
83 | U.S.A.
84 | 


--------------------------------------------------------------------------------