├── gutenrye
    ├── gut.sh
    ├── scala-proj
    │   ├── rye.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           ├── Rye.scala
    │   │           ├── DFRye.scala
    │   │           └── Stemmer.scala
    ├── launch-gut-scala.sh
    ├── gutdf.py
    ├── output-sample.rtf
    ├── gut2.py
    └── porter2.py
├── README.md
└── LICENSE


/gutenrye/gut.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # you'll need to swap in your own paths as appropriate:
4 | 
5 | /Users/adam/Applications/spark-1.4.0-bin-hadoop2.6/bin/spark-submit /Users/adam/Documents/rye/gutenrye/gut2.py 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/gutenrye/scala-proj/rye.sbt:
--------------------------------------------------------------------------------
1 | name := "Rye"
2 | 
3 | version := "1.0"
4 | 
5 | scalaVersion := "2.10.4"
6 | 
7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
8 | 
9 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0"


--------------------------------------------------------------------------------
/gutenrye/launch-gut-scala.sh:
--------------------------------------------------------------------------------
1 |  #!/usr/bin/env bash
2 | 
3 | # you'll need to swap in your own paths as appropriate:
4 | 
5 |  /Users/adam/Applications/spark-1.4.0-bin-hadoop2.6/bin/spark-submit --master local[*] --class "DFRye" /Users/adam/Documents/rye/gutenrye/scala-proj/target/scala-2.10/rye_2.10-1.0.jar 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rye
 2 | 
 3 | Experimental implementation with Apache Spark of a subset of functionality from http://tesserae.caset.buffalo.edu/
 4 | 
 5 | ### Some notes
 6 | 
 7 | - The stemmer is public domain and grabbed from a common Python lib
 8 | - Stop words are an arbitrary list from the Internet 
 9 | 
10 | ### Logical ToDos
11 | 
12 | - Get a better, multilingual stemmer
13 | - Better stop word list
14 | - Add filtering of "stop bigrams"
15 | 	+ Real Tesserae has a scoring engine
16 | 	+ Bigrams common to a genre/language should probable get scored 0 and filtered out for performance
17 | 
18 | ### Plans
19 | 
20 | - Reimplement parts of the code using the Spark DataFrames/SQL API
21 | 	+ Hoping to squeeze a little free performance out of the Catalyst optimizer
22 | 	+ Not using a Python lambda in the "big filter" (i.e., the distance filter after the Cartesian join) might help perf
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/gutenrye/gutdf.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.sql import HiveContext
 3 | from porter2 import stem
 4 | import urllib2
 5 | import re
 6 | 
 7 | sc = SparkContext()
 8 | sqlContext = HiveContext(sc)
 9 | 
10 | stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero']
11 | 
12 | text_urls = ['https://dl.dropboxusercontent.com/u/105876471/tth.txt', 'https://dl.dropboxusercontent.com/u/105876471/coc.txt']
13 | text1_url = text_urls[0]
14 | text2_url = text_urls[1]
15 | 
16 | # Load from web:
17 | def wgetAndTokenize(url):
18 |   response = urllib2.urlopen(url)
19 |   data = response.read()
20 |   return re.split('\s+', data);
21 | 
22 | text1_tokens = wgetAndTokenize(text1_url)
23 | text2_tokens = wgetAndTokenize(text2_url)
24 | 
25 | # make RDD with list of words along with their position in the original text (so we can find context later)
26 | text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex()
27 | text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex()
28 | #print text1_tokensRDD.take(5)
29 | 
30 | # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list:
31 | text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 
32 | print text1_tokensRDD.take(5)
33 | text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 
34 | 
35 | # stem the words using imported stem function (chosen arbitrarily)
36 | text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 
37 | print text1_stemmedRDD.take(5)
38 | text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 
39 | 
40 | t1raw = text1_stemmedRDD.toDF(['entry', 'locus'])
41 | t1raw.show()
42 | 
43 | t2raw = text2_stemmedRDD.toDF(['entry', 'locus'])
44 | 
45 | t1raw.registerTempTable("t1raw")
46 | t2raw.registerTempTable("t2raw")
47 | 
48 | bg1 = sqlContext.sql("select a.entry a1, b.entry b1, a.locus, b.locus from t1raw a cross join t1raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7")
49 | bg1.show(4)
50 | 
51 | bg2 = sqlContext.sql("select a.entry a2, b.entry b2, a.locus, b.locus from t2raw a cross join t2raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7")
52 | 
53 | bg2.show(4)
54 | 
55 | bg = bg1.join(bg2, ((bg1.a1 == bg2.a2) & (bg1.b1 == bg2.b2)))
56 | bg.show(100)
57 | 
58 | 


--------------------------------------------------------------------------------
/gutenrye/output-sample.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170
  2 | {\fonttbl\f0\fmodern\fcharset0 CourierNewPSMT;}
  3 | {\colortbl;\red255\green255\blue255;}
  4 | \margl1440\margr1440\vieww17260\viewh9800\viewkind0
  5 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural
  6 | 
  7 | \f0\fs20 \cf0 \CocoaLigature0 ('earth', 'thing')\
  8 | 	text 1 loci\
  9 | 		(57, 61): heaven and in the earth. I heard many things in hell. How,\
 10 | 	text 2 loci\
 11 | 		(5572, 5570): alone among the conscious things of earth, for shapes came\
 12 | 		(5799, 5795): been aeons when other Things ruled on the earth, and They had\
 13 | 		(11120, 11123): that was not of earth the titan Thing from the stars\
 14 | \
 15 | ('acut', 'thing')\
 16 | 	text 1 loci\
 17 | 		(46, 50): the sense of hearing acute. I heard all things in the heaven\
 18 | 	text 2 loci\
 19 | 		(2528, 2534): of the dreamers confessed acute fear of the gigantic nameless thing visible toward the\
 20 | \
 21 | ('dead', 'stone')\
 22 | 	text 1 loci\
 23 | 		(1435, 1433): corpse. Yes, he was stone, stone dead. I placed my\
 24 | 		(1435, 1434): Yes, he was stone, stone dead. I placed my\
 25 | 		(1456, 1455): no pulsation. He was stone dead. His eve would\
 26 | 	text 2 loci\
 27 | 		(7253, 7258): ritual which told of dead Cthulhu's dream-vigil in his stone vault at R'lyeh,\
 28 | \
 29 | ('examin', 'remov')\
 30 | 	text 1 loci\
 31 | 		(1427, 1423): man was dead. I removed the bed and examined the corpse. Yes,\
 32 | 	text 2 loci\
 33 | 		(5307, 5301): of course, was carefully removed and carried back by Legrasse. Examined at headquarters after\
 34 | \
 35 | ('felt', 'night')\
 36 | 	text 1 loci\
 37 | 		(544, 541): mine. Never before that night had I felt the extent of\
 38 | 	text 2 loci\
 39 | 		(1501, 1496): slight earthquake tremor the night before, the most considerable felt in New England\
 40 | \
 41 | ('men', 'nois')\
 42 | 	text 1 loci\
 43 | 		(2004, 2007): the observations of the men --but the noise steadily increased. Oh\
 44 | 	text 2 loci\
 45 | 		(4733, 4729): do justice to the noises heard by Legrasse's men as they ploughed\
 46 | \
 47 | ('felt', 'thing')\
 48 | 	text 1 loci\
 49 | 		(1826, 1821): they chatted of familiar things. But, ere long, I felt myself getting pale\
 50 | 	text 2 loci\
 51 | 		(10284, 10287): Now an unlettered seaman felt the same thing whilst gazing at\
 52 | \
 53 | ('dream', 'thought')\
 54 | 	text 1 loci\
 55 | 		(579, 585): he not even to dream of my secret deeds or thoughts. I fairly chuckled\
 56 | 	text 2 loci\
 57 | 		(10018, 10012): after cycles incalculable, the thoughts that spread fear to the dreams of the sensitive\
 58 | \
 59 | ('black', 'dark')\
 60 | 	text 1 loci\
 61 | 		(621, 627): His room was as black as pitch with the thick darkness, (for the shutters\
 62 | 	text 2 loci\
 63 | 		(10682, 10685): upset. The aperture was black with a darkness almost material. That\
 64 | \
 65 | ('dream', 'secret')\
 66 | 	text 1 loci\
 67 | 		(579, 582): he not even to dream of my secret deeds or thoughts.\
 68 | 	text 2 loci\
 69 | 		(5454, 5452): bodies had told their secrets in dreams to the first\
 70 | \
 71 | ('face', 'man')\
 72 | 	text 1 loci\
 73 | 		(1103, 1102): else of the old man's face or person: for\
 74 | 	text 2 loci\
 75 | 		(4893, 4897): swamp water on the face of the fainting man, and all stood\
 76 | \
 77 | ('death', 'night')\
 78 | 	text 1 loci\
 79 | 		(740, 734): as I have done, night after night, hearkening to the death watches in the\
 80 | 		(740, 736): have done, night after night, hearkening to the death watches in the\
 81 | 	text 2 loci\
 82 | 		(11178, 11182): laughing at intervals till death found him one night in the cabin\
 83 | \
 84 | ('found', 'night')\
 85 | 	text 1 loci\
 86 | 		(423, 417): seven long nights --every night just at midnight --but I found the eye always\
 87 | 	text 2 loci\
 88 | 		(11179, 11182): at intervals till death found him one night in the cabin\
 89 | \
 90 | ('heard', 'suspect')\
 91 | 	text 1 loci\
 92 | 		(2071, 2073): God! --no, no! They heard! --they suspected! --they knew! --they\
 93 | 	text 2 loci\
 94 | 		(6757, 6752): natural; though privately I suspected young Wilcox of having heard of the cult\
 95 | \
 96 | ('disturb', 'man')\
 97 | 	text 1 loci\
 98 | 		(329, 332): that I might not disturb the old man's sleep. It took\
 99 | 	text 2 loci\
100 | 		(721, 725): responsible for this apparent disturbance of an old man's peace of mind.\
101 | \
102 | ('heard', 'length')\
103 | 	text 1 loci\
104 | 		(1409, 1414): it would not be heard through the wall. At length it ceased. The\
105 | 	text 2 loci\
106 | 		(10756, 10750): was intolerable, and at length the quick-eared Hawkins thought he heard a nasty, slopping\
107 | \
108 | ('end', 'made')\
109 | 	text 1 loci\
110 | 		(1591, 1589): ha! When I had made an end of these labors,\
111 | 	text 2 loci\
112 | 		(5246, 5242): fired, and escapes were made; but in the end Legrasse was able\
113 | \
114 | ('continu', 'feel')\
115 | 	text 1 loci\
116 | 		(1875, 1872): get rid of the feeling: but it continued and gained definiteness\
117 | 	text 2 loci\
118 | 		(2441, 2443): That is why I continued to feel that Wilcox, somehow\
119 | \
120 | ('beat', 'muffl')\
121 | 	text 1 loci\
122 | 		(1393, 1397): many minutes, the heart beat on with a muffled sound. This, however,\
123 | 	text 2 loci\
124 | 		(4483, 4482): of bobbing lanterns. The muffled beat of tom-toms was\
125 | \
126 | ('god', 'suspect')\
127 | 	text 1 loci\
128 | 		(2067, 2073): they heard not? Almighty God! --no, no! They heard! --they suspected! --they knew! --they\
129 | 	text 2 loci\
130 | 		(10044, 10042): this Johansen did not suspect, but God knows he soon\
131 | \
132 | ('bed', 'sudden')\
133 | 	text 1 loci\
134 | 		(602, 603): he moved on the bed suddenly, as if startled.\
135 | 	text 2 loci\
136 | 		(2058, 2052): trace of Wilcox's malady suddenly ceased. He sat upright in bed, astonished to find\
137 | \
138 | ('door', 'knock')\
139 | 	text 1 loci\
140 | 		(1616, 1612): hour, there came a knocking at the street door. I went down\
141 | 	text 2 loci\
142 | 		(9410, 9404): trip by taxicab, and knocked with palpitant heart at the door of a neat\
143 | \
144 | ('heard', 'sound')\
145 | 	text 1 loci\
146 | 		(1331, 1328): anxiety seized me --the sound would be heard by a neighbour!\
147 | 	text 2 loci\
148 | 		(10756, 10760): quick-eared Hawkins thought he heard a nasty, slopping sound down there. Everyone\
149 | \
150 | ('dream', 'man')\
151 | 	text 1 loci\
152 | 		(1708, 1711): my own in a dream. The old man, I mentioned, was\
153 | 	text 2 loci\
154 | 		(6693, 6690): of a sensitive young man who had dreamed not only the\
155 | \
156 | ('awak', 'lie')\
157 | 	text 1 loci\
158 | 		(853, 852): that he had been lying awake ever since the\
159 | 	text 2 loci\
160 | 		(6017, 6016): and They could only lie awake in the dark\
161 | \
162 | ('dead', 'man')\
163 | 	text 1 loci\
164 | 		(1421, 1419): it ceased. The old man was dead. I removed the\
165 | 	text 2 loci\
166 | 		(7967, 7968): Tow. One Survivor and Dead Man Found Aboard. Tale\
167 | 		(8045, 8046): one living and one dead man aboard. The Vigilant\
168 | 		(8102, 8097): half-delirious condition and one man who had evidently been dead for more than\
169 | \
170 | ('man', 'week')\
171 | 	text 1 loci\
172 | 		(243, 248): kinder to the old man than during the whole week before I killed\
173 | 	text 2 loci\
174 | 		(8110, 8107): for more than a week. The living man was clutching a\
175 | \
176 | ('excit', 'strang')\
177 | 	text 1 loci\
178 | 		(1293, 1288): that old house, so strange a noise as this excited me to uncontrollable\
179 | 	text 2 loci\
180 | 		(1253, 1257): and had from chidhood excited attention through the strange stories and odd\
181 | 		(7023, 7029): for my uncle had excited his curiosity in probing his strange dreams, yet had\
182 | \
183 | ('long', 'thing')\
184 | 	text 1 loci\
185 | 		(1824, 1821): they chatted of familiar things. But, ere long, I felt myself\
186 | 	text 2 loci\
187 | 		(3436, 3441): and fore feet, and long, narrow wings behind. This thing, which seemed instinct\
188 | 		(9234, 9240): and I studied it long and well, finding it a thing of balefully exquisite\
189 | \
190 | ('man', 'suspect')\
191 | 	text 1 loci\
192 | 		(496, 499): a very profound old man, indeed, to suspect that every night,\
193 | 	text 2 loci\
194 | 		(3718, 3723): problem, there was one man in that gathering who suspected a touch of}


--------------------------------------------------------------------------------
/gutenrye/scala-proj/src/main/scala/Rye.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.SparkContext
 2 | import org.apache.spark.SparkContext._
 3 | import org.apache.spark.SparkConf
 4 | 
 5 | object Rye {
 6 |   def main(args: Array[String]) {    
 7 |     val conf = new SparkConf().setAppName("Rye")
 8 |     val sc = new SparkContext(conf)
 9 |     // Databricks notebook source exported at Thu, 6 Aug 2015 20:48:55 UTC
10 | 	val text_urls = Array("https://dl.dropboxusercontent.com/u/105876471/tth.txt", "https://dl.dropboxusercontent.com/u/105876471/coc.txt")
11 | 	//val text_urls = Array("https://www.gutenberg.org/cache/epub/77/pg77.txt", "https://www.gutenberg.org/ebooks/2701.txt.utf-8")
12 | 	val t1_url = text_urls(0)
13 | 	val t2_url = text_urls(1)
14 | 
15 | 	import scala.io.Source
16 | 	
17 | 	val t1_tokens = Source.fromURL(t1_url).mkString.split("\\s+")
18 | 	val t2_tokens = Source.fromURL(t2_url).mkString.split("\\s+")
19 | 
20 | 	// COMMAND ----------
21 | 
22 | 	val t1_tokensRDD = sc.parallelize(t1_tokens).zipWithIndex()
23 | 	val t2_tokensRDD = sc.parallelize(t2_tokens).zipWithIndex()
24 | 	println(t1_tokensRDD.take(5))
25 | 	println(t2_tokensRDD.take(5).mkString)
26 | 
27 | 
28 | 	// COMMAND ----------
29 | 
30 | 	val stop_words = Array("a", "i", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero")
31 | 
32 | 	// COMMAND ----------
33 | 
34 | 	// remove normalize to lowercase, remove stopwords
35 | 	val t1_cleaned_tokensRDD = t1_tokensRDD
36 | 	    .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))	
37 | 		.filter(p => { p._1.length>1 && !(stop_words contains p._1) })
38 | 			
39 | 	println(t1_cleaned_tokensRDD.take(15).mkString)
40 | 	val t2_cleaned_tokensRDD = t2_tokensRDD
41 | 	    .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))	
42 | 		.filter(p => { p._1.length>1 && !(stop_words contains p._1) })
43 | 
44 | 	println(t2_cleaned_tokensRDD.take(15).mkString)
45 | 	//t2_cleaned_tokensRDD.collect()
46 | 
47 | 	val t1_stemmedRDD = t1_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
48 | 	println(t1_stemmedRDD.take(5).mkString)
49 | 	val t2_stemmedRDD = t2_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
50 | 
51 | 	// COMMAND ----------
52 | 
53 | 	val t1_concRDD = t1_stemmedRDD.groupByKey()
54 | 	t1_concRDD.take(5).foreach(ex=> {
55 | 	println("key " + ex._1 + " -- loci " + ex._2.mkString(","))
56 | 	})
57 | 	val t2_concRDD = t2_stemmedRDD.groupByKey()
58 | 
59 | 
60 | 	// COMMAND ----------
61 | 
62 | 	val t1_bigram = t1_concRDD.cartesian(t1_concRDD).filter(p=>(p._1._1<p._2._1))
63 | 	println(t1_bigram.first())
64 | 	val t2_bigram = t2_concRDD.cartesian(t2_concRDD).filter(p=>(p._1._1<p._2._1))
65 | 
66 | 
67 | 	// COMMAND ----------
68 | 
69 | 	val distance = 7
70 | 	def findBigramsWithin(rawBigram:((String, Iterable[Long]), (String, Iterable[Long]))) = {  
71 | 	  val pairs = for { 
72 | 	    loc1 <- rawBigram._1._2
73 | 	    loc2 <- rawBigram._2._2
74 | 	    if (Math.abs(loc1-loc2)<distance)
75 | 	  } yield {
76 | 	    (loc1, loc2)
77 | 	  }
78 | 	  ((rawBigram._1._1, rawBigram._2._1), pairs)
79 | 	}
80 | 
81 | 	val t1_bigram_loci = t1_bigram.map(findBigramsWithin).filter(p=>p._2.size>0)
82 | 	println(t1_bigram_loci.take(10).mkString(","))
83 | 
84 | 	val t2_bigram_loci = t2_bigram.map(findBigramsWithin).filter(p=>p._2.size>0)
85 | 
86 | 	// COMMAND ----------
87 | 
88 | 	val joinedRDD = t1_bigram_loci.join(t2_bigram_loci)
89 | 	for (entry <- joinedRDD.collect) {
90 | 	  println(entry._1 + "\n\t" + entry._2._1 + "\n\t" + entry._2._2 + "\n\n")
91 | 	}
92 | 
93 | 
94 |   }
95 | }


--------------------------------------------------------------------------------
/gutenrye/gut2.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from porter2 import stem
 3 | import urllib2
 4 | import re
 5 | 
 6 | sc = SparkContext()
 7 | 
 8 | #text_urls = ['https://www.gutenberg.org/cache/epub/77/pg77.txt', 'http://www.gutenberg.org/cache/epub/2701/pg2701.txt']
 9 | text_urls = ['https://dl.dropboxusercontent.com/u/105876471/tth.txt', 'https://dl.dropboxusercontent.com/u/105876471/coc.txt']
10 | text1_url = text_urls[0]
11 | text2_url = text_urls[1]
12 | 
13 | # Load from web:
14 | def wgetAndTokenize(url):
15 |   response = urllib2.urlopen(url)
16 |   data = response.read()
17 |   return re.split('\s+', data);
18 | 
19 | text1_tokens = wgetAndTokenize(text1_url)
20 | text2_tokens = wgetAndTokenize(text2_url)
21 | 
22 | # make RDD with list of words along with their position in the original text (so we can find context later)
23 | text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex()
24 | text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex()
25 | #print text1_tokensRDD.take(5)
26 | 
27 | # define a list of stop words (chosen fairly arbitrarily)
28 | stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero']
29 |  
30 | # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list:
31 | text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 
32 | #print text1_tokensRDD.take(5)
33 | text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 
34 | 
35 | # stem the words using imported stem function (chosen arbitrarily)
36 | text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 
37 | #print text1_stemmedRDD.take(5)
38 | text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 
39 | 
40 | # for each word, get the list of loci:
41 | text1_concRDD = text1_stemmedRDD.groupByKey()
42 | #print text1_concRDD.take(5)
43 | text2_concRDD = text2_stemmedRDD.groupByKey()
44 | 
45 | # find every pair of words (brute force)
46 | text1_bigrams = text1_concRDD.cartesian(text1_concRDD)
47 | #print text1_bigrams.first()
48 | text2_bigrams = text2_concRDD.cartesian(text2_concRDD)
49 | 
50 | # eliminate transposed pairs, and dupes -- keep ("a","b"); not ("b", "a") or ("a", "a") etc
51 | text1_bigrams = text1_bigrams.filter(lambda p:p[0][0] < p[1][0])
52 | #print text1_bigrams.first()
53 | text2_bigrams = text2_bigrams.filter(lambda p:p[0][0] < p[1][0])
54 | 
55 | # toss all pairs which never occurs within "distance" of each other:
56 | distance = 7
57 | def findBigramsWithin(pair):
58 |   p,q = pair
59 |   return ((p[0],q[0]), [(loc1, loc2) for loc1 in p[1] for loc2 in q[1] if abs(loc1-loc2)<distance])
60 | 
61 | text1_bigram_loci = text1_bigrams.map(findBigramsWithin).filter(lambda p:len(p[1])>0)
62 | #print text1_bigram_loci.take(10)
63 | text2_bigram_loci = text2_bigrams.map(findBigramsWithin).filter(lambda p:len(p[1])>0)
64 | 
65 | # "match" bigram+loci from text1 with same bigram (and other loci) from text2 (keeping only those that occur in both)
66 | joined = text1_bigram_loci.join(text2_bigram_loci)
67 | 
68 | # make it run and print a report
69 | for bigram in joined.collect():
70 |   print "\n"+str(bigram[0])
71 |   print "\ttext 1 loci"
72 |   for locus in bigram[1][0]:
73 |     lo,hi = min(locus[0],locus[1]),max(locus[0],locus[1])
74 |     print "\t\t" + str(locus) + ": " + " ".join(text1_tokens[lo-4:hi+4])
75 |   print "\ttext 2 loci"
76 |   for locus in bigram[1][1]:
77 |     lo,hi = min(locus[0],locus[1]),max(locus[0],locus[1])
78 |     print "\t\t" + str(locus) + ": " + " ".join(text2_tokens[lo-4:hi+4])
79 | 


--------------------------------------------------------------------------------
/gutenrye/scala-proj/src/main/scala/DFRye.scala:
--------------------------------------------------------------------------------
  1 | import org.apache.spark.SparkContext
  2 | import org.apache.spark.SparkContext._
  3 | import org.apache.spark.sql._
  4 | import org.apache.spark.SparkConf
  5 | 
  6 | object DFRye {
  7 |   def main(args: Array[String]) {    
  8 |     val conf = new SparkConf().setAppName("Rye")
  9 |     val sc = new SparkContext(conf)
 10 | 
 11 |     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 12 |     import sqlContext.implicits._
 13 |     import org.apache.spark.sql.functions._
 14 |     
 15 |     // Databricks notebook source exported at Fri, 7 Aug 2015 05:52:33 UTC
 16 |     val text_urls = Array("https://dl.dropboxusercontent.com/u/105876471/tth.txt", "https://dl.dropboxusercontent.com/u/105876471/coc.txt")
 17 |     // val text_urls = Array("https://www.gutenberg.org/cache/epub/77/pg77.txt", "https://www.gutenberg.org/ebooks/2701.txt.utf-8")
 18 |     val t1_url = text_urls(0)
 19 |     val t2_url = text_urls(1)
 20 | 
 21 |     import scala.io.Source
 22 |     val t1_tokens = Source.fromURL(t1_url).mkString.split("\\s+")
 23 |     val t2_tokens = Source.fromURL(t2_url).mkString.split("\\s+")
 24 | 
 25 |     // COMMAND ----------
 26 | 
 27 |     val t1_tokensRDD = sc.parallelize(t1_tokens).zipWithIndex()
 28 |     val t2_tokensRDD = sc.parallelize(t2_tokens).zipWithIndex()
 29 |     println(t1_tokensRDD.take(5))
 30 |     println(t2_tokensRDD.take(5).mkString)
 31 | 
 32 |     // COMMAND ----------
 33 | 
 34 |     val stop_words = Array("a", "i", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero")
 35 | 
 36 |     // COMMAND ----------
 37 | 
 38 |     // remove normalize to lowercase, remove stopwords
 39 |     val t1_cleaned_tokensRDD = t1_tokensRDD
 40 |         .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))	
 41 |     	.filter(p => { p._1.length>1 && !(stop_words contains p._1) })
 42 |     		
 43 |     println(t1_cleaned_tokensRDD.take(15).mkString)
 44 |     val t2_cleaned_tokensRDD = t2_tokensRDD
 45 |         .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))	
 46 |     	.filter(p => { p._1.length>1 && !(stop_words contains p._1) })
 47 | 
 48 |     println(t2_cleaned_tokensRDD.take(15).mkString)
 49 | 
 50 |     // COMMAND ----------
 51 | 
 52 |     val t1_stemmedRDD = t1_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
 53 |     println(t1_stemmedRDD.take(5).mkString)
 54 |     val t2_stemmedRDD = t2_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
 55 | 
 56 | 
 57 |     // COMMAND ----------
 58 | 
 59 |     val t1_concRDD = t1_stemmedRDD.groupByKey()
 60 |     t1_concRDD.take(5).foreach(ex=> {
 61 |     println("key " + ex._1 + " -- loci " + ex._2.mkString(","))
 62 |     })
 63 |     val t2_concRDD = t2_stemmedRDD.groupByKey()
 64 | 
 65 | 
 66 |     // COMMAND ----------
 67 |     val t1d1 = t1_concRDD.map(p => (p._1, p._2.toList)).toDF("entry", "loci")
 68 |     val t1d2 = t1_concRDD.map(p => (p._1, p._2.toList)).toDF("entry2", "loci2")
 69 | 
 70 |     var t1j = t1d1.join(t1d2, t1d1("entry")<t1d2("entry2"))
 71 | 
 72 |     val distance = 7
 73 |     val findBigramsWithin = (l1: List[Long], l2: List[Long]) => {  
 74 |       for { 
 75 |         loc1 <- l1
 76 |         loc2 <- l2
 77 |         if (Math.abs(loc1-loc2)<distance)
 78 |       } yield {
 79 |         (loc1, loc2)
 80 |       }  
 81 |     }
 82 | 
 83 |     val bigrams = udf(findBigramsWithin)
 84 | 
 85 |     val getLen = udf((e:List[Tuple2[Long, Long]]) => e.length)
 86 | 
 87 |     t1j = t1j.withColumn("bg", bigrams(t1j("loci"), t1j("loci2")))
 88 | 
 89 |     t1j = t1j.filter(getLen(t1j("bg"))>0)
 90 |     t1j = t1j.select("entry", "entry2", "bg")
 91 |     t1j.show()
 92 | 
 93 |     val t2d1 = t2_concRDD.map(p => (p._1, p._2.toList)).toDF("entry", "loci")
 94 |     val t2d2 = t2_concRDD.map(p => (p._1, p._2.toList)).toDF("entry2", "loci2")
 95 |     var t2j = t2d1.join(t2d2, t2d1("entry")<t2d2("entry2"))
 96 |     t2j = t2j.withColumn("bg", bigrams(t2j("loci"), t2j("loci2")))
 97 |     t2j = t2j.filter(getLen(t2j("bg"))>0)
 98 |     t2j = t2j.select("entry", "entry2", "bg")
 99 | 
100 |     // COMMAND ----------
101 | 
102 |     val t2rename = t2j.withColumnRenamed("entry","e").withColumnRenamed("entry2","e2").withColumnRenamed("bg", "bg2")
103 |     val joinedDF = t1j.join(t2rename, t1j("entry") === t2rename("e") && t1j("entry2") === t2rename("e2")).select("entry", "entry2", "bg", "bg2")
104 |     joinedDF.show()
105 | 
106 |     // COMMAND ----------
107 | 
108 |     joinedDF.explain
109 |   }
110 | }
111 |     // COMMAND ----------


--------------------------------------------------------------------------------
/gutenrye/porter2.py:
--------------------------------------------------------------------------------
  1 | # borrowed from https://pypi.python.org/pypi/stemming/1.0 (public domain)
  2 | 
  3 | """An implementation of the Porter2 stemming algorithm.
  4 | See http://snowball.tartarus.org/algorithms/english/stemmer.html
  5 | 
  6 | Adapted from pyporter2 by Michael Dirolf.
  7 | 
  8 | This algorithm is more correct but (at least in this implementation)
  9 | several times slower than the original porter algorithm as implemented
 10 | in stemming.porter.
 11 | """
 12 | 
 13 | import re
 14 |  
 15 | r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
 16 | ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
 17 | ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
 18 | ccy_exp = re.compile(r"([aeiouy])y")
 19 | s1a_exp = re.compile(r"[aeiouy].")
 20 | s1b_exp = re.compile(r"[aeiouy]")
 21 | 
 22 | def get_r1(word):
 23 |     # exceptional forms
 24 |     if word.startswith('gener') or word.startswith('arsen'):
 25 |         return 5
 26 |     if word.startswith('commun'):
 27 |         return 6
 28 |  
 29 |     # normal form
 30 |     match = r_exp.match(word)
 31 |     if match:
 32 |         return match.start(1)
 33 |     return len(word)
 34 |  
 35 | def get_r2(word):
 36 |     match = r_exp.match(word, get_r1(word))
 37 |     if match:
 38 |         return match.start(1)
 39 |     return len(word)
 40 |  
 41 | def ends_with_short_syllable(word):
 42 |     if len(word) == 2:
 43 |         if ewss_exp1.match(word):
 44 |             return True
 45 |     if ewss_exp2.match(word):
 46 |         return True
 47 |     return False
 48 |  
 49 | def is_short_word(word):
 50 |     if ends_with_short_syllable(word):
 51 |         if get_r1(word) == len(word):
 52 |             return True
 53 |     return False
 54 |  
 55 | def remove_initial_apostrophe(word):
 56 |     if word.startswith("'"):
 57 |         return word[1:]
 58 |     return word
 59 |  
 60 | def capitalize_consonant_ys(word):
 61 |     if word.startswith('y'):
 62 |         word = 'Y' + word[1:]
 63 |     return ccy_exp.sub('\g<1>Y', word)
 64 |  
 65 | def step_0(word):
 66 |     if word.endswith("'s'"):
 67 |         return word[:-3]
 68 |     if word.endswith("'s"):
 69 |         return word[:-2]
 70 |     if word.endswith("'"):
 71 |         return word[:-1]
 72 |     return word
 73 |  
 74 | def step_1a(word):
 75 |     if word.endswith('sses'):
 76 |         return word[:-4] + 'ss'
 77 |     if word.endswith('ied') or word.endswith('ies'):
 78 |         if len(word) > 4:
 79 |             return word[:-3] + 'i'
 80 |         else:
 81 |             return word[:-3] + 'ie'
 82 |     if word.endswith('us') or word.endswith('ss'):
 83 |         return word
 84 |     if word.endswith('s'):
 85 |         preceding = word[:-1]
 86 |         if s1a_exp.search(preceding):
 87 |             return preceding
 88 |         return word
 89 |     return word
 90 | 
 91 | doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')
 92 | def ends_with_double(word):
 93 |     for double in doubles:
 94 |         if word.endswith(double):
 95 |             return True
 96 |     return False
 97 | def step_1b_helper(word):
 98 |     if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
 99 |         return word + 'e'
100 |     if ends_with_double(word):
101 |         return word[:-1]
102 |     if is_short_word(word):
103 |         return word + 'e'
104 |     return word
105 | s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')
106 | 
107 | def step_1b(word, r1):
108 |     if word.endswith('eedly'):
109 |         if len(word) - 5 >= r1:
110 |             return word[:-3]
111 |         return word
112 |     if word.endswith('eed'):
113 |         if len(word) - 3 >= r1:
114 |             return word[:-1]
115 |         return word
116 |  
117 |     for suffix in s1b_suffixes:
118 |         if word.endswith(suffix):
119 |             preceding = word[:-len(suffix)]
120 |             if s1b_exp.search(preceding):
121 |                 return step_1b_helper(preceding)
122 |             return word
123 |  
124 |     return word
125 |  
126 | def step_1c(word):
127 |     if word.endswith('y') or word.endswith('Y') and len(word) > 1:
128 |         if word[-2] not in 'aeiouy':
129 |             if len(word) > 2:
130 |                 return word[:-1] + 'i'
131 |     return word
132 | 
133 | def step_2_helper(word, r1, end, repl, prev):
134 |         if word.endswith(end):
135 |             if len(word) - len(end) >= r1:
136 |                 if prev == []:
137 |                     return word[:-len(end)] + repl
138 |                 for p in prev:
139 |                     if word[:-len(end)].endswith(p):
140 |                         return word[:-len(end)] + repl
141 |             return word
142 |         return None
143 | s2_triples = (('ization', 'ize', []),
144 |                ('ational', 'ate', []),
145 |                ('fulness', 'ful', []),
146 |                ('ousness', 'ous', []),
147 |                ('iveness', 'ive', []),
148 |                ('tional', 'tion', []),
149 |                ('biliti', 'ble', []),
150 |                ('lessli', 'less', []),
151 |                ('entli', 'ent', []),
152 |                ('ation', 'ate', []),
153 |                ('alism', 'al', []),
154 |                ('aliti', 'al', []),
155 |                ('ousli', 'ous', []),
156 |                ('iviti', 'ive', []),
157 |                ('fulli', 'ful', []),
158 |                ('enci', 'ence', []),
159 |                ('anci', 'ance', []),
160 |                ('abli', 'able', []),
161 |                ('izer', 'ize', []),
162 |                ('ator', 'ate', []),
163 |                ('alli', 'al', []),
164 |                ('bli', 'ble', []),
165 |                ('ogi', 'og', ['l']),
166 |                ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))
167 | 
168 | def step_2(word, r1):
169 |     for trip in s2_triples:
170 |         attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
171 |         if attempt:
172 |             return attempt
173 |     return word
174 | 
175 | def step_3_helper(word, r1, r2, end, repl, r2_necessary):
176 |     if word.endswith(end):
177 |         if len(word) - len(end) >= r1:
178 |             if not r2_necessary:
179 |                 return word[:-len(end)] + repl
180 |             else:
181 |                 if len(word) - len(end) >= r2:
182 |                     return word[:-len(end)] + repl
183 |         return word
184 |     return None
185 | s3_triples = (('ational', 'ate', False),
186 |                ('tional', 'tion', False),
187 |                ('alize', 'al', False),
188 |                ('icate', 'ic', False),
189 |                ('iciti', 'ic', False),
190 |                ('ative', '', True),
191 |                ('ical', 'ic', False),
192 |                ('ness', '', False),
193 |                ('ful', '', False))
194 | def step_3(word, r1, r2):
195 |     for trip in s3_triples:
196 |         attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
197 |         if attempt:
198 |             return attempt
199 |     return word
200 | 
201 | s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
202 |                   'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')
203 | 
204 | def step_4(word, r2):
205 |     for end in s4_delete_list:
206 |         if word.endswith(end):
207 |             if len(word) - len(end) >= r2:
208 |                 return word[:-len(end)]
209 |             return word
210 |  
211 |     if word.endswith('sion') or word.endswith('tion'):
212 |         if len(word) - 3 >= r2:
213 |             return word[:-3]
214 |  
215 |     return word
216 |  
217 | def step_5(word, r1, r2):
218 |     if word.endswith('l'):
219 |         if len(word) - 1 >= r2 and word[-2] == 'l':
220 |             return word[:-1]
221 |         return word
222 |  
223 |     if word.endswith('e'):
224 |         if len(word) - 1 >= r2:
225 |             return word[:-1]
226 |         if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
227 |             return word[:-1]
228 |  
229 |     return word
230 |  
231 | def normalize_ys(word):
232 |     return word.replace('Y', 'y')
233 |  
234 | exceptional_forms = {'skis': 'ski',
235 |                     'skies': 'sky',
236 |                     'dying': 'die',
237 |                     'lying': 'lie',
238 |                     'tying': 'tie',
239 |                     'idly': 'idl',
240 |                     'gently': 'gentl',
241 |                     'ugly': 'ugli',
242 |                     'early': 'earli',
243 |                     'only': 'onli',
244 |                     'singly': 'singl',
245 |                     'sky': 'sky',
246 |                     'news': 'news',
247 |                     'howe': 'howe',
248 |                     'atlas': 'atlas',
249 |                     'cosmos': 'cosmos',
250 |                     'bias': 'bias',
251 |                     'andes': 'andes'}
252 |  
253 | exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
254 |                                             'earring', 'proceed', 'exceed', 'succeed'])
255 |  
256 | 
257 | def stem(word):
258 |     if len(word) <= 2:
259 |         return word
260 |     word = remove_initial_apostrophe(word)
261 | 
262 |     # handle some exceptional forms
263 |     if word in exceptional_forms:
264 |         return exceptional_forms[word]
265 | 
266 |     word = capitalize_consonant_ys(word)
267 |     r1 = get_r1(word)
268 |     r2 = get_r2(word)
269 |     word = step_0(word)
270 |     word = step_1a(word)
271 | 
272 |     # handle some more exceptional forms
273 |     if word in exceptional_early_exit_post_1a:
274 |         return word
275 | 
276 |     word = step_1b(word, r1)
277 |     word = step_1c(word)
278 |     word = step_2(word, r1)
279 |     word = step_3(word, r1, r2)
280 |     word = step_4(word, r2)
281 |     word = step_5(word, r1, r2)
282 |     word = normalize_ys(word)
283 | 
284 |     return word
285 | 
286 | if __name__ == "__main__":
287 |     assert stem("bill's") == "bill"
288 |     assert stem("y's") == "y"
289 |     
290 |  


--------------------------------------------------------------------------------
/gutenrye/scala-proj/src/main/scala/Stemmer.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Scala Porter Stemmer Implementation
  3 |  *
  4 |  */
  5 | object Stemmer extends Serializable {
  6 |   def stem (str: String): String = {
  7 |     // check for zero length
  8 | 	  if (str.length() > 3) {
  9 | 	    // all characters must be letters
 10 | 	    for (ch <- str toList) {
 11 | 		    if (!Character.isLetter(ch)) {
 12 | 		      return str.toLowerCase()
 13 | 		    }
 14 | 	    }
 15 | 	  } 
 16 |     var s: String = step_1(str)
 17 | 	  step_5(step_4(step_3(step_2(step_1(str))))).toLowerCase
 18 |   }
 19 | 
 20 |   def step_1(str: String): String = step_1_c(step_1_b(step_1_a(str)))
 21 | 
 22 |   /*
 23 |    * Step 1a
 24 |    * SSES -> SS                         caresses  ->  caress
 25 |    * IES  -> I                          ponies    ->  poni
 26 |    *                                    ties      ->  ti
 27 |    * SS   -> SS                         caress    ->  caress
 28 |    * S    ->                            cats      ->  cat
 29 |    */
 30 |   def step_1_a(str: String): String = replacePatterns(str, List( ("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")), _>=0)
 31 | 
 32 |   /*
 33 |    * Step 1b
 34 |    * (m>0) EED -> EE                    feed      ->  feed
 35 |    *                                    agreed    ->  agree
 36 |    * (*v*) ED  ->                       plastered ->  plaster
 37 |    *                                    bled      ->  bled
 38 |    * (*v*) ING ->                       motoring  ->  motor
 39 |    *                                    sing      ->  sing
 40 |    */
 41 |   def step_1_b (str: String): String = {
 42 |     // (m > 0) EED -> EE
 43 |     if (str.endsWith("eed")) {
 44 |       if (stringMeasure(str.substring(0, str.length - 3)) > 0)
 45 |         return str.substring(0, str.length() - 1)
 46 |       // (*v*) ED ->
 47 |     } else if ((str.endsWith("ed")) &&
 48 |                (containsVowel(str.substring(0, str.length - 2)))) {
 49 |                  return step_1_b_2(str.substring(0, str.length - 2))
 50 |                  // (*v*) ING ->
 51 |                } else if ((str.endsWith("ing")) &&
 52 |                           (containsVowel(str.substring(0, str.length - 3)))) {
 53 |                             return step_1_b_2(str.substring(0, str.length - 3))
 54 |                           } // end if
 55 |     str
 56 |   } // end step1b
 57 | 
 58 |   /*
 59 |    * If the second or third of the rules in Step 1b is successful, the following is done:
 60 |    * AT -> ATE                       conflat(ed)  ->  conflate
 61 |    * BL -> BLE                       troubl(ed)   ->  trouble
 62 |    * IZ -> IZE                       siz(ed)      ->  size
 63 |    * 
 64 |    * (*d and not (*L or *S or *Z))                ->  single letter
 65 |    *                                 hopp(ing)    ->  hop
 66 |    *                                 tann(ed)     ->  tan
 67 |    *                                 fall(ing)    ->  fall
 68 |    *                                 hiss(ing)    ->  hiss
 69 |    *                                 fizz(ed)     ->  fizz
 70 |    * 
 71 |    * (m=1 and *o) -> E               fail(ing)    ->  fail
 72 |    *                                 fil(ing)     ->  file
 73 |    */
 74 |   def step_1_b_2 (str: String): String =  {
 75 |     
 76 |     if (str.endsWith("at") ||
 77 |         str.endsWith("bl") ||
 78 |         str.endsWith("iz")) {
 79 |           return str + "e";
 80 |         }
 81 |     else if ((str.length() > 1) && (endsWithDoubleConsonent(str)) &&
 82 |              (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) {
 83 |                return str.substring(0, str.length() - 1);
 84 |              }
 85 |     else if ((stringMeasure(str) == 1) &&
 86 |              (endsWithCVC(str))) {
 87 |                return str + "e"
 88 |              }
 89 |     str
 90 |   }
 91 | 
 92 |   /*
 93 |    *     (*v*) Y -> I                    happy        ->  happi
 94 |    *                                     sky          ->  sky
 95 |    */
 96 |   def step_1_c(str: String): String = {
 97 |     if (str.endsWith("y") && containsVowel(str.substring(0, str.length() - 1)))
 98 |       return str.substring(0, str.length() - 1) + "i"
 99 |     str
100 |   } // end step1c
101 | 
102 |   /*
103 |    * (m>0) ATIONAL ->  ATE           relational     ->  relate
104 |    * (m>0) TIONAL  ->  TION          conditional    ->  condition
105 |    *                                 rational       ->  rational
106 |    *  (m>0) ENCI    ->  ENCE          valenci        ->  valence
107 |    *  (m>0) ANCI    ->  ANCE          hesitanci      ->  hesitance
108 |    *  (m>0) IZER    ->  IZE           digitizer      ->  digitize
109 |    *  (m>0) ABLI    ->  ABLE          conformabli    ->  conformable
110 |    *  (m>0) ALLI    ->  AL            radicalli      ->  radical
111 |    *  (m>0) ENTLI   ->  ENT           differentli    ->  different
112 |    *  (m>0) ELI     ->  E             vileli        - >  vile
113 |    *  (m>0) OUSLI   ->  OUS           analogousli    ->  analogous
114 |    *  (m>0) IZATION ->  IZE           vietnamization ->  vietnamize
115 |    *  (m>0) ATION   ->  ATE           predication    ->  predicate
116 |    *  (m>0) ATOR    ->  ATE           operator       ->  operate
117 |    *  (m>0) ALISM   ->  AL            feudalism      ->  feudal
118 |    *  (m>0) IVENESS ->  IVE           decisiveness   ->  decisive
119 |    *  (m>0) FULNESS ->  FUL           hopefulness    ->  hopeful
120 |    *  (m>0) OUSNESS ->  OUS           callousness    ->  callous
121 |    *  (m>0) ALITI   ->  AL            formaliti      ->  formal
122 |    *  (m>0) IVITI   ->  IVE           sensitiviti    ->  sensitive
123 |    *  (m>0) BILITI  ->  BLE           sensibiliti    ->  sensible
124 |    */
125 |   def step_2 (str: String): String = replacePatterns(str, List( ("ational", "ate"), ("tional","tion"), ("enci","ence"), ("anci","ance"),
126 |                                                                  ("izer","ize"), ("bli","ble"), ("alli", "al"), ("entli","ent"),("eli","e"),
127 |                                                                  ("ousli","ous"), ("ization","ize"), ("ation","ate"), ("ator","ate"), ("alism","al"),
128 |                                                                  ("iveness","ive"), ("fulness","ful"), ("ousness", "ous"), ("aliti", "al"), ("iviti","ive"),
129 |                                                                  ("biliti", "ble"), ("logi", "log")))
130 | 
131 |   /*
132 |    * (m>0) ICATE ->  IC              triplicate     ->  triplic
133 |    * (m>0) ATIVE ->                  formative      ->  form
134 |    * (m>0) ALIZE ->  AL              formalize      ->  formal
135 |    * (m>0) ICITI ->  IC              electriciti    ->  electric
136 |    * (m>0) ICAL  ->  IC              electrical     ->  electric
137 |    * (m>0) FUL   ->                  hopeful        ->  hope
138 |    * (m>0) NESS  ->                  goodness       ->  good
139 |    */
140 |   def step_3 (str: String): String = replacePatterns(str, List( ("icate", "ic"),("ative",""),("alize","al"),("iciti","ic"),("ical","ic"),("ful",""),("ness","")))
141 | 
142 |   /*
143 |    * (m>1) AL    ->                  revival        ->  reviv
144 |    * (m>1) ANCE  ->                  allowance      ->  allow
145 |    * (m>1) ENCE  ->                  inference      ->  infer
146 |    * (m>1) ER    ->                  airliner       ->  airlin
147 |    * (m>1) IC    ->                  gyroscopic     ->  gyroscop
148 |    * (m>1) ABLE  ->                  adjustable     ->  adjust
149 |    * (m>1) IBLE  ->                  defensible     ->  defens
150 |    * (m>1) ANT   ->                  irritant       ->  irrit
151 |    * (m>1) EMENT ->                  replacement    ->  replac
152 |    * (m>1) MENT  ->                  adjustment     ->  adjust
153 |    * (m>1) ENT   ->                  dependent      ->  depend
154 |    * (m>1 and (*S or *T)) ION ->     adoption       ->  adopt
155 |    * (m>1) OU    ->                  homologou      ->  homolog
156 |    * (m>1) ISM   ->                  communism      ->  commun
157 |    * (m>1) ATE   ->                  activate       ->  activ
158 |    * (m>1) ITI   ->                  angulariti     ->  angular
159 |    * (m>1) OUS   ->                  homologous     ->  homolog
160 |    * (m>1) IVE   ->                  effective      ->  effect
161 |    * (m>1) IZE   ->                  bowdlerize     ->  bowdler
162 |    */ 
163 |   def step_4 (str: String): String = {
164 |     val res: String = replacePatterns(str, List( ("al",""),("ance",""),("ence",""),("er",""),("ic",""),("able",""),("ible",""),("ant",""),("ement",""),
165 |                                                 ("ment",""),("ent",""),("ou", ""),("ism",""),("ate",""),("iti",""),("ous",""),
166 |                                                 ("ive",""),("ize","")), _>1)
167 |     if (str == res) {
168 |       if ((str.endsWith("sion") || str.endsWith("tion")) && stringMeasure(str.substring(0, str.length() - 3)) > 1) 
169 |         return str.substring(0, str.length() - 3)
170 |       else
171 |         return str
172 |     }
173 |     else {
174 |       return res
175 |     } 
176 |   }
177 | 
178 |   def step_5 (str: String): String = step_5_b(step_5_a(str))
179 | 
180 |   /*
181 |    * (m>1) E     ->                  probate        ->  probat
182 |    *                                 rate           ->  rate
183 |    * (m=1 and not *o) E ->           cease          ->  ceas
184 |    */
185 |   def step_5_a (str: String): String = {
186 |     // (m > 1) E ->
187 |     if ((stringMeasure(str.substring(0, str.length() - 1)) > 1) &&
188 |         str.endsWith("e"))
189 |       return str.substring(0, str.length() -1)
190 |     // (m = 1 and not *0) E ->
191 |     else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1) &&
192 |              (!endsWithCVC(str.substring(0, str.length() - 1))) &&
193 |              (str.endsWith("e")))
194 |       return str.substring(0, str.length() - 1)
195 |     else
196 |       return str
197 |   } // end step5a
198 | 
199 |   /*
200 |    * (m > 1 and *d and *L) -> single letter
201 |    *                                 controll       ->  control
202 |    *                                 roll           ->  roll
203 |    */
204 |   def step_5_b (str: String): String = {
205 |     // (m > 1 and *d and *L) ->
206 |     if (str.endsWith("l") &&
207 |         endsWithDoubleConsonent(str) &&
208 |         (stringMeasure(str.substring(0, str.length() - 1)) > 1)) {
209 |       str.substring(0, str.length() - 1)
210 |     } else {
211 |       str
212 |     }
213 |   } // end step5b
214 | 
215 |   // does string contain a vowel?
216 |   def containsVowel(str: String): Boolean = {
217 |     for (ch <- str toList) {
218 |       if (isVowel(ch))
219 |         return true
220 |     }
221 |     // no aeiou but there is y
222 |     if (str.indexOf('y') > -1)
223 |       return true
224 |     else
225 |       false
226 |   } // end function
227 | 
228 |   // is char a vowel?
229 |   def isVowel(c: Char): Boolean = {
230 |     for (ch <- "aeiou" toList)
231 |       if (c == ch)
232 |         return true
233 |     false
234 |   } // end function
235 | 
236 |   /*
237 |    * Special check for 'y', since it may be both vowel and consonent depending on surrounding letters
238 |    */
239 |   def isVowel(str: String, i: Int): Boolean = {
240 |     for (ch <- "aeiou" toList)
241 |       if (str(i) == ch || (str(i) == 'y' && i > 0 && i+1 < str.length && !isVowel(str(i-1)) && !isVowel(str(i+1)) ))
242 |         return true
243 |     false
244 |   } // end function
245 | 
246 |   // returns a CVC measure for the string
247 |   def stringMeasure(str: String): Int = {
248 |     var count = 0
249 |     var vowelSeen: Boolean = false
250 | 
251 |     for (i <- 0 to str.length - 1) {
252 |       if(isVowel(str, i)) {
253 |         vowelSeen = true
254 |       } else if (vowelSeen) {
255 |         count += 1 
256 |         vowelSeen = false
257 |       }
258 |     }
259 |     count
260 |   } // end function
261 | 
262 |   // does stem end with CVC?
263 |   def endsWithCVC (str: String): Boolean = {
264 |     if (str.length() >= 3) {
265 |       val cvc = ( str(str.length - 1), str(str.length - 2), str(str.length - 3) )
266 |       val cvc_str = cvc._1.toString + cvc._2 + cvc._3
267 | 
268 |       if ((cvc._1 == 'w') || (cvc._1 == 'x') || (cvc._1 == 'y')) 
269 |         false
270 |       else if (!isVowel(cvc._1) && isVowel(cvc_str, 1) && !isVowel(cvc._3)) 
271 |         true
272 |       else 
273 |         false
274 |     }
275 |     else
276 |       false
277 |   } // end function
278 | 
279 |   // does string end with a double consonent?
280 |   def endsWithDoubleConsonent(str: String): Boolean = {
281 |     val c: Char = str.charAt(str.length() - 1);
282 |     if (c == str.charAt(str.length() - 2))
283 |       if (!containsVowel(str.substring(str.length() - 2))) {
284 |         return true
285 |       }
286 |     false
287 |   } // end function
288 | 
289 |   def replacePatterns(str: String, patterns: List[(String, String)]): String = replacePatterns(str, patterns, _>0)
290 | 
291 |   def replaceLast(str: String, pattern: String, replacement: String) = new StringBuilder(str).replace(str.lastIndexOf(pattern), str.lastIndexOf(pattern) + pattern.length, replacement).toString
292 | 
293 |   def replacePatterns(str: String, patterns: List[(String, String)], comparer: Int => Boolean): String = {
294 |       for (pattern <- patterns)
295 |         if (str.endsWith(pattern._1)) {
296 |           val res = replaceLast(str, pattern._1, pattern._2)
297 |           if (comparer(stringMeasure(replaceLast(str, pattern._1, ""))))
298 |             return res
299 |           else
300 |             return str
301 |         }
302 |     str
303 |   }
304 | 
305 | }


--------------------------------------------------------------------------------