├── gutenrye
├── gut.sh
├── scala-proj
│ ├── rye.sbt
│ └── src
│ │ └── main
│ │ └── scala
│ │ ├── Rye.scala
│ │ ├── DFRye.scala
│ │ └── Stemmer.scala
├── launch-gut-scala.sh
├── gutdf.py
├── output-sample.rtf
├── gut2.py
└── porter2.py
├── README.md
└── LICENSE
/gutenrye/gut.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # you'll need to swap in your own paths as appropriate:
4 |
5 | /Users/adam/Applications/spark-1.4.0-bin-hadoop2.6/bin/spark-submit /Users/adam/Documents/rye/gutenrye/gut2.py
6 |
7 |
8 |
--------------------------------------------------------------------------------
/gutenrye/scala-proj/rye.sbt:
--------------------------------------------------------------------------------
1 | name := "Rye"
2 |
3 | version := "1.0"
4 |
5 | scalaVersion := "2.10.4"
6 |
7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0"
8 |
9 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0"
--------------------------------------------------------------------------------
/gutenrye/launch-gut-scala.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # you'll need to swap in your own paths as appropriate:
4 |
5 | /Users/adam/Applications/spark-1.4.0-bin-hadoop2.6/bin/spark-submit --master local[*] --class "DFRye" /Users/adam/Documents/rye/gutenrye/scala-proj/target/scala-2.10/rye_2.10-1.0.jar
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # rye
2 |
3 | Experimental implementation with Apache Spark of a subset of functionality from http://tesserae.caset.buffalo.edu/
4 |
5 | ### Some notes
6 |
7 | - The stemmer is public domain and grabbed from a common Python lib
8 | - Stop words are an arbitrary list from the Internet
9 |
10 | ### Logical ToDos
11 |
12 | - Get a better, multilingual stemmer
13 | - Better stop word list
14 | - Add filtering of "stop bigrams"
15 | + Real Tesserae has a scoring engine
16 | + Bigrams common to a genre/language should probable get scored 0 and filtered out for performance
17 |
18 | ### Plans
19 |
20 | - Reimplement parts of the code using the Spark DataFrames/SQL API
21 | + Hoping to squeeze a little free performance out of the Catalyst optimizer
22 | + Not using a Python lambda in the "big filter" (i.e., the distance filter after the Cartesian join) might help perf
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/gutenrye/gutdf.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext
2 | from pyspark.sql import HiveContext
3 | from porter2 import stem
4 | import urllib2
5 | import re
6 |
7 | sc = SparkContext()
8 | sqlContext = HiveContext(sc)
9 |
10 | stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero']
11 |
12 | text_urls = ['https://dl.dropboxusercontent.com/u/105876471/tth.txt', 'https://dl.dropboxusercontent.com/u/105876471/coc.txt']
13 | text1_url = text_urls[0]
14 | text2_url = text_urls[1]
15 |
16 | # Load from web:
17 | def wgetAndTokenize(url):
18 | response = urllib2.urlopen(url)
19 | data = response.read()
20 | return re.split('\s+', data);
21 |
22 | text1_tokens = wgetAndTokenize(text1_url)
23 | text2_tokens = wgetAndTokenize(text2_url)
24 |
25 | # make RDD with list of words along with their position in the original text (so we can find context later)
26 | text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex()
27 | text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex()
28 | #print text1_tokensRDD.take(5)
29 |
30 | # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list:
31 | text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words)
32 | print text1_tokensRDD.take(5)
33 | text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words)
34 |
35 | # stem the words using imported stem function (chosen arbitrarily)
36 | text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1]))
37 | print text1_stemmedRDD.take(5)
38 | text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1]))
39 |
40 | t1raw = text1_stemmedRDD.toDF(['entry', 'locus'])
41 | t1raw.show()
42 |
43 | t2raw = text2_stemmedRDD.toDF(['entry', 'locus'])
44 |
45 | t1raw.registerTempTable("t1raw")
46 | t2raw.registerTempTable("t2raw")
47 |
48 | bg1 = sqlContext.sql("select a.entry a1, b.entry b1, a.locus, b.locus from t1raw a cross join t1raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7")
49 | bg1.show(4)
50 |
51 | bg2 = sqlContext.sql("select a.entry a2, b.entry b2, a.locus, b.locus from t2raw a cross join t2raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7")
52 |
53 | bg2.show(4)
54 |
55 | bg = bg1.join(bg2, ((bg1.a1 == bg2.a2) & (bg1.b1 == bg2.b2)))
56 | bg.show(100)
57 |
58 |
--------------------------------------------------------------------------------
/gutenrye/output-sample.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170
2 | {\fonttbl\f0\fmodern\fcharset0 CourierNewPSMT;}
3 | {\colortbl;\red255\green255\blue255;}
4 | \margl1440\margr1440\vieww17260\viewh9800\viewkind0
5 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural
6 |
7 | \f0\fs20 \cf0 \CocoaLigature0 ('earth', 'thing')\
8 | text 1 loci\
9 | (57, 61): heaven and in the earth. I heard many things in hell. How,\
10 | text 2 loci\
11 | (5572, 5570): alone among the conscious things of earth, for shapes came\
12 | (5799, 5795): been aeons when other Things ruled on the earth, and They had\
13 | (11120, 11123): that was not of earth the titan Thing from the stars\
14 | \
15 | ('acut', 'thing')\
16 | text 1 loci\
17 | (46, 50): the sense of hearing acute. I heard all things in the heaven\
18 | text 2 loci\
19 | (2528, 2534): of the dreamers confessed acute fear of the gigantic nameless thing visible toward the\
20 | \
21 | ('dead', 'stone')\
22 | text 1 loci\
23 | (1435, 1433): corpse. Yes, he was stone, stone dead. I placed my\
24 | (1435, 1434): Yes, he was stone, stone dead. I placed my\
25 | (1456, 1455): no pulsation. He was stone dead. His eve would\
26 | text 2 loci\
27 | (7253, 7258): ritual which told of dead Cthulhu's dream-vigil in his stone vault at R'lyeh,\
28 | \
29 | ('examin', 'remov')\
30 | text 1 loci\
31 | (1427, 1423): man was dead. I removed the bed and examined the corpse. Yes,\
32 | text 2 loci\
33 | (5307, 5301): of course, was carefully removed and carried back by Legrasse. Examined at headquarters after\
34 | \
35 | ('felt', 'night')\
36 | text 1 loci\
37 | (544, 541): mine. Never before that night had I felt the extent of\
38 | text 2 loci\
39 | (1501, 1496): slight earthquake tremor the night before, the most considerable felt in New England\
40 | \
41 | ('men', 'nois')\
42 | text 1 loci\
43 | (2004, 2007): the observations of the men --but the noise steadily increased. Oh\
44 | text 2 loci\
45 | (4733, 4729): do justice to the noises heard by Legrasse's men as they ploughed\
46 | \
47 | ('felt', 'thing')\
48 | text 1 loci\
49 | (1826, 1821): they chatted of familiar things. But, ere long, I felt myself getting pale\
50 | text 2 loci\
51 | (10284, 10287): Now an unlettered seaman felt the same thing whilst gazing at\
52 | \
53 | ('dream', 'thought')\
54 | text 1 loci\
55 | (579, 585): he not even to dream of my secret deeds or thoughts. I fairly chuckled\
56 | text 2 loci\
57 | (10018, 10012): after cycles incalculable, the thoughts that spread fear to the dreams of the sensitive\
58 | \
59 | ('black', 'dark')\
60 | text 1 loci\
61 | (621, 627): His room was as black as pitch with the thick darkness, (for the shutters\
62 | text 2 loci\
63 | (10682, 10685): upset. The aperture was black with a darkness almost material. That\
64 | \
65 | ('dream', 'secret')\
66 | text 1 loci\
67 | (579, 582): he not even to dream of my secret deeds or thoughts.\
68 | text 2 loci\
69 | (5454, 5452): bodies had told their secrets in dreams to the first\
70 | \
71 | ('face', 'man')\
72 | text 1 loci\
73 | (1103, 1102): else of the old man's face or person: for\
74 | text 2 loci\
75 | (4893, 4897): swamp water on the face of the fainting man, and all stood\
76 | \
77 | ('death', 'night')\
78 | text 1 loci\
79 | (740, 734): as I have done, night after night, hearkening to the death watches in the\
80 | (740, 736): have done, night after night, hearkening to the death watches in the\
81 | text 2 loci\
82 | (11178, 11182): laughing at intervals till death found him one night in the cabin\
83 | \
84 | ('found', 'night')\
85 | text 1 loci\
86 | (423, 417): seven long nights --every night just at midnight --but I found the eye always\
87 | text 2 loci\
88 | (11179, 11182): at intervals till death found him one night in the cabin\
89 | \
90 | ('heard', 'suspect')\
91 | text 1 loci\
92 | (2071, 2073): God! --no, no! They heard! --they suspected! --they knew! --they\
93 | text 2 loci\
94 | (6757, 6752): natural; though privately I suspected young Wilcox of having heard of the cult\
95 | \
96 | ('disturb', 'man')\
97 | text 1 loci\
98 | (329, 332): that I might not disturb the old man's sleep. It took\
99 | text 2 loci\
100 | (721, 725): responsible for this apparent disturbance of an old man's peace of mind.\
101 | \
102 | ('heard', 'length')\
103 | text 1 loci\
104 | (1409, 1414): it would not be heard through the wall. At length it ceased. The\
105 | text 2 loci\
106 | (10756, 10750): was intolerable, and at length the quick-eared Hawkins thought he heard a nasty, slopping\
107 | \
108 | ('end', 'made')\
109 | text 1 loci\
110 | (1591, 1589): ha! When I had made an end of these labors,\
111 | text 2 loci\
112 | (5246, 5242): fired, and escapes were made; but in the end Legrasse was able\
113 | \
114 | ('continu', 'feel')\
115 | text 1 loci\
116 | (1875, 1872): get rid of the feeling: but it continued and gained definiteness\
117 | text 2 loci\
118 | (2441, 2443): That is why I continued to feel that Wilcox, somehow\
119 | \
120 | ('beat', 'muffl')\
121 | text 1 loci\
122 | (1393, 1397): many minutes, the heart beat on with a muffled sound. This, however,\
123 | text 2 loci\
124 | (4483, 4482): of bobbing lanterns. The muffled beat of tom-toms was\
125 | \
126 | ('god', 'suspect')\
127 | text 1 loci\
128 | (2067, 2073): they heard not? Almighty God! --no, no! They heard! --they suspected! --they knew! --they\
129 | text 2 loci\
130 | (10044, 10042): this Johansen did not suspect, but God knows he soon\
131 | \
132 | ('bed', 'sudden')\
133 | text 1 loci\
134 | (602, 603): he moved on the bed suddenly, as if startled.\
135 | text 2 loci\
136 | (2058, 2052): trace of Wilcox's malady suddenly ceased. He sat upright in bed, astonished to find\
137 | \
138 | ('door', 'knock')\
139 | text 1 loci\
140 | (1616, 1612): hour, there came a knocking at the street door. I went down\
141 | text 2 loci\
142 | (9410, 9404): trip by taxicab, and knocked with palpitant heart at the door of a neat\
143 | \
144 | ('heard', 'sound')\
145 | text 1 loci\
146 | (1331, 1328): anxiety seized me --the sound would be heard by a neighbour!\
147 | text 2 loci\
148 | (10756, 10760): quick-eared Hawkins thought he heard a nasty, slopping sound down there. Everyone\
149 | \
150 | ('dream', 'man')\
151 | text 1 loci\
152 | (1708, 1711): my own in a dream. The old man, I mentioned, was\
153 | text 2 loci\
154 | (6693, 6690): of a sensitive young man who had dreamed not only the\
155 | \
156 | ('awak', 'lie')\
157 | text 1 loci\
158 | (853, 852): that he had been lying awake ever since the\
159 | text 2 loci\
160 | (6017, 6016): and They could only lie awake in the dark\
161 | \
162 | ('dead', 'man')\
163 | text 1 loci\
164 | (1421, 1419): it ceased. The old man was dead. I removed the\
165 | text 2 loci\
166 | (7967, 7968): Tow. One Survivor and Dead Man Found Aboard. Tale\
167 | (8045, 8046): one living and one dead man aboard. The Vigilant\
168 | (8102, 8097): half-delirious condition and one man who had evidently been dead for more than\
169 | \
170 | ('man', 'week')\
171 | text 1 loci\
172 | (243, 248): kinder to the old man than during the whole week before I killed\
173 | text 2 loci\
174 | (8110, 8107): for more than a week. The living man was clutching a\
175 | \
176 | ('excit', 'strang')\
177 | text 1 loci\
178 | (1293, 1288): that old house, so strange a noise as this excited me to uncontrollable\
179 | text 2 loci\
180 | (1253, 1257): and had from chidhood excited attention through the strange stories and odd\
181 | (7023, 7029): for my uncle had excited his curiosity in probing his strange dreams, yet had\
182 | \
183 | ('long', 'thing')\
184 | text 1 loci\
185 | (1824, 1821): they chatted of familiar things. But, ere long, I felt myself\
186 | text 2 loci\
187 | (3436, 3441): and fore feet, and long, narrow wings behind. This thing, which seemed instinct\
188 | (9234, 9240): and I studied it long and well, finding it a thing of balefully exquisite\
189 | \
190 | ('man', 'suspect')\
191 | text 1 loci\
192 | (496, 499): a very profound old man, indeed, to suspect that every night,\
193 | text 2 loci\
194 | (3718, 3723): problem, there was one man in that gathering who suspected a touch of}
--------------------------------------------------------------------------------
/gutenrye/scala-proj/src/main/scala/Rye.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.SparkContext
2 | import org.apache.spark.SparkContext._
3 | import org.apache.spark.SparkConf
4 |
5 | object Rye {
6 | def main(args: Array[String]) {
7 | val conf = new SparkConf().setAppName("Rye")
8 | val sc = new SparkContext(conf)
9 | // Databricks notebook source exported at Thu, 6 Aug 2015 20:48:55 UTC
10 | val text_urls = Array("https://dl.dropboxusercontent.com/u/105876471/tth.txt", "https://dl.dropboxusercontent.com/u/105876471/coc.txt")
11 | //val text_urls = Array("https://www.gutenberg.org/cache/epub/77/pg77.txt", "https://www.gutenberg.org/ebooks/2701.txt.utf-8")
12 | val t1_url = text_urls(0)
13 | val t2_url = text_urls(1)
14 |
15 | import scala.io.Source
16 |
17 | val t1_tokens = Source.fromURL(t1_url).mkString.split("\\s+")
18 | val t2_tokens = Source.fromURL(t2_url).mkString.split("\\s+")
19 |
20 | // COMMAND ----------
21 |
22 | val t1_tokensRDD = sc.parallelize(t1_tokens).zipWithIndex()
23 | val t2_tokensRDD = sc.parallelize(t2_tokens).zipWithIndex()
24 | println(t1_tokensRDD.take(5))
25 | println(t2_tokensRDD.take(5).mkString)
26 |
27 |
28 | // COMMAND ----------
29 |
30 | val stop_words = Array("a", "i", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero")
31 |
32 | // COMMAND ----------
33 |
34 | // remove normalize to lowercase, remove stopwords
35 | val t1_cleaned_tokensRDD = t1_tokensRDD
36 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))
37 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) })
38 |
39 | println(t1_cleaned_tokensRDD.take(15).mkString)
40 | val t2_cleaned_tokensRDD = t2_tokensRDD
41 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))
42 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) })
43 |
44 | println(t2_cleaned_tokensRDD.take(15).mkString)
45 | //t2_cleaned_tokensRDD.collect()
46 |
47 | val t1_stemmedRDD = t1_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
48 | println(t1_stemmedRDD.take(5).mkString)
49 | val t2_stemmedRDD = t2_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
50 |
51 | // COMMAND ----------
52 |
53 | val t1_concRDD = t1_stemmedRDD.groupByKey()
54 | t1_concRDD.take(5).foreach(ex=> {
55 | println("key " + ex._1 + " -- loci " + ex._2.mkString(","))
56 | })
57 | val t2_concRDD = t2_stemmedRDD.groupByKey()
58 |
59 |
60 | // COMMAND ----------
61 |
62 | val t1_bigram = t1_concRDD.cartesian(t1_concRDD).filter(p=>(p._1._1
(p._1._1p._2.size>0)
82 | println(t1_bigram_loci.take(10).mkString(","))
83 |
84 | val t2_bigram_loci = t2_bigram.map(findBigramsWithin).filter(p=>p._2.size>0)
85 |
86 | // COMMAND ----------
87 |
88 | val joinedRDD = t1_bigram_loci.join(t2_bigram_loci)
89 | for (entry <- joinedRDD.collect) {
90 | println(entry._1 + "\n\t" + entry._2._1 + "\n\t" + entry._2._2 + "\n\n")
91 | }
92 |
93 |
94 | }
95 | }
--------------------------------------------------------------------------------
/gutenrye/gut2.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkContext
2 | from porter2 import stem
3 | import urllib2
4 | import re
5 |
6 | sc = SparkContext()
7 |
8 | #text_urls = ['https://www.gutenberg.org/cache/epub/77/pg77.txt', 'http://www.gutenberg.org/cache/epub/2701/pg2701.txt']
9 | text_urls = ['https://dl.dropboxusercontent.com/u/105876471/tth.txt', 'https://dl.dropboxusercontent.com/u/105876471/coc.txt']
10 | text1_url = text_urls[0]
11 | text2_url = text_urls[1]
12 |
13 | # Load from web:
14 | def wgetAndTokenize(url):
15 | response = urllib2.urlopen(url)
16 | data = response.read()
17 | return re.split('\s+', data);
18 |
19 | text1_tokens = wgetAndTokenize(text1_url)
20 | text2_tokens = wgetAndTokenize(text2_url)
21 |
22 | # make RDD with list of words along with their position in the original text (so we can find context later)
23 | text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex()
24 | text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex()
25 | #print text1_tokensRDD.take(5)
26 |
27 | # define a list of stop words (chosen fairly arbitrarily)
28 | stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero']
29 |
30 | # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list:
31 | text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words)
32 | #print text1_tokensRDD.take(5)
33 | text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words)
34 |
35 | # stem the words using imported stem function (chosen arbitrarily)
36 | text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1]))
37 | #print text1_stemmedRDD.take(5)
38 | text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1]))
39 |
40 | # for each word, get the list of loci:
41 | text1_concRDD = text1_stemmedRDD.groupByKey()
42 | #print text1_concRDD.take(5)
43 | text2_concRDD = text2_stemmedRDD.groupByKey()
44 |
45 | # find every pair of words (brute force)
46 | text1_bigrams = text1_concRDD.cartesian(text1_concRDD)
47 | #print text1_bigrams.first()
48 | text2_bigrams = text2_concRDD.cartesian(text2_concRDD)
49 |
50 | # eliminate transposed pairs, and dupes -- keep ("a","b"); not ("b", "a") or ("a", "a") etc
51 | text1_bigrams = text1_bigrams.filter(lambda p:p[0][0] < p[1][0])
52 | #print text1_bigrams.first()
53 | text2_bigrams = text2_bigrams.filter(lambda p:p[0][0] < p[1][0])
54 |
55 | # toss all pairs which never occurs within "distance" of each other:
56 | distance = 7
57 | def findBigramsWithin(pair):
58 | p,q = pair
59 | return ((p[0],q[0]), [(loc1, loc2) for loc1 in p[1] for loc2 in q[1] if abs(loc1-loc2)0)
62 | #print text1_bigram_loci.take(10)
63 | text2_bigram_loci = text2_bigrams.map(findBigramsWithin).filter(lambda p:len(p[1])>0)
64 |
65 | # "match" bigram+loci from text1 with same bigram (and other loci) from text2 (keeping only those that occur in both)
66 | joined = text1_bigram_loci.join(text2_bigram_loci)
67 |
68 | # make it run and print a report
69 | for bigram in joined.collect():
70 | print "\n"+str(bigram[0])
71 | print "\ttext 1 loci"
72 | for locus in bigram[1][0]:
73 | lo,hi = min(locus[0],locus[1]),max(locus[0],locus[1])
74 | print "\t\t" + str(locus) + ": " + " ".join(text1_tokens[lo-4:hi+4])
75 | print "\ttext 2 loci"
76 | for locus in bigram[1][1]:
77 | lo,hi = min(locus[0],locus[1]),max(locus[0],locus[1])
78 | print "\t\t" + str(locus) + ": " + " ".join(text2_tokens[lo-4:hi+4])
79 |
--------------------------------------------------------------------------------
/gutenrye/scala-proj/src/main/scala/DFRye.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.SparkContext
2 | import org.apache.spark.SparkContext._
3 | import org.apache.spark.sql._
4 | import org.apache.spark.SparkConf
5 |
6 | object DFRye {
7 | def main(args: Array[String]) {
8 | val conf = new SparkConf().setAppName("Rye")
9 | val sc = new SparkContext(conf)
10 |
11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
12 | import sqlContext.implicits._
13 | import org.apache.spark.sql.functions._
14 |
15 | // Databricks notebook source exported at Fri, 7 Aug 2015 05:52:33 UTC
16 | val text_urls = Array("https://dl.dropboxusercontent.com/u/105876471/tth.txt", "https://dl.dropboxusercontent.com/u/105876471/coc.txt")
17 | // val text_urls = Array("https://www.gutenberg.org/cache/epub/77/pg77.txt", "https://www.gutenberg.org/ebooks/2701.txt.utf-8")
18 | val t1_url = text_urls(0)
19 | val t2_url = text_urls(1)
20 |
21 | import scala.io.Source
22 | val t1_tokens = Source.fromURL(t1_url).mkString.split("\\s+")
23 | val t2_tokens = Source.fromURL(t2_url).mkString.split("\\s+")
24 |
25 | // COMMAND ----------
26 |
27 | val t1_tokensRDD = sc.parallelize(t1_tokens).zipWithIndex()
28 | val t2_tokensRDD = sc.parallelize(t2_tokens).zipWithIndex()
29 | println(t1_tokensRDD.take(5))
30 | println(t2_tokensRDD.take(5).mkString)
31 |
32 | // COMMAND ----------
33 |
34 | val stop_words = Array("a", "i", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero")
35 |
36 | // COMMAND ----------
37 |
38 | // remove normalize to lowercase, remove stopwords
39 | val t1_cleaned_tokensRDD = t1_tokensRDD
40 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))
41 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) })
42 |
43 | println(t1_cleaned_tokensRDD.take(15).mkString)
44 | val t2_cleaned_tokensRDD = t2_tokensRDD
45 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2))
46 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) })
47 |
48 | println(t2_cleaned_tokensRDD.take(15).mkString)
49 |
50 | // COMMAND ----------
51 |
52 | val t1_stemmedRDD = t1_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
53 | println(t1_stemmedRDD.take(5).mkString)
54 | val t2_stemmedRDD = t2_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2))
55 |
56 |
57 | // COMMAND ----------
58 |
59 | val t1_concRDD = t1_stemmedRDD.groupByKey()
60 | t1_concRDD.take(5).foreach(ex=> {
61 | println("key " + ex._1 + " -- loci " + ex._2.mkString(","))
62 | })
63 | val t2_concRDD = t2_stemmedRDD.groupByKey()
64 |
65 |
66 | // COMMAND ----------
67 | val t1d1 = t1_concRDD.map(p => (p._1, p._2.toList)).toDF("entry", "loci")
68 | val t1d2 = t1_concRDD.map(p => (p._1, p._2.toList)).toDF("entry2", "loci2")
69 |
70 | var t1j = t1d1.join(t1d2, t1d1("entry") {
74 | for {
75 | loc1 <- l1
76 | loc2 <- l2
77 | if (Math.abs(loc1-loc2) e.length)
86 |
87 | t1j = t1j.withColumn("bg", bigrams(t1j("loci"), t1j("loci2")))
88 |
89 | t1j = t1j.filter(getLen(t1j("bg"))>0)
90 | t1j = t1j.select("entry", "entry2", "bg")
91 | t1j.show()
92 |
93 | val t2d1 = t2_concRDD.map(p => (p._1, p._2.toList)).toDF("entry", "loci")
94 | val t2d2 = t2_concRDD.map(p => (p._1, p._2.toList)).toDF("entry2", "loci2")
95 | var t2j = t2d1.join(t2d2, t2d1("entry")0)
98 | t2j = t2j.select("entry", "entry2", "bg")
99 |
100 | // COMMAND ----------
101 |
102 | val t2rename = t2j.withColumnRenamed("entry","e").withColumnRenamed("entry2","e2").withColumnRenamed("bg", "bg2")
103 | val joinedDF = t1j.join(t2rename, t1j("entry") === t2rename("e") && t1j("entry2") === t2rename("e2")).select("entry", "entry2", "bg", "bg2")
104 | joinedDF.show()
105 |
106 | // COMMAND ----------
107 |
108 | joinedDF.explain
109 | }
110 | }
111 | // COMMAND ----------
--------------------------------------------------------------------------------
/gutenrye/porter2.py:
--------------------------------------------------------------------------------
1 | # borrowed from https://pypi.python.org/pypi/stemming/1.0 (public domain)
2 |
3 | """An implementation of the Porter2 stemming algorithm.
4 | See http://snowball.tartarus.org/algorithms/english/stemmer.html
5 |
6 | Adapted from pyporter2 by Michael Dirolf.
7 |
8 | This algorithm is more correct but (at least in this implementation)
9 | several times slower than the original porter algorithm as implemented
10 | in stemming.porter.
11 | """
12 |
13 | import re
14 |
15 | r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)")
16 | ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$")
17 | ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$")
18 | ccy_exp = re.compile(r"([aeiouy])y")
19 | s1a_exp = re.compile(r"[aeiouy].")
20 | s1b_exp = re.compile(r"[aeiouy]")
21 |
22 | def get_r1(word):
23 | # exceptional forms
24 | if word.startswith('gener') or word.startswith('arsen'):
25 | return 5
26 | if word.startswith('commun'):
27 | return 6
28 |
29 | # normal form
30 | match = r_exp.match(word)
31 | if match:
32 | return match.start(1)
33 | return len(word)
34 |
35 | def get_r2(word):
36 | match = r_exp.match(word, get_r1(word))
37 | if match:
38 | return match.start(1)
39 | return len(word)
40 |
41 | def ends_with_short_syllable(word):
42 | if len(word) == 2:
43 | if ewss_exp1.match(word):
44 | return True
45 | if ewss_exp2.match(word):
46 | return True
47 | return False
48 |
49 | def is_short_word(word):
50 | if ends_with_short_syllable(word):
51 | if get_r1(word) == len(word):
52 | return True
53 | return False
54 |
55 | def remove_initial_apostrophe(word):
56 | if word.startswith("'"):
57 | return word[1:]
58 | return word
59 |
60 | def capitalize_consonant_ys(word):
61 | if word.startswith('y'):
62 | word = 'Y' + word[1:]
63 | return ccy_exp.sub('\g<1>Y', word)
64 |
65 | def step_0(word):
66 | if word.endswith("'s'"):
67 | return word[:-3]
68 | if word.endswith("'s"):
69 | return word[:-2]
70 | if word.endswith("'"):
71 | return word[:-1]
72 | return word
73 |
74 | def step_1a(word):
75 | if word.endswith('sses'):
76 | return word[:-4] + 'ss'
77 | if word.endswith('ied') or word.endswith('ies'):
78 | if len(word) > 4:
79 | return word[:-3] + 'i'
80 | else:
81 | return word[:-3] + 'ie'
82 | if word.endswith('us') or word.endswith('ss'):
83 | return word
84 | if word.endswith('s'):
85 | preceding = word[:-1]
86 | if s1a_exp.search(preceding):
87 | return preceding
88 | return word
89 | return word
90 |
91 | doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt')
92 | def ends_with_double(word):
93 | for double in doubles:
94 | if word.endswith(double):
95 | return True
96 | return False
97 | def step_1b_helper(word):
98 | if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
99 | return word + 'e'
100 | if ends_with_double(word):
101 | return word[:-1]
102 | if is_short_word(word):
103 | return word + 'e'
104 | return word
105 | s1b_suffixes = ('ed', 'edly', 'ing', 'ingly')
106 |
107 | def step_1b(word, r1):
108 | if word.endswith('eedly'):
109 | if len(word) - 5 >= r1:
110 | return word[:-3]
111 | return word
112 | if word.endswith('eed'):
113 | if len(word) - 3 >= r1:
114 | return word[:-1]
115 | return word
116 |
117 | for suffix in s1b_suffixes:
118 | if word.endswith(suffix):
119 | preceding = word[:-len(suffix)]
120 | if s1b_exp.search(preceding):
121 | return step_1b_helper(preceding)
122 | return word
123 |
124 | return word
125 |
126 | def step_1c(word):
127 | if word.endswith('y') or word.endswith('Y') and len(word) > 1:
128 | if word[-2] not in 'aeiouy':
129 | if len(word) > 2:
130 | return word[:-1] + 'i'
131 | return word
132 |
133 | def step_2_helper(word, r1, end, repl, prev):
134 | if word.endswith(end):
135 | if len(word) - len(end) >= r1:
136 | if prev == []:
137 | return word[:-len(end)] + repl
138 | for p in prev:
139 | if word[:-len(end)].endswith(p):
140 | return word[:-len(end)] + repl
141 | return word
142 | return None
143 | s2_triples = (('ization', 'ize', []),
144 | ('ational', 'ate', []),
145 | ('fulness', 'ful', []),
146 | ('ousness', 'ous', []),
147 | ('iveness', 'ive', []),
148 | ('tional', 'tion', []),
149 | ('biliti', 'ble', []),
150 | ('lessli', 'less', []),
151 | ('entli', 'ent', []),
152 | ('ation', 'ate', []),
153 | ('alism', 'al', []),
154 | ('aliti', 'al', []),
155 | ('ousli', 'ous', []),
156 | ('iviti', 'ive', []),
157 | ('fulli', 'ful', []),
158 | ('enci', 'ence', []),
159 | ('anci', 'ance', []),
160 | ('abli', 'able', []),
161 | ('izer', 'ize', []),
162 | ('ator', 'ate', []),
163 | ('alli', 'al', []),
164 | ('bli', 'ble', []),
165 | ('ogi', 'og', ['l']),
166 | ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't']))
167 |
168 | def step_2(word, r1):
169 | for trip in s2_triples:
170 | attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2])
171 | if attempt:
172 | return attempt
173 | return word
174 |
175 | def step_3_helper(word, r1, r2, end, repl, r2_necessary):
176 | if word.endswith(end):
177 | if len(word) - len(end) >= r1:
178 | if not r2_necessary:
179 | return word[:-len(end)] + repl
180 | else:
181 | if len(word) - len(end) >= r2:
182 | return word[:-len(end)] + repl
183 | return word
184 | return None
185 | s3_triples = (('ational', 'ate', False),
186 | ('tional', 'tion', False),
187 | ('alize', 'al', False),
188 | ('icate', 'ic', False),
189 | ('iciti', 'ic', False),
190 | ('ative', '', True),
191 | ('ical', 'ic', False),
192 | ('ness', '', False),
193 | ('ful', '', False))
194 | def step_3(word, r1, r2):
195 | for trip in s3_triples:
196 | attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2])
197 | if attempt:
198 | return attempt
199 | return word
200 |
201 | s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement',
202 | 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize')
203 |
204 | def step_4(word, r2):
205 | for end in s4_delete_list:
206 | if word.endswith(end):
207 | if len(word) - len(end) >= r2:
208 | return word[:-len(end)]
209 | return word
210 |
211 | if word.endswith('sion') or word.endswith('tion'):
212 | if len(word) - 3 >= r2:
213 | return word[:-3]
214 |
215 | return word
216 |
217 | def step_5(word, r1, r2):
218 | if word.endswith('l'):
219 | if len(word) - 1 >= r2 and word[-2] == 'l':
220 | return word[:-1]
221 | return word
222 |
223 | if word.endswith('e'):
224 | if len(word) - 1 >= r2:
225 | return word[:-1]
226 | if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]):
227 | return word[:-1]
228 |
229 | return word
230 |
231 | def normalize_ys(word):
232 | return word.replace('Y', 'y')
233 |
234 | exceptional_forms = {'skis': 'ski',
235 | 'skies': 'sky',
236 | 'dying': 'die',
237 | 'lying': 'lie',
238 | 'tying': 'tie',
239 | 'idly': 'idl',
240 | 'gently': 'gentl',
241 | 'ugly': 'ugli',
242 | 'early': 'earli',
243 | 'only': 'onli',
244 | 'singly': 'singl',
245 | 'sky': 'sky',
246 | 'news': 'news',
247 | 'howe': 'howe',
248 | 'atlas': 'atlas',
249 | 'cosmos': 'cosmos',
250 | 'bias': 'bias',
251 | 'andes': 'andes'}
252 |
253 | exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring',
254 | 'earring', 'proceed', 'exceed', 'succeed'])
255 |
256 |
257 | def stem(word):
258 | if len(word) <= 2:
259 | return word
260 | word = remove_initial_apostrophe(word)
261 |
262 | # handle some exceptional forms
263 | if word in exceptional_forms:
264 | return exceptional_forms[word]
265 |
266 | word = capitalize_consonant_ys(word)
267 | r1 = get_r1(word)
268 | r2 = get_r2(word)
269 | word = step_0(word)
270 | word = step_1a(word)
271 |
272 | # handle some more exceptional forms
273 | if word in exceptional_early_exit_post_1a:
274 | return word
275 |
276 | word = step_1b(word, r1)
277 | word = step_1c(word)
278 | word = step_2(word, r1)
279 | word = step_3(word, r1, r2)
280 | word = step_4(word, r2)
281 | word = step_5(word, r1, r2)
282 | word = normalize_ys(word)
283 |
284 | return word
285 |
286 | if __name__ == "__main__":
287 | assert stem("bill's") == "bill"
288 | assert stem("y's") == "y"
289 |
290 |
--------------------------------------------------------------------------------
/gutenrye/scala-proj/src/main/scala/Stemmer.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Scala Porter Stemmer Implementation
3 | *
4 | */
5 | object Stemmer extends Serializable {
6 | def stem (str: String): String = {
7 | // check for zero length
8 | if (str.length() > 3) {
9 | // all characters must be letters
10 | for (ch <- str toList) {
11 | if (!Character.isLetter(ch)) {
12 | return str.toLowerCase()
13 | }
14 | }
15 | }
16 | var s: String = step_1(str)
17 | step_5(step_4(step_3(step_2(step_1(str))))).toLowerCase
18 | }
19 |
20 | def step_1(str: String): String = step_1_c(step_1_b(step_1_a(str)))
21 |
22 | /*
23 | * Step 1a
24 | * SSES -> SS caresses -> caress
25 | * IES -> I ponies -> poni
26 | * ties -> ti
27 | * SS -> SS caress -> caress
28 | * S -> cats -> cat
29 | */
30 | def step_1_a(str: String): String = replacePatterns(str, List( ("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")), _>=0)
31 |
32 | /*
33 | * Step 1b
34 | * (m>0) EED -> EE feed -> feed
35 | * agreed -> agree
36 | * (*v*) ED -> plastered -> plaster
37 | * bled -> bled
38 | * (*v*) ING -> motoring -> motor
39 | * sing -> sing
40 | */
41 | def step_1_b (str: String): String = {
42 | // (m > 0) EED -> EE
43 | if (str.endsWith("eed")) {
44 | if (stringMeasure(str.substring(0, str.length - 3)) > 0)
45 | return str.substring(0, str.length() - 1)
46 | // (*v*) ED ->
47 | } else if ((str.endsWith("ed")) &&
48 | (containsVowel(str.substring(0, str.length - 2)))) {
49 | return step_1_b_2(str.substring(0, str.length - 2))
50 | // (*v*) ING ->
51 | } else if ((str.endsWith("ing")) &&
52 | (containsVowel(str.substring(0, str.length - 3)))) {
53 | return step_1_b_2(str.substring(0, str.length - 3))
54 | } // end if
55 | str
56 | } // end step1b
57 |
58 | /*
59 | * If the second or third of the rules in Step 1b is successful, the following is done:
60 | * AT -> ATE conflat(ed) -> conflate
61 | * BL -> BLE troubl(ed) -> trouble
62 | * IZ -> IZE siz(ed) -> size
63 | *
64 | * (*d and not (*L or *S or *Z)) -> single letter
65 | * hopp(ing) -> hop
66 | * tann(ed) -> tan
67 | * fall(ing) -> fall
68 | * hiss(ing) -> hiss
69 | * fizz(ed) -> fizz
70 | *
71 | * (m=1 and *o) -> E fail(ing) -> fail
72 | * fil(ing) -> file
73 | */
74 | def step_1_b_2 (str: String): String = {
75 |
76 | if (str.endsWith("at") ||
77 | str.endsWith("bl") ||
78 | str.endsWith("iz")) {
79 | return str + "e";
80 | }
81 | else if ((str.length() > 1) && (endsWithDoubleConsonent(str)) &&
82 | (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) {
83 | return str.substring(0, str.length() - 1);
84 | }
85 | else if ((stringMeasure(str) == 1) &&
86 | (endsWithCVC(str))) {
87 | return str + "e"
88 | }
89 | str
90 | }
91 |
92 | /*
93 | * (*v*) Y -> I happy -> happi
94 | * sky -> sky
95 | */
96 | def step_1_c(str: String): String = {
97 | if (str.endsWith("y") && containsVowel(str.substring(0, str.length() - 1)))
98 | return str.substring(0, str.length() - 1) + "i"
99 | str
100 | } // end step1c
101 |
102 | /*
103 | * (m>0) ATIONAL -> ATE relational -> relate
104 | * (m>0) TIONAL -> TION conditional -> condition
105 | * rational -> rational
106 | * (m>0) ENCI -> ENCE valenci -> valence
107 | * (m>0) ANCI -> ANCE hesitanci -> hesitance
108 | * (m>0) IZER -> IZE digitizer -> digitize
109 | * (m>0) ABLI -> ABLE conformabli -> conformable
110 | * (m>0) ALLI -> AL radicalli -> radical
111 | * (m>0) ENTLI -> ENT differentli -> different
112 | * (m>0) ELI -> E vileli - > vile
113 | * (m>0) OUSLI -> OUS analogousli -> analogous
114 | * (m>0) IZATION -> IZE vietnamization -> vietnamize
115 | * (m>0) ATION -> ATE predication -> predicate
116 | * (m>0) ATOR -> ATE operator -> operate
117 | * (m>0) ALISM -> AL feudalism -> feudal
118 | * (m>0) IVENESS -> IVE decisiveness -> decisive
119 | * (m>0) FULNESS -> FUL hopefulness -> hopeful
120 | * (m>0) OUSNESS -> OUS callousness -> callous
121 | * (m>0) ALITI -> AL formaliti -> formal
122 | * (m>0) IVITI -> IVE sensitiviti -> sensitive
123 | * (m>0) BILITI -> BLE sensibiliti -> sensible
124 | */
125 | def step_2 (str: String): String = replacePatterns(str, List( ("ational", "ate"), ("tional","tion"), ("enci","ence"), ("anci","ance"),
126 | ("izer","ize"), ("bli","ble"), ("alli", "al"), ("entli","ent"),("eli","e"),
127 | ("ousli","ous"), ("ization","ize"), ("ation","ate"), ("ator","ate"), ("alism","al"),
128 | ("iveness","ive"), ("fulness","ful"), ("ousness", "ous"), ("aliti", "al"), ("iviti","ive"),
129 | ("biliti", "ble"), ("logi", "log")))
130 |
131 | /*
132 | * (m>0) ICATE -> IC triplicate -> triplic
133 | * (m>0) ATIVE -> formative -> form
134 | * (m>0) ALIZE -> AL formalize -> formal
135 | * (m>0) ICITI -> IC electriciti -> electric
136 | * (m>0) ICAL -> IC electrical -> electric
137 | * (m>0) FUL -> hopeful -> hope
138 | * (m>0) NESS -> goodness -> good
139 | */
140 | def step_3 (str: String): String = replacePatterns(str, List( ("icate", "ic"),("ative",""),("alize","al"),("iciti","ic"),("ical","ic"),("ful",""),("ness","")))
141 |
142 | /*
143 | * (m>1) AL -> revival -> reviv
144 | * (m>1) ANCE -> allowance -> allow
145 | * (m>1) ENCE -> inference -> infer
146 | * (m>1) ER -> airliner -> airlin
147 | * (m>1) IC -> gyroscopic -> gyroscop
148 | * (m>1) ABLE -> adjustable -> adjust
149 | * (m>1) IBLE -> defensible -> defens
150 | * (m>1) ANT -> irritant -> irrit
151 | * (m>1) EMENT -> replacement -> replac
152 | * (m>1) MENT -> adjustment -> adjust
153 | * (m>1) ENT -> dependent -> depend
154 | * (m>1 and (*S or *T)) ION -> adoption -> adopt
155 | * (m>1) OU -> homologou -> homolog
156 | * (m>1) ISM -> communism -> commun
157 | * (m>1) ATE -> activate -> activ
158 | * (m>1) ITI -> angulariti -> angular
159 | * (m>1) OUS -> homologous -> homolog
160 | * (m>1) IVE -> effective -> effect
161 | * (m>1) IZE -> bowdlerize -> bowdler
162 | */
163 | def step_4 (str: String): String = {
164 | val res: String = replacePatterns(str, List( ("al",""),("ance",""),("ence",""),("er",""),("ic",""),("able",""),("ible",""),("ant",""),("ement",""),
165 | ("ment",""),("ent",""),("ou", ""),("ism",""),("ate",""),("iti",""),("ous",""),
166 | ("ive",""),("ize","")), _>1)
167 | if (str == res) {
168 | if ((str.endsWith("sion") || str.endsWith("tion")) && stringMeasure(str.substring(0, str.length() - 3)) > 1)
169 | return str.substring(0, str.length() - 3)
170 | else
171 | return str
172 | }
173 | else {
174 | return res
175 | }
176 | }
177 |
178 | def step_5 (str: String): String = step_5_b(step_5_a(str))
179 |
180 | /*
181 | * (m>1) E -> probate -> probat
182 | * rate -> rate
183 | * (m=1 and not *o) E -> cease -> ceas
184 | */
185 | def step_5_a (str: String): String = {
186 | // (m > 1) E ->
187 | if ((stringMeasure(str.substring(0, str.length() - 1)) > 1) &&
188 | str.endsWith("e"))
189 | return str.substring(0, str.length() -1)
190 | // (m = 1 and not *0) E ->
191 | else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1) &&
192 | (!endsWithCVC(str.substring(0, str.length() - 1))) &&
193 | (str.endsWith("e")))
194 | return str.substring(0, str.length() - 1)
195 | else
196 | return str
197 | } // end step5a
198 |
199 | /*
200 | * (m > 1 and *d and *L) -> single letter
201 | * controll -> control
202 | * roll -> roll
203 | */
204 | def step_5_b (str: String): String = {
205 | // (m > 1 and *d and *L) ->
206 | if (str.endsWith("l") &&
207 | endsWithDoubleConsonent(str) &&
208 | (stringMeasure(str.substring(0, str.length() - 1)) > 1)) {
209 | str.substring(0, str.length() - 1)
210 | } else {
211 | str
212 | }
213 | } // end step5b
214 |
215 | // does string contain a vowel?
216 | def containsVowel(str: String): Boolean = {
217 | for (ch <- str toList) {
218 | if (isVowel(ch))
219 | return true
220 | }
221 | // no aeiou but there is y
222 | if (str.indexOf('y') > -1)
223 | return true
224 | else
225 | false
226 | } // end function
227 |
228 | // is char a vowel?
229 | def isVowel(c: Char): Boolean = {
230 | for (ch <- "aeiou" toList)
231 | if (c == ch)
232 | return true
233 | false
234 | } // end function
235 |
236 | /*
237 | * Special check for 'y', since it may be both vowel and consonent depending on surrounding letters
238 | */
239 | def isVowel(str: String, i: Int): Boolean = {
240 | for (ch <- "aeiou" toList)
241 | if (str(i) == ch || (str(i) == 'y' && i > 0 && i+1 < str.length && !isVowel(str(i-1)) && !isVowel(str(i+1)) ))
242 | return true
243 | false
244 | } // end function
245 |
246 | // returns a CVC measure for the string
247 | def stringMeasure(str: String): Int = {
248 | var count = 0
249 | var vowelSeen: Boolean = false
250 |
251 | for (i <- 0 to str.length - 1) {
252 | if(isVowel(str, i)) {
253 | vowelSeen = true
254 | } else if (vowelSeen) {
255 | count += 1
256 | vowelSeen = false
257 | }
258 | }
259 | count
260 | } // end function
261 |
262 | // does stem end with CVC?
263 | def endsWithCVC (str: String): Boolean = {
264 | if (str.length() >= 3) {
265 | val cvc = ( str(str.length - 1), str(str.length - 2), str(str.length - 3) )
266 | val cvc_str = cvc._1.toString + cvc._2 + cvc._3
267 |
268 | if ((cvc._1 == 'w') || (cvc._1 == 'x') || (cvc._1 == 'y'))
269 | false
270 | else if (!isVowel(cvc._1) && isVowel(cvc_str, 1) && !isVowel(cvc._3))
271 | true
272 | else
273 | false
274 | }
275 | else
276 | false
277 | } // end function
278 |
279 | // does string end with a double consonent?
280 | def endsWithDoubleConsonent(str: String): Boolean = {
281 | val c: Char = str.charAt(str.length() - 1);
282 | if (c == str.charAt(str.length() - 2))
283 | if (!containsVowel(str.substring(str.length() - 2))) {
284 | return true
285 | }
286 | false
287 | } // end function
288 |
289 | def replacePatterns(str: String, patterns: List[(String, String)]): String = replacePatterns(str, patterns, _>0)
290 |
291 | def replaceLast(str: String, pattern: String, replacement: String) = new StringBuilder(str).replace(str.lastIndexOf(pattern), str.lastIndexOf(pattern) + pattern.length, replacement).toString
292 |
293 | def replacePatterns(str: String, patterns: List[(String, String)], comparer: Int => Boolean): String = {
294 | for (pattern <- patterns)
295 | if (str.endsWith(pattern._1)) {
296 | val res = replaceLast(str, pattern._1, pattern._2)
297 | if (comparer(stringMeasure(replaceLast(str, pattern._1, ""))))
298 | return res
299 | else
300 | return str
301 | }
302 | str
303 | }
304 |
305 | }
--------------------------------------------------------------------------------