├── gutenrye ├── gut.sh ├── scala-proj │ ├── rye.sbt │ └── src │ │ └── main │ │ └── scala │ │ ├── Rye.scala │ │ ├── DFRye.scala │ │ └── Stemmer.scala ├── launch-gut-scala.sh ├── gutdf.py ├── output-sample.rtf ├── gut2.py └── porter2.py ├── README.md └── LICENSE /gutenrye/gut.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # you'll need to swap in your own paths as appropriate: 4 | 5 | /Users/adam/Applications/spark-1.4.0-bin-hadoop2.6/bin/spark-submit /Users/adam/Documents/rye/gutenrye/gut2.py 6 | 7 | 8 | -------------------------------------------------------------------------------- /gutenrye/scala-proj/rye.sbt: -------------------------------------------------------------------------------- 1 | name := "Rye" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.0" 8 | 9 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.4.0" -------------------------------------------------------------------------------- /gutenrye/launch-gut-scala.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # you'll need to swap in your own paths as appropriate: 4 | 5 | /Users/adam/Applications/spark-1.4.0-bin-hadoop2.6/bin/spark-submit --master local[*] --class "DFRye" /Users/adam/Documents/rye/gutenrye/scala-proj/target/scala-2.10/rye_2.10-1.0.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rye 2 | 3 | Experimental implementation with Apache Spark of a subset of functionality from http://tesserae.caset.buffalo.edu/ 4 | 5 | ### Some notes 6 | 7 | - The stemmer is public domain and grabbed from a common Python lib 8 | - Stop words are an arbitrary list from the Internet 9 | 10 | ### Logical ToDos 11 | 12 | - Get a better, multilingual stemmer 13 | - Better stop word list 14 | - Add filtering of "stop bigrams" 15 | + Real Tesserae has a scoring engine 16 | + Bigrams common to a genre/language should probable get scored 0 and filtered out for performance 17 | 18 | ### Plans 19 | 20 | - Reimplement parts of the code using the Spark DataFrames/SQL API 21 | + Hoping to squeeze a little free performance out of the Catalyst optimizer 22 | + Not using a Python lambda in the "big filter" (i.e., the distance filter after the Cartesian join) might help perf 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /gutenrye/gutdf.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.sql import HiveContext 3 | from porter2 import stem 4 | import urllib2 5 | import re 6 | 7 | sc = SparkContext() 8 | sqlContext = HiveContext(sc) 9 | 10 | stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero'] 11 | 12 | text_urls = ['https://dl.dropboxusercontent.com/u/105876471/tth.txt', 'https://dl.dropboxusercontent.com/u/105876471/coc.txt'] 13 | text1_url = text_urls[0] 14 | text2_url = text_urls[1] 15 | 16 | # Load from web: 17 | def wgetAndTokenize(url): 18 | response = urllib2.urlopen(url) 19 | data = response.read() 20 | return re.split('\s+', data); 21 | 22 | text1_tokens = wgetAndTokenize(text1_url) 23 | text2_tokens = wgetAndTokenize(text2_url) 24 | 25 | # make RDD with list of words along with their position in the original text (so we can find context later) 26 | text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex() 27 | text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex() 28 | #print text1_tokensRDD.take(5) 29 | 30 | # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list: 31 | text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 32 | print text1_tokensRDD.take(5) 33 | text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 34 | 35 | # stem the words using imported stem function (chosen arbitrarily) 36 | text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 37 | print text1_stemmedRDD.take(5) 38 | text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 39 | 40 | t1raw = text1_stemmedRDD.toDF(['entry', 'locus']) 41 | t1raw.show() 42 | 43 | t2raw = text2_stemmedRDD.toDF(['entry', 'locus']) 44 | 45 | t1raw.registerTempTable("t1raw") 46 | t2raw.registerTempTable("t2raw") 47 | 48 | bg1 = sqlContext.sql("select a.entry a1, b.entry b1, a.locus, b.locus from t1raw a cross join t1raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7") 49 | bg1.show(4) 50 | 51 | bg2 = sqlContext.sql("select a.entry a2, b.entry b2, a.locus, b.locus from t2raw a cross join t2raw b where a.entry < b.entry and a.locus - b.locus < 7 and b.locus - a.locus < 7") 52 | 53 | bg2.show(4) 54 | 55 | bg = bg1.join(bg2, ((bg1.a1 == bg2.a2) & (bg1.b1 == bg2.b2))) 56 | bg.show(100) 57 | 58 | -------------------------------------------------------------------------------- /gutenrye/output-sample.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 2 | {\fonttbl\f0\fmodern\fcharset0 CourierNewPSMT;} 3 | {\colortbl;\red255\green255\blue255;} 4 | \margl1440\margr1440\vieww17260\viewh9800\viewkind0 5 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural 6 | 7 | \f0\fs20 \cf0 \CocoaLigature0 ('earth', 'thing')\ 8 | text 1 loci\ 9 | (57, 61): heaven and in the earth. I heard many things in hell. How,\ 10 | text 2 loci\ 11 | (5572, 5570): alone among the conscious things of earth, for shapes came\ 12 | (5799, 5795): been aeons when other Things ruled on the earth, and They had\ 13 | (11120, 11123): that was not of earth the titan Thing from the stars\ 14 | \ 15 | ('acut', 'thing')\ 16 | text 1 loci\ 17 | (46, 50): the sense of hearing acute. I heard all things in the heaven\ 18 | text 2 loci\ 19 | (2528, 2534): of the dreamers confessed acute fear of the gigantic nameless thing visible toward the\ 20 | \ 21 | ('dead', 'stone')\ 22 | text 1 loci\ 23 | (1435, 1433): corpse. Yes, he was stone, stone dead. I placed my\ 24 | (1435, 1434): Yes, he was stone, stone dead. I placed my\ 25 | (1456, 1455): no pulsation. He was stone dead. His eve would\ 26 | text 2 loci\ 27 | (7253, 7258): ritual which told of dead Cthulhu's dream-vigil in his stone vault at R'lyeh,\ 28 | \ 29 | ('examin', 'remov')\ 30 | text 1 loci\ 31 | (1427, 1423): man was dead. I removed the bed and examined the corpse. Yes,\ 32 | text 2 loci\ 33 | (5307, 5301): of course, was carefully removed and carried back by Legrasse. Examined at headquarters after\ 34 | \ 35 | ('felt', 'night')\ 36 | text 1 loci\ 37 | (544, 541): mine. Never before that night had I felt the extent of\ 38 | text 2 loci\ 39 | (1501, 1496): slight earthquake tremor the night before, the most considerable felt in New England\ 40 | \ 41 | ('men', 'nois')\ 42 | text 1 loci\ 43 | (2004, 2007): the observations of the men --but the noise steadily increased. Oh\ 44 | text 2 loci\ 45 | (4733, 4729): do justice to the noises heard by Legrasse's men as they ploughed\ 46 | \ 47 | ('felt', 'thing')\ 48 | text 1 loci\ 49 | (1826, 1821): they chatted of familiar things. But, ere long, I felt myself getting pale\ 50 | text 2 loci\ 51 | (10284, 10287): Now an unlettered seaman felt the same thing whilst gazing at\ 52 | \ 53 | ('dream', 'thought')\ 54 | text 1 loci\ 55 | (579, 585): he not even to dream of my secret deeds or thoughts. I fairly chuckled\ 56 | text 2 loci\ 57 | (10018, 10012): after cycles incalculable, the thoughts that spread fear to the dreams of the sensitive\ 58 | \ 59 | ('black', 'dark')\ 60 | text 1 loci\ 61 | (621, 627): His room was as black as pitch with the thick darkness, (for the shutters\ 62 | text 2 loci\ 63 | (10682, 10685): upset. The aperture was black with a darkness almost material. That\ 64 | \ 65 | ('dream', 'secret')\ 66 | text 1 loci\ 67 | (579, 582): he not even to dream of my secret deeds or thoughts.\ 68 | text 2 loci\ 69 | (5454, 5452): bodies had told their secrets in dreams to the first\ 70 | \ 71 | ('face', 'man')\ 72 | text 1 loci\ 73 | (1103, 1102): else of the old man's face or person: for\ 74 | text 2 loci\ 75 | (4893, 4897): swamp water on the face of the fainting man, and all stood\ 76 | \ 77 | ('death', 'night')\ 78 | text 1 loci\ 79 | (740, 734): as I have done, night after night, hearkening to the death watches in the\ 80 | (740, 736): have done, night after night, hearkening to the death watches in the\ 81 | text 2 loci\ 82 | (11178, 11182): laughing at intervals till death found him one night in the cabin\ 83 | \ 84 | ('found', 'night')\ 85 | text 1 loci\ 86 | (423, 417): seven long nights --every night just at midnight --but I found the eye always\ 87 | text 2 loci\ 88 | (11179, 11182): at intervals till death found him one night in the cabin\ 89 | \ 90 | ('heard', 'suspect')\ 91 | text 1 loci\ 92 | (2071, 2073): God! --no, no! They heard! --they suspected! --they knew! --they\ 93 | text 2 loci\ 94 | (6757, 6752): natural; though privately I suspected young Wilcox of having heard of the cult\ 95 | \ 96 | ('disturb', 'man')\ 97 | text 1 loci\ 98 | (329, 332): that I might not disturb the old man's sleep. It took\ 99 | text 2 loci\ 100 | (721, 725): responsible for this apparent disturbance of an old man's peace of mind.\ 101 | \ 102 | ('heard', 'length')\ 103 | text 1 loci\ 104 | (1409, 1414): it would not be heard through the wall. At length it ceased. The\ 105 | text 2 loci\ 106 | (10756, 10750): was intolerable, and at length the quick-eared Hawkins thought he heard a nasty, slopping\ 107 | \ 108 | ('end', 'made')\ 109 | text 1 loci\ 110 | (1591, 1589): ha! When I had made an end of these labors,\ 111 | text 2 loci\ 112 | (5246, 5242): fired, and escapes were made; but in the end Legrasse was able\ 113 | \ 114 | ('continu', 'feel')\ 115 | text 1 loci\ 116 | (1875, 1872): get rid of the feeling: but it continued and gained definiteness\ 117 | text 2 loci\ 118 | (2441, 2443): That is why I continued to feel that Wilcox, somehow\ 119 | \ 120 | ('beat', 'muffl')\ 121 | text 1 loci\ 122 | (1393, 1397): many minutes, the heart beat on with a muffled sound. This, however,\ 123 | text 2 loci\ 124 | (4483, 4482): of bobbing lanterns. The muffled beat of tom-toms was\ 125 | \ 126 | ('god', 'suspect')\ 127 | text 1 loci\ 128 | (2067, 2073): they heard not? Almighty God! --no, no! They heard! --they suspected! --they knew! --they\ 129 | text 2 loci\ 130 | (10044, 10042): this Johansen did not suspect, but God knows he soon\ 131 | \ 132 | ('bed', 'sudden')\ 133 | text 1 loci\ 134 | (602, 603): he moved on the bed suddenly, as if startled.\ 135 | text 2 loci\ 136 | (2058, 2052): trace of Wilcox's malady suddenly ceased. He sat upright in bed, astonished to find\ 137 | \ 138 | ('door', 'knock')\ 139 | text 1 loci\ 140 | (1616, 1612): hour, there came a knocking at the street door. I went down\ 141 | text 2 loci\ 142 | (9410, 9404): trip by taxicab, and knocked with palpitant heart at the door of a neat\ 143 | \ 144 | ('heard', 'sound')\ 145 | text 1 loci\ 146 | (1331, 1328): anxiety seized me --the sound would be heard by a neighbour!\ 147 | text 2 loci\ 148 | (10756, 10760): quick-eared Hawkins thought he heard a nasty, slopping sound down there. Everyone\ 149 | \ 150 | ('dream', 'man')\ 151 | text 1 loci\ 152 | (1708, 1711): my own in a dream. The old man, I mentioned, was\ 153 | text 2 loci\ 154 | (6693, 6690): of a sensitive young man who had dreamed not only the\ 155 | \ 156 | ('awak', 'lie')\ 157 | text 1 loci\ 158 | (853, 852): that he had been lying awake ever since the\ 159 | text 2 loci\ 160 | (6017, 6016): and They could only lie awake in the dark\ 161 | \ 162 | ('dead', 'man')\ 163 | text 1 loci\ 164 | (1421, 1419): it ceased. The old man was dead. I removed the\ 165 | text 2 loci\ 166 | (7967, 7968): Tow. One Survivor and Dead Man Found Aboard. Tale\ 167 | (8045, 8046): one living and one dead man aboard. The Vigilant\ 168 | (8102, 8097): half-delirious condition and one man who had evidently been dead for more than\ 169 | \ 170 | ('man', 'week')\ 171 | text 1 loci\ 172 | (243, 248): kinder to the old man than during the whole week before I killed\ 173 | text 2 loci\ 174 | (8110, 8107): for more than a week. The living man was clutching a\ 175 | \ 176 | ('excit', 'strang')\ 177 | text 1 loci\ 178 | (1293, 1288): that old house, so strange a noise as this excited me to uncontrollable\ 179 | text 2 loci\ 180 | (1253, 1257): and had from chidhood excited attention through the strange stories and odd\ 181 | (7023, 7029): for my uncle had excited his curiosity in probing his strange dreams, yet had\ 182 | \ 183 | ('long', 'thing')\ 184 | text 1 loci\ 185 | (1824, 1821): they chatted of familiar things. But, ere long, I felt myself\ 186 | text 2 loci\ 187 | (3436, 3441): and fore feet, and long, narrow wings behind. This thing, which seemed instinct\ 188 | (9234, 9240): and I studied it long and well, finding it a thing of balefully exquisite\ 189 | \ 190 | ('man', 'suspect')\ 191 | text 1 loci\ 192 | (496, 499): a very profound old man, indeed, to suspect that every night,\ 193 | text 2 loci\ 194 | (3718, 3723): problem, there was one man in that gathering who suspected a touch of} -------------------------------------------------------------------------------- /gutenrye/scala-proj/src/main/scala/Rye.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext 2 | import org.apache.spark.SparkContext._ 3 | import org.apache.spark.SparkConf 4 | 5 | object Rye { 6 | def main(args: Array[String]) { 7 | val conf = new SparkConf().setAppName("Rye") 8 | val sc = new SparkContext(conf) 9 | // Databricks notebook source exported at Thu, 6 Aug 2015 20:48:55 UTC 10 | val text_urls = Array("https://dl.dropboxusercontent.com/u/105876471/tth.txt", "https://dl.dropboxusercontent.com/u/105876471/coc.txt") 11 | //val text_urls = Array("https://www.gutenberg.org/cache/epub/77/pg77.txt", "https://www.gutenberg.org/ebooks/2701.txt.utf-8") 12 | val t1_url = text_urls(0) 13 | val t2_url = text_urls(1) 14 | 15 | import scala.io.Source 16 | 17 | val t1_tokens = Source.fromURL(t1_url).mkString.split("\\s+") 18 | val t2_tokens = Source.fromURL(t2_url).mkString.split("\\s+") 19 | 20 | // COMMAND ---------- 21 | 22 | val t1_tokensRDD = sc.parallelize(t1_tokens).zipWithIndex() 23 | val t2_tokensRDD = sc.parallelize(t2_tokens).zipWithIndex() 24 | println(t1_tokensRDD.take(5)) 25 | println(t2_tokensRDD.take(5).mkString) 26 | 27 | 28 | // COMMAND ---------- 29 | 30 | val stop_words = Array("a", "i", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero") 31 | 32 | // COMMAND ---------- 33 | 34 | // remove normalize to lowercase, remove stopwords 35 | val t1_cleaned_tokensRDD = t1_tokensRDD 36 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2)) 37 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) }) 38 | 39 | println(t1_cleaned_tokensRDD.take(15).mkString) 40 | val t2_cleaned_tokensRDD = t2_tokensRDD 41 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2)) 42 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) }) 43 | 44 | println(t2_cleaned_tokensRDD.take(15).mkString) 45 | //t2_cleaned_tokensRDD.collect() 46 | 47 | val t1_stemmedRDD = t1_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2)) 48 | println(t1_stemmedRDD.take(5).mkString) 49 | val t2_stemmedRDD = t2_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2)) 50 | 51 | // COMMAND ---------- 52 | 53 | val t1_concRDD = t1_stemmedRDD.groupByKey() 54 | t1_concRDD.take(5).foreach(ex=> { 55 | println("key " + ex._1 + " -- loci " + ex._2.mkString(",")) 56 | }) 57 | val t2_concRDD = t2_stemmedRDD.groupByKey() 58 | 59 | 60 | // COMMAND ---------- 61 | 62 | val t1_bigram = t1_concRDD.cartesian(t1_concRDD).filter(p=>(p._1._1(p._1._1p._2.size>0) 82 | println(t1_bigram_loci.take(10).mkString(",")) 83 | 84 | val t2_bigram_loci = t2_bigram.map(findBigramsWithin).filter(p=>p._2.size>0) 85 | 86 | // COMMAND ---------- 87 | 88 | val joinedRDD = t1_bigram_loci.join(t2_bigram_loci) 89 | for (entry <- joinedRDD.collect) { 90 | println(entry._1 + "\n\t" + entry._2._1 + "\n\t" + entry._2._2 + "\n\n") 91 | } 92 | 93 | 94 | } 95 | } -------------------------------------------------------------------------------- /gutenrye/gut2.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from porter2 import stem 3 | import urllib2 4 | import re 5 | 6 | sc = SparkContext() 7 | 8 | #text_urls = ['https://www.gutenberg.org/cache/epub/77/pg77.txt', 'http://www.gutenberg.org/cache/epub/2701/pg2701.txt'] 9 | text_urls = ['https://dl.dropboxusercontent.com/u/105876471/tth.txt', 'https://dl.dropboxusercontent.com/u/105876471/coc.txt'] 10 | text1_url = text_urls[0] 11 | text2_url = text_urls[1] 12 | 13 | # Load from web: 14 | def wgetAndTokenize(url): 15 | response = urllib2.urlopen(url) 16 | data = response.read() 17 | return re.split('\s+', data); 18 | 19 | text1_tokens = wgetAndTokenize(text1_url) 20 | text2_tokens = wgetAndTokenize(text2_url) 21 | 22 | # make RDD with list of words along with their position in the original text (so we can find context later) 23 | text1_tokensRDD = sc.parallelize(text1_tokens).zipWithIndex() 24 | text2_tokensRDD = sc.parallelize(text2_tokens).zipWithIndex() 25 | #print text1_tokensRDD.take(5) 26 | 27 | # define a list of stop words (chosen fairly arbitrarily) 28 | stop_words = ['a', 'i', 'an', 'as', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'aint', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'arent', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'cmon', 'cs', 'came', 'can', 'cant', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', 'course', 'currently', 'definitely', 'described', 'despite', 'did', 'didnt', 'different', 'do', 'does', 'doesnt', 'doing', 'dont', 'done', 'down', 'downwards', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifth', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'four', 'from', 'further', 'furthermore', 'get', 'gets', 'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', 'hadnt', 'happens', 'hardly', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hes', 'hello', 'help', 'hence', 'her', 'here', 'heres', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'id', 'ill', 'im', 'ive', 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'into', 'inward', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'its', 'itself', 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', 'shouldnt', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'ts', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'thats', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'theres', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'well', 'were', 'weve', 'welcome', 'well', 'went', 'were', 'werent', 'what', 'whats', 'whatever', 'when', 'whence', 'whenever', 'where', 'wheres', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whos', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'wonder', 'would', 'wouldnt', 'yes', 'yet', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves', 'zero'] 29 | 30 | # get rid of sequences of non-word chars, keep remaining strings with something in them, and not in stop list: 31 | text1_tokensRDD = text1_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 32 | #print text1_tokensRDD.take(5) 33 | text2_tokensRDD = text2_tokensRDD.map(lambda p:(re.sub('\W+', '', p[0]).lower(), p[1])).filter(lambda p:len(p[0])>0 and not p[0] in stop_words) 34 | 35 | # stem the words using imported stem function (chosen arbitrarily) 36 | text1_stemmedRDD = text1_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 37 | #print text1_stemmedRDD.take(5) 38 | text2_stemmedRDD = text2_tokensRDD.map(lambda p:(stem(p[0]), p[1])) 39 | 40 | # for each word, get the list of loci: 41 | text1_concRDD = text1_stemmedRDD.groupByKey() 42 | #print text1_concRDD.take(5) 43 | text2_concRDD = text2_stemmedRDD.groupByKey() 44 | 45 | # find every pair of words (brute force) 46 | text1_bigrams = text1_concRDD.cartesian(text1_concRDD) 47 | #print text1_bigrams.first() 48 | text2_bigrams = text2_concRDD.cartesian(text2_concRDD) 49 | 50 | # eliminate transposed pairs, and dupes -- keep ("a","b"); not ("b", "a") or ("a", "a") etc 51 | text1_bigrams = text1_bigrams.filter(lambda p:p[0][0] < p[1][0]) 52 | #print text1_bigrams.first() 53 | text2_bigrams = text2_bigrams.filter(lambda p:p[0][0] < p[1][0]) 54 | 55 | # toss all pairs which never occurs within "distance" of each other: 56 | distance = 7 57 | def findBigramsWithin(pair): 58 | p,q = pair 59 | return ((p[0],q[0]), [(loc1, loc2) for loc1 in p[1] for loc2 in q[1] if abs(loc1-loc2)0) 62 | #print text1_bigram_loci.take(10) 63 | text2_bigram_loci = text2_bigrams.map(findBigramsWithin).filter(lambda p:len(p[1])>0) 64 | 65 | # "match" bigram+loci from text1 with same bigram (and other loci) from text2 (keeping only those that occur in both) 66 | joined = text1_bigram_loci.join(text2_bigram_loci) 67 | 68 | # make it run and print a report 69 | for bigram in joined.collect(): 70 | print "\n"+str(bigram[0]) 71 | print "\ttext 1 loci" 72 | for locus in bigram[1][0]: 73 | lo,hi = min(locus[0],locus[1]),max(locus[0],locus[1]) 74 | print "\t\t" + str(locus) + ": " + " ".join(text1_tokens[lo-4:hi+4]) 75 | print "\ttext 2 loci" 76 | for locus in bigram[1][1]: 77 | lo,hi = min(locus[0],locus[1]),max(locus[0],locus[1]) 78 | print "\t\t" + str(locus) + ": " + " ".join(text2_tokens[lo-4:hi+4]) 79 | -------------------------------------------------------------------------------- /gutenrye/scala-proj/src/main/scala/DFRye.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext 2 | import org.apache.spark.SparkContext._ 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.SparkConf 5 | 6 | object DFRye { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("Rye") 9 | val sc = new SparkContext(conf) 10 | 11 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 12 | import sqlContext.implicits._ 13 | import org.apache.spark.sql.functions._ 14 | 15 | // Databricks notebook source exported at Fri, 7 Aug 2015 05:52:33 UTC 16 | val text_urls = Array("https://dl.dropboxusercontent.com/u/105876471/tth.txt", "https://dl.dropboxusercontent.com/u/105876471/coc.txt") 17 | // val text_urls = Array("https://www.gutenberg.org/cache/epub/77/pg77.txt", "https://www.gutenberg.org/ebooks/2701.txt.utf-8") 18 | val t1_url = text_urls(0) 19 | val t2_url = text_urls(1) 20 | 21 | import scala.io.Source 22 | val t1_tokens = Source.fromURL(t1_url).mkString.split("\\s+") 23 | val t2_tokens = Source.fromURL(t2_url).mkString.split("\\s+") 24 | 25 | // COMMAND ---------- 26 | 27 | val t1_tokensRDD = sc.parallelize(t1_tokens).zipWithIndex() 28 | val t2_tokensRDD = sc.parallelize(t2_tokens).zipWithIndex() 29 | println(t1_tokensRDD.take(5)) 30 | println(t2_tokensRDD.take(5).mkString) 31 | 32 | // COMMAND ---------- 33 | 34 | val stop_words = Array("a", "i", "an", "as", "able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "arent", "around", "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely", "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing", "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "few", "fifth", "first", "five", "followed", "following", "follows", "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent", "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "id", "ill", "im", "ive", "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is", "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone", "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt", "since", "six", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends", "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately", "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz", "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve", "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when", "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wont", "wonder", "would", "wouldnt", "yes", "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself", "yourselves", "zero") 35 | 36 | // COMMAND ---------- 37 | 38 | // remove normalize to lowercase, remove stopwords 39 | val t1_cleaned_tokensRDD = t1_tokensRDD 40 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2)) 41 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) }) 42 | 43 | println(t1_cleaned_tokensRDD.take(15).mkString) 44 | val t2_cleaned_tokensRDD = t2_tokensRDD 45 | .map(p=>(p._1.replaceAll("\\W+", "").toLowerCase, p._2)) 46 | .filter(p => { p._1.length>1 && !(stop_words contains p._1) }) 47 | 48 | println(t2_cleaned_tokensRDD.take(15).mkString) 49 | 50 | // COMMAND ---------- 51 | 52 | val t1_stemmedRDD = t1_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2)) 53 | println(t1_stemmedRDD.take(5).mkString) 54 | val t2_stemmedRDD = t2_cleaned_tokensRDD.map(p=>(Stemmer.stem(p._1), p._2)) 55 | 56 | 57 | // COMMAND ---------- 58 | 59 | val t1_concRDD = t1_stemmedRDD.groupByKey() 60 | t1_concRDD.take(5).foreach(ex=> { 61 | println("key " + ex._1 + " -- loci " + ex._2.mkString(",")) 62 | }) 63 | val t2_concRDD = t2_stemmedRDD.groupByKey() 64 | 65 | 66 | // COMMAND ---------- 67 | val t1d1 = t1_concRDD.map(p => (p._1, p._2.toList)).toDF("entry", "loci") 68 | val t1d2 = t1_concRDD.map(p => (p._1, p._2.toList)).toDF("entry2", "loci2") 69 | 70 | var t1j = t1d1.join(t1d2, t1d1("entry") { 74 | for { 75 | loc1 <- l1 76 | loc2 <- l2 77 | if (Math.abs(loc1-loc2) e.length) 86 | 87 | t1j = t1j.withColumn("bg", bigrams(t1j("loci"), t1j("loci2"))) 88 | 89 | t1j = t1j.filter(getLen(t1j("bg"))>0) 90 | t1j = t1j.select("entry", "entry2", "bg") 91 | t1j.show() 92 | 93 | val t2d1 = t2_concRDD.map(p => (p._1, p._2.toList)).toDF("entry", "loci") 94 | val t2d2 = t2_concRDD.map(p => (p._1, p._2.toList)).toDF("entry2", "loci2") 95 | var t2j = t2d1.join(t2d2, t2d1("entry")0) 98 | t2j = t2j.select("entry", "entry2", "bg") 99 | 100 | // COMMAND ---------- 101 | 102 | val t2rename = t2j.withColumnRenamed("entry","e").withColumnRenamed("entry2","e2").withColumnRenamed("bg", "bg2") 103 | val joinedDF = t1j.join(t2rename, t1j("entry") === t2rename("e") && t1j("entry2") === t2rename("e2")).select("entry", "entry2", "bg", "bg2") 104 | joinedDF.show() 105 | 106 | // COMMAND ---------- 107 | 108 | joinedDF.explain 109 | } 110 | } 111 | // COMMAND ---------- -------------------------------------------------------------------------------- /gutenrye/porter2.py: -------------------------------------------------------------------------------- 1 | # borrowed from https://pypi.python.org/pypi/stemming/1.0 (public domain) 2 | 3 | """An implementation of the Porter2 stemming algorithm. 4 | See http://snowball.tartarus.org/algorithms/english/stemmer.html 5 | 6 | Adapted from pyporter2 by Michael Dirolf. 7 | 8 | This algorithm is more correct but (at least in this implementation) 9 | several times slower than the original porter algorithm as implemented 10 | in stemming.porter. 11 | """ 12 | 13 | import re 14 | 15 | r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)") 16 | ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$") 17 | ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$") 18 | ccy_exp = re.compile(r"([aeiouy])y") 19 | s1a_exp = re.compile(r"[aeiouy].") 20 | s1b_exp = re.compile(r"[aeiouy]") 21 | 22 | def get_r1(word): 23 | # exceptional forms 24 | if word.startswith('gener') or word.startswith('arsen'): 25 | return 5 26 | if word.startswith('commun'): 27 | return 6 28 | 29 | # normal form 30 | match = r_exp.match(word) 31 | if match: 32 | return match.start(1) 33 | return len(word) 34 | 35 | def get_r2(word): 36 | match = r_exp.match(word, get_r1(word)) 37 | if match: 38 | return match.start(1) 39 | return len(word) 40 | 41 | def ends_with_short_syllable(word): 42 | if len(word) == 2: 43 | if ewss_exp1.match(word): 44 | return True 45 | if ewss_exp2.match(word): 46 | return True 47 | return False 48 | 49 | def is_short_word(word): 50 | if ends_with_short_syllable(word): 51 | if get_r1(word) == len(word): 52 | return True 53 | return False 54 | 55 | def remove_initial_apostrophe(word): 56 | if word.startswith("'"): 57 | return word[1:] 58 | return word 59 | 60 | def capitalize_consonant_ys(word): 61 | if word.startswith('y'): 62 | word = 'Y' + word[1:] 63 | return ccy_exp.sub('\g<1>Y', word) 64 | 65 | def step_0(word): 66 | if word.endswith("'s'"): 67 | return word[:-3] 68 | if word.endswith("'s"): 69 | return word[:-2] 70 | if word.endswith("'"): 71 | return word[:-1] 72 | return word 73 | 74 | def step_1a(word): 75 | if word.endswith('sses'): 76 | return word[:-4] + 'ss' 77 | if word.endswith('ied') or word.endswith('ies'): 78 | if len(word) > 4: 79 | return word[:-3] + 'i' 80 | else: 81 | return word[:-3] + 'ie' 82 | if word.endswith('us') or word.endswith('ss'): 83 | return word 84 | if word.endswith('s'): 85 | preceding = word[:-1] 86 | if s1a_exp.search(preceding): 87 | return preceding 88 | return word 89 | return word 90 | 91 | doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt') 92 | def ends_with_double(word): 93 | for double in doubles: 94 | if word.endswith(double): 95 | return True 96 | return False 97 | def step_1b_helper(word): 98 | if word.endswith('at') or word.endswith('bl') or word.endswith('iz'): 99 | return word + 'e' 100 | if ends_with_double(word): 101 | return word[:-1] 102 | if is_short_word(word): 103 | return word + 'e' 104 | return word 105 | s1b_suffixes = ('ed', 'edly', 'ing', 'ingly') 106 | 107 | def step_1b(word, r1): 108 | if word.endswith('eedly'): 109 | if len(word) - 5 >= r1: 110 | return word[:-3] 111 | return word 112 | if word.endswith('eed'): 113 | if len(word) - 3 >= r1: 114 | return word[:-1] 115 | return word 116 | 117 | for suffix in s1b_suffixes: 118 | if word.endswith(suffix): 119 | preceding = word[:-len(suffix)] 120 | if s1b_exp.search(preceding): 121 | return step_1b_helper(preceding) 122 | return word 123 | 124 | return word 125 | 126 | def step_1c(word): 127 | if word.endswith('y') or word.endswith('Y') and len(word) > 1: 128 | if word[-2] not in 'aeiouy': 129 | if len(word) > 2: 130 | return word[:-1] + 'i' 131 | return word 132 | 133 | def step_2_helper(word, r1, end, repl, prev): 134 | if word.endswith(end): 135 | if len(word) - len(end) >= r1: 136 | if prev == []: 137 | return word[:-len(end)] + repl 138 | for p in prev: 139 | if word[:-len(end)].endswith(p): 140 | return word[:-len(end)] + repl 141 | return word 142 | return None 143 | s2_triples = (('ization', 'ize', []), 144 | ('ational', 'ate', []), 145 | ('fulness', 'ful', []), 146 | ('ousness', 'ous', []), 147 | ('iveness', 'ive', []), 148 | ('tional', 'tion', []), 149 | ('biliti', 'ble', []), 150 | ('lessli', 'less', []), 151 | ('entli', 'ent', []), 152 | ('ation', 'ate', []), 153 | ('alism', 'al', []), 154 | ('aliti', 'al', []), 155 | ('ousli', 'ous', []), 156 | ('iviti', 'ive', []), 157 | ('fulli', 'ful', []), 158 | ('enci', 'ence', []), 159 | ('anci', 'ance', []), 160 | ('abli', 'able', []), 161 | ('izer', 'ize', []), 162 | ('ator', 'ate', []), 163 | ('alli', 'al', []), 164 | ('bli', 'ble', []), 165 | ('ogi', 'og', ['l']), 166 | ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])) 167 | 168 | def step_2(word, r1): 169 | for trip in s2_triples: 170 | attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2]) 171 | if attempt: 172 | return attempt 173 | return word 174 | 175 | def step_3_helper(word, r1, r2, end, repl, r2_necessary): 176 | if word.endswith(end): 177 | if len(word) - len(end) >= r1: 178 | if not r2_necessary: 179 | return word[:-len(end)] + repl 180 | else: 181 | if len(word) - len(end) >= r2: 182 | return word[:-len(end)] + repl 183 | return word 184 | return None 185 | s3_triples = (('ational', 'ate', False), 186 | ('tional', 'tion', False), 187 | ('alize', 'al', False), 188 | ('icate', 'ic', False), 189 | ('iciti', 'ic', False), 190 | ('ative', '', True), 191 | ('ical', 'ic', False), 192 | ('ness', '', False), 193 | ('ful', '', False)) 194 | def step_3(word, r1, r2): 195 | for trip in s3_triples: 196 | attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2]) 197 | if attempt: 198 | return attempt 199 | return word 200 | 201 | s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', 202 | 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize') 203 | 204 | def step_4(word, r2): 205 | for end in s4_delete_list: 206 | if word.endswith(end): 207 | if len(word) - len(end) >= r2: 208 | return word[:-len(end)] 209 | return word 210 | 211 | if word.endswith('sion') or word.endswith('tion'): 212 | if len(word) - 3 >= r2: 213 | return word[:-3] 214 | 215 | return word 216 | 217 | def step_5(word, r1, r2): 218 | if word.endswith('l'): 219 | if len(word) - 1 >= r2 and word[-2] == 'l': 220 | return word[:-1] 221 | return word 222 | 223 | if word.endswith('e'): 224 | if len(word) - 1 >= r2: 225 | return word[:-1] 226 | if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]): 227 | return word[:-1] 228 | 229 | return word 230 | 231 | def normalize_ys(word): 232 | return word.replace('Y', 'y') 233 | 234 | exceptional_forms = {'skis': 'ski', 235 | 'skies': 'sky', 236 | 'dying': 'die', 237 | 'lying': 'lie', 238 | 'tying': 'tie', 239 | 'idly': 'idl', 240 | 'gently': 'gentl', 241 | 'ugly': 'ugli', 242 | 'early': 'earli', 243 | 'only': 'onli', 244 | 'singly': 'singl', 245 | 'sky': 'sky', 246 | 'news': 'news', 247 | 'howe': 'howe', 248 | 'atlas': 'atlas', 249 | 'cosmos': 'cosmos', 250 | 'bias': 'bias', 251 | 'andes': 'andes'} 252 | 253 | exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring', 254 | 'earring', 'proceed', 'exceed', 'succeed']) 255 | 256 | 257 | def stem(word): 258 | if len(word) <= 2: 259 | return word 260 | word = remove_initial_apostrophe(word) 261 | 262 | # handle some exceptional forms 263 | if word in exceptional_forms: 264 | return exceptional_forms[word] 265 | 266 | word = capitalize_consonant_ys(word) 267 | r1 = get_r1(word) 268 | r2 = get_r2(word) 269 | word = step_0(word) 270 | word = step_1a(word) 271 | 272 | # handle some more exceptional forms 273 | if word in exceptional_early_exit_post_1a: 274 | return word 275 | 276 | word = step_1b(word, r1) 277 | word = step_1c(word) 278 | word = step_2(word, r1) 279 | word = step_3(word, r1, r2) 280 | word = step_4(word, r2) 281 | word = step_5(word, r1, r2) 282 | word = normalize_ys(word) 283 | 284 | return word 285 | 286 | if __name__ == "__main__": 287 | assert stem("bill's") == "bill" 288 | assert stem("y's") == "y" 289 | 290 | -------------------------------------------------------------------------------- /gutenrye/scala-proj/src/main/scala/Stemmer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Scala Porter Stemmer Implementation 3 | * 4 | */ 5 | object Stemmer extends Serializable { 6 | def stem (str: String): String = { 7 | // check for zero length 8 | if (str.length() > 3) { 9 | // all characters must be letters 10 | for (ch <- str toList) { 11 | if (!Character.isLetter(ch)) { 12 | return str.toLowerCase() 13 | } 14 | } 15 | } 16 | var s: String = step_1(str) 17 | step_5(step_4(step_3(step_2(step_1(str))))).toLowerCase 18 | } 19 | 20 | def step_1(str: String): String = step_1_c(step_1_b(step_1_a(str))) 21 | 22 | /* 23 | * Step 1a 24 | * SSES -> SS caresses -> caress 25 | * IES -> I ponies -> poni 26 | * ties -> ti 27 | * SS -> SS caress -> caress 28 | * S -> cats -> cat 29 | */ 30 | def step_1_a(str: String): String = replacePatterns(str, List( ("sses", "ss"), ("ies", "i"), ("ss", "ss"), ("s", "")), _>=0) 31 | 32 | /* 33 | * Step 1b 34 | * (m>0) EED -> EE feed -> feed 35 | * agreed -> agree 36 | * (*v*) ED -> plastered -> plaster 37 | * bled -> bled 38 | * (*v*) ING -> motoring -> motor 39 | * sing -> sing 40 | */ 41 | def step_1_b (str: String): String = { 42 | // (m > 0) EED -> EE 43 | if (str.endsWith("eed")) { 44 | if (stringMeasure(str.substring(0, str.length - 3)) > 0) 45 | return str.substring(0, str.length() - 1) 46 | // (*v*) ED -> 47 | } else if ((str.endsWith("ed")) && 48 | (containsVowel(str.substring(0, str.length - 2)))) { 49 | return step_1_b_2(str.substring(0, str.length - 2)) 50 | // (*v*) ING -> 51 | } else if ((str.endsWith("ing")) && 52 | (containsVowel(str.substring(0, str.length - 3)))) { 53 | return step_1_b_2(str.substring(0, str.length - 3)) 54 | } // end if 55 | str 56 | } // end step1b 57 | 58 | /* 59 | * If the second or third of the rules in Step 1b is successful, the following is done: 60 | * AT -> ATE conflat(ed) -> conflate 61 | * BL -> BLE troubl(ed) -> trouble 62 | * IZ -> IZE siz(ed) -> size 63 | * 64 | * (*d and not (*L or *S or *Z)) -> single letter 65 | * hopp(ing) -> hop 66 | * tann(ed) -> tan 67 | * fall(ing) -> fall 68 | * hiss(ing) -> hiss 69 | * fizz(ed) -> fizz 70 | * 71 | * (m=1 and *o) -> E fail(ing) -> fail 72 | * fil(ing) -> file 73 | */ 74 | def step_1_b_2 (str: String): String = { 75 | 76 | if (str.endsWith("at") || 77 | str.endsWith("bl") || 78 | str.endsWith("iz")) { 79 | return str + "e"; 80 | } 81 | else if ((str.length() > 1) && (endsWithDoubleConsonent(str)) && 82 | (!(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))) { 83 | return str.substring(0, str.length() - 1); 84 | } 85 | else if ((stringMeasure(str) == 1) && 86 | (endsWithCVC(str))) { 87 | return str + "e" 88 | } 89 | str 90 | } 91 | 92 | /* 93 | * (*v*) Y -> I happy -> happi 94 | * sky -> sky 95 | */ 96 | def step_1_c(str: String): String = { 97 | if (str.endsWith("y") && containsVowel(str.substring(0, str.length() - 1))) 98 | return str.substring(0, str.length() - 1) + "i" 99 | str 100 | } // end step1c 101 | 102 | /* 103 | * (m>0) ATIONAL -> ATE relational -> relate 104 | * (m>0) TIONAL -> TION conditional -> condition 105 | * rational -> rational 106 | * (m>0) ENCI -> ENCE valenci -> valence 107 | * (m>0) ANCI -> ANCE hesitanci -> hesitance 108 | * (m>0) IZER -> IZE digitizer -> digitize 109 | * (m>0) ABLI -> ABLE conformabli -> conformable 110 | * (m>0) ALLI -> AL radicalli -> radical 111 | * (m>0) ENTLI -> ENT differentli -> different 112 | * (m>0) ELI -> E vileli - > vile 113 | * (m>0) OUSLI -> OUS analogousli -> analogous 114 | * (m>0) IZATION -> IZE vietnamization -> vietnamize 115 | * (m>0) ATION -> ATE predication -> predicate 116 | * (m>0) ATOR -> ATE operator -> operate 117 | * (m>0) ALISM -> AL feudalism -> feudal 118 | * (m>0) IVENESS -> IVE decisiveness -> decisive 119 | * (m>0) FULNESS -> FUL hopefulness -> hopeful 120 | * (m>0) OUSNESS -> OUS callousness -> callous 121 | * (m>0) ALITI -> AL formaliti -> formal 122 | * (m>0) IVITI -> IVE sensitiviti -> sensitive 123 | * (m>0) BILITI -> BLE sensibiliti -> sensible 124 | */ 125 | def step_2 (str: String): String = replacePatterns(str, List( ("ational", "ate"), ("tional","tion"), ("enci","ence"), ("anci","ance"), 126 | ("izer","ize"), ("bli","ble"), ("alli", "al"), ("entli","ent"),("eli","e"), 127 | ("ousli","ous"), ("ization","ize"), ("ation","ate"), ("ator","ate"), ("alism","al"), 128 | ("iveness","ive"), ("fulness","ful"), ("ousness", "ous"), ("aliti", "al"), ("iviti","ive"), 129 | ("biliti", "ble"), ("logi", "log"))) 130 | 131 | /* 132 | * (m>0) ICATE -> IC triplicate -> triplic 133 | * (m>0) ATIVE -> formative -> form 134 | * (m>0) ALIZE -> AL formalize -> formal 135 | * (m>0) ICITI -> IC electriciti -> electric 136 | * (m>0) ICAL -> IC electrical -> electric 137 | * (m>0) FUL -> hopeful -> hope 138 | * (m>0) NESS -> goodness -> good 139 | */ 140 | def step_3 (str: String): String = replacePatterns(str, List( ("icate", "ic"),("ative",""),("alize","al"),("iciti","ic"),("ical","ic"),("ful",""),("ness",""))) 141 | 142 | /* 143 | * (m>1) AL -> revival -> reviv 144 | * (m>1) ANCE -> allowance -> allow 145 | * (m>1) ENCE -> inference -> infer 146 | * (m>1) ER -> airliner -> airlin 147 | * (m>1) IC -> gyroscopic -> gyroscop 148 | * (m>1) ABLE -> adjustable -> adjust 149 | * (m>1) IBLE -> defensible -> defens 150 | * (m>1) ANT -> irritant -> irrit 151 | * (m>1) EMENT -> replacement -> replac 152 | * (m>1) MENT -> adjustment -> adjust 153 | * (m>1) ENT -> dependent -> depend 154 | * (m>1 and (*S or *T)) ION -> adoption -> adopt 155 | * (m>1) OU -> homologou -> homolog 156 | * (m>1) ISM -> communism -> commun 157 | * (m>1) ATE -> activate -> activ 158 | * (m>1) ITI -> angulariti -> angular 159 | * (m>1) OUS -> homologous -> homolog 160 | * (m>1) IVE -> effective -> effect 161 | * (m>1) IZE -> bowdlerize -> bowdler 162 | */ 163 | def step_4 (str: String): String = { 164 | val res: String = replacePatterns(str, List( ("al",""),("ance",""),("ence",""),("er",""),("ic",""),("able",""),("ible",""),("ant",""),("ement",""), 165 | ("ment",""),("ent",""),("ou", ""),("ism",""),("ate",""),("iti",""),("ous",""), 166 | ("ive",""),("ize","")), _>1) 167 | if (str == res) { 168 | if ((str.endsWith("sion") || str.endsWith("tion")) && stringMeasure(str.substring(0, str.length() - 3)) > 1) 169 | return str.substring(0, str.length() - 3) 170 | else 171 | return str 172 | } 173 | else { 174 | return res 175 | } 176 | } 177 | 178 | def step_5 (str: String): String = step_5_b(step_5_a(str)) 179 | 180 | /* 181 | * (m>1) E -> probate -> probat 182 | * rate -> rate 183 | * (m=1 and not *o) E -> cease -> ceas 184 | */ 185 | def step_5_a (str: String): String = { 186 | // (m > 1) E -> 187 | if ((stringMeasure(str.substring(0, str.length() - 1)) > 1) && 188 | str.endsWith("e")) 189 | return str.substring(0, str.length() -1) 190 | // (m = 1 and not *0) E -> 191 | else if ((stringMeasure(str.substring(0, str.length() - 1)) == 1) && 192 | (!endsWithCVC(str.substring(0, str.length() - 1))) && 193 | (str.endsWith("e"))) 194 | return str.substring(0, str.length() - 1) 195 | else 196 | return str 197 | } // end step5a 198 | 199 | /* 200 | * (m > 1 and *d and *L) -> single letter 201 | * controll -> control 202 | * roll -> roll 203 | */ 204 | def step_5_b (str: String): String = { 205 | // (m > 1 and *d and *L) -> 206 | if (str.endsWith("l") && 207 | endsWithDoubleConsonent(str) && 208 | (stringMeasure(str.substring(0, str.length() - 1)) > 1)) { 209 | str.substring(0, str.length() - 1) 210 | } else { 211 | str 212 | } 213 | } // end step5b 214 | 215 | // does string contain a vowel? 216 | def containsVowel(str: String): Boolean = { 217 | for (ch <- str toList) { 218 | if (isVowel(ch)) 219 | return true 220 | } 221 | // no aeiou but there is y 222 | if (str.indexOf('y') > -1) 223 | return true 224 | else 225 | false 226 | } // end function 227 | 228 | // is char a vowel? 229 | def isVowel(c: Char): Boolean = { 230 | for (ch <- "aeiou" toList) 231 | if (c == ch) 232 | return true 233 | false 234 | } // end function 235 | 236 | /* 237 | * Special check for 'y', since it may be both vowel and consonent depending on surrounding letters 238 | */ 239 | def isVowel(str: String, i: Int): Boolean = { 240 | for (ch <- "aeiou" toList) 241 | if (str(i) == ch || (str(i) == 'y' && i > 0 && i+1 < str.length && !isVowel(str(i-1)) && !isVowel(str(i+1)) )) 242 | return true 243 | false 244 | } // end function 245 | 246 | // returns a CVC measure for the string 247 | def stringMeasure(str: String): Int = { 248 | var count = 0 249 | var vowelSeen: Boolean = false 250 | 251 | for (i <- 0 to str.length - 1) { 252 | if(isVowel(str, i)) { 253 | vowelSeen = true 254 | } else if (vowelSeen) { 255 | count += 1 256 | vowelSeen = false 257 | } 258 | } 259 | count 260 | } // end function 261 | 262 | // does stem end with CVC? 263 | def endsWithCVC (str: String): Boolean = { 264 | if (str.length() >= 3) { 265 | val cvc = ( str(str.length - 1), str(str.length - 2), str(str.length - 3) ) 266 | val cvc_str = cvc._1.toString + cvc._2 + cvc._3 267 | 268 | if ((cvc._1 == 'w') || (cvc._1 == 'x') || (cvc._1 == 'y')) 269 | false 270 | else if (!isVowel(cvc._1) && isVowel(cvc_str, 1) && !isVowel(cvc._3)) 271 | true 272 | else 273 | false 274 | } 275 | else 276 | false 277 | } // end function 278 | 279 | // does string end with a double consonent? 280 | def endsWithDoubleConsonent(str: String): Boolean = { 281 | val c: Char = str.charAt(str.length() - 1); 282 | if (c == str.charAt(str.length() - 2)) 283 | if (!containsVowel(str.substring(str.length() - 2))) { 284 | return true 285 | } 286 | false 287 | } // end function 288 | 289 | def replacePatterns(str: String, patterns: List[(String, String)]): String = replacePatterns(str, patterns, _>0) 290 | 291 | def replaceLast(str: String, pattern: String, replacement: String) = new StringBuilder(str).replace(str.lastIndexOf(pattern), str.lastIndexOf(pattern) + pattern.length, replacement).toString 292 | 293 | def replacePatterns(str: String, patterns: List[(String, String)], comparer: Int => Boolean): String = { 294 | for (pattern <- patterns) 295 | if (str.endsWith(pattern._1)) { 296 | val res = replaceLast(str, pattern._1, pattern._2) 297 | if (comparer(stringMeasure(replaceLast(str, pattern._1, "")))) 298 | return res 299 | else 300 | return str 301 | } 302 | str 303 | } 304 | 305 | } --------------------------------------------------------------------------------