├── .gitignore ├── CourseContent ├── data │ └── README.md └── src │ ├── README.md │ └── ratings-counter.py ├── Dockerfile ├── Friends-By-Age.py ├── Max-Temperatures.py ├── Min-Temperatures.py ├── Most-Often-Watched-Movie.py ├── Most-Popular-Superhero.py ├── Popular-Movies-Nicer.py ├── README.md ├── Total-Amount-By-Customer.py ├── Vagrantfile ├── Word-Count-Better-Sorted.py ├── Word-Count-Better.py ├── Word-Count.py ├── degrees-of-separation.py ├── download_movielens_datasets.sh ├── install_spark_executables.sh ├── log4j.properties ├── ratings-counter.py ├── scratch.py └── vagrant ├── setup.sh └── setup2.sh /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | 3 | .vagrant 4 | 5 | *.out 6 | *.err 7 | *.gz 8 | *.tar 9 | .DS_Store 10 | ml-100k/ 11 | *.*~ 12 | .idea/ 13 | *.csv 14 | *.txt 15 | *.data 16 | *.zip 17 | -------------------------------------------------------------------------------- /CourseContent/data/README.md: -------------------------------------------------------------------------------- 1 | data directory 2 | -------------------------------------------------------------------------------- /CourseContent/src/README.md: -------------------------------------------------------------------------------- 1 | src directory 2 | -------------------------------------------------------------------------------- /CourseContent/src/ratings-counter.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | import collections 3 | 4 | conf = SparkConf().setMaster("local").setAppName("RatingsHistogram") 5 | sc = SparkContext(conf = conf) 6 | 7 | lines = sc.textFile("file:///SparkCourse/ml-100k/u.data") 8 | ratings = lines.map(lambda x: x.split()[2]) 9 | result = ratings.countByValue() 10 | 11 | sortedResults = collections.OrderedDict(sorted(result.items())) 12 | for key, value in sortedResults.iteritems(): 13 | print "%s %i" % (key, value) 14 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:latest 2 | 3 | RUN yum -y update && yum clean all \ 4 | && yum -y install epel-release \ 5 | && yum -y install vim bash-completion tree git curl wget telnet 6 | 7 | RUN yum install -y python34 \ 8 | && yum -y install python-pip 9 | 10 | 11 | ENV JAVA_HOME=/usr/java/default 12 | RUN mkdir -p /usr/java \ 13 | && curl -O -L --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" \ 14 | "http://download.oracle.com/otn-pub/java/jdk/8u60-b27/jdk-8u60-linux-x64.tar.gz" \ 15 | && tar -xvf jdk-8u60-linux-x64.tar.gz -C /usr/java \ 16 | && ln -s /usr/java/jdk1.8.0_60/ /usr/java/default \ 17 | && rm -f jdk-8u60-linux-x64.tar.gz 18 | 19 | 20 | ENV SPARK_HOME=/usr/spark/default 21 | RUN mkdir -p /usr/spark \ 22 | && curl -O -L http://www-eu.apache.org/dist/spark/spark-1.6.1/spark-1.6.1-bin-hadoop2.6.tgz \ 23 | && tar -xvf spark-1.6.1-bin-hadoop2.6.tgz -C /usr/spark \ 24 | && ln -s /usr/spark/spark-1.6.1-bin-hadoop2.6/ /usr/spark/default \ 25 | && rm -f spark-1.6.1-bin-hadoop2.6.tgz 26 | 27 | COPY log4j.properties /usr/spark/default/conf/log4j.properties 28 | COPY install_spark_executables.sh /install_spark_executables.sh 29 | RUN /install_spark_executables.sh 30 | -------------------------------------------------------------------------------- /Friends-By-Age.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | 3 | conf = SparkConf().setMaster("local").setAppName("FriendsByAge") 4 | sc = SparkContext(conf = conf) 5 | 6 | def parseLine(line): 7 | fields = line.split(',') 8 | age = int(fields[2]) 9 | numFriends = int(fields[3]) 10 | return (age, numFriends) 11 | 12 | lines = sc.textFile("/vagrant/fakefriends.csv") 13 | rdd = lines.map(parseLine) 14 | #results = rdd.collect() 15 | 16 | groupByAge = rdd.mapValues(lambda x: (x,1)) 17 | #results = groupByAge.collect() 18 | 19 | totalsByAge = groupByAge.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) 20 | #results = totalsByAge.collect() 21 | 22 | averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1]) 23 | results = averagesByAge.collect() 24 | 25 | for result in results: 26 | print result 27 | -------------------------------------------------------------------------------- /Max-Temperatures.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | 3 | conf = SparkConf().setMaster("local").setAppName("MinTemperatures") 4 | sc = SparkContext(conf = conf) 5 | 6 | def parseLine(line): 7 | fields = line.split(',') 8 | stationID = fields[0] 9 | entryType = fields[2] 10 | temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 11 | return (stationID, entryType, temperature) 12 | 13 | lines = sc.textFile("/vagrant/1800.csv") 14 | parsedLines = lines.map(parseLine) 15 | #results = parsedLines.collect() 16 | 17 | minTemps = parsedLines.filter(lambda x: "TMAX" in x[1]) 18 | #results = minTemps.collect() 19 | 20 | stationTemps = minTemps.map(lambda x: (x[0], x[2])) 21 | #results = stationTemps.collect() 22 | 23 | minTemps = stationTemps.reduceByKey(lambda x, y: max(x,y)) 24 | results = minTemps.collect() 25 | 26 | for result in results: 27 | #print result 28 | print result[0] + "\t{:.2f}F".format(result[1]) 29 | -------------------------------------------------------------------------------- /Min-Temperatures.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | 3 | conf = SparkConf().setMaster("local").setAppName("MinTemperatures") 4 | sc = SparkContext(conf = conf) 5 | 6 | def parseLine(line): 7 | fields = line.split(',') 8 | stationID = fields[0] 9 | entryType = fields[2] 10 | temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 11 | return (stationID, entryType, temperature) 12 | 13 | lines = sc.textFile("/vagrant/1800.csv") 14 | parsedLines = lines.map(parseLine) 15 | #results = parsedLines.collect() 16 | 17 | minTemps = parsedLines.filter(lambda x: "TMIN" in x[1]) 18 | #results = minTemps.collect() 19 | 20 | stationTemps = minTemps.map(lambda x: (x[0], x[2])) 21 | #results = stationTemps.collect() 22 | 23 | minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y)) 24 | results = minTemps.collect() 25 | 26 | for result in results: 27 | #print result 28 | print result[0] + "\t{:.2f}F".format(result[1]) 29 | -------------------------------------------------------------------------------- /Most-Often-Watched-Movie.py: -------------------------------------------------------------------------------- 1 | 2 | from pyspark import SparkConf, SparkContext 3 | 4 | conf = SparkConf().setMaster("local").setAppName("MostOftenWatchedMovies") 5 | sc = SparkContext(conf = conf) 6 | 7 | def parseLineAll(line): 8 | fields = line.split() 9 | userId = int(fields[0]) 10 | movieId = int(fields[1]) 11 | rating = int(fields[2]) 12 | timestamp = long(fields[3]) 13 | return (userId,(movieId,rating,timestamp)) 14 | 15 | def parseLine(line): 16 | fields = line.split() 17 | movieId = int(fields[1]) 18 | return movieId 19 | 20 | input = sc.textFile("/vagrant/u.data") 21 | rdd1 = input.map(parseLine) #parse each line of the input db 22 | rdd2 = rdd1.map(lambda x: (x,1)) 23 | rdd3 = rdd2.reduceByKey(lambda x,y: x+y) 24 | 25 | rdd4 = rdd3.map(lambda (x,y): (y,x)).sortByKey() 26 | 27 | results = rdd4.collect() 28 | for result in results: 29 | print result 30 | -------------------------------------------------------------------------------- /Most-Popular-Superhero.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | 3 | conf = SparkConf().setMaster("local").setAppName("PopularHero") 4 | sc = SparkContext(conf = conf) 5 | 6 | def countCoOccurences(line): 7 | elements = line.split() 8 | return (int(elements[0]), len(elements) - 1) 9 | 10 | def parseNames(line): 11 | fields = line.split('\"') 12 | return (int(fields[0]), fields[1].encode("utf8")) 13 | 14 | names = sc.textFile("/vagrant/marvel-names.txt") 15 | namesRdd = names.map(parseNames) 16 | 17 | lines = sc.textFile("/vagrant/marvel-graph.txt") 18 | 19 | pairings = lines.map(countCoOccurences) 20 | 21 | results = pairings.collect() 22 | 23 | for result in results: 24 | print result 25 | 26 | 27 | # totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y) 28 | # 29 | # flipped = totalFriendsByCharacter.map(lambda (x,y) : (y,x)) 30 | # mostPopular = flipped.max() 31 | # mostPopularName = namesRdd.lookup(mostPopular[1])[0] 32 | # 33 | # print mostPopularName + " is the most popular superhero, with " + \ 34 | # str(mostPopular[0]) + " co-appearances." 35 | -------------------------------------------------------------------------------- /Popular-Movies-Nicer.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | 3 | def loadMovieNames(): 4 | movieNames = {} 5 | with open("ml-100k/u.ITEM") as f: 6 | for line in f: 7 | fields = line.split('|') 8 | movieNames[int(fields[0])] = fields[1] 9 | return movieNames 10 | 11 | conf = SparkConf().setMaster("local").setAppName("PopularMovies") 12 | sc = SparkContext(conf = conf) 13 | 14 | nameDict = sc.broadcast(loadMovieNames()) 15 | 16 | lines = sc.textFile("/vagrant/ml-100k/u.data") 17 | movies = lines.map(lambda x: (int(x.split()[1]), 1)) 18 | movieCounts = movies.reduceByKey(lambda x, y: x + y) 19 | 20 | flipped = movieCounts.map( lambda (x, y) : (y, x)) 21 | sortedMovies = flipped.sortByKey() 22 | 23 | sortedMoviesWithNames = sortedMovies.map(lambda (count, movie) : (nameDict.value[movie], count)) 24 | 25 | results = sortedMoviesWithNames.collect() 26 | 27 | for result in results: 28 | print result 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SparkCourse 2 | ## Taming Big Data with Apache Spark and Python - Hands On 3 | 4 | ####Resources and downloads 5 | https://www.udemy.com/taming-big-data-with-apache-spark-hands-on/learn/v4/content 6 | 7 | 8 | ####12.[Activity] Running the Average Friends by Age Example 9 | #####Social Network Dataset 10 | the original dataset is in the form (id,name,age,number_of_friends): fakefriends.csv 11 | ``` 12 | 0,Will,33,385 13 | 1,Jean-Luc,26,2 14 | 2,Hugh,55,221 15 | 3,Deanna,40,465 16 | 4,Quark,68,21 17 | 5,Weyoun,59,318 18 | 6,Gowron,37,220 19 | 7,Will,54,307 20 | 8,Jadzia,38,380 21 | 9,Hugh,27,181 22 | 10,Odo,53,191 23 | ... 24 | ``` 25 | 26 | #####Friends-By-Age.py 27 | the pyspark program to work on the dataset is given here 28 | ```python 29 | 1 from pyspark import SparkConf, SparkContext 30 | 2 31 | 3 conf = SparkConf().setMaster("local").setAppName("WordCount") 32 | 4 sc = SparkContext(conf = conf) 33 | 5 34 | 6 input = sc.textFile("/vagrant/Book.txt") 35 | 7 words = input.flatMap(lambda x: x.split()) 36 | 8 # results = words.collect() 37 | 9 # for result in results: 38 | 10 # print result 39 | 11 40 | 12 # #python function 41 | 13 # #wordCounts = words.countByValue() 42 | 14 43 | 15 #take a spark approach... 44 | 16 #convert each word to a key/value pair with a value of 1 45 | 17 wordCountsMap = words.map(lambda x: (x,1)) 46 | 18 # results = wordCountsMap.collect() 47 | 19 # for result in results: 48 | 20 # print result 49 | 21 50 | 22 51 | 23 #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s! 52 | 24 wordCountsReduce = wordCountsMap.reduceByKey(lambda x, y: x + y) 53 | 25 results = wordCountsReduce.collect() 54 | 26 for result in results: 55 | 27 print result 56 | 28 # 57 | 29 # for word, count in wordCountsMap.items(): 58 | 30 # cleanWord = word.encode('ascii', 'ignore') 59 | 31 # if (cleanWord): 60 | 32 # print cleanWord, count 61 | ``` 62 | 63 | #####step-by-step 64 | 65 | define a function that can be mapped onto the dataset. 'parseLine' will accept a line of input and split the comma separated lines into fileds. we are only interested in the 3rd and 4th field and they need to be cast as integers. 66 | 67 | ```python 68 | 6 def parseLine(line): 69 | 7 fields = line.split(',') 70 | 8 age = int(fields[2]) 71 | 9 numFriends = int(fields[3]) 72 | 10 return (age, numFriends) 73 | 11 74 | ``` 75 | 76 | build the first rdd by mapping the parseLine function onto each item (line) in the dataset. parselLine will emit the 3rd and 4th values of each line into the new rdd. 77 | 78 | ```python 79 | 12 lines = sc.textFile("/vagrant/fakefriends.csv") 80 | 13 rdd = lines.map(parseLine) 81 | 14 results = rdd.collect() 82 | ... 83 | 25 for result in results: 84 | 26 print result 85 | ``` 86 | if we output contents of this rdd and filter for values where the age (3rd field) is 43 we get the following: 87 | ``` 88 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,' 89 | (43, 49) 90 | (43, 249) 91 | (43, 404) 92 | (43, 101) 93 | (43, 48) 94 | (43, 335) 95 | (43, 428) 96 | ``` 97 | okay, now lets build the second rdd by grouping the new dataset by age. ultimately what we will be doing is determining the average friends per age. in order to do that we need to be able to total the friends for a particular age and then divide by the number of friends for that age. so the new dataset will consist look like this (K,V) or (age, (friends,1)). so the mapValues spark function will take the old (age,friends) dataset and then emit the new one with a 1 for each so that a count can be performed later. 98 | 99 | ```python 100 | 16 groupByAge = rdd.mapValues(lambda x: (x,1)) 101 | 17 results = groupByAge.collect() 102 | ... 103 | 25 for result in results: 104 | 26 print result 105 | ``` 106 | and so the output (for age 43) looks like this: 107 | ``` 108 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,' 109 | (43, (49, 1)) 110 | (43, (249, 1)) 111 | (43, (404, 1)) 112 | (43, (101, 1)) 113 | (43, (48, 1)) 114 | (43, (335, 1)) 115 | (43, (428, 1)) 116 | ``` 117 | okay so now we need to then tally the friends for each age and divide by the number of friends for that age. that can be done with a reduceByKey function (which collapes rows that are grouped by the same key). which take the set of values for each age(the key) and then applies the function that adds the friends (x) and the count (y) and emits the total for that age. 118 | 119 | ```python 120 | 19 totalsByAge = groupByAge.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) 121 | 20 results = totalsByAge.collect() 122 | ... 123 | 25 for result in results: 124 | 26 print result 125 | ``` 126 | and the output (for age 43) looks like this: 127 | ``` 128 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,' 129 | (43, (1614, 7)) 130 | ``` 131 | and finally we need to divide the total by the count in order to get the average for each age. to do this we are mapping a function onto each item in the dataset that will do the division and emit a new kay value pair that contains the age and the average number of friends. 132 | 133 | ```python 134 | 22 averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1]) 135 | 23 results = averagesByAge.collect() 136 | 24 137 | 25 for result in results: 138 | 26 print result 139 | ``` 140 | and the output looks like this (for age 43). so it looks like for age 43 in this dataset they have an average of 230 friends. 141 | ``` 142 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,' 143 | (43, 230) 144 | ``` 145 | 146 | ####16.[Activity] Counting Word Occurrences using flatmap() 147 | The data for this exercise is in the form of a book that is in text form: Book.txt. The objective is to get a wordcount for all words in the book text. 148 | 149 | ```python 150 | 1 import re 151 | 2 from pyspark import SparkConf, SparkContext 152 | 3 153 | 4 def normalizeWords(text): 154 | 5 return re.compile(r'\W+', re.UNICODE).split(text.lower()) 155 | 6 156 | 7 conf = SparkConf().setMaster("local").setAppName("WordCount") 157 | 8 sc = SparkContext(conf = conf) 158 | 9 159 | 10 input = sc.textFile("/vagrant/Book.txt") 160 | 11 #a simple split works 161 | 12 #words = input.flatMap(lambda x: x.split()) 162 | 13 #but let's clean the text up a bit and filter out special characters and consider upper and lowercase to be the same thing 163 | 14 words = input.flatMap(normalizeWords) 164 | 15 # results = words.collect() 165 | 16 # for result in results: 166 | 17 # print result 167 | 18 168 | 19 # #python function 169 | 20 # #wordCounts = words.countByValue() 170 | 21 171 | 22 #take a spark approach... 172 | 23 #convert each word to a key/value pair with a value of 1 173 | 24 wordCountsMap = words.map(lambda x: (x,1)) 174 | 25 # results = wordCountsMap.collect() 175 | 26 # for result in results: 176 | 27 # print result 177 | 28 178 | 29 179 | 30 #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s! 180 | 31 wordCountsReduced = wordCountsMap.reduceByKey(lambda x, y: x + y) 181 | 32 results = wordCountsReduced.collect() 182 | 33 # for result in results: 183 | 34 # print result 184 | 35 185 | 36 wordCountsSorted = wordCountsReduced.map(lambda (x,y): (y,x)).sortByKey() 186 | 37 results = wordCountsSorted.collect() 187 | 38 188 | 39 for result in results: 189 | 40 # print result 190 | 41 count = str(result[0]) 191 | 42 word = result[1].encode('ascii', 'ignore') 192 | 43 if (word): 193 | 44 print word + ":\t\t" + count 194 | ``` 195 | so to start off we need to build a dataset consisting of each word in the book. we can do this with python and spark and building the first rdd based on each word in the book and using a split() function (on whitespace by default). the flatMap function can do that take a single input and produce multiple outputs and apply the given function on the input: 196 | ```python 197 | 10 input = sc.textFile("/vagrant/Book.txt") 198 | 11 #a simple split works 199 | 12 #words = input.flatMap(lambda x: x.split()) 200 | 13 #but let's clean the text up a bit and filter out special characters and consider upper and lowercase to be the same thing 201 | 14 words = input.flatMap(normalizeWords) 202 | ``` 203 | taking a look at the ```word``` rdd after the content of the book is split into words: 204 | ``` 205 | Self-Employment: 206 | Building 207 | an 208 | Internet 209 | Business 210 | of 211 | One 212 | Achieving 213 | Financial 214 | and 215 | Personal 216 | Freedom 217 | through 218 | a 219 | Lifestyle 220 | Technology 221 | Business 222 | By 223 | Frank 224 | Kane 225 | ... 226 | ``` 227 | okay now we need to be able to provide a count (of 1) for each of the words so that eventually we can tally up the count of each word. we can do this by applying a map function to each of the words in the rdd dataset 228 | ```python 229 | 23 #convert each word to a key/value pair with a value of 1 230 | 24 wordCountsMap = words.map(lambda x: (x,1)) 231 | ``` 232 | and that will yield a second rdd with the following key,value dataset: 233 | ``` 234 | (u'Self-Employment:', 1) 235 | (u'Building', 1) 236 | (u'an', 1) 237 | (u'Internet', 1) 238 | (u'Business', 1) 239 | (u'of', 1) 240 | (u'One', 1) 241 | (u'Achieving', 1) 242 | (u'Financial', 1) 243 | (u'and', 1) 244 | (u'Personal', 1) 245 | (u'Freedom', 1) 246 | (u'through', 1) 247 | (u'a', 1) 248 | (u'Lifestyle', 1) 249 | (u'Technology', 1) 250 | (u'Business', 1) 251 | (u'By', 1) 252 | (u'Frank', 1) 253 | (u'Kane', 1) 254 | ... 255 | ``` 256 | and now we can use a reduceByKey function that will group by each word and tally up the counts 257 | ```python 258 | 30 #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s! 259 | 31 wordCountsReduced = wordCountsMap.reduceByKey(lambda x, y: x + y) 260 | ``` 261 | now we can see the counts for each word from the wordCountReduced rdd: 262 | ``` 263 | ... 264 | (u'daughters.', 2) 265 | (u'ability', 14) 266 | (u'opening', 1) 267 | (u'self-fund,', 1) 268 | (u'merit.', 1) 269 | (u'merit,', 2) 270 | (u'moz.com', 1) 271 | ... 272 | ``` 273 | but things are still unordered, so lets sort them ascending with another map function. since we have the totals stored in the value part of the key,value pair, we need to flip the key and the value and then apply a sortByKey function to the dataset 274 | ```python 275 | 36 wordCountsSorted = wordCountsReduced.map(lambda (x,y): (y,x)).sortByKey() 276 | ``` 277 | and now the last part of the wordCountsSorted should end with the highest word count entries: 278 | ``` 279 | ... 280 | (747, u'that') 281 | (772, u'') 282 | (934, u'and') 283 | (970, u'of') 284 | (1191, u'a') 285 | (1292, u'the') 286 | (1420, u'your') 287 | (1828, u'to') 288 | (1878, u'you') 289 | ``` 290 | and the formatted output looks like this: 291 | ``` 292 | ... 293 | that: 747 294 | and: 934 295 | of: 970 296 | a: 1191 297 | the: 1292 298 | your: 1420 299 | to: 1828 300 | you: 1878 301 | ``` 302 | 303 | 304 | ####24. [Activity] Find the Most Popular Superhero in a Social Graph 305 | #####Superheros Datasets 306 | The first dataset contains the id of each superhero and the associated superhero ids that that have appared together. By finding the superhero with the most number of associations (or appearances with other superheros) then we find the "most popular" superhero. The first dataset contains the association graph for each superhero. Superheros may appear in multiple lines (start with the same superhero id) so we must count the associations both within and between the graph lines (ie project and total for each line starting with the same heroId) 307 | The dataset is contained in the Marvel-graph.txt file. 308 | ``` 309 | 5988 748 1722 3752 4655 5743 1872 3413 5527 6368 6085 4319 4728 1636 2397 3364 4001 1614 1819 1585 732 2660 3952 2507 3891 2070 2239 2602 612 1352 5447 4548 1596 5488 1605 5517 11 479 2554 2043 17 865 4292 6312 473 534 1479 6375 4456 310 | 5989 4080 4264 4446 3779 2430 2297 6169 3530 3272 4282 6432 2548 4140 185 105 3878 2429 1334 4595 2767 3956 3877 4776 4946 3407 128 269 5775 5121 481 5516 4758 4053 1044 1602 3889 1535 6038 533 3986 311 | ... 312 | ``` 313 | the second dataset contains the database of names for the superhero keyed on the superhero id. Marvel-Names.txt 314 | ``` 315 | 1 "24-HOUR MAN/EMMANUEL" 316 | 2 "3-D MAN/CHARLES CHAN" 317 | 3 "4-D MAN/MERCURIO" 318 | 4 "8-BALL/" 319 | 5 "A" 320 | 6 "A'YIN" 321 | 7 "ABBOTT, JACK" 322 | 8 "ABCISSA" 323 | ``` 324 | the program: Most-Popular-Superhero.py 325 | ```python 326 | 1 from pyspark import SparkConf, SparkContext 327 | 2 328 | 3 conf = SparkConf().setMaster("local").setAppName("PopularHero") 329 | 4 sc = SparkContext(conf = conf) 330 | 5 331 | 6 def countCoOccurences(line): 332 | 7 elements = line.split() 333 | 8 return (int(elements[0]), len(elements) - 1) 334 | 9 335 | 10 def parseNames(line): 336 | 11 fields = line.split('\"') 337 | 12 return (int(fields[0]), fields[1].encode("utf8")) 338 | 13 339 | 14 names = sc.textFile("/vagrant/marvel-names.txt") 340 | 15 namesRdd = names.map(parseNames) 341 | 16 342 | 17 lines = sc.textFile("/vagrant/marvel-graph-sm.txt") 343 | 18 344 | 19 pairings = lines.map(countCoOccurences) 345 | 20 346 | 21 totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y) 347 | 22 348 | 23 flipped = totalFriendsByCharacter.map(lambda (x,y) : (y,x)) 349 | 24 mostPopular = flipped.max() 350 | 25 mostPopularName = namesRdd.lookup(mostPopular[1])[0] 351 | 26 352 | 27 print mostPopularName + " is the most popular superhero, with " + \ 353 | 28 str(mostPopular[0]) + " co-appearances." 354 | 355 | ``` 356 | #####step-by-step 357 | - Map input data to (heroId, numberOfOccurrences) per line. read in the lines and map the count each co-occurence in each issue by split() words and then subtract one for the superhero id itself 358 | ```python 359 | 6 def countCoOccurences(line): 360 | 7 elements = line.split() 361 | 8 return (int(elements[0]), len(elements) - 1) 362 | 9 ... 363 | 13 364 | 17 lines = sc.textFile("/vagrant/marvel-graph-sm.txt") 365 | 18 366 | 19 pairings = lines.map(countCoOccurences) 367 | ``` 368 | will yield the superhero id and the count of associations for each line of the dataset 369 | ``` 370 | (1742, 14) 371 | (1743, 41) 372 | (3308, 47) 373 | (3309, 7) 374 | (5494, 6) 375 | ``` 376 | 377 | - Add up co-occurance by heroId using reduceByKey(). we know this function will groupby and count the total (occurences) 378 | ```python 379 | 21 totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y) 380 | ``` 381 | - Flip (map) RDD to (number, heroId). We need to flip the K and V. 382 | ```python 383 | 23 flipped = totalFriendsByCharacter.map(lambda (x,y) : (y,x)) 384 | ``` 385 | - Use max() on the RDD to find the hero with the most co-occurences 386 | ```python 387 | 24 mostPopular = flipped.max() 388 | ``` 389 | - Look up the name of the most popular 390 | ```python 391 | 25 mostPopularName = namesRdd.lookup(mostPopular[1])[0] 392 | ``` 393 | 394 | -------------------------------------------------------------------------------- /Total-Amount-By-Customer.py: -------------------------------------------------------------------------------- 1 | 2 | from pyspark import SparkConf, SparkContext 3 | 4 | conf = SparkConf().setMaster("local").setAppName("TotalAmountByCustomer") 5 | sc = SparkContext(conf = conf) 6 | 7 | def parseLine(line): 8 | fields = line.split(',') 9 | customerId = int(fields[0]) 10 | itemId = fields[1] 11 | itemAmt = float(fields[2]) 12 | return (customerId, itemAmt) 13 | 14 | input = sc.textFile("/vagrant/customer-orders.csv") 15 | orders = input.map(parseLine) 16 | totals = orders.reduceByKey(lambda x, y: x + y) 17 | sortedTotals = totals.map(lambda (x,y): (y,x)).sortByKey(False) 18 | 19 | results = sortedTotals .collect() 20 | for result in results: 21 | # print result 22 | print result[1],"\t${:.2f}".format(result[0]) 23 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | 2 | Vagrant.configure(2) do |config| 3 | 4 | config.vm.box = "petergdoyle/CentOS-7-x86_64-Minimal-1503-01" 5 | 6 | config.vm.provider "virtualbox" do |vb| 7 | vb.customize ["modifyvm", :id, "--cpuexecutioncap", "80"] 8 | vb.cpus=2 9 | vb.memory = "2048" 10 | end 11 | 12 | config.vm.provision "shell", inline: <<-SHELL 13 | 14 | #best to update the os 15 | yum -y update && yum -y clean 16 | yum -y install vim htop curl wget tree unzip bash-completion 17 | 18 | 19 | eval "yum repolist |grep 'epel/x86_64'" > /dev/null 2>&1 20 | if [ $? -eq 1 ]; then 21 | yum -y install epel-release 22 | else 23 | echo -e "\e[7;44;96m*epel-release already appears to be installed. skipping." 24 | fi 25 | 26 | eval 'python' > /dev/null 2>&1 27 | if [ $? -eq 127 ]; then 28 | yum install -y python34 29 | else 30 | echo -e "\e[7;44;96m*python34 already appears to be installed. skipping." 31 | fi 32 | 33 | eval 'pip -help' > /dev/null 2>&1 34 | if [ $? -eq 127 ]; then 35 | yum -y install python-pip 36 | else 37 | echo -e "\e[7;44;96m*python-pip already appears to be installed. skipping." 38 | fi 39 | 40 | eval 'java -version' > /dev/null 2>&1 41 | if [ $? -eq 127 ]; then 42 | mkdir -p /usr/java 43 | #install java jdk 8 from oracle 44 | curl -O -L --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" \ 45 | "http://download.oracle.com/otn-pub/java/jdk/8u60-b27/jdk-8u60-linux-x64.tar.gz" \ 46 | && tar -xvf jdk-8u60-linux-x64.tar.gz -C /usr/java \ 47 | && ln -s /usr/java/jdk1.8.0_60/ /usr/java/default \ 48 | && rm -f jdk-8u60-linux-x64.tar.gz 49 | 50 | alternatives --install "/usr/bin/java" "java" "/usr/java/default/bin/java" 99999; \ 51 | alternatives --install "/usr/bin/javac" "javac" "/usr/java/default/bin/javac" 99999; \ 52 | alternatives --install "/usr/bin/javaws" "javaws" "/usr/java/default/bin/javaws" 99999; \ 53 | alternatives --install "/usr/bin/jvisualvm" "jvisualvm" "/usr/java/default/bin/jvisualvm" 99999 54 | 55 | export JAVA_HOME=/usr/java/default 56 | cat >/etc/profile.d/java.sh <<-EOF 57 | export JAVA_HOME=$JAVA_HOME 58 | EOF 59 | 60 | else 61 | echo -e "\e[7;44;96m*jdk-8u60-linux-x64 already appears to be installed. skipping." 62 | fi 63 | 64 | 65 | if [ ! -d /usr/spark/spark-1.6.1-bin-hadoop2.6/ ]; then 66 | mkdir -p /usr/spark 67 | curl -O -L http://www-eu.apache.org/dist/spark/spark-1.6.1/spark-1.6.1-bin-hadoop2.6.tgz \ 68 | && tar -xvf spark-1.6.1-bin-hadoop2.6.tgz -C /usr/spark \ 69 | && ln -s /usr/spark/spark-1.6.1-bin-hadoop2.6/ /usr/spark/default \ 70 | && rm -f spark-1.6.1-bin-hadoop2.6.tgz 71 | 72 | export SPARK_HOME=/usr/spark/default 73 | cat >/etc/profile.d/spark.sh <<-EOF 74 | export SPARK_HOME=$SPARK_HOME 75 | EOF 76 | 77 | #set log levels 78 | cp /usr/spark/default/conf/log4j.properties.template /usr/spark/default/conf/log4j.properties 79 | sed -i 's/INFO/ERROR/g' /usr/spark/default/conf/log4j.properties 80 | 81 | #install executeable files 82 | for each in $(find /usr/spark/default/bin/ -executable -type f) ; do 83 | name=$(basename $each) 84 | alternatives --install "/usr/bin/$name" "$name" "$each" 99999 85 | done 86 | 87 | 88 | else 89 | echo -e "\e[7;44;96m*spark-1.6.1 already appears to be installed. skipping." 90 | fi 91 | 92 | #course material 93 | if [ ! -d ml-100k ]; then 94 | curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip \ 95 | && unzip ml-100k.zip \ 96 | && rm -f ml-100k.zip 97 | fi 98 | 99 | if [ ! -f ratings-counter.py ]; then 100 | curl -O https://raw.githubusercontent.com/minimav/udemy_spark/master/ratings-counter.py 101 | fi 102 | 103 | 104 | 105 | #set hostname 106 | hostnamectl set-hostname SparkCourse.vbx 107 | 108 | SHELL 109 | end 110 | -------------------------------------------------------------------------------- /Word-Count-Better-Sorted.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pyspark import SparkConf, SparkContext 3 | 4 | def normalizeWords(text): 5 | return re.compile(r'\W+', re.UNICODE).split(text.lower()) 6 | 7 | conf = SparkConf().setMaster("local").setAppName("WordCount") 8 | sc = SparkContext(conf = conf) 9 | 10 | input = sc.textFile("file:///sparkcourse/book.txt") 11 | words = input.flatMap(normalizeWords) 12 | 13 | wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) 14 | wordCountsSorted = wordCounts.map(lambda (x,y): (y,x)).sortByKey() 15 | 16 | results = wordCountsSorted.collect() 17 | 18 | for result in results: 19 | count = str(result[0]) 20 | word = result[1].encode('ascii', 'ignore') 21 | if (word): 22 | print word + ":\t\t" + count 23 | -------------------------------------------------------------------------------- /Word-Count-Better.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pyspark import SparkConf, SparkContext 3 | 4 | def normalizeWords(text): 5 | return re.compile(r'\W+', re.UNICODE).split(text.lower()) 6 | 7 | conf = SparkConf().setMaster("local").setAppName("WordCount") 8 | sc = SparkContext(conf = conf) 9 | 10 | input = sc.textFile("file:///sparkcourse/book.txt") 11 | words = input.flatMap(normalizeWords) 12 | wordCounts = words.countByValue() 13 | 14 | for word, count in wordCounts.items(): 15 | cleanWord = word.encode('ascii', 'ignore') 16 | if (cleanWord): 17 | print cleanWord, count 18 | -------------------------------------------------------------------------------- /Word-Count.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pyspark import SparkConf, SparkContext 3 | 4 | def normalizeWords(text): 5 | return re.compile(r'\W+', re.UNICODE).split(text.lower()) 6 | 7 | bad_word_list = ['', 's', 't'] 8 | 9 | conf = SparkConf().setMaster("local").setAppName("WordCount") 10 | sc = SparkContext(conf = conf) 11 | 12 | input = sc.textFile("/vagrant/Book.txt") 13 | #a simple split works 14 | #words = input.flatMap(lambda x: x.split()) 15 | #but let's clean the text up a bit and filter out special characters and consider upper and lowercase to be the same thing 16 | words = input.flatMap(normalizeWords) 17 | # results = words.collect() 18 | 19 | # #python function 20 | # #wordCounts = words.countByValue() 21 | 22 | #take a spark approach... 23 | #convert each word to a key/value pair with a value of 1 24 | wordCountsMap = words.map(lambda x: (x,1)).filter(lambda (x, y): x not in bad_word_list) 25 | # results = wordCountsMap.collect() 26 | 27 | 28 | #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s! 29 | wordCountsReduced = wordCountsMap.reduceByKey(lambda x, y: x + y) 30 | # results = wordCountsReduced.collect() 31 | 32 | wordCountsSorted = wordCountsReduced.map(lambda (x,y): (y,x)).sortByKey() 33 | results = wordCountsSorted.collect() 34 | 35 | for result in results: 36 | print result 37 | # count = str(result[0]) 38 | # word = result[1].encode('ascii', 'ignore') 39 | # if (word): 40 | # print word + ":\t\t" + count 41 | -------------------------------------------------------------------------------- /degrees-of-separation.py: -------------------------------------------------------------------------------- 1 | #Boilerplate stuff: 2 | from pyspark import SparkConf, SparkContext 3 | 4 | conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation") 5 | sc = SparkContext(conf = conf) 6 | 7 | # The characters we wish to find the degree of separation between: 8 | startCharacterID = 1562 #SpiderMan 9 | targetCharacterID = 12 #ADAM 3,031 (who?) 10 | 11 | # Our accumulator, used to signal when we find the target character during 12 | # our BFS traversal. 13 | hitCounter = sc.accumulator(0) 14 | 15 | def convertToBFS(line): 16 | fields = line.split() 17 | heroID = int(fields[0]) 18 | connections = [] 19 | for connection in fields[1:]: 20 | connections.append(int(connection)) 21 | 22 | color = 'WHITE' 23 | distance = 9999 24 | 25 | if (heroID == startCharacterID): 26 | color = 'GRAY' 27 | distance = 0 28 | 29 | return (heroID, (connections, distance, color)) 30 | 31 | 32 | def createStartingRdd(): 33 | inputFile = sc.textFile("/vagrant/Marvel-Graph-1562.txt") 34 | return inputFile.map(convertToBFS) 35 | 36 | def bfsMap(node): 37 | characterID = node[0] 38 | data = node[1] 39 | connections = data[0] 40 | distance = data[1] 41 | color = data[2] 42 | 43 | results = [] 44 | 45 | #If this node needs to be expanded... 46 | if (color == 'GRAY'): 47 | for connection in connections: 48 | newCharacterID = connection 49 | newDistance = distance + 1 50 | newColor = 'GRAY' 51 | if (targetCharacterID == connection): 52 | hitCounter.add(1) 53 | 54 | newEntry = (newCharacterID, ([], newDistance, newColor)) 55 | results.append(newEntry) 56 | 57 | #We've processed this node, so color it black 58 | color = 'BLACK' 59 | 60 | #Emit the input node so we don't lose it. 61 | results.append( (characterID, (connections, distance, color)) ) 62 | return results 63 | 64 | def bfsReduce(data1, data2): 65 | edges1 = data1[0] 66 | edges2 = data2[0] 67 | distance1 = data1[1] 68 | distance2 = data2[1] 69 | color1 = data1[2] 70 | color2 = data2[2] 71 | 72 | distance = 9999 73 | color = 'WHITE' 74 | edges = [] 75 | 76 | # See if one is the original node with its connections. 77 | # If so preserve them. 78 | if (len(edges1) > 0): 79 | edges = edges1 80 | if (len(edges2) > 0): 81 | for connection in edges2: 82 | edges.append(connection) 83 | 84 | # Preserve minimum distance 85 | if (distance1 < distance): 86 | distance = distance1 87 | 88 | if (distance2 < distance): 89 | distance = distance2 90 | 91 | # Preserve darkest color 92 | if (color1 == 'WHITE' and (color2 == 'GRAY' or color2 == 'BLACK')): 93 | color = color2 94 | 95 | if (color1 == 'GRAY' and color2 == 'BLACK'): 96 | color = color2 97 | 98 | if (color2 == 'WHITE' and (color1 == 'GRAY' or color1 == 'BLACK')): 99 | color = color1 100 | 101 | if (color2 == 'GRAY' and color1 == 'BLACK'): 102 | color = color1 103 | 104 | return (edges, distance, color) 105 | 106 | 107 | #Main program here: 108 | iterationRdd = createStartingRdd() 109 | results = iterationRdd.collect() 110 | for result in results: 111 | print result 112 | 113 | arbitraryUpperBound=1 114 | for iteration in range(0, arbitraryUpperBound): 115 | print "Running BFS iteration# " + str(iteration+1) 116 | 117 | # Create new vertices as needed to darken or reduce distances in the 118 | # reduce stage. If we encounter the node we're looking for as a GRAY 119 | # node, increment our accumulator to signal that we're done. 120 | mapped = iterationRdd.flatMap(bfsMap) 121 | 122 | # Note that mapped.count() action here forces the RDD to be evaluated, and 123 | # that's the only reason our accumulator is actually updated. 124 | print "Processing " + str(mapped.count()) + " values." 125 | 126 | if (hitCounter.value > 0): 127 | print "Hit the target character! From " + str(hitCounter.value) \ 128 | + " different direction(s)." 129 | break 130 | 131 | # Reducer combines data for each character ID, preserving the darkest 132 | # color and shortest path. 133 | iterationRdd = mapped.reduceByKey(bfsReduce) 134 | 135 | results = iterationRdd.collect() 136 | for result in results: 137 | print result 138 | -------------------------------------------------------------------------------- /download_movielens_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip \ 4 | && unzip ml-100k.zip 5 | -------------------------------------------------------------------------------- /install_spark_executables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | for each in $(find /usr/spark/default/bin/ -executable -type f) ; do 4 | name=$(basename $each) 5 | alternatives --install "/usr/bin/$name" "$name" "$each" 99999 6 | done 7 | -------------------------------------------------------------------------------- /log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=ERROR, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Settings to quiet third party logs that are too verbose 26 | log4j.logger.org.spark-project.jetty=ERROR 27 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 28 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR 29 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR 30 | log4j.logger.org.apache.parquet=ERROR 31 | log4j.logger.parquet=ERROR 32 | 33 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 34 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 35 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 36 | -------------------------------------------------------------------------------- /ratings-counter.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | import collections 3 | 4 | conf = SparkConf().setMaster("local").setAppName("RatingsHistogram") 5 | sc = SparkContext(conf = conf) 6 | 7 | lines = sc.textFile("/vagrant/ml-100k/u.data") 8 | ratings = lines.map(lambda x: x.split()[2]) 9 | result = ratings.countByValue() 10 | 11 | sortedResults = collections.OrderedDict(sorted(result.items())) 12 | for key, value in sortedResults.iteritems(): 13 | print "%s %i" % (key, value) 14 | -------------------------------------------------------------------------------- /scratch.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | import collections 3 | 4 | conf = SparkConf().setMaster("local").setAppName("RatingsHistogram") 5 | sc = SparkContext(conf = conf) 6 | 7 | rdd = sc.parallelize([1,2,3,4,5]) 8 | rdd.map(lambda x: print " %i" % (x*x))) 9 | -------------------------------------------------------------------------------- /vagrant/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "hello from setup.sh" 4 | -------------------------------------------------------------------------------- /vagrant/setup2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "hello from setup2.sh" 4 | --------------------------------------------------------------------------------