├── .gitignore
├── CourseContent
    ├── data
    │   └── README.md
    └── src
    │   ├── README.md
    │   └── ratings-counter.py
├── Dockerfile
├── Friends-By-Age.py
├── Max-Temperatures.py
├── Min-Temperatures.py
├── Most-Often-Watched-Movie.py
├── Most-Popular-Superhero.py
├── Popular-Movies-Nicer.py
├── README.md
├── Total-Amount-By-Customer.py
├── Vagrantfile
├── Word-Count-Better-Sorted.py
├── Word-Count-Better.py
├── Word-Count.py
├── degrees-of-separation.py
├── download_movielens_datasets.sh
├── install_spark_executables.sh
├── log4j.properties
├── ratings-counter.py
├── scratch.py
└── vagrant
    ├── setup.sh
    └── setup2.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | build
 2 | 
 3 | .vagrant
 4 | 
 5 | *.out
 6 | *.err
 7 | *.gz
 8 | *.tar
 9 | .DS_Store
10 | ml-100k/
11 | *.*~
12 | .idea/
13 | *.csv
14 | *.txt
15 | *.data
16 | *.zip
17 | 


--------------------------------------------------------------------------------
/CourseContent/data/README.md:
--------------------------------------------------------------------------------
1 | data directory
2 | 


--------------------------------------------------------------------------------
/CourseContent/src/README.md:
--------------------------------------------------------------------------------
1 | src directory
2 | 


--------------------------------------------------------------------------------
/CourseContent/src/ratings-counter.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | import collections
 3 | 
 4 | conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
 5 | sc = SparkContext(conf = conf)
 6 | 
 7 | lines = sc.textFile("file:///SparkCourse/ml-100k/u.data")
 8 | ratings = lines.map(lambda x: x.split()[2])
 9 | result = ratings.countByValue()
10 | 
11 | sortedResults = collections.OrderedDict(sorted(result.items()))
12 | for key, value in sortedResults.iteritems():
13 |     print "%s %i" % (key, value)
14 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM centos:latest
 2 | 
 3 | RUN yum -y update && yum clean all \
 4 | && yum -y install epel-release \
 5 | && yum -y install vim bash-completion tree git curl wget telnet
 6 | 
 7 | RUN yum install -y python34 \
 8 | && yum -y install python-pip
 9 | 
10 | 
11 | ENV JAVA_HOME=/usr/java/default
12 | RUN mkdir -p /usr/java \
13 | && curl -O -L --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" \
14 | "http://download.oracle.com/otn-pub/java/jdk/8u60-b27/jdk-8u60-linux-x64.tar.gz" \
15 | && tar -xvf jdk-8u60-linux-x64.tar.gz -C /usr/java \
16 | && ln -s /usr/java/jdk1.8.0_60/ /usr/java/default \
17 | && rm -f jdk-8u60-linux-x64.tar.gz
18 | 
19 | 
20 | ENV SPARK_HOME=/usr/spark/default
21 | RUN mkdir -p /usr/spark \
22 | && curl -O -L http://www-eu.apache.org/dist/spark/spark-1.6.1/spark-1.6.1-bin-hadoop2.6.tgz \
23 | && tar -xvf spark-1.6.1-bin-hadoop2.6.tgz -C /usr/spark \
24 | && ln -s /usr/spark/spark-1.6.1-bin-hadoop2.6/ /usr/spark/default \
25 | && rm -f spark-1.6.1-bin-hadoop2.6.tgz
26 | 
27 | COPY log4j.properties /usr/spark/default/conf/log4j.properties
28 | COPY install_spark_executables.sh /install_spark_executables.sh
29 | RUN /install_spark_executables.sh
30 | 


--------------------------------------------------------------------------------
/Friends-By-Age.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | 
 3 | conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
 4 | sc = SparkContext(conf = conf)
 5 | 
 6 | def parseLine(line):
 7 |     fields = line.split(',')
 8 |     age = int(fields[2])
 9 |     numFriends = int(fields[3])
10 |     return (age, numFriends)
11 | 
12 | lines = sc.textFile("/vagrant/fakefriends.csv")
13 | rdd = lines.map(parseLine)
14 | #results = rdd.collect()
15 | 
16 | groupByAge = rdd.mapValues(lambda x: (x,1))
17 | #results = groupByAge.collect()
18 | 
19 | totalsByAge = groupByAge.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
20 | #results = totalsByAge.collect()
21 | 
22 | averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
23 | results = averagesByAge.collect()
24 | 
25 | for result in results:
26 |     print result
27 | 


--------------------------------------------------------------------------------
/Max-Temperatures.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | 
 3 | conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
 4 | sc = SparkContext(conf = conf)
 5 | 
 6 | def parseLine(line):
 7 |     fields = line.split(',')
 8 |     stationID = fields[0]
 9 |     entryType = fields[2]
10 |     temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
11 |     return (stationID, entryType, temperature)
12 | 
13 | lines = sc.textFile("/vagrant/1800.csv")
14 | parsedLines = lines.map(parseLine)
15 | #results = parsedLines.collect()
16 | 
17 | minTemps = parsedLines.filter(lambda x: "TMAX" in x[1])
18 | #results = minTemps.collect()
19 | 
20 | stationTemps = minTemps.map(lambda x: (x[0], x[2]))
21 | #results = stationTemps.collect()
22 | 
23 | minTemps = stationTemps.reduceByKey(lambda x, y: max(x,y))
24 | results = minTemps.collect()
25 | 
26 | for result in results:
27 |     #print result
28 |     print result[0] + "\t{:.2f}F".format(result[1])
29 | 


--------------------------------------------------------------------------------
/Min-Temperatures.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | 
 3 | conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
 4 | sc = SparkContext(conf = conf)
 5 | 
 6 | def parseLine(line):
 7 |     fields = line.split(',')
 8 |     stationID = fields[0]
 9 |     entryType = fields[2]
10 |     temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
11 |     return (stationID, entryType, temperature)
12 | 
13 | lines = sc.textFile("/vagrant/1800.csv")
14 | parsedLines = lines.map(parseLine)
15 | #results = parsedLines.collect()
16 | 
17 | minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])
18 | #results = minTemps.collect()
19 | 
20 | stationTemps = minTemps.map(lambda x: (x[0], x[2]))
21 | #results = stationTemps.collect()
22 | 
23 | minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
24 | results = minTemps.collect()
25 | 
26 | for result in results:
27 |     #print result
28 |     print result[0] + "\t{:.2f}F".format(result[1])
29 | 


--------------------------------------------------------------------------------
/Most-Often-Watched-Movie.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pyspark import SparkConf, SparkContext
 3 | 
 4 | conf = SparkConf().setMaster("local").setAppName("MostOftenWatchedMovies")
 5 | sc = SparkContext(conf = conf)
 6 | 
 7 | def parseLineAll(line):
 8 |     fields = line.split()
 9 |     userId = int(fields[0])
10 |     movieId = int(fields[1])
11 |     rating = int(fields[2])
12 |     timestamp = long(fields[3])
13 |     return (userId,(movieId,rating,timestamp))
14 | 
15 | def parseLine(line):
16 |     fields = line.split()
17 |     movieId = int(fields[1])
18 |     return movieId
19 | 
20 | input = sc.textFile("/vagrant/u.data")
21 | rdd1 = input.map(parseLine) #parse each line of the input db
22 | rdd2 = rdd1.map(lambda x: (x,1))
23 | rdd3 = rdd2.reduceByKey(lambda x,y: x+y)
24 | 
25 | rdd4 = rdd3.map(lambda (x,y): (y,x)).sortByKey()
26 | 
27 | results = rdd4.collect()
28 | for result in results:
29 |     print result
30 | 


--------------------------------------------------------------------------------
/Most-Popular-Superhero.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | 
 3 | conf = SparkConf().setMaster("local").setAppName("PopularHero")
 4 | sc = SparkContext(conf = conf)
 5 | 
 6 | def countCoOccurences(line):
 7 |     elements = line.split()
 8 |     return (int(elements[0]), len(elements) - 1)
 9 | 
10 | def parseNames(line):
11 |     fields = line.split('\"')
12 |     return (int(fields[0]), fields[1].encode("utf8"))
13 | 
14 | names = sc.textFile("/vagrant/marvel-names.txt")
15 | namesRdd = names.map(parseNames)
16 | 
17 | lines = sc.textFile("/vagrant/marvel-graph.txt")
18 | 
19 | pairings = lines.map(countCoOccurences)
20 | 
21 | results = pairings.collect()
22 | 
23 | for result in results:
24 |     print result
25 | 
26 | 
27 | # totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y)
28 | #
29 | # flipped = totalFriendsByCharacter.map(lambda (x,y) : (y,x))
30 | # mostPopular = flipped.max()
31 | # mostPopularName = namesRdd.lookup(mostPopular[1])[0]
32 | #
33 | # print mostPopularName + " is the most popular superhero, with " + \
34 | #     str(mostPopular[0]) + " co-appearances."
35 | 


--------------------------------------------------------------------------------
/Popular-Movies-Nicer.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | 
 3 | def loadMovieNames():
 4 |     movieNames = {}
 5 |     with open("ml-100k/u.ITEM") as f:
 6 |         for line in f:
 7 |             fields = line.split('|')
 8 |             movieNames[int(fields[0])] = fields[1]
 9 |     return movieNames
10 | 
11 | conf = SparkConf().setMaster("local").setAppName("PopularMovies")
12 | sc = SparkContext(conf = conf)
13 | 
14 | nameDict = sc.broadcast(loadMovieNames())
15 | 
16 | lines = sc.textFile("/vagrant/ml-100k/u.data")
17 | movies = lines.map(lambda x: (int(x.split()[1]), 1))
18 | movieCounts = movies.reduceByKey(lambda x, y: x + y)
19 | 
20 | flipped = movieCounts.map( lambda (x, y) : (y, x))
21 | sortedMovies = flipped.sortByKey()
22 | 
23 | sortedMoviesWithNames = sortedMovies.map(lambda (count, movie) : (nameDict.value[movie], count))
24 | 
25 | results = sortedMoviesWithNames.collect()
26 | 
27 | for result in results:
28 |     print result
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SparkCourse
  2 | ## Taming Big Data with Apache Spark and Python - Hands On
  3 | 
  4 | ####Resources and downloads
  5 | https://www.udemy.com/taming-big-data-with-apache-spark-hands-on/learn/v4/content 
  6 | 
  7 | 
  8 | ####12.[Activity] Running the Average Friends by Age Example
  9 | #####Social Network Dataset
 10 | the original dataset is in the form (id,name,age,number_of_friends): fakefriends.csv
 11 | ```
 12 | 0,Will,33,385
 13 | 1,Jean-Luc,26,2
 14 | 2,Hugh,55,221
 15 | 3,Deanna,40,465
 16 | 4,Quark,68,21
 17 | 5,Weyoun,59,318
 18 | 6,Gowron,37,220
 19 | 7,Will,54,307
 20 | 8,Jadzia,38,380
 21 | 9,Hugh,27,181
 22 | 10,Odo,53,191
 23 | ...
 24 | ```
 25 | 
 26 | #####Friends-By-Age.py
 27 | the pyspark program to work on the dataset is given here 
 28 | ```python
 29 |            1 from pyspark import SparkConf, SparkContext
 30 |       2 
 31 |       3 conf = SparkConf().setMaster("local").setAppName("WordCount")
 32 |       4 sc = SparkContext(conf = conf)
 33 |       5 
 34 |       6 input = sc.textFile("/vagrant/Book.txt")
 35 |       7 words = input.flatMap(lambda x: x.split())
 36 |       8 # results = words.collect()
 37 |       9 # for result in results:
 38 |      10 #    print result
 39 |      11 
 40 |      12 # #python function
 41 |      13 # #wordCounts = words.countByValue()
 42 |      14 
 43 |      15 #take a spark approach...
 44 |      16 #convert each word to a key/value pair with a value of 1
 45 |      17 wordCountsMap = words.map(lambda x: (x,1))
 46 |      18 # results = wordCountsMap.collect()
 47 |      19 # for result in results:
 48 |      20 #    print result
 49 |      21 
 50 |      22 
 51 |      23 #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s!
 52 |      24 wordCountsReduce = wordCountsMap.reduceByKey(lambda x, y: x + y)
 53 |      25 results = wordCountsReduce.collect()
 54 |      26 for result in results:
 55 |      27    print result
 56 |      28 #
 57 |      29 # for word, count in wordCountsMap.items():
 58 |      30 #     cleanWord = word.encode('ascii', 'ignore')
 59 |      31 #     if (cleanWord):
 60 |      32 #         print cleanWord, count
 61 | ```
 62 | 
 63 | #####step-by-step
 64 | 
 65 | define a function that can be mapped onto the dataset. 'parseLine' will accept a line of input and split the comma separated lines into fileds. we are only interested in the 3rd and 4th field and they need to be cast as integers.
 66 | 
 67 | ```python
 68 |       6 def parseLine(line):
 69 |       7     fields = line.split(',')
 70 |       8     age = int(fields[2])
 71 |       9     numFriends = int(fields[3])
 72 |      10     return (age, numFriends)
 73 |      11 
 74 | ```
 75 | 
 76 | build the first rdd by mapping the parseLine function onto each item (line) in the dataset. parselLine will emit the 3rd and 4th values of each line into the new rdd. 
 77 | 
 78 | ```python
 79 |      12 lines = sc.textFile("/vagrant/fakefriends.csv")
 80 |      13 rdd = lines.map(parseLine)
 81 |      14 results = rdd.collect()
 82 |      ...
 83 |      25 for result in results:
 84 |      26     print result
 85 | ```
 86 | if we output contents of this rdd and filter for values where the age (3rd field) is 43 we get the following:
 87 | ```
 88 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,'
 89 | (43, 49)
 90 | (43, 249)
 91 | (43, 404)
 92 | (43, 101)
 93 | (43, 48)
 94 | (43, 335)
 95 | (43, 428)
 96 | ```
 97 | okay, now lets build the second rdd by grouping the new dataset by age. ultimately what we will be doing is determining the average friends per age. in order to do that we need to be able to total the friends for a particular age and then divide by the number of friends for that age. so the new dataset will consist look like this (K,V) or (age, (friends,1)). so the mapValues spark function will take the old (age,friends) dataset and then emit the new one with a 1 for each so that a count can be performed later.
 98 | 
 99 | ```python
100 |      16 groupByAge = rdd.mapValues(lambda x: (x,1))
101 |      17 results = groupByAge.collect()
102 |      ...
103 |      25 for result in results:
104 |      26     print result
105 | ```
106 | and so the output (for age 43) looks like this:
107 | ```
108 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,'
109 | (43, (49, 1))
110 | (43, (249, 1))
111 | (43, (404, 1))
112 | (43, (101, 1))
113 | (43, (48, 1))
114 | (43, (335, 1))
115 | (43, (428, 1))
116 | ```
117 | okay so now we need to then tally the friends for each age and divide by the number of friends for that age. that can be done with a reduceByKey function (which collapes rows that are grouped by the same key). which take the set of values for each age(the key) and then applies the function that adds the friends (x) and the count (y) and emits the total for that age. 
118 | 
119 | ```python
120 |      19 totalsByAge = groupByAge.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
121 |      20 results = totalsByAge.collect()
122 |      ...
123 |      25 for result in results:
124 |      26     print result
125 | ```
126 | and the output (for age 43) looks like this:
127 | ```
128 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,'
129 | (43, (1614, 7))
130 | ```
131 | and finally we need to divide the total by the count in order to get the average for each age. to do this we are mapping a function onto each item in the dataset that will do the division and emit a new kay value pair that contains the age and the average number of friends. 
132 | 
133 | ```python
134 |      22 averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
135 |      23 results = averagesByAge.collect()
136 |      24 
137 |      25 for result in results:
138 |      26     print result
139 | ```
140 | and the output looks like this (for age 43). so it looks like for age 43 in this dataset they have an average of 230 friends.
141 | ```
142 | [vagrant@sparkcourse vagrant]$ spark-submit Friends-By-Age.py |grep '(43,'
143 | (43, 230)
144 | ```
145 | 
146 | ####16.[Activity] Counting Word Occurrences using flatmap() 
147 | The data for this exercise is in the form of a book that is in text form: Book.txt. The objective is to get a wordcount for all words in the book text.
148 | 
149 | ```python
150 |       1 import re
151 |       2 from pyspark import SparkConf, SparkContext
152 |       3 
153 |       4 def normalizeWords(text):
154 |       5     return re.compile(r'\W+', re.UNICODE).split(text.lower())
155 |       6 
156 |       7 conf = SparkConf().setMaster("local").setAppName("WordCount")
157 |       8 sc = SparkContext(conf = conf)
158 |       9 
159 |      10 input = sc.textFile("/vagrant/Book.txt")
160 |      11 #a simple split works
161 |      12 #words = input.flatMap(lambda x: x.split())
162 |      13 #but let's clean the text up a bit and filter out special characters and consider upper and lowercase to be the same thing
163 |      14 words = input.flatMap(normalizeWords)
164 |      15 # results = words.collect()
165 |      16 # for result in results:
166 |      17 #    print result
167 |      18 
168 |      19 # #python function
169 |      20 # #wordCounts = words.countByValue()
170 |      21 
171 |      22 #take a spark approach...
172 |      23 #convert each word to a key/value pair with a value of 1
173 |      24 wordCountsMap = words.map(lambda x: (x,1))
174 |      25 # results = wordCountsMap.collect()
175 |      26 # for result in results:
176 |      27 #    print result
177 |      28 
178 |      29 
179 |      30 #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s!
180 |      31 wordCountsReduced = wordCountsMap.reduceByKey(lambda x, y: x + y)
181 |      32 results = wordCountsReduced.collect()
182 |      33 # for result in results:
183 |      34 #    print result
184 |      35 
185 |      36 wordCountsSorted = wordCountsReduced.map(lambda (x,y): (y,x)).sortByKey()
186 |      37 results = wordCountsSorted.collect()
187 |      38 
188 |      39 for result in results:
189 |      40     # print result
190 |      41     count = str(result[0])
191 |      42     word = result[1].encode('ascii', 'ignore')
192 |      43     if (word):
193 |      44         print word + ":\t\t" + count
194 | ```
195 | so to start off we need to build a dataset consisting of each word in the book. we can do this with python and spark and building the first rdd based on each word in the book and using a split() function (on whitespace by default). the flatMap function can do that take a single input and produce multiple outputs and apply the given function on the input:
196 | ```python
197 |      10 input = sc.textFile("/vagrant/Book.txt")
198 |      11 #a simple split works
199 |      12 #words = input.flatMap(lambda x: x.split())
200 |      13 #but let's clean the text up a bit and filter out special characters and consider upper and lowercase to be the same thing
201 |      14 words = input.flatMap(normalizeWords)
202 | ```
203 | taking a look at the ```word``` rdd after the content of the book is split into words:
204 | ```
205 | Self-Employment:
206 | Building
207 | an
208 | Internet
209 | Business
210 | of
211 | One
212 | Achieving
213 | Financial
214 | and
215 | Personal
216 | Freedom
217 | through
218 | a
219 | Lifestyle
220 | Technology
221 | Business
222 | By
223 | Frank
224 | Kane
225 | ...
226 | ```
227 | okay now we need to be able to provide a count (of 1) for each of the words so that eventually we can tally up the count of each word. we can do this by applying a map function to each of the words in the rdd dataset 
228 | ```python     
229 | 	 23 #convert each word to a key/value pair with a value of 1
230 |      24 wordCountsMap = words.map(lambda x: (x,1))
231 | ```
232 | and that will yield a second rdd with the following key,value dataset:
233 | ```
234 | (u'Self-Employment:', 1)
235 | (u'Building', 1)
236 | (u'an', 1)
237 | (u'Internet', 1)
238 | (u'Business', 1)
239 | (u'of', 1)
240 | (u'One', 1)
241 | (u'Achieving', 1)
242 | (u'Financial', 1)
243 | (u'and', 1)
244 | (u'Personal', 1)
245 | (u'Freedom', 1)
246 | (u'through', 1)
247 | (u'a', 1)
248 | (u'Lifestyle', 1)
249 | (u'Technology', 1)
250 | (u'Business', 1)
251 | (u'By', 1)
252 | (u'Frank', 1)
253 | (u'Kane', 1)
254 | ...
255 | ```
256 | and now we can use a reduceByKey function that will group by each word and tally up the counts
257 | ```python
258 |      30 #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s!
259 |      31 wordCountsReduced = wordCountsMap.reduceByKey(lambda x, y: x + y)
260 | ```
261 | now we can see the counts for each word from the wordCountReduced rdd:
262 | ```
263 | ...
264 | (u'daughters.', 2)
265 | (u'ability', 14)
266 | (u'opening', 1)
267 | (u'self-fund,', 1)
268 | (u'merit.', 1)
269 | (u'merit,', 2)
270 | (u'moz.com', 1)
271 | ...
272 | ```
273 | but things are still unordered, so lets sort them ascending with another map function. since we have the totals stored in the value part of the key,value pair, we need to flip the key and the value and then apply a sortByKey function to the dataset
274 | ```python
275 |      36 wordCountsSorted = wordCountsReduced.map(lambda (x,y): (y,x)).sortByKey()
276 | ```
277 | and now the last part of the wordCountsSorted should end with the highest word count entries:
278 | ```
279 | ...
280 | (747, u'that')
281 | (772, u'')
282 | (934, u'and')
283 | (970, u'of')
284 | (1191, u'a')
285 | (1292, u'the')
286 | (1420, u'your')
287 | (1828, u'to')
288 | (1878, u'you')
289 | ```
290 | and the formatted output looks like this:
291 | ```
292 | ...
293 | that: 	   747
294 | and:		 934
295 | of: 		 970
296 | a:	  	1191
297 | the:		1292
298 | your:   	1420
299 | to:		 1828
300 | you:		1878
301 | ```
302 | 
303 | 
304 | ####24. [Activity] Find the Most Popular Superhero in a Social Graph
305 | #####Superheros Datasets
306 | The first dataset contains the id of each superhero and the associated superhero ids that that have appared together. By finding the superhero with the most number of associations (or appearances with other superheros) then we find the "most popular" superhero. The first dataset contains the association graph for each superhero. Superheros may appear in multiple lines (start with the same superhero id) so we must count the associations both within and between the graph lines (ie project and total for each line starting with the same heroId) 
307 | The dataset is contained in the Marvel-graph.txt file.
308 | ```
309 | 5988 748 1722 3752 4655 5743 1872 3413 5527 6368 6085 4319 4728 1636 2397 3364 4001 1614 1819 1585 732 2660 3952 2507 3891 2070 2239 2602 612 1352 5447 4548 1596 5488 1605 5517 11 479 2554 2043 17 865 4292 6312 473 534 1479 6375 4456 
310 | 5989 4080 4264 4446 3779 2430 2297 6169 3530 3272 4282 6432 2548 4140 185 105 3878 2429 1334 4595 2767 3956 3877 4776 4946 3407 128 269 5775 5121 481 5516 4758 4053 1044 1602 3889 1535 6038 533 3986 
311 | ...
312 | ```
313 | the second dataset contains the database of names for the superhero keyed on the superhero id. Marvel-Names.txt
314 | ```
315 | 1 "24-HOUR MAN/EMMANUEL"
316 | 2 "3-D MAN/CHARLES CHAN"
317 | 3 "4-D MAN/MERCURIO"
318 | 4 "8-BALL/"
319 | 5 "A"
320 | 6 "A'YIN"
321 | 7 "ABBOTT, JACK"
322 | 8 "ABCISSA"
323 | ```
324 | the program: Most-Popular-Superhero.py
325 | ```python
326 |       1 from pyspark import SparkConf, SparkContext
327 |       2 
328 |       3 conf = SparkConf().setMaster("local").setAppName("PopularHero")
329 |       4 sc = SparkContext(conf = conf)
330 |       5 
331 |       6 def countCoOccurences(line):
332 |       7     elements = line.split()
333 |       8     return (int(elements[0]), len(elements) - 1)
334 |       9 
335 |      10 def parseNames(line):
336 |      11     fields = line.split('\"')
337 |      12     return (int(fields[0]), fields[1].encode("utf8"))
338 |      13 
339 |      14 names = sc.textFile("/vagrant/marvel-names.txt")
340 |      15 namesRdd = names.map(parseNames)
341 |      16 
342 |      17 lines = sc.textFile("/vagrant/marvel-graph-sm.txt")
343 |      18 
344 |      19 pairings = lines.map(countCoOccurences)
345 |      20 
346 |      21 totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y)
347 |      22 
348 |      23 flipped = totalFriendsByCharacter.map(lambda (x,y) : (y,x))
349 |      24 mostPopular = flipped.max()
350 |      25 mostPopularName = namesRdd.lookup(mostPopular[1])[0]
351 |      26 
352 |      27 print mostPopularName + " is the most popular superhero, with " + \
353 |      28     str(mostPopular[0]) + " co-appearances."
354 | 
355 | ```
356 | #####step-by-step
357 | - Map input data to (heroId, numberOfOccurrences) per line. read in the lines and map the count each co-occurence in each issue by split() words and then subtract one for the superhero id itself 
358 | ```python
359 |       6 def countCoOccurences(line):
360 |       7     elements = line.split()
361 |       8     return (int(elements[0]), len(elements) - 1)
362 |       9 ...
363 |      13 
364 |      17 lines = sc.textFile("/vagrant/marvel-graph-sm.txt") 
365 |      18  
366 |      19 pairings = lines.map(countCoOccurences)
367 |      ```
368 |      will yield the superhero id and the count of associations for each line of the dataset
369 |      ```
370 |     (1742, 14)
371 |     (1743, 41)
372 |     (3308, 47)
373 |     (3309, 7)
374 |     (5494, 6)
375 |      ```
376 |      
377 | - Add up co-occurance by heroId using reduceByKey(). we know this function will groupby and count the total (occurences) 
378 | ```python
379 |      21 totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y)
380 | ```
381 | - Flip (map) RDD to (number, heroId). We need to flip the K and V. 
382 | ```python
383 |      23 flipped = totalFriendsByCharacter.map(lambda (x,y) : (y,x))
384 | ```
385 | - Use max() on the RDD to find the hero with the most co-occurences
386 | ```python
387 |      24 mostPopular = flipped.max()
388 | ```
389 | - Look up the name of the most popular 
390 | ```python
391 |      25 mostPopularName = namesRdd.lookup(mostPopular[1])[0]
392 | ```
393 | 
394 | 


--------------------------------------------------------------------------------
/Total-Amount-By-Customer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pyspark import SparkConf, SparkContext
 3 | 
 4 | conf = SparkConf().setMaster("local").setAppName("TotalAmountByCustomer")
 5 | sc = SparkContext(conf = conf)
 6 | 
 7 | def parseLine(line):
 8 |     fields = line.split(',')
 9 |     customerId = int(fields[0])
10 |     itemId = fields[1]
11 |     itemAmt = float(fields[2])
12 |     return (customerId, itemAmt)
13 | 
14 | input = sc.textFile("/vagrant/customer-orders.csv")
15 | orders = input.map(parseLine)
16 | totals = orders.reduceByKey(lambda x, y: x + y)
17 | sortedTotals = totals.map(lambda (x,y): (y,x)).sortByKey(False)
18 | 
19 | results = sortedTotals .collect()
20 | for result in results:
21 |     # print result
22 |     print result[1],"\t${:.2f}".format(result[0])
23 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
  1 | 
  2 | Vagrant.configure(2) do |config|
  3 | 
  4 |   config.vm.box = "petergdoyle/CentOS-7-x86_64-Minimal-1503-01"
  5 | 
  6 |   config.vm.provider "virtualbox" do |vb|
  7 |     vb.customize ["modifyvm", :id, "--cpuexecutioncap", "80"]
  8 |     vb.cpus=2
  9 |     vb.memory = "2048"
 10 |   end
 11 | 
 12 |   config.vm.provision "shell", inline: <<-SHELL
 13 | 
 14 |   #best to update the os
 15 |   yum -y update && yum -y clean
 16 |   yum -y install vim htop curl wget tree unzip bash-completion
 17 | 
 18 | 
 19 |   eval "yum repolist |grep 'epel/x86_64'" > /dev/null 2>&1
 20 |   if [ $? -eq 1 ]; then
 21 |     yum -y install epel-release
 22 |   else
 23 |     echo -e "\e[7;44;96m*epel-release already appears to be installed. skipping."
 24 |   fi
 25 | 
 26 |   eval 'python' > /dev/null 2>&1
 27 |   if [ $? -eq 127 ]; then
 28 |     yum install -y python34
 29 |   else
 30 |     echo -e "\e[7;44;96m*python34 already appears to be installed. skipping."
 31 |   fi
 32 | 
 33 |   eval 'pip -help' > /dev/null 2>&1
 34 |   if [ $? -eq 127 ]; then
 35 |      yum -y install python-pip
 36 |   else
 37 |    echo -e "\e[7;44;96m*python-pip already appears to be installed. skipping."
 38 |   fi
 39 | 
 40 |   eval 'java -version' > /dev/null 2>&1
 41 |   if [ $? -eq 127 ]; then
 42 |     mkdir -p /usr/java
 43 |     #install java jdk 8 from oracle
 44 |     curl -O -L --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" \
 45 |     "http://download.oracle.com/otn-pub/java/jdk/8u60-b27/jdk-8u60-linux-x64.tar.gz" \
 46 |       && tar -xvf jdk-8u60-linux-x64.tar.gz -C /usr/java \
 47 |       && ln -s /usr/java/jdk1.8.0_60/ /usr/java/default \
 48 |       && rm -f jdk-8u60-linux-x64.tar.gz
 49 | 
 50 |     alternatives --install "/usr/bin/java" "java" "/usr/java/default/bin/java" 99999; \
 51 |     alternatives --install "/usr/bin/javac" "javac" "/usr/java/default/bin/javac" 99999; \
 52 |     alternatives --install "/usr/bin/javaws" "javaws" "/usr/java/default/bin/javaws" 99999; \
 53 |     alternatives --install "/usr/bin/jvisualvm" "jvisualvm" "/usr/java/default/bin/jvisualvm" 99999
 54 | 
 55 |     export JAVA_HOME=/usr/java/default
 56 |     cat >/etc/profile.d/java.sh <<-EOF
 57 | export JAVA_HOME=$JAVA_HOME
 58 | EOF
 59 | 
 60 |   else
 61 |     echo -e "\e[7;44;96m*jdk-8u60-linux-x64 already appears to be installed. skipping."
 62 |   fi
 63 | 
 64 | 
 65 |   if [ ! -d /usr/spark/spark-1.6.1-bin-hadoop2.6/ ]; then
 66 |     mkdir -p /usr/spark
 67 |     curl -O -L http://www-eu.apache.org/dist/spark/spark-1.6.1/spark-1.6.1-bin-hadoop2.6.tgz \
 68 |       && tar -xvf spark-1.6.1-bin-hadoop2.6.tgz -C /usr/spark \
 69 |       && ln -s /usr/spark/spark-1.6.1-bin-hadoop2.6/ /usr/spark/default \
 70 |       && rm -f spark-1.6.1-bin-hadoop2.6.tgz
 71 | 
 72 |     export SPARK_HOME=/usr/spark/default
 73 |     cat >/etc/profile.d/spark.sh <<-EOF
 74 | export SPARK_HOME=$SPARK_HOME
 75 | EOF
 76 | 
 77 |     #set log levels
 78 |     cp /usr/spark/default/conf/log4j.properties.template /usr/spark/default/conf/log4j.properties
 79 |     sed -i 's/INFO/ERROR/g' /usr/spark/default/conf/log4j.properties
 80 | 
 81 |     #install executeable files
 82 |     for each in $(find /usr/spark/default/bin/ -executable -type f) ; do
 83 |       name=$(basename $each)
 84 |       alternatives --install "/usr/bin/$name" "$name" "$each" 99999
 85 |     done
 86 | 
 87 | 
 88 |   else
 89 |     echo -e "\e[7;44;96m*spark-1.6.1 already appears to be installed. skipping."
 90 |   fi
 91 | 
 92 |   #course material
 93 |   if [ ! -d ml-100k ]; then
 94 |     curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip \
 95 |     && unzip ml-100k.zip \
 96 |     && rm -f ml-100k.zip
 97 |   fi
 98 | 
 99 |   if [ ! -f ratings-counter.py ]; then
100 |     curl -O https://raw.githubusercontent.com/minimav/udemy_spark/master/ratings-counter.py
101 |   fi
102 | 
103 | 
104 | 
105 |   #set hostname
106 |   hostnamectl set-hostname SparkCourse.vbx
107 | 
108 |   SHELL
109 | end
110 | 


--------------------------------------------------------------------------------
/Word-Count-Better-Sorted.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pyspark import SparkConf, SparkContext
 3 | 
 4 | def normalizeWords(text):
 5 |     return re.compile(r'\W+', re.UNICODE).split(text.lower())
 6 | 
 7 | conf = SparkConf().setMaster("local").setAppName("WordCount")
 8 | sc = SparkContext(conf = conf)
 9 | 
10 | input = sc.textFile("file:///sparkcourse/book.txt")
11 | words = input.flatMap(normalizeWords)
12 | 
13 | wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
14 | wordCountsSorted = wordCounts.map(lambda (x,y): (y,x)).sortByKey()
15 | 
16 | results = wordCountsSorted.collect()
17 | 
18 | for result in results:
19 |     count = str(result[0])
20 |     word = result[1].encode('ascii', 'ignore')
21 |     if (word):
22 |         print word + ":\t\t" + count
23 | 


--------------------------------------------------------------------------------
/Word-Count-Better.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pyspark import SparkConf, SparkContext
 3 | 
 4 | def normalizeWords(text):
 5 |     return re.compile(r'\W+', re.UNICODE).split(text.lower())
 6 | 
 7 | conf = SparkConf().setMaster("local").setAppName("WordCount")
 8 | sc = SparkContext(conf = conf)
 9 | 
10 | input = sc.textFile("file:///sparkcourse/book.txt")
11 | words = input.flatMap(normalizeWords)
12 | wordCounts = words.countByValue()
13 | 
14 | for word, count in wordCounts.items():
15 |     cleanWord = word.encode('ascii', 'ignore')
16 |     if (cleanWord):
17 |         print cleanWord, count
18 | 


--------------------------------------------------------------------------------
/Word-Count.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pyspark import SparkConf, SparkContext
 3 | 
 4 | def normalizeWords(text):
 5 |     return re.compile(r'\W+', re.UNICODE).split(text.lower())
 6 | 
 7 | bad_word_list = ['', 's', 't']
 8 | 
 9 | conf = SparkConf().setMaster("local").setAppName("WordCount")
10 | sc = SparkContext(conf = conf)
11 | 
12 | input = sc.textFile("/vagrant/Book.txt")
13 | #a simple split works
14 | #words = input.flatMap(lambda x: x.split())
15 | #but let's clean the text up a bit and filter out special characters and consider upper and lowercase to be the same thing
16 | words = input.flatMap(normalizeWords)
17 | # results = words.collect()
18 | 
19 | # #python function
20 | # #wordCounts = words.countByValue()
21 | 
22 | #take a spark approach...
23 | #convert each word to a key/value pair with a value of 1
24 | wordCountsMap = words.map(lambda x: (x,1)).filter(lambda (x, y): x not in bad_word_list)
25 | # results = wordCountsMap.collect()
26 | 
27 | 
28 | #count words with reduceByKey so reduceByKey will build a Set of each work and count the 1s!
29 | wordCountsReduced = wordCountsMap.reduceByKey(lambda x, y: x + y)
30 | # results = wordCountsReduced.collect()
31 | 
32 | wordCountsSorted = wordCountsReduced.map(lambda (x,y): (y,x)).sortByKey()
33 | results = wordCountsSorted.collect()
34 | 
35 | for result in results:
36 |     print result
37 |     # count = str(result[0])
38 |     # word = result[1].encode('ascii', 'ignore')
39 |     # if (word):
40 |     #     print word + ":\t\t" + count
41 | 


--------------------------------------------------------------------------------
/degrees-of-separation.py:
--------------------------------------------------------------------------------
  1 | #Boilerplate stuff:
  2 | from pyspark import SparkConf, SparkContext
  3 | 
  4 | conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation")
  5 | sc = SparkContext(conf = conf)
  6 | 
  7 | # The characters we wish to find the degree of separation between:
  8 | startCharacterID = 1562 #SpiderMan
  9 | targetCharacterID = 12  #ADAM 3,031 (who?)
 10 | 
 11 | # Our accumulator, used to signal when we find the target character during
 12 | # our BFS traversal.
 13 | hitCounter = sc.accumulator(0)
 14 | 
 15 | def convertToBFS(line):
 16 |     fields = line.split()
 17 |     heroID = int(fields[0])
 18 |     connections = []
 19 |     for connection in fields[1:]:
 20 |         connections.append(int(connection))
 21 | 
 22 |     color = 'WHITE'
 23 |     distance = 9999
 24 | 
 25 |     if (heroID == startCharacterID):
 26 |         color = 'GRAY'
 27 |         distance = 0
 28 | 
 29 |     return (heroID, (connections, distance, color))
 30 | 
 31 | 
 32 | def createStartingRdd():
 33 |     inputFile = sc.textFile("/vagrant/Marvel-Graph-1562.txt")
 34 |     return inputFile.map(convertToBFS)
 35 | 
 36 | def bfsMap(node):
 37 |     characterID = node[0]
 38 |     data = node[1]
 39 |     connections = data[0]
 40 |     distance = data[1]
 41 |     color = data[2]
 42 | 
 43 |     results = []
 44 | 
 45 |     #If this node needs to be expanded...
 46 |     if (color == 'GRAY'):
 47 |         for connection in connections:
 48 |             newCharacterID = connection
 49 |             newDistance = distance + 1
 50 |             newColor = 'GRAY'
 51 |             if (targetCharacterID == connection):
 52 |                 hitCounter.add(1)
 53 | 
 54 |             newEntry = (newCharacterID, ([], newDistance, newColor))
 55 |             results.append(newEntry)
 56 | 
 57 |         #We've processed this node, so color it black
 58 |         color = 'BLACK'
 59 | 
 60 |     #Emit the input node so we don't lose it.
 61 |     results.append( (characterID, (connections, distance, color)) )
 62 |     return results
 63 | 
 64 | def bfsReduce(data1, data2):
 65 |     edges1 = data1[0]
 66 |     edges2 = data2[0]
 67 |     distance1 = data1[1]
 68 |     distance2 = data2[1]
 69 |     color1 = data1[2]
 70 |     color2 = data2[2]
 71 | 
 72 |     distance = 9999
 73 |     color = 'WHITE'
 74 |     edges = []
 75 | 
 76 |     # See if one is the original node with its connections.
 77 |     # If so preserve them.
 78 |     if (len(edges1) > 0):
 79 |         edges = edges1
 80 |     if (len(edges2) > 0):
 81 |         for connection in edges2:
 82 |             edges.append(connection)
 83 | 
 84 |     # Preserve minimum distance
 85 |     if (distance1 < distance):
 86 |         distance = distance1
 87 | 
 88 |     if (distance2 < distance):
 89 |         distance = distance2
 90 | 
 91 |     # Preserve darkest color
 92 |     if (color1 == 'WHITE' and (color2 == 'GRAY' or color2 == 'BLACK')):
 93 |         color = color2
 94 | 
 95 |     if (color1 == 'GRAY' and color2 == 'BLACK'):
 96 |         color = color2
 97 | 
 98 |     if (color2 == 'WHITE' and (color1 == 'GRAY' or color1 == 'BLACK')):
 99 |         color = color1
100 | 
101 |     if (color2 == 'GRAY' and color1 == 'BLACK'):
102 |         color = color1
103 | 
104 |     return (edges, distance, color)
105 | 
106 | 
107 | #Main program here:
108 | iterationRdd = createStartingRdd()
109 | results = iterationRdd.collect()
110 | for result in results:
111 |     print result
112 |     
113 | arbitraryUpperBound=1
114 | for iteration in range(0, arbitraryUpperBound):
115 |     print "Running BFS iteration# " + str(iteration+1)
116 | 
117 |     # Create new vertices as needed to darken or reduce distances in the
118 |     # reduce stage. If we encounter the node we're looking for as a GRAY
119 |     # node, increment our accumulator to signal that we're done.
120 |     mapped = iterationRdd.flatMap(bfsMap)
121 | 
122 |     # Note that mapped.count() action here forces the RDD to be evaluated, and
123 |     # that's the only reason our accumulator is actually updated.
124 |     print "Processing " + str(mapped.count()) + " values."
125 | 
126 |     if (hitCounter.value > 0):
127 |         print "Hit the target character! From " + str(hitCounter.value) \
128 |             + " different direction(s)."
129 |         break
130 | 
131 |     # Reducer combines data for each character ID, preserving the darkest
132 |     # color and shortest path.
133 |     iterationRdd = mapped.reduceByKey(bfsReduce)
134 | 
135 |     results = iterationRdd.collect()
136 |     for result in results:
137 |         print result
138 | 


--------------------------------------------------------------------------------
/download_movielens_datasets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip \
4 | && unzip ml-100k.zip
5 | 


--------------------------------------------------------------------------------
/install_spark_executables.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for each in $(find /usr/spark/default/bin/ -executable -type f) ; do
4 |   name=$(basename $each)
5 |   alternatives --install "/usr/bin/$name" "$name" "$each" 99999
6 | done
7 | 


--------------------------------------------------------------------------------
/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=ERROR, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 | 
25 | # Settings to quiet third party logs that are too verbose
26 | log4j.logger.org.spark-project.jetty=ERROR
27 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
28 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
29 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
30 | log4j.logger.org.apache.parquet=ERROR
31 | log4j.logger.parquet=ERROR
32 | 
33 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
34 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
35 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
36 | 


--------------------------------------------------------------------------------
/ratings-counter.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf, SparkContext
 2 | import collections
 3 | 
 4 | conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
 5 | sc = SparkContext(conf = conf)
 6 | 
 7 | lines = sc.textFile("/vagrant/ml-100k/u.data")
 8 | ratings = lines.map(lambda x: x.split()[2])
 9 | result = ratings.countByValue()
10 | 
11 | sortedResults = collections.OrderedDict(sorted(result.items()))
12 | for key, value in sortedResults.iteritems():
13 |     print "%s %i" % (key, value)
14 | 


--------------------------------------------------------------------------------
/scratch.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | import collections
3 | 
4 | conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
5 | sc = SparkContext(conf = conf)
6 | 
7 | rdd = sc.parallelize([1,2,3,4,5])
8 | rdd.map(lambda x: print " %i" % (x*x)))
9 | 


--------------------------------------------------------------------------------
/vagrant/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "hello from setup.sh"
4 | 


--------------------------------------------------------------------------------
/vagrant/setup2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "hello from setup2.sh"
4 | 


--------------------------------------------------------------------------------