├── activities
├── nosql-databases
│ ├── mongo
│ │ ├── setup.sh
│ │ ├── forkdb.sh
│ │ ├── mongo.conf
│ │ └── test-insert.js
│ └── rest-api.json
├── common-crawl
│ ├── mrcc.py.tar.gz
│ ├── tag-count-mr.idraw
│ ├── test-1.warc
│ ├── test-5.warc
│ ├── mrjob.conf
│ ├── test-10.warc
│ ├── mrcc.py
│ ├── test-15.warc
│ ├── test-20.warc
│ ├── README.md
│ ├── tag-count-mr.svg
│ └── test-100.warc
├── crawling-the-crawl
│ ├── mrcc.tar.gz
│ ├── cc-bootstrap.sh
│ ├── extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz
│ ├── cluster.json
│ ├── s3.py
│ ├── run-step.sh
│ ├── ccex.py
│ ├── start.sh
│ ├── mrcc.py
│ └── README.md
├── decision-trees
│ ├── Bias_Variance.jpg
│ ├── decision-tree.idraw
│ ├── tree-example.idraw
│ ├── decision_tree.py
│ ├── adaboost_classifier.py
│ ├── random_forest.py
│ ├── gradient_classifier.py
│ ├── iris_tree.py
│ ├── regions.py
│ ├── tree-example.svg
│ └── decision-tree.svg
├── emr-opennex-climate-model
│ ├── input-sequences.txt
│ ├── input-example.py
│ ├── seqs.py
│ ├── average.py
│ ├── date_partitions.py
│ ├── by-sequences.py
│ ├── acquire.py
│ └── README.md
├── intro-to-spark
│ ├── one-line-json.py
│ ├── random-text.py
│ ├── rdd-map.py
│ ├── rdd-flatmap.py
│ ├── rdd-reduce.py
│ ├── wordcount.py
│ ├── tweet-wordcount.py
│ └── README.md
├── web-scraping
│ ├── urllib2-get.py
│ ├── soup.py
│ ├── urllib2-headers.py
│ └── README.md
├── emr-map-only
│ ├── line-count.py
│ ├── generate-input.py
│ └── README.md
├── sentiment-analysis
│ ├── annotate.py
│ ├── rt-polaritydata
│ │ ├── rt-polarity.neg
│ │ ├── rt-polarity.pos
│ │ └── README.1.0.txt
│ ├── wordcounts.py
│ ├── train.py
│ ├── test.py
│ ├── featureset.py
│ ├── n-way.py
│ ├── candy-corn.py
│ └── README.md
├── data-munging
│ ├── pipeline-input.py
│ ├── csv-dump.py
│ ├── s3list.py
│ ├── s3copy.py
│ ├── s3cat.py
│ ├── xml-parse.py
│ └── README.md
├── emr-cluster
│ ├── cluster.json
│ └── README.md
├── emr-tweet-wordcount
│ ├── format-tweets.py
│ ├── tweetSplitter.py
│ └── README.md
├── emr-prime-multiplier
│ ├── step.json
│ ├── generate-input.py
│ ├── prime-factors.py
│ └── README.md
├── twitter-acquisition
│ ├── hello-twitter.py
│ ├── partitions.py
│ ├── search.py
│ └── README.md
├── README.md
├── relational-databases
│ └── README.md
└── text-processing-with-nltk
│ └── README.md
├── data-science.png
├── sessions
├── session-8.md
├── session-7.md
├── session-9.md
├── session-12.md
├── session-10.md
├── session-11.md
└── session-6.md
├── data-science.xpr
├── assignments
├── tweet-acquisition
│ └── README.md
├── getting-started
│ └── README.md
└── organizing-tweets
│ └── README.md
├── README.md
└── data-science.svg
/activities/nosql-databases/mongo/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir data
3 | mkdir log
4 |
--------------------------------------------------------------------------------
/data-science.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/data-science.png
--------------------------------------------------------------------------------
/activities/common-crawl/mrcc.py.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/common-crawl/mrcc.py.tar.gz
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/mrcc.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/crawling-the-crawl/mrcc.tar.gz
--------------------------------------------------------------------------------
/activities/common-crawl/tag-count-mr.idraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/common-crawl/tag-count-mr.idraw
--------------------------------------------------------------------------------
/activities/decision-trees/Bias_Variance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/decision-trees/Bias_Variance.jpg
--------------------------------------------------------------------------------
/activities/decision-trees/decision-tree.idraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/decision-trees/decision-tree.idraw
--------------------------------------------------------------------------------
/activities/decision-trees/tree-example.idraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/decision-trees/tree-example.idraw
--------------------------------------------------------------------------------
/activities/nosql-databases/rest-api.json:
--------------------------------------------------------------------------------
1 | { "rest-api": {
2 | "name": "RESTstop",
3 | "database": "tweets",
4 | "port": "8888"
5 | } }
6 |
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/input-sequences.txt:
--------------------------------------------------------------------------------
1 | #lat1,lon1,lat2,lon2,size,startYear,startMonth,endYear,endMonth
2 | 40,-125,35,-120,60,2015,01,2015,03
3 |
--------------------------------------------------------------------------------
/activities/intro-to-spark/one-line-json.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 |
4 | data = json.load(sys.stdin)
5 |
6 | for tweet in data:
7 | print json.dumps(tweet)
--------------------------------------------------------------------------------
/activities/web-scraping/urllib2-get.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 |
3 | response = urllib2.urlopen("http://www.ischool.berkeley.edu/")
4 | html = response.read()
5 | print html
--------------------------------------------------------------------------------
/activities/emr-map-only/line-count.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import sys
3 |
4 | count = 0
5 | for line in sys.stdin:
6 | count += 1
7 |
8 | print "lines: ", count
9 |
--------------------------------------------------------------------------------
/activities/common-crawl/test-1.warc:
--------------------------------------------------------------------------------
1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
2 |
--------------------------------------------------------------------------------
/activities/sentiment-analysis/annotate.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | for line in sys.stdin:
4 | sys.stdout.write(sys.argv[1])
5 | sys.stdout.write('\t')
6 | sys.stdout.write(line)
--------------------------------------------------------------------------------
/activities/sentiment-analysis/rt-polaritydata/rt-polarity.neg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/sentiment-analysis/rt-polaritydata/rt-polarity.neg
--------------------------------------------------------------------------------
/activities/sentiment-analysis/rt-polaritydata/rt-polarity.pos:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/sentiment-analysis/rt-polaritydata/rt-polarity.pos
--------------------------------------------------------------------------------
/activities/nosql-databases/mongo/forkdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | MONGO=$HOME/workspace/mongodb-osx-x86_64-2.6.5/
3 |
4 | $MONGO/bin/mongod --config mongo.conf --pidfilepath `pwd`/mongo.pid
5 |
--------------------------------------------------------------------------------
/activities/data-munging/pipeline-input.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 |
4 | for line in sys.stdin:
5 | print line
6 | f = open(line.strip(),"r")
7 | # process data
8 | f.close()
9 |
--------------------------------------------------------------------------------
/activities/web-scraping/soup.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import bs4
3 | import urllib2
4 |
5 | resource = urllib2.urlopen(sys.argv[1])
6 | html = bs4.BeautifulSoup(resource.read().decode('utf-8'))
7 | print "".join(html.title.strings)
--------------------------------------------------------------------------------
/activities/web-scraping/urllib2-headers.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 |
3 | response = urllib2.urlopen("http://www.ischool.berkeley.edu/")
4 | headers = dict(response.info())
5 |
6 | for name in headers:
7 | print name,": ",headers[name]
8 |
--------------------------------------------------------------------------------
/activities/decision-trees/decision_tree.py:
--------------------------------------------------------------------------------
1 | import iris_tree as iris
2 | from sklearn.tree import DecisionTreeClassifier
3 |
4 |
5 | dtree = DecisionTreeClassifier(criterion='gini',max_depth=3,random_state=0)
6 |
7 | iris.tree(dtree)
8 |
9 |
--------------------------------------------------------------------------------
/activities/decision-trees/adaboost_classifier.py:
--------------------------------------------------------------------------------
1 | import iris_tree as iris
2 | from sklearn.ensemble import AdaBoostClassifier
3 |
4 | boosting = AdaBoostClassifier(n_estimators=10, learning_rate=1.0,random_state=1)
5 |
6 | iris.tree(boosting)
7 |
8 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/cc-bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo yum install -y python27 python27-devel python27-pip gcc-c++
4 | sudo pip-2.7 install boto mrjob warc
5 | sudo pip-2.7 install https://github.com/commoncrawl/gzipstream/archive/master.zip
6 |
--------------------------------------------------------------------------------
/activities/decision-trees/random_forest.py:
--------------------------------------------------------------------------------
1 | import iris_tree as iris
2 | from sklearn.ensemble import RandomForestClassifier
3 |
4 | forest = RandomForestClassifier(criterion='gini',n_estimators=10,max_depth=3,random_state=1,n_jobs=2)
5 |
6 | iris.tree(forest)
7 |
8 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/crawling-the-crawl/extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz
--------------------------------------------------------------------------------
/activities/decision-trees/gradient_classifier.py:
--------------------------------------------------------------------------------
1 | import iris_tree as iris
2 | from sklearn.ensemble import GradientBoostingClassifier
3 |
4 | boosting = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0,max_depth=3,random_state=1)
5 |
6 | iris.tree(boosting)
7 |
8 |
--------------------------------------------------------------------------------
/activities/emr-cluster/cluster.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "InstanceGroupType": "MASTER",
4 | "InstanceCount": 1,
5 | "InstanceType": "m1.medium"
6 | },
7 | {
8 | "InstanceGroupType": "CORE",
9 | "InstanceCount": 2,
10 | "InstanceType": "m1.medium"
11 | }
12 | ]
13 |
--------------------------------------------------------------------------------
/activities/data-munging/csv-dump.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import csv
3 |
4 | for line in sys.stdin:
5 | f = open(line.strip(),"r")
6 | # process data
7 | reader = csv.reader(f,delimiter=',',quotechar='"')
8 | for row in reader:
9 | print ','.join(row)
10 |
11 | f.close()
12 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/cluster.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "InstanceGroupType": "MASTER",
4 | "InstanceCount": 1,
5 | "InstanceType": "m1.medium"
6 | },
7 | {
8 | "InstanceGroupType": "CORE",
9 | "InstanceCount": 2,
10 | "InstanceType": "m1.medium"
11 | }
12 | ]
--------------------------------------------------------------------------------
/activities/intro-to-spark/random-text.py:
--------------------------------------------------------------------------------
1 | import random
2 | import sys
3 |
4 | words = sys.stdin.read().splitlines()
5 |
6 | for i in range(int(sys.argv[1])):
7 | for j in range(int(sys.argv[2])):
8 | sys.stdout.write(random.choice(words))
9 | sys.stdout.write(" ")
10 | sys.stdout.write("\n")
11 |
12 |
13 |
--------------------------------------------------------------------------------
/activities/data-munging/s3list.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from boto.s3.connection import S3Connection
3 | from boto.s3.key import Key
4 |
5 | conn = S3Connection()
6 | bucket = conn.get_bucket(sys.argv[1])
7 |
8 | subset = sys.argv[2] if len(sys.argv)>2 else ""
9 |
10 | for key in bucket.list(prefix=subset):
11 | print key.key
12 |
13 |
--------------------------------------------------------------------------------
/activities/intro-to-spark/rdd-map.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from pyspark import SparkContext
4 |
5 | sc = SparkContext(appName="TweetLoader")
6 | tweetData = sc.textFile("2015-02*.txt")
7 | tweets = tweetData.map(lambda line: json.loads(line))
8 |
9 | output = tweets.collect()
10 | for (tweet) in output:
11 | print tweet
12 |
--------------------------------------------------------------------------------
/sessions/session-8.md:
--------------------------------------------------------------------------------
1 | # Session Schedule - Week 8 #
2 |
3 | * [5] Intro to models over data
4 | * [10] [Bag-of-words model](http://en.wikipedia.org/wiki/Bag-of-words_model)
5 | * [10] Feature extraction from movie reviews
6 | * [10] Candy Corn Example
7 | * [25] Project Feedback / Activity
8 | * [20] Training Classifiers
9 | * [10] Discussion
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/s3.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | from signal import signal, SIGPIPE, SIG_DFL
4 |
5 | #Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html)
6 | signal(SIGPIPE,SIG_DFL)
7 |
8 | for line in sys.stdin:
9 | print("s3://aws-publicdatasets/"+line[0:-1])
--------------------------------------------------------------------------------
/activities/nosql-databases/mongo/mongo.conf:
--------------------------------------------------------------------------------
1 | systemLog:
2 | destination: file
3 | path: "log/mongodb.log"
4 | logAppend: true
5 | processManagement:
6 | fork: true
7 | storage:
8 | dbPath: "data"
9 | directoryPerDB: true
10 | journal:
11 | enabled: true
12 | net:
13 | bindIp: 127.0.0.1
14 | port: 27017
15 |
16 |
--------------------------------------------------------------------------------
/activities/emr-map-only/generate-input.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import random
3 | import os
4 |
5 | prefix = sys.argv[1]
6 | files = int(sys.argv[2])
7 | max = int(sys.argv[3])
8 |
9 | for n in range(files):
10 | f = open(prefix+"-"+str(n+1)+".txt","w")
11 | for i in range(max):
12 | f.write(str(n+1)+" "+str(i+1)+"\n")
13 | f.close();
14 |
15 |
--------------------------------------------------------------------------------
/data-science.xpr:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/activities/emr-tweet-wordcount/format-tweets.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 |
4 | f = open(sys.argv[1],"r")
5 | data = json.load(f)
6 | f.close()
7 |
8 | for tweet in data["tweets"]:
9 | language = tweet["metadata"]["iso_language_code"].encode('utf-8')
10 | text = tweet["text"].replace("\n"," ")
11 | print "#iso-"+language+" "+text.encode('utf-8')
12 |
--------------------------------------------------------------------------------
/activities/data-munging/s3copy.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from boto.s3.connection import S3Connection
3 | from boto.s3.key import Key
4 |
5 | conn = S3Connection()
6 | bucket = conn.get_bucket(sys.argv[1])
7 | prefix = sys.argv[2]
8 | for i in range(3,len(sys.argv)):
9 | print sys.argv[i]
10 | k = Key(bucket)
11 | k.key = prefix+"/"+sys.argv[i]
12 | k.set_contents_from_filename(sys.argv[i])
13 |
--------------------------------------------------------------------------------
/activities/emr-prime-multiplier/step.json:
--------------------------------------------------------------------------------
1 | {
2 | "Type" : "STREAMING",
3 | "Name" : "Multiply",
4 | "ActionOnFailure" : "CONTINUE",
5 | "Args" : [
6 | "-files","s3://mybucket/prime-factors.py",
7 | "-mapper","prime-factors.py",
8 | "-reducer","aggregate",
9 | "-input","s3://mybucket/multiply/input",
10 | "-output","s3://mybucket/multiply/output"
11 | ]
12 | }
--------------------------------------------------------------------------------
/activities/data-munging/s3cat.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import re
3 | from boto.s3.connection import S3Connection
4 | from boto.s3.key import Key
5 |
6 | conn = S3Connection()
7 | for uri in sys.argv[1:]:
8 | m = re.match(r"s3://([\w\-]+)/(.*)",uri)
9 | if m:
10 | bucket = conn.get_bucket(m.group(1))
11 | k = Key(bucket)
12 | k.key = m.group(2)
13 | print k.get_contents_as_string()
14 |
--------------------------------------------------------------------------------
/sessions/session-7.md:
--------------------------------------------------------------------------------
1 | # Session Schedule - Week 7 #
2 |
3 | * [5] Setup (if you haven't done so)
4 | * [20] Introduction to NoSQL
5 | * what really is a NoSQL database?
6 | * market players / gartner report
7 | * two significant players: Mongo and MarkLogic
8 | * [10] Mongo introduction
9 | * [15] Data storage in Mongo activity
10 | * [10] MarkLogic introduction
11 | * [15] Data storage in MarkLogic activity
12 | * [15] Q&A and wrap-up
13 |
--------------------------------------------------------------------------------
/activities/intro-to-spark/rdd-flatmap.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from pyspark import SparkContext
4 |
5 | sc = SparkContext(appName="TweetLoader")
6 | tweetData = sc.textFile("2015-02*.txt")
7 | users = tweetData.map(lambda line: json.loads(line)) \
8 | .flatMap(lambda tweet: [tweet["user"]["screen_name"]] + map(lambda u : u["screen_name"],tweet["entities"]["user_mentions"])).distinct()
9 |
10 | output = users.collect()
11 | for user in output:
12 | print user
13 |
--------------------------------------------------------------------------------
/activities/twitter-acquisition/hello-twitter.py:
--------------------------------------------------------------------------------
1 | import tweepy
2 | import json;
3 |
4 | # Don't forget to install tweepy
5 | # pip install tweepy
6 |
7 | consumer_key = "...";
8 | consumer_secret = "...";
9 |
10 | access_token = "...";
11 | access_token_secret = "...";
12 |
13 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
14 | auth.set_access_token(access_token, access_token_secret)
15 |
16 | api = tweepy.API(auth)
17 |
18 | for tweet in api.search(q="minecraft"):
19 | print tweet.text
--------------------------------------------------------------------------------
/activities/sentiment-analysis/wordcounts.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import nltk
3 | import sets
4 | import operator
5 |
6 | import featureset
7 |
8 | words = {}
9 |
10 | for line in sys.stdin:
11 | for word in featureset.wordlist(line.decode('utf-8')):
12 | words[word] = words[word] + 1 if word in words else 1
13 |
14 | wordsSorted = sorted(words.items(), key=operator.itemgetter(1),reverse=True)
15 |
16 | for w in wordsSorted:
17 | sys.stdout.write("{0}\t{1}\n".format(w[0].encode('utf-8'),w[1]))
18 |
19 |
--------------------------------------------------------------------------------
/activities/emr-prime-multiplier/generate-input.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import random
3 |
4 | # The maximum magnitude of the numbers
5 | max = int(sys.argv[1])
6 | # The number to generate
7 | count = int(sys.argv[2])
8 |
9 | def positiveRandom(max):
10 | n = random.random()
11 | while n==0:
12 | n = random.random()
13 | r = int(n*max)
14 | return 1 if r==0 else r
15 |
16 | # Generate a positive random number for the count up to the given maximum
17 | for i in range(count):
18 | print positiveRandom(max)
--------------------------------------------------------------------------------
/activities/intro-to-spark/rdd-reduce.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from pyspark import SparkContext
4 |
5 | sc = SparkContext(appName="TweetLoader")
6 | tweetData = sc.textFile("2015-02*.txt")
7 | counts = tweetData.map(lambda line: json.loads(line)) \
8 | .map(lambda tweet: (tweet["user"]["screen_name"],1)) \
9 | .reduceByKey(lambda a,b: a + b)
10 |
11 | output = sorted(counts.collect(),lambda a,b: b[1] - a[1])
12 | for (user,count) in output:
13 | print "{}: {}".format(user,count)
14 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/run-step.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script expects the path list as input for the MRJob
4 |
5 | SCRIPT=$1
6 | BUCKET=$2
7 | FLOWID=$3
8 |
9 | if [ -z $SCRIPT ] || [ -z $BUCKET ] || [ -z $FLOWID ] ; then
10 | echo "Usage: $(basename $0) script.py bucket-name job-flow-id"
11 | exit 1
12 | fi
13 |
14 | shift 3
15 | OUTDIR=s3://$BUCKET/common-crawl/wat/domains/$$
16 | echo "Output: $OUTDIR"
17 | python $SCRIPT -r emr --python-bin python2.7 --python-archive mrcc.tar.gz --no-output --output-dir $OUTDIR --emr-job-flow-id $FLOWID $*
18 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/ccex.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | from urlparse import urlparse
4 |
5 | #
6 | from mrcc import CCJob
7 |
8 | class Example(CCJob):
9 | def process_record(self, record):
10 | # Some header readers aren't for Web resources
11 | if "warc-target-uri" in record.header:
12 |
13 | uri = record.header["warc-target-uri"]
14 | print uri
15 |
16 | # load the payload into a string
17 | payload = record.payload.read()
18 |
19 | yield uri,1
20 | yield "zzzz-count",1
21 |
22 | if __name__ == '__main__':
23 | Example.run()
--------------------------------------------------------------------------------
/activities/nosql-databases/mongo/test-insert.js:
--------------------------------------------------------------------------------
1 | var MongoClient = require('mongodb').MongoClient,
2 | ObjectID = require('mongodb').ObjectID;
3 |
4 | // The database connection URI
5 | var url = 'mongodb://localhost:27017/test';
6 |
7 | // Connect to the database and provide a callback function
8 | MongoClient.connect(url, function(err, db) {
9 | console.log("Connected!");
10 | var collection = db.collection("conference");
11 | collection.insert(
12 | [{ test: "A" },{ test: "B" },{ test: "C" },{ test: "A" }],
13 | function() {
14 | console.log("done!")
15 | db.close();
16 | }
17 | );
18 | });
19 |
--------------------------------------------------------------------------------
/activities/data-munging/xml-parse.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from xml.etree import ElementTree
3 |
4 | # See: https://docs.python.org/2/library/xml.etree.elementtree.html
5 |
6 | # This will parse the document from a file. If the handle was elsewhere, you can give it an open stream too.
7 | doc = ElementTree.parse(sys.argv[1])
8 |
9 | # iteration is a lot like //report in XPath
10 | for report in doc.getroot().iter('{http://weather.milowski.com/V/APRS/}report'):
11 | # If the attribute isn't available, we'll get a dictionary key exception
12 | # so we check for its existence
13 | if "temperature" in report.attrib:
14 | print report.attrib["temperature"]
--------------------------------------------------------------------------------
/sessions/session-9.md:
--------------------------------------------------------------------------------
1 | # Session Schedule - Week 9 #
2 |
3 | * [5] Admin / Project Update Next Week - 3 slides needed - architecture, update on info organization, top issues
4 | * [10] Assignment Q&A
5 | * [15] Introduction to Map/Reduce
6 | * [10] Hadoop / YARN
7 | * [10] [Starting a cluster on EMR](https://github.com/alexmilowski/data-science/tree/master/activities/emr-cluster)
8 | * [15] Input Splitting - [Map Only Example](https://github.com/alexmilowski/data-science/tree/master/activities/emr-map-only)
9 | * [15] Reducing - [Tweet Word Count Example](https://github.com/alexmilowski/data-science/tree/master/activities/emr-tweet-wordcount)
10 | * [10] Wrap-up
--------------------------------------------------------------------------------
/activities/intro-to-spark/wordcount.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pyspark import SparkContext
3 |
4 | sc = SparkContext(appName="PythonWordCount")
5 |
6 | # Load the data from the file (or wildcard) on the command line
7 | lines = sc.textFile(sys.argv[1], 1)
8 |
9 | # count the words: split each line, output a key/value pair (count of 1), reduce by summation
10 | counts = lines.flatMap(lambda x: x.split()) \
11 | .map(lambda word: (word, 1)) \
12 | .reduceByKey(lambda a,b : a + b)
13 |
14 | # output the results (unsorted)
15 | output = counts.collect()
16 | for (word, count) in output:
17 | print "{}: {}".format(word.encode('utf-8'), count)
18 |
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/input-example.py:
--------------------------------------------------------------------------------
1 | from mrjob.job import MRJob
2 | import sys
3 | import os
4 | import json
5 | import math
6 |
7 | # A simple example of processing JSON input to compute an average (w/o counts)
8 | class InputExample(MRJob):
9 |
10 | # Yields an average for an input line (same key)
11 | def mapper(self, _, line):
12 | obj = json.loads(line)
13 | yield "average",sum(obj["data"])/len(obj["data"])
14 |
15 | # Computes the average over all the values
16 | def reducer(self, key, values):
17 | data = list(values)
18 | yield key, sum(data) / len(data)
19 |
20 |
21 | if __name__ == '__main__':
22 | InputExample.run()
23 |
--------------------------------------------------------------------------------
/sessions/session-12.md:
--------------------------------------------------------------------------------
1 | # Session Schedule - Week 12 #
2 |
3 | ## Spark ##
4 |
5 | * [5-10] Udpate / Q & A
6 | * [20] [Introduction to Spark](https://docs.google.com/presentation/d/1vgDuqCsbugrsw2W99ak70HVu9TFsiybDP8vAo3E6758/edit?usp=sharing)
7 | * [15] Activity - [Run some examples](https://github.com/alexmilowski/data-science/tree/master/activities/intro-to-spark#activity---run-some-example)
8 | * [40] Activity - [Problem Solving with Spark](https://github.com/alexmilowski/data-science/tree/master/activities/intro-to-spark#activity---problem-solving)
9 | * Extra - [Spark on EC2 / EMR](https://github.com/alexmilowski/data-science/tree/master/activities/intro-to-spark#activity---deploying-to-clusters)
10 |
--------------------------------------------------------------------------------
/activities/common-crawl/test-5.warc:
--------------------------------------------------------------------------------
1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz
3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz
4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz
5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz
6 |
--------------------------------------------------------------------------------
/activities/twitter-acquisition/partitions.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import datetime
3 |
4 | xsdDatetimeFormat = "%Y-%m-%dT%H:%M:%S"
5 | xsdDateFormat = "%Y-%m-%d"
6 |
7 | def datetime_partition(start,end,duration):
8 | current = start
9 | while start==current or (end-current).days > 0 or ((end-current).days==0 and (end-current).seconds>0):
10 | yield current
11 | current = current + duration
12 |
13 | def date_partition(start,end):
14 | return datetime_partition(start,end,datetime.timedelta(days=1))
15 |
16 | if __name__ == "__main__":
17 | start = datetime.datetime.strptime(sys.argv[1],xsdDateFormat) # start date
18 | end = datetime.datetime.strptime(sys.argv[2],xsdDateFormat) # end date
19 |
20 | for d in date_partition(start,end):
21 | print d
--------------------------------------------------------------------------------
/activities/intro-to-spark/tweet-wordcount.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from pyspark import SparkContext
4 |
5 | sc = SparkContext(appName="TweetWordCount")
6 |
7 | # Load the JSON data from the file (or wildcard) on the command line
8 | lines = sc.textFile(sys.argv[1], 1)
9 |
10 | # count the words: load each line into a JSON object, split each text property, output a key/value pair (count of 1), reduce by summation
11 | counts = lines.map(lambda line: json.loads(line)) \
12 | .flatMap(lambda tweet: tweet["text"].split()) \
13 | .map(lambda word: (word, 1)) \
14 | .reduceByKey(lambda a,b : a + b)
15 |
16 | # output the results (unsorted)
17 | output = counts.collect()
18 | for (word, count) in output:
19 | print "{0}: {1}".format(word.encode("utf-8"), count)
20 |
--------------------------------------------------------------------------------
/sessions/session-10.md:
--------------------------------------------------------------------------------
1 | # Session Schedule - Week 10 #
2 |
3 | ## More about AWS, EMR, and working with Hadoop ##
4 |
5 | * [10] Admin / Q & A
6 | * [10] [Configuring a cluster with JSON + tiny bit on bootstrapping scripts](https://github.com/alexmilowski/data-science/tree/master/activities/emr-cluster)
7 | * [20] Project Status Sharing
8 | * [10] Making an AMI
9 | * [15] [Prime Multiplier Example](https://github.com/alexmilowski/data-science/tree/master/activities/emr-prime-multiplier) - scaling simple computations
10 | * [10] [Introduction to mrjob](https://docs.google.com/a/milowski.com/presentation/d/1ZUCg4oPnHYbRXNOMLE6NmUl1af8X0Q1DUVMT-iFEjMw/edit?usp=sharing)
11 | * [20] [OpenNEX climate data](https://github.com/alexmilowski/data-science/tree/master/activities/emr-opennex-climate-model) - mrjob from scratch
12 | * [5] Wrap-up
13 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | KEYNAME=$1
3 | BUCKET=$2
4 | if [ -z $KEYNAME ] || [ -z $BUCKET ]; then
5 | echo "Usage: $(basename $0) key-name bucket-name"
6 | exit 1
7 | fi
8 | tmpname="/tmp/$(basename $0).bootstrap.$$.json"
9 | echo "[{\"Path\" : \"s3://$BUCKET/cc-bootstrap.sh\", \"Name\" : \"Common Crawl Bootstrap\", \"Args\" : [] }, { \"Path\":\"s3://elasticmapreduce/bootstrap-actions/configure-hadoop\",\"Args\":[\"-m\",\"mapred.map.max.attempts=1\"]} ]" > $tmpname
10 |
11 | AMI_VERSION=3.6.0
12 | CLUSTER=file://./cluster.json
13 | LOG_PATH=logs/
14 | TAG=emr
15 |
16 | aws emr create-cluster --ami-version $AMI_VERSION --ec2-attributes KeyName=$KEYNAME --instance-groups $CLUSTER --name "Crawl The Crawl Cluster" --log-uri s3://$BUCKET/$LOG_PATH --enable-debugging --tags Name=$TAG --bootstrap-actions file://$tmpname --applications "[]"
17 | rm -f $tmpname
--------------------------------------------------------------------------------
/activities/twitter-acquisition/search.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import tweepy
3 | import datetime
4 | import urllib
5 | import signal
6 | import json
7 |
8 | # Don't forget to install tweepy
9 | # pip install tweepy
10 |
11 | consumer_key = ""
12 | consumer_secret = ""
13 |
14 | access_token = ""
15 | access_token_secret = ""
16 |
17 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
18 | auth.set_access_token(access_token, access_token_secret)
19 |
20 | api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
21 |
22 | q = urllib.quote_plus(sys.argv[1]) # URL encoded query
23 |
24 | # Additional query parameters:
25 | # since: {date}
26 | # until: {date}
27 | # Just add them to the 'q' variable: q+" since: 2014-01-01 until: 2014-01-02"
28 | for tweet in tweepy.Cursor(api.search,q=q).items(200):
29 | # FYI: JSON is in tweet._json
30 | print tweet._json
31 |
--------------------------------------------------------------------------------
/activities/emr-tweet-wordcount/tweetSplitter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import sys
3 | import re
4 |
5 | def main(argv):
6 | pattern = re.compile("[a-zA-Z][a-zA-Z0-9]*")
7 | for line in sys.stdin:
8 | line = line.replace("..."," ")
9 | line.replace("("," ")
10 | line.replace(")"," ")
11 | for word in line.split():
12 | if len(word)<3 or word[0:5] == "http:" or word[0:6] == "https:" or word == "-":
13 | continue
14 | if word[0] == "." or word[0] == "\"" or word[0] == "(":
15 | word = word[1:]
16 | if word[-1] == "." or word[-1] == "," or word[-1] == "!" or word[-1] == ":" or word[-1] == "\"" or word[-1] == ")":
17 | word = word[0:-1]
18 | if len(word)<3:
19 | continue
20 | print "LongValueSum:" + word.lower() + "\t" + "1"
21 |
22 |
23 | if __name__ == "__main__":
24 | main(sys.argv)
25 |
--------------------------------------------------------------------------------
/activities/sentiment-analysis/train.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import nltk
3 | import sets
4 | import pickle
5 |
6 | # Local
7 | import featureset
8 |
9 | wordlistFilename = sys.argv[1]
10 | rangeSpec = sys.argv[2].split(",")
11 | wordStart = int(rangeSpec[0])
12 | wordEnd = int(rangeSpec[1])
13 | outputFilename = sys.argv[3]
14 |
15 | featureWords = featureset.load(wordlistFilename,wordStart,wordEnd)
16 | print featureWords
17 |
18 | sys.stderr.write("Loading training data...");
19 |
20 | texts = []
21 |
22 | for line in sys.stdin:
23 | parts = line.decode('utf-8').split("\n")[0].split("\t")
24 | wordlist = list(featureset.wordlist(parts[1]))
25 | texts.append((wordlist,parts[0]))
26 |
27 | extractFeatures = featureset.makeExtractor(featureWords)
28 |
29 | sys.stderr.write(" applying features ...");
30 | trainingSet = nltk.classify.apply_features(extractFeatures, texts)
31 |
32 | sys.stderr.write(" training classifier ...");
33 | classifier = nltk.NaiveBayesClassifier.train(trainingSet)
34 | sys.stderr.write(" done\n");
35 |
36 | f = open(outputFilename, 'wb')
37 | pickle.dump(classifier, f)
38 | f.close()
39 |
--------------------------------------------------------------------------------
/activities/sentiment-analysis/test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pickle
3 | import sets
4 | import nltk
5 |
6 | # Local
7 | import featureset
8 |
9 | classifierFilename = sys.argv[1]
10 | wordlistFilename = sys.argv[2]
11 | rangeSpec = sys.argv[3].split(",")
12 | wordStart = int(rangeSpec[0])
13 | wordEnd = int(rangeSpec[1])
14 |
15 | f = open(classifierFilename,"rb")
16 | classifier = pickle.load(f)
17 | f.close()
18 |
19 | featureWords = featureset.load(wordlistFilename,wordStart,wordEnd)
20 |
21 | reviews = []
22 |
23 | extractFeatures = featureset.makeExtractor(featureWords)
24 |
25 | count = 0
26 | missed = 0
27 | variance = 0;
28 | for line in sys.stdin:
29 | parts = line.decode('utf-8').split("\n")[0].split("\t")
30 | wordlist = list(featureset.wordlist(parts[1]))
31 | c = classifier.classify(extractFeatures(wordlist))
32 | a = parts[0]
33 | count += 1
34 | if c != a:
35 | missed += 1
36 | print str(count)+"\t"+a+"\t"+c+"\t"+(",".join(reduce(lambda l,w: l+[w] if w in featureWords else l,wordlist,[])))
37 |
38 | if count>0:
39 | print "{0} % correct, {1}/{2} ".format(100* ((count-missed)*1.0 / count), (count-missed),count)
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/seqs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import math
3 |
4 | # This library calculates sequence numbers for quadrangles of a given size in degrees.
5 |
6 | # Example: 5° quadrangle sequence number
7 | # s = sequenceNumber(5,49.5,-126.0)
8 | #
9 | def sequenceNumber(size,lat,lon):
10 | latMax = int(180 / size)
11 | lonMax = int(360 / size)
12 | nlat = 90 - lat
13 | nlon = 360 + lon if lon < 0 else lon
14 | s = int(math.floor(nlat/size)) * int(lonMax) + int(nlon / size) + 1
15 | return s
16 |
17 | # Example: 5° quandrangle sequence numbers for a large rectangular region
18 | # quad = sequencesFromQuadrangle(5,[40,-125,35,-120])
19 | #
20 | def sequencesFromQuadrangle(size,quad):
21 | quadBounds = [ sequenceNumber(size,quad[0],quad[1]), sequenceNumber(size,quad[0],quad[3]),
22 | sequenceNumber(size,quad[2],quad[3])]
23 | width = int(quadBounds[1] - quadBounds[0] + 1)
24 | lonMax = int(360 / size)
25 |
26 | s = quadBounds[0]
27 | while s= 3 and not e.lower() in stopWords]:
10 | if word == "n't":
11 | word = "not"
12 | if word == "'re":
13 | word = "are"
14 | if word == "'ve":
15 | word = "have"
16 | if word == "'ll":
17 | word = "will"
18 | word = lemmatizer.lemmatize(word)
19 | yield word
20 |
21 |
22 | def load(filename,start,end):
23 | featureWords = sets.Set()
24 | input = open(filename,"r")
25 | count = 0
26 | for line in input:
27 | count += 1
28 | if count < start:
29 | continue
30 | if end>start and count > end:
31 | break
32 | parts = line.decode('utf-8').split("\n")[0].split("\t")
33 | featureWords.add(parts[0])
34 | input.close()
35 | return featureWords
36 |
37 | def makeExtractor(featureWords):
38 | def extractFeatures(document):
39 | words = set(document)
40 | features = {}
41 | for word in featureWords:
42 | features['contains(%s)' % word] = (word in words)
43 | return features
44 | return extractFeatures
45 |
--------------------------------------------------------------------------------
/sessions/session-6.md:
--------------------------------------------------------------------------------
1 | # Session Schedule - Week 6 #
2 |
3 | * [5] intro
4 | * [10] project wrangling (joint)
5 | * [20] project group discussion (breakout)
6 |
7 | 1. define the goals project in more detail
8 | 2. details of data acquisition
9 | 3. proposed analytics and tools
10 | 4. prepare a short slide deck of issues/concerns to share when you return
11 |
12 | * [10 - 20] project sharing / planning (joint)
13 |
14 | * proposal due the following week (week 7)
15 | * include details from the breakout
16 | * 2 pages: problem, data sources, proposed analytics, may include technical architecture
17 |
18 | * [5] ER diagrams in Gliffy
19 | * [10] db ER activity (breakout)
20 |
21 | * How would you store tweet data in a relational database?
22 | * Consider storing hash tags, text tweet, language, handle, user information
23 | * Draw an ER diagram of your proposed model
24 |
25 | * [10] ER share / discussion (joint)
26 | * [10] sqlite activity (breakout)
27 |
28 | * Create a set of table definitions from your ER model for the tweet data.
29 | * Create a database and load your sample data.
30 | * Execute some sample queries:
31 | 1. all tweets for a particular user
32 | 2. all users who used a particular hashtag
33 | * Create a histogram of hashtags used by querying the database
34 |
35 | * [10] class planning / wrap-up (joint)
36 |
37 |
--------------------------------------------------------------------------------
/activities/common-crawl/test-10.warc:
--------------------------------------------------------------------------------
1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz
3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz
4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz
5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz
6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz
7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz
8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz
9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz
10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz
11 |
--------------------------------------------------------------------------------
/assignments/tweet-acquisition/README.md:
--------------------------------------------------------------------------------
1 | # Acquiring and Storing Social Media Data #
2 |
3 | ## A Hypothetical Scenario ##
4 |
5 | Minecraft is a popular game throughout the world that was
6 | [acquired last year (2014) by Microsoft](https://mojang.com/2014/09/yes-were-being-bought-by-microsoft/). We'd like to
7 | assess the current sentiment of the acquisition by examining social media data. Twitter is an obvious and easy choice
8 | as a place to start.
9 |
10 |
11 | ## Acquisition Task ##
12 |
13 | Acquire relevant data around the Microsoft / Mojang for a recent week. To accomplish this, do the following:
14 |
15 | 1. Write an acquisition program that can acquire tweets for a specific date on the using the Tweepy python package. The program should pull tweets
16 | for the #microsoft and #mojang hash tags simultaneously.
17 |
18 | 2. Run your data analysis over a week period of time. You should chunk your data as appropriate and give yourself the ability to re-run the process reliable in case of failures.
19 |
20 | 3. Organize the resulting raw data into a set of tweets and store these tweets into S3.
21 |
22 | 4. Analyze the tweets by producing a histogram (a graph) of the words.
23 |
24 |
25 | ## What to Turn In ##
26 |
27 | 1. A link to your S3 bucket documented in your README.md file. Make sure to make it publicly accessible.
28 |
29 | 2. Your twitter acquisition code.
30 |
31 | 3. The histogram.
32 |
33 |
34 |
--------------------------------------------------------------------------------
/activities/decision-trees/regions.py:
--------------------------------------------------------------------------------
1 | from matplotlib.colors import ListedColormap
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | def plot(X, y, classifier,test_idx=None, resolution=0.02):
6 |
7 | # setup marker generator and color map
8 | markers = ('s', 'x', 'o', '^', 'v')
9 | colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
10 | cmap = ListedColormap(colors[:len(np.unique(y))])
11 |
12 | # plot the decision surface
13 | x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
14 | x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
15 | xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
16 | np.arange(x2_min, x2_max, resolution))
17 | Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
18 | Z = Z.reshape(xx1.shape)
19 | plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
20 | plt.xlim(xx1.min(), xx1.max())
21 | plt.ylim(xx2.min(), xx2.max())
22 |
23 | # plot all samples
24 | for idx, cl in enumerate(np.unique(y)):
25 | plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
26 | alpha=0.8, c=cmap(idx),
27 | marker=markers[idx], label=cl)
28 |
29 | # highlight test samples
30 | if test_idx:
31 | X_test, y_test = X[test_idx, :], y[test_idx]
32 | plt.scatter(X_test[:, 0], X_test[:, 1], c='',
33 | alpha=1.0, linewidths=1, marker='o',
34 | s=55, label='test set')
35 |
--------------------------------------------------------------------------------
/activities/emr-prime-multiplier/prime-factors.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import sys
3 |
4 | # Appends a single next prime
5 | def appendPrime(primes):
6 | p = primes[-1]
7 | prime = False
8 | while not prime:
9 | p += 1
10 | divisor = False
11 | for i in range(0,len(primes)):
12 | if p % primes[i] == 0:
13 | divisor = True
14 | break
15 | if not divisor:
16 | prime = True
17 | primes.append(p)
18 | return primes
19 |
20 |
21 | # an initial set of primes
22 | primes = [2, 3, 5]
23 | # an initial array of zeros of the same length
24 | counts = [0 for i in range(len(primes))]
25 |
26 | # for each line of input, factor the input
27 | for line in sys.stdin:
28 |
29 | # Parse the integer and skip zeros
30 | i = int(line)
31 | if (i==0):
32 | continue
33 |
34 | # Factor until we reach 1
35 | p = 0;
36 | while i!=1:
37 | #print i,p,i % primes[p],counts
38 |
39 | # compute exponent for current prime
40 | while i!=1 and i % primes[p] == 0:
41 | i = i / primes[p]
42 | counts[p] += 1
43 |
44 | # increment prime
45 | p += 1
46 |
47 | # if we aren't at zero but have run out of primes, find the next prime to factor
48 | if i!=1 and p==len(primes):
49 | appendPrime(primes)
50 | counts.append(0)
51 |
52 | # Output the counts for each prime
53 | for i in range(len(primes)):
54 | if counts[i]>0:
55 | print "LongValueSum:",primes[i],"\t",counts[i]
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/average.py:
--------------------------------------------------------------------------------
1 | from mrjob.job import MRJob
2 | import sys
3 | import os
4 | import json
5 | import math
6 |
7 | # Computes the average correctly for a given input dataset
8 | class Average(MRJob):
9 |
10 | # Loads the JSON object and yields yearMonth -> (length,average)
11 | def average_partition(self, _, line):
12 | obj = json.loads(line)
13 | #print obj["yearMonth"],(len(obj["data"]),sum(obj["data"])/len(obj["data"]))
14 | yield obj["yearMonth"],(len(obj["data"]),sum(obj["data"])/len(obj["data"]))
15 |
16 | # Combines sequence number averages for particular year+month
17 | def average_month(self, yearMonth, countAverage):
18 | sum = 0
19 | total = 0
20 | for count,value in countAverage:
21 | sum += count*value
22 | total += count
23 | #print yearMonth,(total,sum/total)
24 | yield "month",(total,sum/total)
25 |
26 | # Computes the average over the year/month data keeping track of counts
27 | def average(self,_,averageData):
28 | sum = 0
29 | total = 0
30 | for count,average in averageData:
31 | sum += count*average
32 | total += count
33 | #print "average",sum/total
34 | yield "average",sum/total
35 |
36 | # Define a 1-step job with a mapper, combiner, and reducer
37 | def steps(self):
38 | return [
39 | self.mr(mapper=self.average_partition,
40 | combiner=self.average_month,
41 | reducer=self.average)
42 | ]
43 |
44 |
45 | if __name__ == '__main__':
46 | Average.run()
47 |
--------------------------------------------------------------------------------
/activities/common-crawl/mrcc.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | #
3 | import boto
4 | import warc
5 | #
6 | from boto.s3.key import Key
7 | from gzipstream import GzipStreamFile
8 | from mrjob.job import MRJob
9 |
10 |
11 | class CCJob(MRJob):
12 | def configure_options(self):
13 | super(CCJob, self).configure_options()
14 | self.add_passthrough_option('--source',help="Source location of the common crawl data (s3 or file)")
15 |
16 | def process_record(self, record):
17 | """
18 | Override process_record with your mapper
19 | """
20 | raise NotImplementedError('Process record needs to be customized')
21 |
22 | def mapper(self, _, line):
23 | f = None
24 | ## If we're on EC2 or running on a Hadoop cluster, pull files via S3
25 | if self.options.source in ['s3' ]:
26 | print 'Downloading ...'
27 | # Connect to Amazon S3 using anonymous credentials
28 | conn = boto.connect_s3(anon=True)
29 | pds = conn.get_bucket('aws-publicdatasets')
30 | # Start a connection to one of the WARC files
31 | k = Key(pds, line)
32 | f = warc.WARCFile(fileobj=GzipStreamFile(k))
33 | ## If we're local, use files on the local file system
34 | else:
35 | print 'Loading local file {}'.format(line)
36 | f = warc.WARCFile(fileobj=gzip.open(line))
37 | ###
38 | for i, record in enumerate(f):
39 | for key, value in self.process_record(record):
40 | yield key, value
41 | self.increment_counter('commoncrawl', 'processed_records', 1)
42 |
43 | # TODO: Make the combiner use the reducer by default
44 | def combiner(self, key, value):
45 | yield key, sum(value)
46 |
47 | def reducer(self, key, value):
48 | yield key, sum(value)
49 |
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/date_partitions.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import datetime
3 |
4 | # A date partition library
5 |
6 | xsdDatetimeFormat = "%Y-%m-%dT%H:%M:%S"
7 | xsdDateFormat = "%Y-%m-%d"
8 |
9 | # A generator for date/times based on durations
10 | #
11 | def datetime_partition(start,end,duration):
12 | current = start
13 | while start==current or (end-current).days > 0 or ((end-current).days==0 and (end-current).seconds>0):
14 | yield current
15 | current = current + duration
16 |
17 | # A generator for months given a start and end month.
18 | #
19 | # Example: Generates the months from 2015-03 to 2016-03
20 | #
21 | # months = month_partition(datetime.datetime(2015,3,1),datetime.datetime(2016,3,1))
22 | #
23 | def month_partition(start,end):
24 | current = datetime.datetime(start.year,start.month,1)
25 | while current.year= 3 and not e.lower() in stopWords]
40 | input.append((words, sentiment))
41 |
42 | print input
43 |
44 |
45 | # Get an ordered list of most frequently used words
46 | def getAllWords(input):
47 | all = []
48 | for (words, sentiment) in input:
49 | all.extend(words)
50 | return all
51 |
52 | print
53 |
54 | wordlist = nltk.FreqDist(getAllWords(input))
55 | print wordlist.pprint(100)
56 | wordFeatures = wordlist.keys()
57 |
58 | def extractFeatures(document):
59 | words = set(document)
60 | features = {}
61 | for word in wordFeatures:
62 | features['contains(%s)' % word] = (word in words)
63 | return features
64 |
65 | trainingSet = nltk.classify.apply_features(extractFeatures, input)
66 |
67 | classifier = nltk.NaiveBayesClassifier.train(trainingSet)
68 |
69 | print
70 | for sentence in data:
71 | print classifier.classify(extractFeatures(sentence.split())),": ",sentence
72 |
73 |
74 |
75 |
76 |
77 |
78 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to Data Science
2 |
3 | Data Science is a process of acquiring, organizing, analyzing, and representing information. The activities in the process focus primarily on learning various technologies in support of each of the major stages of any data science process.
4 |
5 | 
6 |
7 | ## Activities
8 |
9 | Guided activities provide you examples of various technologies and how they can be used for particular tasks. Each activity should be able to be accomplished as a self-study.
10 |
11 | See [Activities](./activities/) section for a list of various self-guided studies.
12 |
13 | ## Sessions
14 |
15 | An agenda for each [session](./sessions/) is available.
16 |
17 | ## Assignments
18 |
19 | If you are a student in one of my classes, you'll be using github to turn in assignments.
20 |
21 | ### Setup
22 |
23 | You'll need to create a repository for your assignments. You can [request a discount](https://education.github.com/discount_requests/new) as a
24 | student so that you can have a private repository. Otherwise, all your assignments will
25 | be publically accessible.
26 |
27 | Once you've setup your repository, you'll want to add your instructor as a collaborator. That way they can merge your
28 | pull requests when they are graded.
29 |
30 | ### Turning in Assignments
31 |
32 | 1. [Create a branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/) of your repository for the homework and make your changes in that branch.
33 | 2. Commit and push your changes to the branch.
34 | 3. [Create a pull request](https://help.github.com/articles/creating-a-pull-request/) for the code you'd like to turn in.
35 | 4. Your instructor can now view the pull request and grade the assignment.
36 | 5. Once your instructor has graded the assignment, they can merge the pull request as a final notification.
37 | 6. You can now delete the branch as the changes have been merged with the master.
38 |
--------------------------------------------------------------------------------
/assignments/getting-started/README.md:
--------------------------------------------------------------------------------
1 | # Getting Started #
2 |
3 | This assignment will step you through the process of running a simple computation over a data set using Map/Reduce via mrjob. The goal
4 | of the assignment is to have you walk through the process of using git, github, python, mrjob, and AWS and ensure you are setup with
5 | all the various tools and services.
6 |
7 | ## Recommended Readings ##
8 |
9 | * [Getting started with Amazon AWS video tutorials](http://aws.amazon.com/getting-started/)
10 | * [Introduction to AWS training](https://www.youtube.com/playlist?list=PLhr1KZpdzukcMmx04RbtWuQ0yYOp1vQi4)
11 | * [A Comparison of Clouds: Amazon Web Services, Windows Azure, Google Cloud Platform, VMWare and Others](http://pages.cs.wisc.edu/~akella/CS838/F12/notes/Cloud_Providers_Comparison.pdf)
12 | * [A Survey on Cloud Provider Security Measures](http://www.cs.ucsb.edu/~koc/ns/projects/12Reports/PucherDimopoulos.pdf)
13 |
14 | ## Tasks ##
15 |
16 | ### Part 1 ###
17 |
18 | Note: Keep track of the time necessary to run the process locally. For Linux/Mac users, you can use the `time` command to compute this.
19 |
20 | 1. Follow the instructions at https://github.com/alexmilowski/data-science/tree/master/activities/common-crawl to get setup with the tools and code.
21 | 2. Run the process locally on your computer.
22 |
23 | ### Part 2 ###
24 |
25 | 1. Follow the process for running the tag counter on AWS EMR.
26 | 2. Download the output from S3.
27 |
28 | ## What to Turn In ##
29 |
30 | You must turn in a pull request containing the following:
31 |
32 | 1. A copy of the output directory for the tag counter running locally (name the directory 'out').
33 | 2. A copy of the output from S3 for the tag counter running on AWS (name the directory 'emr-out').
34 | 3. How long did it take to run the process for each of these?
35 | 4. How many `address` tags are there in the input?
36 | 5. Does the local version and EMR version give the same answer?
37 |
38 | Please submit the answers to 3-5 in a text file called `answers.txt`
39 |
40 |
--------------------------------------------------------------------------------
/activities/common-crawl/test-15.warc:
--------------------------------------------------------------------------------
1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz
3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz
4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz
5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz
6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz
7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz
8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz
9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz
10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz
11 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz
12 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz
13 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz
14 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz
15 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz
16 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/mrcc.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import gzip
4 | import sys
5 | #
6 | import boto
7 | import warc
8 | #
9 | from boto.s3.key import Key
10 | from gzipstream import GzipStreamFile
11 | from mrjob.job import MRJob
12 |
13 |
14 | class CCJob(MRJob):
15 |
16 | def process_record(self, record):
17 | """
18 | Override process_record with your mapper
19 | """
20 | raise NotImplementedError('Process record needs to be customized')
21 |
22 | def mapper(self, _, line):
23 | f = None
24 | ## If we're on EC2 or running on a Hadoop cluster, pull files via S3
25 | if line.startswith("s3://"):
26 |
27 | print('Downloading ...',file=sys.stderr)
28 | key = None
29 |
30 | # Connect to Amazon S3 using anonymous credentials
31 | conn = boto.connect_s3(anon=True)
32 | if line.startswith("s3://"):
33 | pathStart = line.index('/',5)
34 | bucketName = line[5:pathStart]
35 | keyPath = line[pathStart+1:]
36 | print("Bucket: "+bucketName,file=sys.stderr)
37 | print("Key: "+keyPath,file=sys.stderr)
38 | bucket = conn.get_bucket(bucketName)
39 | key = Key(bucket,keyPath)
40 | else:
41 | print("Bucket: aws-publicdatasets",file=sys.stderr)
42 | print("Key: "+line,file=sys.stderr)
43 | bucket = conn.get_bucket("aws-publicdatasets")
44 | key = Key(bucket,line)
45 | # Start a connection to one of the WARC files
46 | f = warc.WARCFile(fileobj=GzipStreamFile(key))
47 |
48 | ## If we're local, use files on the local file system
49 | else:
50 | if line.startswith("file:///"):
51 | line = line[7:]
52 | print("Local: {}".format(line),file=sys.stderr)
53 | f = warc.WARCFile(fileobj=gzip.open(line))
54 | ###
55 | for i, record in enumerate(f):
56 | for key, value in self.process_record(record):
57 | yield key, value
58 | self.increment_counter('commoncrawl', 'processed_records', 1)
59 |
60 | def reducer(self, key, value):
61 | yield key, sum(value)
62 |
--------------------------------------------------------------------------------
/activities/emr-tweet-wordcount/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Word Counts for Tweets #
3 |
4 | This example shows running the word count example over tweets.
5 |
6 | # Setup #
7 |
8 | If you don't have a cluster running, you'll need to start one (see main setup page). You also need a bucket for the code, input, and output.
9 |
10 | # Running the Example #
11 |
12 | In this example, we'll use a sample set of 1995 tweets with the word 'Microsoft' in them.
13 |
14 | ## Step 1 ##
15 |
16 | The tweets are stored as JSON and We'll need to extract the tweet text and create an input with one line per tweet. The `format-tweets.py` program
17 | does this:
18 |
19 | mkdir -p tweet-wc/input
20 | python format-tweets.py microsoft-2014-10-07.json > tweet-wc/input/tweets.txt
21 |
22 | Now we need to store the input:
23 |
24 | aws s3 sync tweet-wc s3://mybucket/tweet-wc/
25 |
26 | ## Step 2 ##
27 |
28 | We need to store the word count program:
29 |
30 | aws s3 cp tweetSplitter.py s3://mybucket/
31 |
32 | ## Step 3 ##
33 |
34 | Now we add the streaming step to do the work:
35 |
36 | aws emr add-steps --cluster-id --steps Type=STREAMING,Name='Tweet Word Count',ActionOnFailure=CONTINUE,Args=--files,s3://mybucket/tweetSplitter.py,-mapper,tweetSplitter.py,-reducer,aggregate,-input,s3://mybucket/tweet-wc/input,-output,s3://mybucket/tweet-wc/output
37 |
38 | Note: don't forget to use your cluster id and bucket name in the above.
39 |
40 | This command returns the step id that you can use for further monitoring. If you use an 'm1.medium' instance type, this job should take 1 minute to process and 3 minutes of elapsed time.
41 |
42 | You can monitor its progress from the console or via:
43 |
44 | aws emr describe-step --cluster-id --step-id
45 |
46 | ## Step 4 ##
47 |
48 | Sync the output:
49 |
50 | aws s3 sync s3://mybucket/tweet-wc/output/ tweet-wc/output/
51 |
52 | You should now have 4 files:
53 |
54 | tweet-wc/output/_SUCCESS
55 | tweet-wc/output/part-00000
56 | tweet-wc/output/part-00001
57 | tweet-wc/output/part-00002
58 |
59 | The output is a list of word counts split amongst the part-nnnnn files.
60 |
--------------------------------------------------------------------------------
/activities/emr-map-only/README.md:
--------------------------------------------------------------------------------
1 | # Map Task Input Splitting #
2 |
3 | This example shows how map tasks get their input from splitting the input files. In this example, we'll
4 | just count the lines received via a map-only step (i.e., no reduce step) and the output will just consist
5 | of that count. You'll see the output of each map task and how much of the input it received.
6 |
7 | # Setup #
8 |
9 | If you don't have a cluster running, you'll need to start one (see main setup page). You also need a bucket for the code, input, and output.
10 |
11 | # Running the Example #
12 |
13 | ## Step 1 ##
14 |
15 | You'll need to setup input to run the job and so we'll create a directory with some input:
16 |
17 | mkdir -p job/input
18 | python generate-input.py job/input/test 3 1000
19 |
20 | This will create three test files. We will process this with a simple map-only task to show you how input
21 | is split.
22 |
23 | Now we need to store the input:
24 |
25 | aws s3 sync job s3://mybucket/job/
26 |
27 | ## Step 2 ##
28 |
29 | We need to store the line count program:
30 |
31 | aws s3 cp line-count.py s3://mybucket/
32 |
33 | ## Step 3 ##
34 |
35 | Now we add the streaming step to do the work:
36 |
37 | aws emr add-steps --cluster-id --steps Type=STREAMING,Name='Map Line Count',ActionOnFailure=CONTINUE,Args=--files,s3://mybucket/line-count.py,-mapper,line-count.py,-reducer,NONE,-input,s3://mybucket/job/input,-output,s3://mybucket/job/output
38 |
39 | Note: don't forget to use your cluster id and bucket name in the above.
40 |
41 | This command returns the step id that you can use for further monitoring. If you use an 'm1.medium' instance type, this job should take 1 minute to process and 3 minutes of elapsed time.
42 |
43 | You can monitor its progress from the console or via:
44 |
45 | aws emr describe-step --cluster-id --step-id
46 |
47 | ## Step 4 ##
48 |
49 | Sync the output:
50 |
51 | aws s3 sync s3://mybucket/job/output/ job/output/
52 |
53 | You should now have 7 files:
54 |
55 | job/output/_SUCCESS
56 | job/output/part-00000
57 | job/output/part-00001
58 | job/output/part-00002
59 | job/output/part-00003
60 | job/output/part-00004
61 | job/output/part-00005
62 |
63 |
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/by-sequences.py:
--------------------------------------------------------------------------------
1 | from mrjob.job import MRJob
2 | import sys
3 | import os
4 | import json
5 | import math
6 | import datetime
7 |
8 | import seqs
9 | import date_partitions as partitions
10 |
11 | # Gathers data based on traversing sequence numbers for a given region and period of time
12 | class ListSequences(MRJob):
13 |
14 | # Add a data directory for the data on disk
15 | def configure_options(self):
16 | super(ListSequences, self).configure_options()
17 | self.add_passthrough_option('--data-dir',help="The directory where the data is stored.")
18 |
19 | # Yields the set of sequence numbers for each year/month for the requested region
20 | def year_seq(self,_,line):
21 | if line[0] == '#':
22 | return
23 |
24 | args = line.rstrip().split(",");
25 |
26 | quad = [ float(args[0]), float(args[1]),
27 | float(args[2]), float(args[3]) ]
28 | size = int(args[4])
29 | startYear = int(args[5])
30 | startMonth = int(args[6])
31 | endYear = int(args[7])
32 | endMonth = int(args[8])
33 |
34 | for month in partitions.month_partition(datetime.datetime(startYear,startMonth,1),datetime.datetime(endYear,endMonth,1)):
35 | for seq in seqs.sequencesFromQuadrangle(size / 120.0,quad):
36 | yield "{}-{:02d}".format(month.year,month.month),(size,seq)
37 |
38 | # Computes the average for a year/month + quadrangle + sequence number by loading the data (JSON)
39 | def average_quadrangle(self, yearMonth, quadSpec):
40 | size,seq = quadSpec
41 | fileName = self.options.data_dir+(os.sep if self.options.data_dir[-1]!=os.sep else "")+yearMonth+"-"+str(size)+"-"+str(seq)+".json"
42 | if os.path.exists(fileName):
43 | f = open(fileName,"r")
44 | obj = json.load(f)
45 | f.close()
46 | yield yearMonth,(1,len(obj["data"]))
47 |
48 | # Defines the job as a 2-step map-only job
49 | def steps(self):
50 | return [
51 | self.mr(mapper=self.year_seq,
52 | reducer=None),
53 | self.mr(mapper=self.average_quadrangle,
54 | reducer=None)
55 | ]
56 |
57 |
58 | if __name__ == '__main__':
59 | ListSequences.run()
60 |
61 |
--------------------------------------------------------------------------------
/assignments/organizing-tweets/README.md:
--------------------------------------------------------------------------------
1 | # Organizing Acquired Data #
2 |
3 | In this assignment we will be organizing the information like that acquired in
4 | [Acquiring and Storing Social Media Data](../tweet-acquisition). In fact,
5 | we will be organizing it in three different ways and contrasting how the various
6 | storage systems can be used to accomplish a particular task.
7 |
8 | The subject of the is the tweet data that was acquired from a conference:
9 |
10 | * [prague-2015-02-14.json](prague-2015-02-14.json)
11 | * [prague-2015-02-15.json](prague-2015-02-15.json)
12 |
13 | Note: The time of the conference is CET (+01:00) timezone.
14 |
15 | We need to answer the following questions by "querying" the data:
16 |
17 | 1. Who tweeted the most during the conference?
18 | 2. What were the top 10 hash tags used?
19 | 3. For a particular hour, how many tweets were produced?
20 |
21 | We are going to answer these questions using three different database storage techonlogies:
22 |
23 | * Key/Value — [AWS S3](http://aws.amazon.com/s3/)
24 | * NoSQL Database — [Mongo](https://www.mongodb.org) or [MarkLogic](http://www.marklogic.com)
25 | * Relational Database — SQLite, MySQL, etc.
26 |
27 | ## Tasks ##
28 |
29 | As you look at the following tasks, keep in mind that you don't need all the raw information from the tweet
30 | data as provided from Twitter. That is, you do not need to model or store all the raw information but just
31 | that which is sufficient to answer the tree questions.
32 |
33 | 1. Draw a UML ER diagram how you would model your information extracted from the raw tweet data.
34 | 2. For each database category of Key/Value, NoSQL, and Relational, decribe a systems architecture that contains:
35 | 1. Your implementation model of how data is actually organized (e.g. a schema, collection structure, etc.).
36 | 2. The process necessary to store the information into your implementation model.
37 | 3. Pseudo-code / procedures that describe how you would answer each of the questions.
38 | 3. For just one of the database categories, implement your architecture.
39 |
40 | ## What to turn in ##
41 |
42 | 1. Your UML ER diagram.
43 | 2. A document for each of the database categories for task #2.
44 | 3. Your implementation code for task #3.
45 | 4. The answers for each of the three questions. Please provide answers for the hours 9:00+01:00 through 16:00+01:00 on both days.
--------------------------------------------------------------------------------
/activities/sentiment-analysis/candy-corn.py:
--------------------------------------------------------------------------------
1 | import nltk
2 |
3 | # negative
4 | negative = [
5 | ("We're all aware by now that Candy corn is evil","nasty"),
6 | ("Candy corn is so bad for you","nasty"),
7 | ("If you eat candy corn... I guess you would eat crayons, candles and ear wax too","nasty"),
8 | ("Candy corn is nasty","nasty"),
9 | ("Never not horrified by candy corn.","nasty")
10 | ]
11 |
12 | # positive
13 | positive = [
14 | ("I'm craving candy corn","best"),
15 | ("I still love candy corn","best"),
16 | ("Yes, I tweet candy corn and not broccoli. You know why? Because candy corn is more exciting.","best"),
17 | ("Autumn candy corn. So sweet; so good; so sticky. I taste no regrets.","best"),
18 | ("I love candy corn","best"),
19 | ("Candy corn is good","best")
20 | ]
21 |
22 | # Test
23 | tests = [
24 | "Now's as good a time as any to remind you candy corn is the worst and if you like it you have a deep personal failing that needs examining.", #nasty
25 | "Candy corn is my favorite candy on Halloween", #best
26 | "Candy corn is sugar and wax - nasty", #nasty
27 | "Can't get enough candy corn love", #best
28 | "Candy corn is evil", #nasty
29 | "Candy corn is bad candy" # nasty
30 | ]
31 |
32 | # words we will exclude
33 | stopWords = [
34 | "candy",
35 | "corn",
36 | "and",
37 | "not",
38 | "the",
39 | "...",
40 | "'re"
41 | ]
42 |
43 | # process the texts into a training set of words
44 | texts = []
45 | for (tweet, sentiment) in positive + negative:
46 | words = [e.lower() for e in nltk.word_tokenize(tweet) if len(e) >= 3 and not e.lower() in stopWords]
47 | texts.append((words, sentiment))
48 |
49 | print texts
50 |
51 |
52 | # Get an ordered list of most frequently used words
53 | def getAllWords(texts):
54 | all = []
55 | for (words, sentiment) in texts:
56 | all.extend(words)
57 | return all
58 |
59 | print
60 |
61 | wordlist = nltk.FreqDist(getAllWords(texts))
62 | print wordlist.pprint(100)
63 | wordFeatures = wordlist.keys()
64 |
65 | def extractFeatures(document):
66 | words = set(document)
67 | features = {}
68 | for word in wordFeatures:
69 | features['contains(%s)' % word] = (word in words)
70 | return features
71 |
72 | trainingSet = nltk.classify.apply_features(extractFeatures, texts)
73 |
74 | classifier = nltk.NaiveBayesClassifier.train(trainingSet)
75 |
76 | print
77 | for tweet in tests:
78 | print classifier.classify(extractFeatures(tweet.split())),": ",tweet
79 |
80 |
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/activities/README.md:
--------------------------------------------------------------------------------
1 | # Activities
2 |
3 |
4 | ## Examples
5 |
6 | ### Common Crawl Exemplar
7 |
8 | The [Common Crawl Exemplar](common-crawl/) is a fully worked example of running Map/Reduce via Hadoop on AWS EMR for textual analysis.
9 |
10 | ### Processing the NASA OpenNEX model in EMR
11 |
12 | The [Processing the NASA OpenNEX model in EMR](emr-opennex-climate-model/) activity processes climate model data using AWS EMR.
13 |
14 | ### Multiplying Many Integers via Prime Factorization using EMR
15 |
16 | The [Multiplying Many Integers via Prime Factorization using EMR](emr-prime-multiplier/) activity is a simple example of using Map/Reduce to perform a computation.
17 |
18 |
19 | ## Acquiring Data
20 |
21 | ### Acquiring Data from Twitter
22 |
23 | The [Acquiring Data from Twitter](twitter-acquisition/) activity demonstrates how to acquire data from an API.
24 |
25 | ### Scraping the Web
26 |
27 | The [Scraping the Web](web-scraping/) activity demonstrates gather information from the web.
28 |
29 | ### Crawling the Common Crawl
30 |
31 | The [Crawling the Common Crawl](crawling-the-crawl/) activity demonstrates using prefetched web content from the [Common Crawl](http://commoncrawl.org).
32 |
33 |
34 | ## Organizing
35 |
36 | ### Data Munging - Processing JSON, XML, and CSV Data
37 |
38 | The [Data Munging](data-munging/) activity demonstrates processing various data formats in Python.
39 |
40 | ### NoSQL Databases
41 |
42 | The [NoSQL Databases](nosql-databases/) activity demonstrates using different NoSQL databases.
43 |
44 | ### Relational Databases
45 |
46 | The [Relational Databases](relational-databases/) activity demonstrates using a relational database from Python.
47 |
48 |
49 | ## Analyzing
50 |
51 | ### Creating Clusters for EMR
52 |
53 | The [Creating Clusters for EMR](emr-cluster/) activity steps through setting up an EMR cluster for Map/Reduce (Hadoop) on AWS.
54 |
55 | ### Word Counts for Tweets
56 |
57 | The [Word Counts for Tweets](emr-tweet-wordcount/) activity steps through the infamous word count example on AWS EMR using tweet data.
58 |
59 | ### Map Task Input Splitting
60 |
61 | The [Map Task Input Splitting](emr-map-only/) activity demonstrates how input is split by Hadoop on AWS EMR.
62 |
63 | ### Introduction to Spark
64 |
65 | The [Introduction to Spark](intro-to-spark/) activity introduces [Spark](http://spark.apache.org) and steps through reproducing various previous activities.
66 |
67 | ### NLP - Text Processing with NLTK
68 |
69 | The [Text Processing with NLTK](text-processing-with-nltk/) activity introduces how text can be processed with NLTK in Python.
70 |
71 | ### NLP - Sentiment Analysis (NLTK)
72 |
73 | The [Sentiment Analysis](sentiment-analysis/) activity introduces Sentiment Analysis and steps through using it via Python.
74 |
75 |
--------------------------------------------------------------------------------
/activities/common-crawl/test-20.warc:
--------------------------------------------------------------------------------
1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz
3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz
4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz
5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz
6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz
7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz
8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz
9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz
10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz
11 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz
12 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz
13 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz
14 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz
15 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz
16 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00015-ip-10-180-136-8.ec2.internal.warc.gz
17 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00016-ip-10-180-136-8.ec2.internal.warc.gz
18 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00017-ip-10-180-136-8.ec2.internal.warc.gz
19 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00018-ip-10-180-136-8.ec2.internal.warc.gz
20 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00019-ip-10-180-136-8.ec2.internal.warc.gz
21 |
--------------------------------------------------------------------------------
/activities/emr-prime-multiplier/README.md:
--------------------------------------------------------------------------------
1 | # Multiplying Many Integers via Prime Factorization using EMR#
2 |
3 | This example demonstrates how map reduce key/values work by mulitplying a large
4 | number of integers. In this case, the output keys are prime numbers and the value is
5 | the exponent of the prime factorization (e.g. 12 produces 2 → 2, 3 → 1).
6 |
7 | It also uses the built-in aggregator as the reducer step and so the output is prefixes with `LongValueSum:`.
8 |
9 | ## Setup ##
10 |
11 | If you don't have a cluster running, you'll need to start one (see main setup page). You also need a bucket for the code, input, and output.
12 |
13 | ## Running the Example ##
14 |
15 | In this example, we'll multiple 1,000,000 integers between 1 and 1000
16 |
17 | ### Step 1 ###
18 |
19 | You'll need to setup input to run the job and so we'll create a directory with some input:
20 |
21 | mkdir -p multiply/input
22 | python generate-input.py 1000 1000000 > multiply/input/numbers.txt
23 |
24 | Now we need to store the input:
25 |
26 | aws s3 sync multiply s3://mybucket/multiply/
27 |
28 | ### Step 2 ###
29 |
30 | We need to store the line count program:
31 |
32 | aws s3 cp prime-factors.py s3://mybucket/
33 |
34 | ### Step 3 ###
35 |
36 | Now we add the streaming step to do the work (shorthand)
37 |
38 | aws emr add-steps --cluster-id --steps Type=STREAMING,Name='Multiply',ActionOnFailure=CONTINUE,Args=--files,s3://mybucket/prime-factors.py,-mapper,prime-factors.py,-reducer,aggregate,-input,s3://mybucket/multiply/input,-output,s3://mybucket/multiply/output
39 |
40 | or using JSON:
41 |
42 | aws emr add-steps --cluster-id --steps file://./step.json
43 |
44 | where `step.json` is:
45 |
46 | {
47 | "Type" : "STREAMING",
48 | "Name" : "Multiply",
49 | "ActionOnFailure" : "CONTINUE",
50 | "Args" : [
51 | "-files","s3://mybucket/prime-factors.py",
52 | "-mapper","prime-factors.py",
53 | "-reducer","aggregate",
54 | "-input","s3://mybucket/multiply/input",
55 | "-output","s3://mybucket/multiply/output"
56 | ]
57 | }
58 |
59 | Note: don't forget to use your cluster id and bucket name in the above.
60 |
61 | This command returns the step id that you can use for further monitoring. If you use an 'm1.medium' instance type, this job should take 1 minute to process and 2 minutes of elapsed time.
62 |
63 | You can monitor its progress from the console or via:
64 |
65 | aws emr describe-step --cluster-id --step-id
66 |
67 | ### Step 4 ###
68 |
69 | Sync the output:
70 |
71 | aws s3 sync s3://mybucket/multiply/output/multiply/output/
72 |
73 | You should now have 4 files:
74 |
75 | job/output/_SUCCESS
76 | job/output/part-00000
77 | job/output/part-00001
78 | job/output/part-00002
79 |
80 | The output is a list of primes and exponents for a very large number!
81 |
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/acquire.py:
--------------------------------------------------------------------------------
1 | import urllib2, gzip, StringIO
2 | from xml.dom import pulldom
3 | from xml import sax
4 | import json
5 | import math
6 | import sys
7 | import datetime
8 |
9 | import seqs
10 | import date_partitions as partitions
11 |
12 | # Service URI for data set
13 | serviceURI = "http://data.pantabular.org/opennex/data/"
14 |
15 | # Fetches a sequence number data give the facets
16 |
17 | def fetchQuadrangle(dataset,yearMonth,resolution,sequence):
18 |
19 | # Format a URI
20 | strYearMonth = "{}-{:02d}".format(yearMonth.year,yearMonth.month)
21 | url = serviceURI+dataset+"/"+strYearMonth+"/"+str(resolution)+"/"+str(sequence);
22 | print url
23 |
24 | # Open an HTTP Request
25 | response = None
26 | try:
27 | response = urllib2.urlopen(url)
28 | except urllib2.HTTPError as e:
29 | return None
30 |
31 | html = None
32 |
33 | # Unpack the response
34 | if response.headers.get('content-encoding', '') == 'gzip':
35 | data = response.read()
36 | compressedstream = StringIO.StringIO(data)
37 | gzipper = gzip.GzipFile(fileobj=compressedstream)
38 | html = gzipper.read()
39 | else:
40 | html = response.read()
41 |
42 | # Parse the markup
43 | parser = sax.make_parser()
44 | parser.setFeature(sax.handler.feature_namespaces, 1)
45 | doc = pulldom.parseString(html,parser)
46 |
47 | inTable = False
48 |
49 | def textContent(parent):
50 | s = "";
51 | for n in parent.childNodes:
52 | if n.data != None:
53 | s += n.data
54 | return s
55 |
56 | # Process the markup as a stream and detect the table of data
57 | data = []
58 | for event, node in doc:
59 | if event == pulldom.START_ELEMENT and node.tagName == 'table':
60 | if node.getAttribute("typeof") == "IndexedTable":
61 | inTable = True
62 | if event == pulldom.END_ELEMENT and node.tagName == 'table':
63 | inTable = False
64 | if inTable and event == pulldom.START_ELEMENT and node.tagName == 'td':
65 | doc.expandNode(node)
66 | if len(node.childNodes) > 0:
67 | data.append(float(textContent(node)))
68 |
69 | if len(data) == 0:
70 | return None
71 |
72 | # Return the sequence number data object
73 | return {"dataset": dataset, "yearMonth": strYearMonth, "resolution" : resolution, "sequence": sequence, "data": data }
74 |
75 | # The data set name
76 | dataset = sys.argv[1]
77 |
78 | # The resolution in 1/120 degree counts
79 | resolution = int(sys.argv[2])
80 |
81 | # The quadrangle to cover
82 | quad = json.loads(sys.argv[3])
83 |
84 | # The start and end year/month
85 | start = datetime.datetime.strptime(sys.argv[4],"%Y-%m") # start month
86 | end = datetime.datetime.strptime(sys.argv[5],"%Y-%m") # end month
87 |
88 | # The prefix for the output files
89 | prefix = sys.argv[6]
90 |
91 | # Compute the degree size of the quadrangles
92 | size = resolution / 120.0
93 |
94 | # Iterate over the months
95 | for yearMonth in partitions.month_partition(start,end):
96 |
97 | # Iterate over the sequence numbers for the quadrangle
98 | for seq in seqs.sequencesFromQuadrangle(size,quad):
99 |
100 | # Fetch a sequence number's data
101 | obj = fetchQuadrangle(dataset,yearMonth,resolution,seq)
102 | if obj != None:
103 |
104 | # Serialize the data as JSON
105 | fileName = "{}{}-{:02d}-{}-{}.json".format(prefix,yearMonth.year,yearMonth.month,resolution,seq)
106 | f = open(fileName,"w")
107 | json.dump(obj,f)
108 | f.write("\n")
109 | f.close()
110 |
111 |
--------------------------------------------------------------------------------
/activities/sentiment-analysis/README.md:
--------------------------------------------------------------------------------
1 | # Sentiment Analysis #
2 |
3 | ## Setup ##
4 |
5 | Please make sure you have nltk installed:
6 |
7 | pip install nltk
8 | python -m nltk.downloader all
9 |
10 | Things you might review:
11 |
12 | * A [short set of slides](http://courses.ischool.berkeley.edu/ds205/f14/sentiment-analysis.xhtml) (also found [here](sentiment-analysis.xhtml)) that will walk you through the [Candy Corn example](candy-corn.py).
13 | * A nice blog post on [using NLTK for sentiment analysis](http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/)
14 | * A short article on [Bag-of-words model on Wikipedia](http://en.wikipedia.org/wiki/Bag-of-words_model)
15 |
16 |
17 | ## Overview ##
18 |
19 | We're going to work our way through training a classifier for to detect positive or negative sentiment. This activity
20 | will not make you an expert. Instead, it is designed to give you a sense of the steps and data pipeline
21 | necessary to run such a classifier.
22 |
23 | We have a set of movie review data gathered from the ["Rotten Tomatoes" website by Pang/Lee in 2005](http://www.cs.cornell.edu/People/pabo/movie-review-data/). Each review has
24 | been extracted from the page and turned into a single line of text that is categorized as positive or negative.
25 |
26 | The data is found in the [rt-polaritydata](rt-polaritydata/) directory:
27 |
28 | * [rt-polarity.neg](rt-polarity.neg) — the original negative reviews in Windows 1252 text encoding
29 | * [rt-polarity.neg.utf8](rt-polarity.neg.utf8) — the negative reviews in UTF-8 text encoding
30 | * [rt-polarity.pos](rt-polarity.pos) — the original positive reviews in Windows 1252 text encoding
31 | * [rt-polarity.pos.utf8](rt-polarity.pos.utf8) — the positive reviews in UTF-8 text encoding
32 |
33 | To apply the bag-of-words model, we must:
34 |
35 | 1. Decide on a set of "feature words" for the model. These might be words like "bad", "good", "excellent", "horrible".
36 | 2. Process our data to produce a feature vector for each review text.
37 | 3. Train a classifier (e.g. a [Naive Bayse classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier) on the data.
38 | 4. Apply the classifier non-annotated data (i.e. new reviews).
39 |
40 | There are two simple examples of this process:
41 |
42 | * [candy-corn.py](candy-corn.py) — an example of positive/negative sentiment (2-way classifier)
43 | * [n-way.py](n-way.py) — an example of a multiple category (>2) classifier
44 |
45 | ## Activity ##
46 |
47 | ### (A) Generate a word list and histogram ###
48 |
49 | Use nltk and the supporting code in [featureset.py](featureset.py) and [wordcounts.py](wordcounts.py) to generate a word count and histogram from the dataset.
50 |
51 | Use this to inform the choice of "features" (words) for you bag-of-words model.
52 |
53 | ### (B) Train a classifier ###
54 |
55 | Use or modify the sample code in [train.py](train.py) to train a classifier and store it into a "pickled" object.
56 |
57 | ### (C) Test a classifier ###
58 |
59 | The the classifier on various input data (see sample code [test.py](test.py)).
60 |
61 | ### (D) Model Questions ###
62 |
63 | 1. How can you improve the accuracy?
64 | 2. Are there less often used words that are more characteristic of positive or negative reviews?
65 | 3. Does including such words (less used) improve the accuracy?
66 | 4. What happens to sentences that exhibit no features?
67 | 5. Does changing the stemmer or lemmanizer improve the accuracy?
68 |
69 | ### (E) Scale-up Questions ###
70 |
71 | 1. How would you apply a classifier to a large amount of data?
72 | 2. Given a raw newly acquired data set, what is the data pipeline necessary to apply such a classifier?
73 | 3. How do you organize the input and output of running a such classifier on AWS S3 (or other key/value storage such as HDFS)?
74 |
--------------------------------------------------------------------------------
/activities/relational-databases/README.md:
--------------------------------------------------------------------------------
1 | # Relational Databases #
2 |
3 | There are a number of relational databases with a great variety of features. In this activity, we'll use the
4 | popular [SQLite database](http://www.sqlite.org/) as a local embedded database. This will avoid the need to configure
5 | remote connections. Given the core interoperability of SQL, most of the activity can easily be ported to
6 | other databases once the connection has been established.
7 |
8 | ## Setup ##
9 |
10 | SQLite3 comes packaged with python. You may also want the sqlite3 command-line tools. If you do not have the
11 | command-line shell for sqlite, you may have only the supporting libraries for the python interface. Additional
12 | tools can be installed via the [SQLite website](http://www.sqlite.org/).
13 |
14 | You can test whether you have a SQLite command-line shell by:
15 |
16 | $ sqlite3
17 | SQLite version 3.8.7.4 2014-12-09 01:34:36
18 | Enter ".help" for usage hints.
19 | Connected to a transient in-memory database.
20 | Use ".open FILENAME" to reopen on a persistent database.
21 | sqlite>
22 |
23 | ## ER Models to SQLite Tables ##
24 |
25 | Once you have a Entity-Relationship Model (ER model), you'll need to translate the model into
26 | a set of table definitions. For SQLite, simple primary/foreign key relationships can be created
27 | by use of integer row identitifiers.
28 |
29 | A primary key is simply labeled with `INTEGER PRIMARY KEY` and this enables SQLite to manage autocreation
30 | integer values for primary keys.
31 |
32 | A foreign key is a specially labeled column that references the table column:
33 |
34 | FOREIGN KEY(user) REFERENCES users(id)
35 |
36 | You can create a set of tables either by executing SQL statements via python or the command-line shell:
37 |
38 | CREATE TABLE users (
39 | id INTEGER PRIMARY KEY,
40 | alias TEXT UNIQUE NOT NULL,
41 | name TEXT
42 | );
43 | CREATE TABLE tweets (
44 | user INTEGER NOT NULL,
45 | tweet TEXT NOT NULL,
46 | FOREIGN KEY(user) REFERENCES users(id)
47 | );
48 |
49 | Note: See the syntax of [CREATE TABLE](https://www.sqlite.org/lang_createtable.html) for more information on the possibilities and the [datatypes supported by SQLite](http://www.sqlite.org/datatype3.html).
50 |
51 | Try this now my running the sqlite3 command-line tool and just cut-n-past the above definitions:
52 |
53 | $ sqlite3 test.db
54 | SQLite version 3.8.7.4 2014-12-09 01:34:36
55 | Enter ".help" for usage hints.
56 | sqlite> CREATE TABLE users (
57 | ...> id INTEGER PRIMARY KEY,
58 | ...> alias TEXT UNIQUE NOT NULL,
59 | ...> name TEXT
60 | ...> );
61 | sqlite> CREATE TABLE tweets (
62 | ...> user INTEGER NOT NULL,
63 | ...> tweet TEXT NOT NULL,
64 | ...> FOREIGN KEY(user) REFERENCES users(id)
65 | ...> );
66 | sqlite>
67 |
68 | ## Inserting Data ##
69 |
70 | You can insert data into tables via simple SQL commands. SQLite will handle row identifiers for primary keys if you've defined them to be integers:
71 |
72 | sqlite> insert into users(alias,name) values ('alexmilowski','Alex Milowski');
73 | sqlite> insert into users(alias,name) values ('ghopper','Grace Hopper');
74 | sqlite> select * from users;
75 | 1|alexmilowski|Alex Milowski
76 | 2|ghopper|Grace Hopper
77 |
78 | If know the user's primary key, we can insert tweet text:
79 |
80 | sqlite> insert into tweets values (1,"Hello World!");
81 | sqlite> select * from tweets where user=(select id from users where alias='alexmilowski');
82 | 1|Hello World!
83 |
84 | ## SQLite in Python ##
85 |
86 | Connecting to a database is simple. Given the previous example database, we can do:
87 |
88 | >>> import sqlite3
89 | >>> conn = sqlite3.connect('test.db')
90 |
91 | and execute a query:
92 |
93 | >>> c = conn.cursor()
94 | >>> c.execute('SELECT * FROM users')
95 | >>> c.fetchone()
96 | (1, u'alexmilowski', u'Alex Milowski')
97 | >>> c.fetchone()
98 | (2, u'ghopper', u'Grace Hopper')
99 | >>> c.fetchone()
100 |
101 | We can also bind values in queries:
102 |
103 | >>> c.execute('SELECT * FROM users WHERE alias=?', ['alexmilowski'])
104 | >>> c.fetchone()
105 | (1, u'alexmilowski', u'Alex Milowski')
106 |
107 | Or iterate results:
108 |
109 | >>> for row in c.execute('SELECT * FROM users'):
110 | ... print row
111 | ...
112 | (1, u'alexmilowski', u'Alex Milowski')
113 | (2, u'ghopper', u'Grace Hopper')
114 |
115 | Inserting data requires both a query (insert statement) and a commit:
116 |
117 | >>> users=[('mariecurie',"Marie Curie"),
118 | ... ('albert',"Albert Einstein")]
119 | >>> c.executemany("INSERT INTO users(alias,name) VALUES(?,?)",users)
120 |
121 | >>> conn.commit()
122 |
123 | Finally, don't forget to close the connection:
124 |
125 | >>> conn.close()
126 |
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/activities/decision-trees/tree-example.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
83 |
--------------------------------------------------------------------------------
/activities/emr-opennex-climate-model/README.md:
--------------------------------------------------------------------------------
1 | # Processing the NASA OpenNEX model in EMR #
2 |
3 | NASA produced a climate model which has been made available as web resources via the PAN methodology at
4 | as an example at http://data.pantabular.org/opennex/ with a demonstration application at http://data.pantabular.org/opennex/explore/
5 |
6 | In this activity, we will be using this climate model data as an example data set to process numerical information in MRJob
7 | and produce various averages and summaries.
8 |
9 | ## Acquiring the Data ##
10 |
11 | To begin with, we will run the Map/Reduce processes locally on a data set rather than off S3 and via EMR. All of the
12 | examples can easily be translated by storing data in S3 and using the EMR runner to deploy the MRJob code on a cluster.
13 |
14 | The file [acquire.py](acquire.py) is a program that will download the data in PAN format and output the data as JSON
15 | data files. The program has the arguments:
16 |
17 | * the data set name
18 | * a resolution counts of a 1/120°
19 | * a quadrangle in JSON array syntax
20 | * a start year/month (e.g., 2015-03)
21 | * an end year/month
22 | * an output directory
23 |
24 | For example:
25 |
26 | python acquire.py avg-rcp85 60 "[40,-125,35,-120]" 2015-03 2015-03 dataset/
27 |
28 | downloads data for the month of March, 2015 for 0.5° partitions for the data set `avg-rcp85` and stores it into the directory `dataset/`.
29 |
30 | The output is a set of files based on sequence numbers that cover the requested geospatial region. They are named `{year}-{month}-{resolution}-{sequence#}.json` and
31 | stored directory given as one JSON object per file without newlines in the formatting.
32 |
33 | For this activity, acquire the first three months of data for 2015:
34 |
35 | python acquire.py avg-rcp85 60 "[40,-125,35,-120]" 2015-01 2015-03 dataset/
36 |
37 | This should take about 2-3 minutes.
38 |
39 | ## Understanding the Data ##
40 |
41 | The JSON object has the format:
42 |
43 | {
44 | "data" : [ 286.19, 286.19, 286.18, ... ],
45 | "yearMonth" : "2015-01",
46 | "sequence": 74634,
47 | "resolution": 60,
48 | "dataset": "avg-rcp85"
49 | }
50 |
51 | The array value for "data" is a set of temperature values in Kelvins from the model associated with the geospatial region for the sequence number.
52 |
53 | ## Supporting Code ##
54 |
55 | There are two supporting libraries:
56 |
57 | * seqs.py — a library supporting generating sequence numbers from latitude/longitude
58 | * date_partitions — a library supporting generating sequences of dates for partitioning time
59 |
60 | A set of sequence numbers given geospatial region can be enumerated giving the quadrangle and the size (in degrees):
61 |
62 | seqs.sequencesFromQuadrangle(0.5,[40,-125,35,-120])
63 |
64 | where `0.5` is for half-degree quadrangles covering the region defined by the two points (40°,-125°) and (35°,-120°).
65 |
66 | The two supporting libraries can be put together:
67 |
68 | import datetime
69 | import seqs
70 | import date_partitions as partitions
71 |
72 | for month in partitions.month_partition(datetime.datetime(2015,3,1),datetime.datetime(2015,5,1)):
73 | for seq in seqs.sequencesFromQuadrangle(0.5,[40,-125,35,-120]):
74 | print "{}-{:02d},{}".format(month.year,month.month,seq)
75 |
76 | ## Input Example ##
77 |
78 | Because we'll be running the example locally, we can just create input from each of the data files where each line contains a single
79 | JSON object. The example code [input-example.py](input-example.py) produces an average via map/reduce (mrjob) over the data loaded.
80 |
81 | To run the example on the first three months we acquired:
82 |
83 | cat dataset/2015-0[1-3]*.json | python input-example.py
84 |
85 | The mapper loads the data from the line given and computes an average:
86 |
87 | def mapper(self, _, line):
88 | obj = json.loads(line)
89 | yield "average",sum(obj["data"])/len(obj["data"])
90 |
91 | ## Average Example ##
92 |
93 | A more complicated example in [average.py](average.py) computes an average by month and keeps track of the counts. It uses a combiner
94 | to collect the sequence numbers associated with the month and then does the reduce step to compute the overall average. It uses the
95 | counts to make sure the average is calculated correctly.
96 |
97 | To run the example on the first three months we acquired:
98 |
99 | cat dataset/2015-0[1-3]*.json | python average.py
100 |
101 | Note that the average is not quite the same.
102 |
103 | ## Activity ##
104 |
105 | We'd like to take these simple examples and compute over a more generic input. We can transition our code to run over a local dataset (or one in S3)
106 | by using a setup like [by-sequences.py] where the data is retrieved from a data set and the input is a specification of what to process.
107 |
108 | This program assumes input in a CSV format with the columns:
109 |
110 | * lat1 — the NW latitude of the quadrangle
111 | * lon1 — the NW longitude of the quadrangle
112 | * lat2 — the SE latitude of the quadrangle
113 | * lon2 — the SE longitude of the quadrangle
114 | * size — the count of 1/120° arc lengths of the resolution (usually 60)
115 | * startYear - the year to start
116 | * startMonth - the month to start
117 | * endYear — the year to end
118 | * endMonth — the month to end
119 |
120 | An input might look like:
121 |
122 | #lat1,lon1,lat2,lon2,size,startYear,startMonth,endYear,endMonth
123 | 40,-125,35,-120,60,2015,02,2015,03
124 |
125 | and you can run the sample code like:
126 |
127 | python by-sequences.py --data-dir `pwd`/dataset/ < input-sequences.txt
128 |
129 | The sample code is a two-step map/reduce job. Your task is to modify it so that it correcly computes an average for the given input line. Take a look
130 | at `average.py` and see how you might modify the various methods and add/replace them in `by-sequences.py`.
131 |
132 |
--------------------------------------------------------------------------------
/activities/crawling-the-crawl/README.md:
--------------------------------------------------------------------------------
1 | # Crawling the Common Crawl #
2 |
3 | The common crawl is a data set hosted by AWS that represents a crawl of the Web. The data set contains the raw web pages as well as
4 | metadata and text extracts that are smaller in size.
5 |
6 | The dataset is stored in [WARC format](http://en.wikipedia.org/wiki/Web_ARChive) (ISO 28500:2009) and consists of a textual stream of
7 | records. Each record contains a header of name/value pairs followed by an entity body (and encoded payload).
8 |
9 | The [Common Crawl stores its data](http://commoncrawl.org/the-data/get-started/) in these format on S3 as hosted by AWS in the bucket
10 | and prefix of `s3://aws-publicdatasets/common-crawl/`. Crawl data from 2013 onward has the key structure of `crawl-data/CC-MAIN-YYYY-DD/
11 | and so, for example, the latest is stored at `s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-06/`.
12 |
13 | ## Activity — Exploring the Data Set ##
14 |
15 | ### How is it stored and partitioned? ###
16 |
17 | Use the AWS CLI to explore the data set (`s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-06/`) by using the `aws s3 ...` command and answer the following:
18 |
19 | 1. What is stored at the root?
20 | 2. What summary metadata can you retrieve?
21 | 3. What are the various data formats you can process?
22 | 4. How is the data set partitioned?
23 |
24 | ### WARC, WET, WAT ###
25 |
26 | 1. There are three data resources stored by the common crawl: raw pages, metadata, and textual extraction. Are they all stored in the same format?
27 |
28 | 2. What do you need to process them?
29 |
30 | 3. Retrieve a sample being careful not to download the whole dataset (it is large).
31 |
32 | 4. Examine a sample WAT file.
33 |
34 |
35 | ## Activity — Extracting Domain Coverage ##
36 |
37 | First you need to create (or reuse) an S3 bucket for this activity. Throughout this activity, we will use the name `mybucket` for the bucket
38 | name and you should replace that with your bucket name.
39 |
40 | Also, you'll need your AWS key name so that you have SSH access to the cluster.
41 |
42 | ### 1. Start a Cluster ###
43 |
44 | First, copy the bootstrapping script [cc-bootstrap.sh](cc-bootstrap.sh) to the root of your bucket (e.g. to s3://mybucket/cc-bootstrap.sh):
45 |
46 | aws s3 cp cc-bootstrap.sh s3://mybucket/cc-bootstrap.sh
47 |
48 | This script installs python 2.7 and various packages use by the WARC python modules.
49 |
50 | There is a script in the code called [start.sh](start.sh) that uses the AWS CLI to start a basic cluster for this activity. It takes a key name (for ssh) and bucket name as arguments:
51 |
52 | ./start.sh mykey mybucket
53 |
54 | It will start the cluster defined in [cluster.json](cluster.json).
55 |
56 | *You'll need this cluster at the end. Don't start the cluster until you need it a save yourself a bit a money.*
57 |
58 | ### 2. Get the manifest ###
59 |
60 | At the root of the crawl there should be several compressed manifest files that have paths to the data. Retrieve these files from S3 and examine the WAT file.
61 |
62 | The manifest contains a set of paths into the S3 bucket. You can convert these to S3 URIs by:
63 |
64 | gzip -dc wat.paths.gz | python s3.py
65 |
66 | ### 3. Retrieve sample data ###
67 |
68 | We will be working with the WAT metadata from here forward. You may want to retrieve some sample data to work locally and then test your code on a cluster afterwards.
69 |
70 | You can get the very first partition by:
71 |
72 | gzip -dc wat.paths.gz | python s3.py | head -n 1
73 |
74 | You can use the AWS CLI to download this locally from S3. Be warned that the data file is about 400MB in size.
75 |
76 | Alternatively, you can use the `extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz` file that is an extract of the first 907 records of the first partition.
77 |
78 | ### 4. View the data ###
79 |
80 | Just take a peek:
81 |
82 | gzip -dc extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz | more
83 |
84 | What's in there? Looks like JSON data ...
85 |
86 | ### 5. Run the example MRJob Locally ###
87 |
88 | There is sample code in [mrcc.py](mrcc.py) and [ccex.py](ccex.py).
89 |
90 | Run the example on the extract:
91 |
92 | echo `pwd`/extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz | python ccex.py
93 |
94 | What does that command do?
95 |
96 | What did the program do? Do you know how this works?
97 |
98 | Notice something funny about the output? Explain your observation based on the input data.
99 |
100 |
101 | ### 6. Modify the example ###
102 |
103 | One basic issue with using the common crawl is to determine whether your target sites are in there. Thus, one simple task is to count the domains crawled within
104 | a particular data set.
105 |
106 | Can you modify [ccex.py](ccex.py) to count domains?
107 |
108 | The WAT data in WARC format contains metadata extracted from the crawl for each page. Process the data to extract and count the domain names. Be careful to remove sub-domains
109 | so that variants like `www1.hp.com` and `www2.hp.com` reduce to `hp.com`.
110 |
111 | ### 7. Run it on a cluster ###
112 |
113 | Once you have your script read, you can run it directly on the dataset hosted in AWS. All you need to do is provide a list of the S3 URIs you want to process as the input.
114 |
115 | One simple way to do that is from the path metadata. For example, the first 10 listed is:
116 |
117 | gzip -dc wat.paths.gz | python s3.py | head -n 10
118 |
119 | There is a script called [run.sh](run.sh) that will launch your job on your cluster and it takes the script, the bucket, and the cluster identifier as parameters:
120 |
121 | gzip -dc wat.paths.gz | python s3.py | head -n 10 | ./run-step.sh myscript.py mybucket j-xxxxxxxxxxxxx
122 |
123 | where `j-xxxxxxxxxxxxx` is your cluster identifier.
124 |
125 | ### 8. Discussion ###
126 |
127 | How long will it take to compute the domains for a partition? For the whole crawl date? For the whole data set?
128 |
129 | Does it scale?
130 |
131 | What do you need to do to make it scale?
132 |
133 |
--------------------------------------------------------------------------------
/activities/twitter-acquisition/README.md:
--------------------------------------------------------------------------------
1 | # Acquiring Data from Twitter #
2 |
3 | This activity will step you through the process of acquiring data from Twitter and applying different acquisition strategies.
4 |
5 | ## Setup ##
6 |
7 | ### Install Tweepy ###
8 |
9 | The code provided and activities will use the [tweepy](https://github.com/tweepy/tweepy) module. You should install this package:
10 |
11 | pip install tweepy
12 |
13 | ### Create an Application ###
14 |
15 | Twitter data can be accessed over the Web by creating an application on their site and then using the access keys
16 | they provide for the application in your program.
17 |
18 | Note: You will need to have a Twitter account to create an application.
19 |
20 | To create an application, follow this procedure:
21 |
22 | 1. Login to Twitter (https://www.twitter.com/).
23 | 2. Visit https://apps.twitter.com and click on "Create New App".
24 | 3. Fill in the application name, description, and Website. The name will be listed in your application list when you return this Website.
25 | 4. Agree to the terms and agreements and click on "Create your Twitter Application"
26 |
27 | Once you have successfully created an application, it should take you to the newly created application. Here you must create access keys for
28 | subsequent operations by your application. To do so, use the following procedure:
29 |
30 | 1. Click on the "Keys and Access Tokens" tab.
31 | 2. Click on "Create my Access Token" near the bottom of the page.
32 |
33 | The response should be relatively immediate.
34 |
35 | Now you have for things:
36 |
37 | 1. A consumer key that identifies your application.
38 | 2. A consumer secret that acts as a "password" for your application.
39 | 3. An access token that identifies your authorized access.
40 | 4. An access token secret that acts as a "password" for that authorized access.
41 |
42 | At any point, you can revoke the access key or regenerated any of these values.
43 |
44 | To completely disable the application, you must delete the application. This does is remove the consumer key, secret, and access tokens from
45 | Twitter's system and any program using them will immediately stop working.
46 |
47 | ### Test your Application ###
48 |
49 | Use the `hello-twitter.py` program to test your application. Change the code and insert your consumer key, consumer secret, access token, and
50 | access token secret. You should then be able to just run the program and get a few tweets:
51 |
52 | python hello-twitter.py
53 |
54 | ## Data Collection Activities ##
55 |
56 | While real-time data collection is interesting, if you are research data provided by tweets, search is the simple way to
57 | collect information - even from the recent past. Instead of collecting information and sorting it ourselves, we'll use
58 | the twitter search API to partition information by date/time and other facets to partition the collected data.
59 |
60 | Also, the [Twitter API is rate limited](https://dev.twitter.com/rest/public/rate-limiting) and so you can't make more than
61 | 180 requests per 15 minutes. Fortunately, the tweepy library that we'll be using handles pausing automatically. With the
62 | partitioning and the automatic handling of rate limiting against the [Twitter REST API](https://dev.twitter.com/rest/public),
63 | we'll be able to just write our code normally and the calls will pause until requests can be made again.
64 |
65 | ### The Tweepy Library ###
66 |
67 | The Tweepy library handles talking directly to the various REST Web services provided by Twitter. Many of the calls
68 | have practical limits to the amount of data that is returned. If you are trying to gather large amounts of data from
69 | Twitter, you'll need to navigate the paged results.
70 |
71 | Tweepy provides a "cursor" functionality that handles the navigation of paged results for you. You simply
72 | wrap your call in a Cursor object:
73 |
74 | for tweet in tweepy.Cursor(api.search,q=q).items(200)
75 | print tweet.text
76 |
77 | In the above example, the 200 tweets are returned from the generator regardless of how many are returned from
78 | each call to a Twitter REST API.
79 |
80 | An example of this is shown in `search.py` where the first 200 tweets are collected for a search term. You'll need to modify
81 | the code to add your consumer key/secret and access token/secret.
82 |
83 | ### Activity: Chunking ###
84 |
85 | Suppose you are going to collect information about a particular topic (e.g. a hash tag) from Twitter and you'll be using code
86 | similar to `search.py` to do so. If you remove the `200` parameter to `items()` you'll be accessing all the search results in
87 | as much as Twitter will give you over time via the rate limiting.
88 |
89 | Change the search.py code to output data to a file and limiting the amount of tweets per file.
90 |
91 | Here are some things to consider:
92 |
93 | * What information will you store?
94 | * Tweets are actually complex JSON objects accessible as the '_json' member on the object returned by the tweepy API. Maybe
95 | you should store the JSON?
96 | * What is a syntactically correct json file (see http://www.json.org)?
97 | * Maybe you'll want a nice handler class for the data?
98 | * How do you cancel this possibly long running process and still have the last chunk be syntactically valid?
99 |
100 | Here is some helper code for serialization that relies on the `json` python module:
101 |
102 | class TweetSerializer:
103 | out = None
104 | first = True
105 | count = 0
106 | def start(self):
107 | self.count += 1
108 | fname = "tweets-"+str(self.count)+".json"
109 | self.out = open(fname,"w")
110 | self.out.write("[\n")
111 | self.first = True
112 |
113 | def end(self):
114 | if self.out is not None:
115 | self.out.write("\n]\n")
116 | self.out.close()
117 | self.out = None
118 |
119 | def write(self,tweet):
120 | if not self.first:
121 | self.out.write(",\n")
122 | self.first = False
123 | self.out.write(json.dumps(tweet._json).encode('utf8'))
124 |
125 | ### Activity: Interrupts and Resilience ###
126 |
127 | If you need to shutdown your data collection, you can define an interrupt handler:
128 |
129 | def interrupt(signum, frame):
130 | print "Interrupted, closing ..."
131 | # magic goes here
132 | exit(1)
133 |
134 | signal.signal(signal.SIGINT, interrupt)
135 |
136 | Things to consider:
137 |
138 | * What would you add to your chunking tweet acquisition code to handle interrupts?
139 | * What kind of exceptions might be thrown?
140 | * What kinds of errors might Tweepy or Twitter give you?
141 | * How do you make your process resilient?
142 |
143 |
144 | ### Activity: Partitioning Data on Facets ###
145 |
146 | While it may be convenient for the programmer to write out a fixed number of tweets per file, it might be more
147 | useful to partition the tweets on facets based on your data collection. For example, if you are collecting tweets over
148 | a specific period of time, treating the data as a time-series data set might make sense. As such, the partition or API use
149 | would use time to limit the results stored in each file.
150 |
151 | Twitter has two useful [search query operators](https://dev.twitter.com/rest/public/search):
152 |
153 | * until:{date} - limits the result to those up to a specific date
154 | * since:{date} - limits the results to those after a specific date
155 |
156 | These two operators can be used together to define a particular day. For example:
157 |
158 | minecraft since:2015-01-10 until:2015-01-11
159 |
160 | which you can view on the [twitter website](https://twitter.com/search?q=minecraft%20since%3A2015-01-10%20until%3A2015-01-11).
161 |
162 | Questions to consider:
163 |
164 | * How would you change your search program to use facets of the tweets for partitioning to retrieve data for a specific time period (e.g. a week)?
165 | * What duration of time would you use to store a "reasonable" number of tweets per chunked file?
166 | * What other criteria would you use to chunk data beyond a day?
167 | * How are the files named consistently to match the facet ranges?
168 | * Are the facet ranges in the JSON in each output file?
169 |
170 |
--------------------------------------------------------------------------------
/activities/common-crawl/README.md:
--------------------------------------------------------------------------------
1 | # Common Crawl Exemplar #
2 |
3 | This activity will step you through the process of running various Map/Reduce (MR) processes
4 | on the [Common Crawl](http://commoncrawl.org/) data set hosted by [AWS](http://aws.amazon.com).
5 |
6 | In this activity you will:
7 |
8 | 1. Install various supporting tools for running MR processes via [mrjob](https://github.com/Yelp/mrjob) and [AWS EMR](http://aws.amazon.com/elasticmapreduce/).
9 | 2. Process data locally using mrjob.
10 | 3. Run the same process on AWS EMR.
11 | 4. Ensure you have all correct development environment to do the above.
12 |
13 | This activity is divided into two parts. In the first part, you'll run the example code locally. Afterwards, you can setup an AWS account and role so that you can run the same
14 | process on AWS in the cloud.
15 |
16 | We will be running the "[Tag Counter](https://github.com/commoncrawl/cc-mrjob#running-the-code)" over portions of the Common Crawl data set.
17 |
18 | # General Setup #
19 |
20 | ## Shell Access ##
21 |
22 | Most of the following code uses shell commands. You should become familiar with running commands from the shell and make sure you have an environment that
23 | matches your deployment environment (likely Linux). You can run a Linux OS locally via technology like [Virtual Box](https://www.virtualbox.org).
24 |
25 | ## Get the Code via Git ##
26 |
27 | You need to install git from [github](http://github.com) and you're already on their site. If you haven't already done so, sign up for an account and clone the
28 | code for the [Common Crawl - mrjob starter kit](https://github.com/commoncrawl/cc-mrjob):
29 |
30 | git clone https://github.com/commoncrawl/cc-mrjob.git
31 |
32 | This will download the code into whatever directory you are in when you issue that command. You should then have a directory called 'cc-mrjob'. The setup from now on
33 | will assume you are in the same directory.
34 |
35 | If you do not have this repository, clone this into a parallel directory:
36 |
37 | git clone https://github.com/alexmilowski/data-science.git
38 |
39 | You should now have two parallel directories:
40 |
41 | .../cc-mrjob/
42 | .../data-science/
43 |
44 | Copy these files from `data-science/activities/common-crawl` to the `cc-mrjob` directory :
45 |
46 | mrcc.py
47 | mrcc.py.tar.gz
48 | mrjob.conf
49 |
50 | Note: The modified code just fixes issues with pulling the common crawl data from S3 and the `mrjob.conf` is a configuration of EMR bit more specific to this activity.
51 |
52 |
53 | ## Setup Python ##
54 |
55 | You should install [Python 2.7.8](https://www.python.org/download/releases/2.7.8/) locally so you can run this example. If you have previous versions
56 | of Python, you may run into compatibility reasons (e.g. don't use 2.6.x). In addition, Python 3.0 has many changes that also may be problematic.
57 |
58 | You may find a Python IDE useful but you should ensure you can run Python from the command line properly. Also, installing multiple versions of Python is not recommended.
59 |
60 | Once you've gotten your Python install sorted, load the packages for the activity via pip:
61 |
62 | pip install -r requirements.txt
63 |
64 | Note: Depending on how you have install various bits, you may need a "sudo" in front of that.
65 |
66 | # Run it Locally #
67 |
68 | ## Requirements ##
69 |
70 | You'll need good bandwidth to download the various data.
71 |
72 | ## Get the Data ##
73 |
74 | There is a script that uses `wget` to download various content from the hosted dataset on S3:
75 |
76 | ./get-data.sh
77 |
78 | If you are on a Mac or Windows, you'll likely need to install wget. If you use Mac Ports, you can install wget via:
79 |
80 | sudo port install wget
81 |
82 | Otherwise, the datasets for this activity are located at:
83 |
84 | https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
85 | https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/wat/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wat.gz
86 | https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/wet/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wet.gz
87 |
88 | The various subsequent scripts expect a subdirectory structure of:
89 |
90 | common-crawl/
91 | crawl-data/
92 | CC-MAIN-2014-35/
93 | segments/
94 | 1408500800168.29/
95 | warc/
96 | CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
97 | wat/
98 | CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wat.gz
99 | wet/
100 | CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wet.gz
101 |
102 | ## Run the Code ##
103 |
104 | To run the code, do the following:
105 |
106 | python absolutize_path.py < input/test-1.warc | python tag_counter.py -r local --conf-path mrjob.conf --no-output --output-dir out
107 |
108 | The first python script just turns a relative path into an absolute path. The second python uses that path as input via stdin and then runs the Map/Reduce process locally via mrjob.
109 |
110 | The output is in the file `out/part-00000`.
111 |
112 |
113 | # Run it on AWS EMR #
114 |
115 | If you have not signed up for AWS, you'll need to do that first by visiting http://aws.amazon.com/
116 |
117 | ## AWS Setup ##
118 |
119 | If you do not have a user/group with access to EMR, you'll need to do the following procedure.
120 |
121 | First, you need to setup a user to run EMR:
122 |
123 | 1. Visit http://aws.amazon.com/ and sign up for an account.
124 | 2. Select the "Identity and Access Management" (or IAM) from your console or visit https://console.aws.amazon.com/iam/home
125 | 3. Select "Users" from the list on the left.
126 | 3. Click on the "Create New Users"
127 | 4. Enter a user name for yourself and create the user.
128 | 5. The next screen will give you an option to download the credentials for this user. Do so and store them in a safe place. You will not be able to retrieve them again.
129 |
130 | Second, you need to create a group with the right roles:
131 |
132 | 1. Select "Groups" from the list on the left.
133 | 2. Click on "Create New Group".
134 | 3. Enter a name and click on "Next Step".
135 | 4. Scroll down to "Amazon Elastic MapReduce Full Access" click on "Select".
136 | 5. Once the policy document is displayed, click on "Next Step".
137 | 6. Click on "Create Group" to create the group.
138 |
139 | Third, you need to assign your user to the group:
140 |
141 | 1. Select the check box next to your group.
142 | 2. Click on the "Group Actions" drop-down menu and click on "Add Users to Group".
143 | 3. Select your user by clicking on the check box.
144 | 4. Click on "Add Users".
145 |
146 | ## Configure mrjob ##
147 |
148 | You need to configure mrjob to access your AWS account:
149 |
150 | 1. Edit the mrjob.conf
151 | 2. Locate the `#aws_access_key_id:` and `#aws_secret_access_key:` lines.
152 | 3. Remove the hash (#) and add your AWS key and secret after the colon (:). You should have these from previously creating the user.
153 |
154 | ## Setup an Output Bucket on S3 ##
155 |
156 | You need to create an output bucket on S3 for the results of your computation:
157 |
158 | 1. Go to https://aws.amazon.com/ in your browser.
159 | 2. Click on the 'S3' service link.
160 | 3. Click on the 'Create Bucket' button.
161 | 4. Enter a name and hit create.
162 |
163 | Keep in mind that the bucket name is unique to all of Amazon. If you use some common name, it is likely to clash with other
164 | users. One suggestion is to use a common prefix (e.g. a domain name) for all your bucket names.
165 |
166 | ## Run the Code on EMR ##
167 |
168 | In the previous step, you created an output bucket. In the example below, replace `{your-bucket-name}` with the name of the bucket you created.
169 |
170 | To run the tag count on EMR for one input, do the following:
171 |
172 | time python tag_counter.py -r emr --conf-path mrjob.conf --python-archive mrcc.py.tar.gz --no-output --output-dir s3://{your-bucket-name}/cc-test-1 --source s3 input/test-1.warc
173 |
--------------------------------------------------------------------------------
/activities/intro-to-spark/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to Spark #
2 |
3 | ## Setup ##
4 |
5 | ### Installing Spark ###
6 |
7 | 1. Visit the Spark [Downloads](https://spark.apache.org/downloads.html) page.
8 | 2. Select "1.3.0" from the first list box.
9 | 3. Select "Pre-built for Hadoop 2.4 and later".
10 | 4. Leave "Select Apache Mirror" alone.
11 | 5. Click on the link in #4
12 | 6. When the result page loads, click on the suggested mirror to download Spark.
13 |
14 | Once you have downloaded Spark, just unpack the directory somewhere convenient. We'll be using the executable directly from the distribution.
15 |
16 | We'll use the environment variable `$SPARK_HOME` throughout this example. You should define it to be where you unpacked the Spark distribution:
17 |
18 | export SPARK_HOME=~/workspace/spark-1.3.0-bin-hadoop2.4/
19 |
20 | You should install psutil as well:
21 |
22 | pip install psutil
23 |
24 | ### Preparing Sample Data ###
25 |
26 | We'll be using the same conference data from the [Organizing Acquired Data](../../assignments/organizing-tweets/) assignment. We will prepare the data by writing each tweet onto a single line:
27 |
28 | python one-line-json.py < ../../assignments/organizing-tweets/prague-2015-02-14.json > 2015-02-14.txt
29 | python one-line-json.py < ../../assignments/organizing-tweets/prague-2015-02-15.json > 2015-02-15.txt
30 |
31 | We'll also use some randomly generated files:
32 |
33 | curl "http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain" > words
34 | mkdir random
35 | python random-text.py 1000000 10 < words > random/random-0.txt
36 | python random-text.py 1000000 10 < words > random/random-1.txt
37 | python random-text.py 1000000 10 < words > random/random-2.txt
38 | python random-text.py 1000000 10 < words > random/random-3.txt
39 | python random-text.py 1000000 10 < words > random/random-4.txt
40 | python random-text.py 1000000 10 < words > random/random-5.txt
41 | python random-text.py 1000000 10 < words > random/random-6.txt
42 | python random-text.py 1000000 10 < words > random/random-7.txt
43 | python random-text.py 1000000 10 < words > random/random-8.txt
44 | python random-text.py 1000000 10 < words > random/random-9.txt
45 |
46 |
47 | ## Activity - Run some example ##
48 |
49 | ### Hello World - Word Count ###
50 |
51 | The classic "hello world" of map/reduce is a simple word count. An example implementation is in [wordcount.py](wordcount.py) and can be run as follows:
52 |
53 | $SPARK_HOME/bin/spark-submit wordcount.py "random/random-*.txt"
54 |
55 | This will run the word count over the randomly generated data (from the setup) of 100 million words.
56 |
57 | The RDD contains a wild card and is effectively the same as:
58 |
59 | lines = sc.textFile("random/random-*.txt", 1)
60 |
61 | and the wild card allows Spark to access all the generated data files.
62 |
63 | The code is straight forward and starts with splitting the lines of text into words:
64 |
65 | lines.flatMap(lambda x: x.split())
66 |
67 | then mapping each word to a pair of the word and a count of one:
68 |
69 | .map(lambda word: (word, 1))
70 |
71 | and finally reducing the pairs by key using summation:
72 |
73 | .reduceByKey(lambda a,b : a + b)
74 |
75 | ### Word Count over Tweets ###
76 |
77 | We can change the first actions on the RDD in the word count example and have it operate on tweet text. The tweet data has been prepared with one
78 | JSON tweet object per line in `2015-02-14.txt` and `2015-02-15.txt` (see Setup).
79 |
80 | The first lines look something like:
81 |
82 | lines.map(lambda line: json.loads(line)) \
83 | .flatMap(lambda tweet: tweet["text"].split())
84 |
85 | which loads the JSON object and splits the "text" property instead of the whole line.
86 |
87 | The code is in [tweet-wordcount.py](tweet-wordcount.py) and can be run by:
88 |
89 | $SPARK_HOME/bin/spark-submit tweet-wordcount.py "2015-02-*.txt"
90 |
91 | ### Understanding Scaling ###
92 |
93 | By default, you are running Spark locally. You can specify the "master" by the `--master` option which takes a URI.
94 |
95 | A special value of "local[n]" allows you to control the number of workers in your local cluster and can give you an
96 | idea of "speed-up via parallelization" (within the limits of your hardware).
97 |
98 | Try the following experiment:
99 |
100 | time $SPARK_HOME/bin/spark-submit --master local[1] wordcount.py "random-large-*.txt"
101 |
102 | and note the time. Now remove the `--master` option and do the same. It should take longer as Spark will attempt
103 | to guess and the correct number of local resources for your hardware.
104 |
105 | Now, trying increasing `local[1]` to `local[2]` through `local[6]` and note the times. Is there a limit to the
106 | increase in speed as you add more workers?
107 |
108 | You can try the same experiments later by creating actual clusters of various sizes. The only change would be
109 | the value for the `--master` option.
110 |
111 |
112 | ## Activity - Problem Solving ##
113 |
114 | The tweet data we prepared is from a conference. How can we use Spark to answer the following questions?
115 |
116 | 1. Who tweeted the most during the conference?
117 | 2. What were the top 10 hash tags used?
118 | 3. For a particular hour, how many tweets were produced?
119 |
120 | ## Activity - Deploying to Clusters ##
121 |
122 | ### Spark on EC2 ###
123 |
124 | #### Overview ####
125 |
126 | You can start a standalone Spark cluster on EC2 using the program `spark-ec2` located in the `ec2` directory of the spark distribution. You'll need:
127 |
128 | * your key name
129 | * your local key (e.g. .pem file)
130 | * a preferred zone
131 | * your AWS key and secret
132 |
133 | You'll need to setup two environment variables to contain your AWS credentials:
134 |
135 | export AWS_SECRET_ACCESS_KEY=xxxxxxxxx
136 | export AWS_ACCESS_KEY_ID=xxxxxxxx
137 |
138 | You will need to make sure your access key is allowed to start EC2 instances. You may need to modify the policy for the access key in "Identity and Access Management" and at minimum you'll
139 | want:
140 |
141 | {
142 | "Version": "2012-10-17",
143 | "Statement": [
144 | {
145 | "Sid": "Stmtnnnnnn",
146 | "Effect": "Allow",
147 | "Action": [
148 | "ec2:*"
149 | ],
150 | "Resource": [
151 | "*"
152 | ]
153 | }
154 | ]
155 | }
156 |
157 | You can create this policy by clicking on "Create Another Policy" when viewing the group. Use the policy generator and select "Amazon EC2" from the "AWS Service",
158 | select "All Actions" for "Actions", and enter "*" for "Amazon Resource Name (ARN)". This is the most liberal policy and you can certain restrict it from there.
159 |
160 | A simple cluster can then be launched as follows:
161 |
162 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem -s 3 -t m3.medium -z us-east-1c --copy-aws-credentials launch "Spark Test"
163 |
164 | At the very end you'll see the master hostname and you can visit this in your browser:
165 |
166 | http://ec2-nn-nn-nn-nn.compute-1.amazonaws.com:8080/
167 |
168 | Spark jobs are run from the master node of the cluster. You can login (ssh) via:
169 |
170 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem login "Spark Test"
171 |
172 | Finally, you can terminate your cluster:
173 |
174 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem destroy "Spark Test"
175 |
176 | Running a job requires two things:
177 |
178 | 1. Your code (driver) must be transferred to the master node.
179 | 2. Your data must be accessible by all nodes (copied to each node, put into HDFS or S3, etc.)
180 |
181 | #### Testing ####
182 |
183 | First let's try transferring our data and code to the master node:
184 |
185 | scp -i yourkey.pem wordcount.py root@ec2-nn-nn-nn-nn.compute-1.amazonaws.com:~
186 | scp -i yourkey.pem random/random-0.txt root@ec2-nn-nn-nn-nn.compute-1.amazonaws.com:~
187 |
188 | Note: We'll only use the first set of random works to minimize network bandwidth use.
189 |
190 | Then login:
191 |
192 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem login "Spark Test"
193 |
194 | Run a job:
195 |
196 | time spark/bin/spark-submit --master spark://ec2-nn-nn-nn-nn.compute-1.amazonaws.com:7077 wordcount.py random-0.txt > /dev/null
197 |
198 | Now we can copy that same file to S3 from your local machine:
199 |
200 | aws s3 cp random/random-0.txt s3://mybucket/random/random-0.txt
201 |
202 | and try the same job with an S3 URI (note the use of s3n)
203 |
204 | time spark/bin/spark-submit --master spark://ec2-nn-nn-nn-nn.compute-1.amazonaws.com:7077 wordcount.py s3n://mybucket/random/random-0.txt > /dev/null
205 |
206 | You should see a notable difference in processing time as S3 is far slower than local files.
207 |
208 | ### Spark on EMR ###
209 |
210 | TBD ... yarn, yarn, yarn
211 |
--------------------------------------------------------------------------------
/activities/decision-trees/decision-tree.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
113 |
--------------------------------------------------------------------------------
/activities/text-processing-with-nltk/README.md:
--------------------------------------------------------------------------------
1 | # Text Processing with NLTK #
2 |
3 | ## Setup ##
4 |
5 | NLTK is a module for python for processing "natural languages". It also contains supporting data files
6 | (e.g., stop word lists by langauge) necessary for some of the algorithms to function.
7 |
8 | To install NLTK for yourself, do the following:
9 |
10 | pip install nltk
11 | python -m nltk.downloader all
12 |
13 | If you are on a Max OS X / Linux system, you may want to install the NLTK module for everyone:
14 |
15 | sudo pip install nltk
16 | sudo python -m nltk.downloader -d /usr/share/nltk_data all
17 |
18 | To test that you've got everything installed:
19 |
20 | from nltk.book import *
21 | text1.concordance("whale")
22 |
23 | should print a list of phrases in Moby Dick that contain the word 'whale'.
24 |
25 | ## Basics of Tokenization ##
26 |
27 | Many algorithms for processing text require taking passages of text and turn them into sentences and words. The process of doing is very
28 | specific to the language being processed and possibily influenced by how the text was collected or the genre of communication.
29 |
30 | In general, langauges like English, Spanish, and other modern european languages are directly supported by the corpus of configuration data
31 | provided by NLTK. These languages also share common mechanism for simple tokenization into sentences and words.
32 |
33 | A passage of text, like the above, can be first be broken down into sentences and then into words:
34 | ```
35 | import nltk
36 |
37 | text = '''Many algorithms for processing text require taking passages of text and turn them into sentences
38 | and words. The process of doing is very specific to the language being processed and possibily influenced
39 | by how the text was collected or the genre of communication.'''
40 |
41 | sentences = nltk.tokenize.sent_tokenize(text)
42 |
43 | for s in sentences:
44 | words = nltk.tokenize.word_tokenize(s)
45 | print words
46 | ```
47 | Notice how the punctuation of the sentences are mixed in with the words. Tokenization doesn't take into account any
48 | syntax that might be present. As such, text tha contains any kind of annotation, URLs, etc. may need to be filtered
49 | when turned into word tokens.
50 |
51 | Futher, words can be annotated independently for their "parts of speech" (POS):
52 |
53 | import nltk
54 |
55 | s = "The quick brown fox jumped over the fence."
56 | words = nltk.tokenize.word_tokenize(s)
57 | nltk.pos_tag(words)
58 |
59 | which should produce:
60 |
61 | [('The', 'DT'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('fence', 'NN'), ('.', '.')]
62 |
63 | Each of the codes can be looked up in the help:
64 |
65 | >>> nltk.help.upenn_tagset('DT')
66 | DT: determiner
67 | all an another any both del each either every half la many much nary
68 | neither no some such that the them these this those
69 | >>> nltk.help.upenn_tagset('NN')
70 | NN: noun, common, singular or mass
71 | common-carrier cabbage knuckle-duster Casino afghan shed thermostat
72 | investment slide humour falloff slick wind hyena override subhumanity
73 | machinist ...
74 |
75 | ## Stopwords ##
76 |
77 | Many languages contain words that occur very often (e.g., "the" or "a" in English) and their frequent use will
78 | overwhelm more interesting words useful in analysis. A common technique is to use a stop word list to exclude
79 | such common words from further processing.
80 |
81 | NLTK supports stop words for a number of languages and they are accessed as:
82 |
83 | stopWords = nltk.corpus.stopwords.words('english')
84 | >>> stopWords
85 | [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
86 | u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself',
87 | u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which',
88 | u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be',
89 | u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an',
90 | u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by',
91 | u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before',
92 | u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over',
93 | u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how',
94 | u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor',
95 | u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just',
96 | u'don', u'should', u'now']
97 |
98 | The `nltk.corpus.stopwords` module just returns a simple list of words you can use in your own code. For example,
99 | a simple list comprehension can be used to filter a list of words:
100 |
101 | stopWords = nltk.corpus.stopwords.words('english')
102 | filtered = [e.lower() for e in words if not e.lower() in stopWords]
103 |
104 | and another trick is to add your list of punctuation to the stop word list:
105 |
106 | stopWords = nltk.corpus.stopwords.words('english') + ['.',',']
107 | filtered = [e.lower() for e in words if not e.lower() in stopWords]
108 |
109 | The languages supported by NLTK can be discovered by inspecting the `nltk.corpus.stopwords` object:
110 |
111 | >>> nltk.corpus.stopwords
112 |
113 |
114 | The reader outputs the directory in which the stop words are stored. You can list the suppored langauges:
115 |
116 | $ ls /usr/share/nltk_data/corpora/stopwords
117 | README english german norwegian spanish
118 | danish finnish hungarian portuguese swedish
119 | dutch french italian russian turkish
120 | $ head -n 10 /usr/share/nltk_data/corpora/stopwords/english
121 | i
122 | me
123 | my
124 | myself
125 | we
126 | our
127 | ours
128 | ourselves
129 | you
130 | your
131 |
132 | The files contain a single word per line. As such, you can create or modify a stop word list for any language and add it to NLTK.
133 |
134 | ## Frequency Distributions ##
135 |
136 | A frequency distribution can be constructed from the a list as a construction parameter:
137 |
138 | import nltk
139 | words = [ 'A', 'A', 'B', 'B', 'B', 'C']
140 | fd = FreqDist(words)
141 | fd.tabulate()
142 |
143 | produces the output:
144 |
145 | B A C
146 | 3 2 1
147 |
148 | You can also produce a visual plot by calling `plot()`.
149 |
150 | A frequency distribution can be constructed iteratively as well:
151 |
152 | fd = FreqDist()
153 | for w in words:
154 | fd[w.lower()] += 1
155 |
156 | or via a comprehension:
157 |
158 | fd = FreqDist(w.lower() for w in words)
159 |
160 | ## Stemming and Lemminization #
161 |
162 | Stemming: the process for reducing inflected (or sometimes derived) words to their stem, base or root form.
163 |
164 | Lemmatization: the process of grouping together the different inflected forms of a word so they can be analysed as a single item.
165 |
166 | NLTK supports:
167 |
168 | * [Porter Stemming](http://tartarus.org/martin/PorterStemmer/)
169 | * [Lancaster Stemming](http://www.comp.lancs.ac.uk/computing/research/stemming/)
170 | * [Snowball Stemming](http://snowball.tartarus.org)
171 | * Lemminization based on [WordNet’s built-in morphy function](http://wordnet.princeton.edu)
172 |
173 | For stemming, you construct a stemmer and then call `stem()` on the word:
174 |
175 | from nltk.stem.lancaster import LancasterStemmer
176 | stemmer = LancasterStemmer()
177 | w = lancaster_stemmer.stem(‘presumably’) # returns u’presum’
178 |
179 | In the above, you can use `nltk.stem.porter.PorterStemmer`, `nltk.stem.lancaster.LancasterStemmer`, or `nltk.stem.SnowballStemmer`.
180 |
181 | Lemmatization is similar:
182 |
183 | from nltk.stem import WordNetLemmatizer
184 | lemmatizer = WordNetLemmatizer()
185 | lemmatizer.lemmatize(‘dogs’) # returns u'dog'
186 |
187 | but the lemmatizer assumes by default everything is a noun. For verbs, this means that results are not lemmatized
188 | properly (e.g., "are" and "is" do not become "be").
189 |
190 | For example, try:
191 |
192 | from nltk.stem import WordNetLemmatizer
193 | lemmatizer = WordNetLemmatizer()
194 | lemmatizer.lemmatize('is',pos='v')
195 | lemmatizer.lemmatize('are',pos='v')
196 |
197 | The `pos` argument can have the following values:
198 |
199 | * 'a' - adjective
200 | * 'r' - adverb
201 | * 'n' - noun
202 | * 'v' - verb
203 |
204 | ## Activity ##
205 |
206 | Pick a passage of text and:
207 |
208 | 1. Tokenize the text.
209 | 2. List all the nouns in the passage.
210 | 3. Apply a stop word filter to the tokenized text.
211 | 4. Compute and plot a frequency distribution of the top 50 words.
212 | 5. Apply a lemmatization algorithm with the pos argument set to 'n' and recompute your frequency distribution.
213 |
214 |
--------------------------------------------------------------------------------
/activities/data-munging/README.md:
--------------------------------------------------------------------------------
1 | # Data Munging - Processing JSON, XML, and CSV Data #
2 |
3 | ## CWOP Data Set ##
4 |
5 | The Citizen Weather Observation Program (CWOP) collects weather data from a variety of citizen, business, and government
6 | sources over the Internet. It collects over 75,000 weather reports an hour from 10,000+ weather stations located all over
7 | the world but mostly concentrated in North America.
8 |
9 | The data collected is transmitted in as APRS weather reports (need ref) in a coded format that is eventually disseminated
10 | via a real-time peer-to-peer network using a system called APRS-IS (need ref). This information can be received and decoded
11 | by attaching to a several of the servers associated with the CWOP program and aggregated the results.
12 |
13 | The [mesonet.info](http://www.mesonet.info) collects and aggregates this data. The data acquisition process first
14 | serializes the data collected from each server into 5 minute segments stored in an custom XML format:
15 |
16 |
17 |
18 |
19 |
20 | ...
21 |
22 |
23 | Each weather report has an identifier (@from), a location (@latitude and @longitude), a received time (@received), a generation time from the weather station (@at), and a
24 | variety of weather report facets (e.g., @temperature). These facets for the weather reports and their units of measure are listed below:
25 |
26 | wind-dir
27 | wind-speed
28 | wind-gust
29 | temperature
30 | rain-hour
31 | rain-24hours
32 | rain-midnight
33 | humidity
34 | pressure
35 |
36 | An excerpt of this data for 2014-12-26 has been stored on AWS S3 in the public bucket `milowski-cwop-data`. It is organized first by date (e.g., 2014-12-26) and then by format and hour. The
37 | raw XML data has been transformed into JSON (geo JSON?) and CSV data formats as well. Each of the variations are located on 'xml', 'json', or 'csv' "directories" in S3.
38 |
39 | For example, 2014-12-26 from 13:00 to 14:00 in JSON is located in:
40 |
41 | s3://milowski-cwop-data/2014-12-26/json/13:00/
42 |
43 | Each key (file) in represents a 5 minute segment of data partitioned only by time. The location of the reports can only be sorted by selecting the subset of information
44 | amongst all the various sources stored under the same key (directory).
45 |
46 | For example, you'll find the full keys for the data set as follows:
47 |
48 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:00:00Z.json
49 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:05:00Z.json
50 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:10:00Z.json
51 | ...
52 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop2-2014-12-26T13:00:00Z.json
53 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop2-2014-12-26T13:05:00Z.json
54 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop2-2014-12-26T13:10:00Z.json
55 | ...
56 |
57 | The names of the keys encode the source server and start of the time segment. This information is only repeated in the XML source and not in the JSON or CSV formats.
58 |
59 | ## Activities ##
60 |
61 | In this activity, we will be:
62 |
63 | * downloading copies of the data via S3
64 | * processing a variety of data formats (i.e., XML, JSON, and CSV)
65 | * computing simple statistics or subsets
66 | * accessing data directly via S3 via boto
67 |
68 | In general, we'll be computing two things:
69 |
70 | * an average (e.g., average temperature)
71 | * geospatial subsets for rectangular areas (quadrangles)
72 |
73 | ### A. Making a Copy ###
74 |
75 | #### Description ####
76 |
77 | The data is available on S3 and you can download a copy (or a subset) easily via the AWS CLI. Keep in mind that S3 is a key/value store. All the data is associated with
78 | the full path of the key. The concept of a "directory" and contained "files" is only implied by the "/" in the key and so is an interpretation of the tool being used.
79 |
80 | Fortunately, the AWS CLI interprets directories in keys as you might expect. Try the following:
81 |
82 | aws s3 ls s3://milowski-cwop-data/2014-12-26/json/13:00/
83 |
84 | When you run that command, you should see the complete listing of 79 keys (files).
85 |
86 | You can copy a single file or directory to your local drive via the same base command. To copy a file locally, try:
87 |
88 | aws s3 cp s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:00:00Z.json .
89 |
90 | If you want to copy a whole directory, try:
91 |
92 | aws s3 cp s3://milowski-cwop-data/2014-12-26/json/13:00 . --recursive
93 |
94 | #### Tasks ####
95 |
96 | 1. Pick an particular hour (e.g., 13:00)
97 | 2. Copy the remote buckets for all the formats (i.e., 'xml', 'json', 'csv') to your local disk.
98 |
99 |
100 | ### B. Parsing XML: Computing an Average ###
101 |
102 | #### Description ####
103 |
104 | In this activity you'll be parsing XML data sources and computing an average temperature. You will want to iterate a set of XML documents in a directory, parsing each XML source,
105 | and interpret the @temperature attribute as a real number measuring temperature in Fahrenheit. You should compute an average over all weather reports in all the documents you process.
106 |
107 | You can parse XML using Python's built in [xml.etree module](https://docs.python.org/2/library/xml.etree.elementtree.html); see [xml-parse.py](xml-parse.py).
108 |
109 | #### Tasks ####
110 |
111 | 1. Pick a particular hour.
112 | 2. Parse all the XML files in python and sum the temperature values for every observed weather report.
113 | 3. Calculate the average temperature for that hour for all the CWOP data received.
114 |
115 | ### C. Parsing JSON: Geospatial Partitioning ###
116 |
117 | #### Description ####
118 |
119 | The CWOP XML data has been translated into [geojson](http://geojson.org). The data is received in whatever order the weather stations report them but it can be filter for a specific region.
120 | We'll parse the weather data as JSON and select only those that occur within a specific quadrangle.
121 |
122 | #### Tasks ####
123 |
124 | 1. Pick a particular hour.
125 | 2. Parse all the JSON files and select the temperature values that occur within the quadrangle \[-125, 40, -120, 35 \] (upper left, lower right).
126 | 3. Calculate the average temperature for that hour for that region.
127 |
128 | ### D. Parsing CSV: Grid Averages ###
129 |
130 | #### Description ####
131 |
132 | Comma Separated Values (CSV) is a very common but non-standardized data format. The CWOP data set has been transformed into a simple set of CSV data files. You should attempt to partition the data
133 | by quadrangles and produce a temperature summary for each quadrangle covering the continental USA (i.e., \[-125, 45, -65, 25\]). A partitioning by 5° quadrangles will produce a
134 | 12 by 4 grid over the region.
135 |
136 | CSV data can be easily parsed in Python using the [csv module](https://docs.python.org/2/library/csv.html); see [csv-dump.py](csv-dump.py).
137 |
138 | #### Tasks ####
139 |
140 | 1. Pick a particular hour.
141 | 2. Parse all the CSV files and select the subset within the region. Assign report to grid cells.
142 | 3. Calculate the average temperature for each grid cell.
143 |
144 |
145 | ### E. Direct Access to S3 via boto ###
146 |
147 | #### Description ####
148 |
149 | You can access S3 in python via the [boto module](http://boto.readthedocs.org/en/latest/s3_tut.html). There are samples for outputting a key value ([s3cat.py](s3cat.py)),
150 | copying a file into s3 ([s3copy.py](s3copy.py)]), and list the keys in a bucket ([s3list.py](s3list.py)).
151 |
152 | You need to set environment variables for the code to work as it needs your AWS key and secret:
153 |
154 | export AWS_ACCESS_KEY_ID=...
155 | export AWS_SECRET_ACCESS_KEY=...
156 |
157 | The documentation is [available online](http://boto.readthedocs.org/en/latest/ref/s3.html).
158 |
159 | #### Activity ####
160 |
161 | You can repeat any of the activities above by accessing the data directly.
162 |
163 | 1. Pick a previous activity for which you have working code.
164 | 2. Modify the activity to read the list of files out of the bucket.
165 | 3. Process the data directly by either temporarily storing the files locally or loading the contents into strings.
166 |
167 | Note: You can list a subset of keys in a bucket by using the `prefix` parameter. See [s3list.py](s3list.py) for an example.
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/activities/common-crawl/tag-count-mr.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
155 |
--------------------------------------------------------------------------------
/activities/emr-cluster/README.md:
--------------------------------------------------------------------------------
1 | # Creating Clusters for EMR #
2 |
3 | ## Setup ##
4 |
5 | Install the [AWS CLI](http://docs.aws.amazon.com/cli/latest/userguide/) with
6 |
7 | sudo pip install awscli
8 |
9 | You may need to link to the executable:
10 |
11 | sudo ln -s /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/aws /opt/local/bin/aws
12 |
13 | Then configure your system with the AWS key, secret, and default region (e.g. us-east-1). You can leave the default output format blank.
14 |
15 | aws configure
16 |
17 | You can re-run this command at any time to change the values.
18 |
19 | Now you can test it by asking about your running EC2 instances (you may have none):
20 |
21 | aws ec2 describe-instances
22 |
23 | You'll need two things to run any of the EMR activities:
24 |
25 | 1. An S3 bucket to store logs, code, input, and output data.
26 | 2. An EMR cluster to run the examples.
27 |
28 | ## Basics of EMR Clusters ##
29 |
30 | You can start a simple test cluster by doing the following:
31 |
32 | aws emr create-cluster --ami-version 3.4.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m1.medium InstanceGroupType=CORE,InstanceCount=2,InstanceType=m1.medium --name "Test Cluster" --log-uri s3://mybucket/logs/ --enable-debugging --tags Name=emr
33 |
34 | The --instance-groups option contains a set of triples in the shorthand syntax for `InstanceGroupType` (one of "MASTER", "CORE", or "TASK"),
35 | `InstanceType` (a EC2 instance type), and `InstanceCount` (the number of instances to start). Alternatively, you can use JSON to describe the
36 | cluster instance groups.
37 |
38 | For example, in a file [cluster.json](cluster.json):
39 |
40 | [
41 | {
42 | "InstanceGroupType": "MASTER",
43 | "InstanceCount": 1,
44 | "InstanceType": "m1.medium"
45 | },
46 | {
47 | "InstanceGroupType": "CORE",
48 | "InstanceCount": 2,
49 | "InstanceType": "m1.medium"
50 | }
51 | ]
52 |
53 | and then the command:
54 |
55 | aws emr create-cluster --ami-version 3.4.0 --instance-groups file://./cluster.json --name "Test Cluster" --log-uri s3://mybucket/logs/ --enable-debugging --tags Name=emr
56 |
57 | The command will return the "Cluster ID" that you will need for further manipulations including to terminating the cluster. You can always find this via the command:
58 |
59 | aws emr list-clusters --active
60 |
61 | You can terminate a cluster by:
62 |
63 | aws emr terminate-clusters --cluster-id
64 |
65 | The documentation examples consistently use the bucket name 'mybucket'. You'll need to replace that with your bucket name to get the commands to work.
66 |
67 | ## Resizing a Cluster ##
68 |
69 | You can add core or task nodes to a running cluster via the cluster details. Clicking on "Resize" next to "Network and Hardware" will give you the ability to add Core and Task nodes
70 | whilst choosing the instance type. Clicking on "Resize" in the "Hardware" section only allows you to change the number of nodes of a given category with the same instance type.
71 |
72 | Both of these are useful techniques to adjust your running cluster once you have found it to be insufficient for processing data. The adjustment only happens after the currently
73 | running step completes. As such, you may need to kill a running step if you know it will take too long to complete to adjust the size your running cluster.
74 |
75 | ## Bootstrap Actions ##
76 |
77 | Once a generic cluster instance has been started, you may need to install specialized software (e.g. python packages). You can specify a set of one-time actions
78 | called "Bootstrap Actions" when you create the cluster using the `--bootstrap-actions` option. Like the --instance-groups option, you can use the shorthand syntax or JSON.
79 |
80 | Each action must contain three things:
81 |
82 | * Path — the path to a script (typically in S3)
83 | * Args - any arguments to the script
84 | * Name — a name to show in the console
85 |
86 | The shorthand is:
87 |
88 | --bootstrap-actions Path=s3://mybucket/python.sh,Name="Install python packages",Args=[numpy,nltk]
89 |
90 | The JSON in `bootstrap.json`:
91 |
92 | [
93 | {
94 | "Path" : "s3://mybucket/python.sh",
95 | "Name" : "Install python packages",
96 | "Args" : ["numpy","nltk"]
97 | }
98 | ]
99 |
100 | with the option:
101 |
102 | --bootstrap-actions file://./bootstrap.json
103 |
104 | The script stored at s3://mybucket/python.sh might be something like:
105 |
106 | #!/bin/bash
107 | sudo pip install $*
108 |
109 | ### Testing Bootstrap Actions ###
110 |
111 | In general, if you script runs on a like-operating system (e.g. linux of the same flavor), you'll be in good shape. AWS EMR's AMI are based on RedHat/CentOS and
112 | so scripts that work on those particular flavors may work. The right way to test bootstrapping is to use the specific AMI for the EMR version, start an EC2
113 | instance, and test on that machine.
114 |
115 | ### Testing Bootstrapping using EMR AMIs ###
116 |
117 | You can test your bootstrapping commands by just starting the exact AMI used by EMR. When you start a cluster, you can look up the
118 | AMI used by EMR in your EC2 console. Under the details of a running or terminate instance associated with your cluster, you'll see
119 | the AMI listed. It should be a identifier formatted like "ami-xxxxxxxx".
120 |
121 | For example, ami-2e88aa46 is the identifier for AMI version 3.6.0 that you can select when you start your cluster. You can then
122 | start an EC2 instance using that AMI using the CLI:
123 |
124 | aws ec2 run-instances --image-id ami-2e88aa46 --key-name your-key-name --instance-type m1.medium --placement AvailabilityZone=us-east-1c
125 |
126 | In the above, you'll want to list your actual key name in place of `your-key-name` and adjust the `AvailabilityZone` value to your preference.
127 |
128 | Now you can ssh into the machine using your key and the user `hadoop`. This user has sudo privileges and so should be able to run your script exactly
129 | as EMR would during cluster bootstrapping.
130 |
131 | Once you are done, you can shutdown the instance via the console in the browser or use the instance ID returned from the `run-instances` command in the following:
132 |
133 | aws ec2 terminate-instances --instance-ids i-3259abcf
134 |
135 | ## Running "Steps" ##
136 |
137 | A step is a unit of work. You add can steps to your cluster via the AWS CLI or via libraries like MRJob.
138 |
139 | A step contains a set of jobs and a job contains a set of tasks (e.g. mappers and reducers).
140 |
141 | Often, a single step contains a single job that contains a map/reduce process. That map/reduce process is turned into a set of map tasks based on the input size. The control over
142 | that process is handled by the input splitter used in Hadoop. Subsequently, the number of reduce tasks depends on the number of map tasks. These are all things you can control when you
143 | configure Hadoop.
144 |
145 |
146 | ### Running Steps via AWS CLI ###
147 |
148 | The AWS CLI command `aws emr add-steps` is used to add steps to your cluster. The cluster identifier is necessary and you can find this in the cluster details.
149 |
150 | The step is described by a set of metadata:
151 |
152 | * Name — A descriptive name.
153 | * Type — the type of Hadoop job (i.e. one of "CUSTOM_JAR", "STREAMING", "HIVE", "PIG", "IMPALA")
154 | * Args - a set of arguments to pass to the step
155 | * Jar — a location of a jar implementing the step (only for "CUSTOM_JAR"). This location must be accessible the Hadoop cluster and may be an S3 URI.
156 | * ActionOnFailure — One of "TERMINATE_CLUSTER", "CANCEL_AND_WAIT" (pause the step queue), "CONTINUE"
157 | * MainClass — the main class to use (only for CUSTOM_JAR)
158 |
159 | The shorthand syntax can be used to specify all of the above but the JSON syntax is more useful:
160 |
161 | For example, a Hadoop streaming job might be specified as:
162 |
163 | [ {
164 | "Type" : "STREAMING",
165 | "Name" : "Multiply",
166 | "ActionOnFailure" : "CONTINUE",
167 | "Args" : [
168 | "-files","s3://mybucket/prime-factors.py",
169 | "-mapper","prime-factors.py",
170 | "-reducer","aggregate",
171 | "-input","s3://mybucket/multiply/input",
172 | "-output","s3://mybucket/multiply/output"
173 | ]
174 | } ]
175 |
176 | The arguments are all specific to the [Hadoop Streaming program](http://hadoop.apache.org/docs/r2.6.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/HadoopStreaming.html). Similarly,
177 | any other program would (including your own custom jar) would have its own argument definition.
178 |
179 | Once you have your JSON definition (`step.json` in this case), you can add it to your running cluster by:
180 |
181 | aws emr add-steps --cluster-id --steps file://./step.json
182 |
183 | ### Running Steps via MRJob ###
184 |
185 | [MRJob](http://mrjob.readthedocs.org) is a very useful abstraction and has the ability to run jobs directly on EMR. While you can use MRJob to start a cluster,
186 | a more useful technique is to run your MRJob program on an already started cluster.
187 |
188 | Running on an existing cluster is easily done by two extra parameters:
189 |
190 | 1. Add the `-r emr` option to select the EMR runner.
191 | 2. Add the `--emr-job-flow-id your-cluster-id` to specify your existing cluster.
192 |
193 | Since you are running on the cluster, there are some additional life-cycle options you may want to control. First, by default, MRJob will upload your
194 | input (e.g. stdin) to S3 and download the output. You'll probably want to run everything from S3 and this is easily done:
195 |
196 | 1. Specify your input bucket by just an extra argument to your program just as you might give it a file name but instead just give in the S3 bucket URI.
197 | 2. Use `--no-output` to turn off downloading the result and `--output-dir s3://yourbucket/yourpath` to specify the output S3 bucket.
198 |
199 | If you have supporting code for your program, you'll need to package it into an archive in tar/gz format. Then just specify that on the command-line using `--python-archive code.tar.gz`
200 |
201 | You may have changed the version of python on your cluster via a bootstrap action. If so, you can specify the python command via `--python-bin`. That command expects a command (or full path)
202 | that will run the python interpreter.
203 |
204 | When you use the `--no-output` and `--output-dir` together with MRJob the results are stored on AWS S3. You can interrupt the local MRJob process after the step has started
205 | and it will continue to run on your cluster. This allows you to terminate the local process and continue other work. You will have to check the cluster interface online to
206 | see the status of your job.
207 |
208 | ### Killing Steps ###
209 |
210 | There is no easy way to kill a running step via the AWS CLI or the browser interface. If you terminate the cluster, the step will be killed first but that is a
211 | draconian way to kill a step. If you stop the cluster, you will have restart the cluster and that can take quite awhile.
212 |
213 | The way you kill the step is to talk to Hadoop directly by the following:
214 |
215 | 1. SSH into the master node. You'll find the connection information in the cluster details and then you'll do something like:
216 |
217 | `ssh hadoop@ec2-nn-nn-nn-nn.compute-1.amazonaws.com -i ~/your-identity.pem`
218 |
219 | 2. Once you are connected, list the jobs with `mapred job -list`
220 |
221 | 3. Locate the row that represents the step you'd like to kill. At this point, a step has turned into a set of jobs. If you only have one job, there will be only one row.
222 |
223 | 4. The first column is labeled `JobId`. Use that identifier to kill the job with `mapred job -kill id` where `id` is the value in that column.
224 |
225 | ## Manipulating S3 Buckets via AWS CLI ##
226 |
227 | You can create a bucket by:
228 |
229 | aws s3 mb s3://mybucket/
230 |
231 | Listing a bucket:
232 |
233 | aws s3 ls s3://mybucket/
234 |
235 | Copying a file to a path:
236 |
237 | aws s3 cp file.txt s3://mybucket/somewhere/
238 |
239 | Removing a key:
240 |
241 | aws s3 rm s3://mybucket/somewhere/file.txt
242 |
243 | Syncing a directory to s3 (both ways):
244 |
245 | aws s3 sync somewhere s3://mybucket/somewhere
246 | aws s3 sync s3://mybucket/somewhere somewhere
247 |
248 |
249 | Removing a set of keys via a prefix:
250 |
251 | aws s3 rm s3://mybucket/somewhere/ --recursive
252 |
--------------------------------------------------------------------------------
/activities/common-crawl/test-100.warc:
--------------------------------------------------------------------------------
1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz
3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz
4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz
5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz
6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz
7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz
8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz
9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz
10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz
11 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz
12 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz
13 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz
14 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz
15 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz
16 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00015-ip-10-180-136-8.ec2.internal.warc.gz
17 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00016-ip-10-180-136-8.ec2.internal.warc.gz
18 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00017-ip-10-180-136-8.ec2.internal.warc.gz
19 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00018-ip-10-180-136-8.ec2.internal.warc.gz
20 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00019-ip-10-180-136-8.ec2.internal.warc.gz
21 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00020-ip-10-180-136-8.ec2.internal.warc.gz
22 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00021-ip-10-180-136-8.ec2.internal.warc.gz
23 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00022-ip-10-180-136-8.ec2.internal.warc.gz
24 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00023-ip-10-180-136-8.ec2.internal.warc.gz
25 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00024-ip-10-180-136-8.ec2.internal.warc.gz
26 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00025-ip-10-180-136-8.ec2.internal.warc.gz
27 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00026-ip-10-180-136-8.ec2.internal.warc.gz
28 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00027-ip-10-180-136-8.ec2.internal.warc.gz
29 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00028-ip-10-180-136-8.ec2.internal.warc.gz
30 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00029-ip-10-180-136-8.ec2.internal.warc.gz
31 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00030-ip-10-180-136-8.ec2.internal.warc.gz
32 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00031-ip-10-180-136-8.ec2.internal.warc.gz
33 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00032-ip-10-180-136-8.ec2.internal.warc.gz
34 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00033-ip-10-180-136-8.ec2.internal.warc.gz
35 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00034-ip-10-180-136-8.ec2.internal.warc.gz
36 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00035-ip-10-180-136-8.ec2.internal.warc.gz
37 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00036-ip-10-180-136-8.ec2.internal.warc.gz
38 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00037-ip-10-180-136-8.ec2.internal.warc.gz
39 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00038-ip-10-180-136-8.ec2.internal.warc.gz
40 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00039-ip-10-180-136-8.ec2.internal.warc.gz
41 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00040-ip-10-180-136-8.ec2.internal.warc.gz
42 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00041-ip-10-180-136-8.ec2.internal.warc.gz
43 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00042-ip-10-180-136-8.ec2.internal.warc.gz
44 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00043-ip-10-180-136-8.ec2.internal.warc.gz
45 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00044-ip-10-180-136-8.ec2.internal.warc.gz
46 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00045-ip-10-180-136-8.ec2.internal.warc.gz
47 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00046-ip-10-180-136-8.ec2.internal.warc.gz
48 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00047-ip-10-180-136-8.ec2.internal.warc.gz
49 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00048-ip-10-180-136-8.ec2.internal.warc.gz
50 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00049-ip-10-180-136-8.ec2.internal.warc.gz
51 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00050-ip-10-180-136-8.ec2.internal.warc.gz
52 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00051-ip-10-180-136-8.ec2.internal.warc.gz
53 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00052-ip-10-180-136-8.ec2.internal.warc.gz
54 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00053-ip-10-180-136-8.ec2.internal.warc.gz
55 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00054-ip-10-180-136-8.ec2.internal.warc.gz
56 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00055-ip-10-180-136-8.ec2.internal.warc.gz
57 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00056-ip-10-180-136-8.ec2.internal.warc.gz
58 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00057-ip-10-180-136-8.ec2.internal.warc.gz
59 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00058-ip-10-180-136-8.ec2.internal.warc.gz
60 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00059-ip-10-180-136-8.ec2.internal.warc.gz
61 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00060-ip-10-180-136-8.ec2.internal.warc.gz
62 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00061-ip-10-180-136-8.ec2.internal.warc.gz
63 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00062-ip-10-180-136-8.ec2.internal.warc.gz
64 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00063-ip-10-180-136-8.ec2.internal.warc.gz
65 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00064-ip-10-180-136-8.ec2.internal.warc.gz
66 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00065-ip-10-180-136-8.ec2.internal.warc.gz
67 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00066-ip-10-180-136-8.ec2.internal.warc.gz
68 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00067-ip-10-180-136-8.ec2.internal.warc.gz
69 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00068-ip-10-180-136-8.ec2.internal.warc.gz
70 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00069-ip-10-180-136-8.ec2.internal.warc.gz
71 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00070-ip-10-180-136-8.ec2.internal.warc.gz
72 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00071-ip-10-180-136-8.ec2.internal.warc.gz
73 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00072-ip-10-180-136-8.ec2.internal.warc.gz
74 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00073-ip-10-180-136-8.ec2.internal.warc.gz
75 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00074-ip-10-180-136-8.ec2.internal.warc.gz
76 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00075-ip-10-180-136-8.ec2.internal.warc.gz
77 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00076-ip-10-180-136-8.ec2.internal.warc.gz
78 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00077-ip-10-180-136-8.ec2.internal.warc.gz
79 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00078-ip-10-180-136-8.ec2.internal.warc.gz
80 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00079-ip-10-180-136-8.ec2.internal.warc.gz
81 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00080-ip-10-180-136-8.ec2.internal.warc.gz
82 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00081-ip-10-180-136-8.ec2.internal.warc.gz
83 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00082-ip-10-180-136-8.ec2.internal.warc.gz
84 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00083-ip-10-180-136-8.ec2.internal.warc.gz
85 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00084-ip-10-180-136-8.ec2.internal.warc.gz
86 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00085-ip-10-180-136-8.ec2.internal.warc.gz
87 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00086-ip-10-180-136-8.ec2.internal.warc.gz
88 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00087-ip-10-180-136-8.ec2.internal.warc.gz
89 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00088-ip-10-180-136-8.ec2.internal.warc.gz
90 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00089-ip-10-180-136-8.ec2.internal.warc.gz
91 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00090-ip-10-180-136-8.ec2.internal.warc.gz
92 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00091-ip-10-180-136-8.ec2.internal.warc.gz
93 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00092-ip-10-180-136-8.ec2.internal.warc.gz
94 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00093-ip-10-180-136-8.ec2.internal.warc.gz
95 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00094-ip-10-180-136-8.ec2.internal.warc.gz
96 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00095-ip-10-180-136-8.ec2.internal.warc.gz
97 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00096-ip-10-180-136-8.ec2.internal.warc.gz
98 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00097-ip-10-180-136-8.ec2.internal.warc.gz
99 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00098-ip-10-180-136-8.ec2.internal.warc.gz
100 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00099-ip-10-180-136-8.ec2.internal.warc.gz
101 |
--------------------------------------------------------------------------------
/data-science.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
196 |
--------------------------------------------------------------------------------