├── activities ├── nosql-databases │ ├── mongo │ │ ├── setup.sh │ │ ├── forkdb.sh │ │ ├── mongo.conf │ │ └── test-insert.js │ └── rest-api.json ├── common-crawl │ ├── mrcc.py.tar.gz │ ├── tag-count-mr.idraw │ ├── test-1.warc │ ├── test-5.warc │ ├── mrjob.conf │ ├── test-10.warc │ ├── mrcc.py │ ├── test-15.warc │ ├── test-20.warc │ ├── README.md │ ├── tag-count-mr.svg │ └── test-100.warc ├── crawling-the-crawl │ ├── mrcc.tar.gz │ ├── cc-bootstrap.sh │ ├── extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz │ ├── cluster.json │ ├── s3.py │ ├── run-step.sh │ ├── ccex.py │ ├── start.sh │ ├── mrcc.py │ └── README.md ├── decision-trees │ ├── Bias_Variance.jpg │ ├── decision-tree.idraw │ ├── tree-example.idraw │ ├── decision_tree.py │ ├── adaboost_classifier.py │ ├── random_forest.py │ ├── gradient_classifier.py │ ├── iris_tree.py │ ├── regions.py │ ├── tree-example.svg │ └── decision-tree.svg ├── emr-opennex-climate-model │ ├── input-sequences.txt │ ├── input-example.py │ ├── seqs.py │ ├── average.py │ ├── date_partitions.py │ ├── by-sequences.py │ ├── acquire.py │ └── README.md ├── intro-to-spark │ ├── one-line-json.py │ ├── random-text.py │ ├── rdd-map.py │ ├── rdd-flatmap.py │ ├── rdd-reduce.py │ ├── wordcount.py │ ├── tweet-wordcount.py │ └── README.md ├── web-scraping │ ├── urllib2-get.py │ ├── soup.py │ ├── urllib2-headers.py │ └── README.md ├── emr-map-only │ ├── line-count.py │ ├── generate-input.py │ └── README.md ├── sentiment-analysis │ ├── annotate.py │ ├── rt-polaritydata │ │ ├── rt-polarity.neg │ │ ├── rt-polarity.pos │ │ └── README.1.0.txt │ ├── wordcounts.py │ ├── train.py │ ├── test.py │ ├── featureset.py │ ├── n-way.py │ ├── candy-corn.py │ └── README.md ├── data-munging │ ├── pipeline-input.py │ ├── csv-dump.py │ ├── s3list.py │ ├── s3copy.py │ ├── s3cat.py │ ├── xml-parse.py │ └── README.md ├── emr-cluster │ ├── cluster.json │ └── README.md ├── emr-tweet-wordcount │ ├── format-tweets.py │ ├── tweetSplitter.py │ └── README.md ├── emr-prime-multiplier │ ├── step.json │ ├── generate-input.py │ ├── prime-factors.py │ └── README.md ├── twitter-acquisition │ ├── hello-twitter.py │ ├── partitions.py │ ├── search.py │ └── README.md ├── README.md ├── relational-databases │ └── README.md └── text-processing-with-nltk │ └── README.md ├── data-science.png ├── sessions ├── session-8.md ├── session-7.md ├── session-9.md ├── session-12.md ├── session-10.md ├── session-11.md └── session-6.md ├── data-science.xpr ├── assignments ├── tweet-acquisition │ └── README.md ├── getting-started │ └── README.md └── organizing-tweets │ └── README.md ├── README.md └── data-science.svg /activities/nosql-databases/mongo/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir data 3 | mkdir log 4 | -------------------------------------------------------------------------------- /data-science.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/data-science.png -------------------------------------------------------------------------------- /activities/common-crawl/mrcc.py.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/common-crawl/mrcc.py.tar.gz -------------------------------------------------------------------------------- /activities/crawling-the-crawl/mrcc.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/crawling-the-crawl/mrcc.tar.gz -------------------------------------------------------------------------------- /activities/common-crawl/tag-count-mr.idraw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/common-crawl/tag-count-mr.idraw -------------------------------------------------------------------------------- /activities/decision-trees/Bias_Variance.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/decision-trees/Bias_Variance.jpg -------------------------------------------------------------------------------- /activities/decision-trees/decision-tree.idraw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/decision-trees/decision-tree.idraw -------------------------------------------------------------------------------- /activities/decision-trees/tree-example.idraw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/decision-trees/tree-example.idraw -------------------------------------------------------------------------------- /activities/nosql-databases/rest-api.json: -------------------------------------------------------------------------------- 1 | { "rest-api": { 2 | "name": "RESTstop", 3 | "database": "tweets", 4 | "port": "8888" 5 | } } 6 | -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/input-sequences.txt: -------------------------------------------------------------------------------- 1 | #lat1,lon1,lat2,lon2,size,startYear,startMonth,endYear,endMonth 2 | 40,-125,35,-120,60,2015,01,2015,03 3 | -------------------------------------------------------------------------------- /activities/intro-to-spark/one-line-json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | data = json.load(sys.stdin) 5 | 6 | for tweet in data: 7 | print json.dumps(tweet) -------------------------------------------------------------------------------- /activities/web-scraping/urllib2-get.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | 3 | response = urllib2.urlopen("http://www.ischool.berkeley.edu/") 4 | html = response.read() 5 | print html -------------------------------------------------------------------------------- /activities/emr-map-only/line-count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | 4 | count = 0 5 | for line in sys.stdin: 6 | count += 1 7 | 8 | print "lines: ", count 9 | -------------------------------------------------------------------------------- /activities/common-crawl/test-1.warc: -------------------------------------------------------------------------------- 1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 2 | -------------------------------------------------------------------------------- /activities/sentiment-analysis/annotate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | for line in sys.stdin: 4 | sys.stdout.write(sys.argv[1]) 5 | sys.stdout.write('\t') 6 | sys.stdout.write(line) -------------------------------------------------------------------------------- /activities/sentiment-analysis/rt-polaritydata/rt-polarity.neg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/sentiment-analysis/rt-polaritydata/rt-polarity.neg -------------------------------------------------------------------------------- /activities/sentiment-analysis/rt-polaritydata/rt-polarity.pos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/sentiment-analysis/rt-polaritydata/rt-polarity.pos -------------------------------------------------------------------------------- /activities/nosql-databases/mongo/forkdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MONGO=$HOME/workspace/mongodb-osx-x86_64-2.6.5/ 3 | 4 | $MONGO/bin/mongod --config mongo.conf --pidfilepath `pwd`/mongo.pid 5 | -------------------------------------------------------------------------------- /activities/data-munging/pipeline-input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | for line in sys.stdin: 5 | print line 6 | f = open(line.strip(),"r") 7 | # process data 8 | f.close() 9 | -------------------------------------------------------------------------------- /activities/web-scraping/soup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import bs4 3 | import urllib2 4 | 5 | resource = urllib2.urlopen(sys.argv[1]) 6 | html = bs4.BeautifulSoup(resource.read().decode('utf-8')) 7 | print "".join(html.title.strings) -------------------------------------------------------------------------------- /activities/web-scraping/urllib2-headers.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | 3 | response = urllib2.urlopen("http://www.ischool.berkeley.edu/") 4 | headers = dict(response.info()) 5 | 6 | for name in headers: 7 | print name,": ",headers[name] 8 | -------------------------------------------------------------------------------- /activities/decision-trees/decision_tree.py: -------------------------------------------------------------------------------- 1 | import iris_tree as iris 2 | from sklearn.tree import DecisionTreeClassifier 3 | 4 | 5 | dtree = DecisionTreeClassifier(criterion='gini',max_depth=3,random_state=0) 6 | 7 | iris.tree(dtree) 8 | 9 | -------------------------------------------------------------------------------- /activities/decision-trees/adaboost_classifier.py: -------------------------------------------------------------------------------- 1 | import iris_tree as iris 2 | from sklearn.ensemble import AdaBoostClassifier 3 | 4 | boosting = AdaBoostClassifier(n_estimators=10, learning_rate=1.0,random_state=1) 5 | 6 | iris.tree(boosting) 7 | 8 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/cc-bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo yum install -y python27 python27-devel python27-pip gcc-c++ 4 | sudo pip-2.7 install boto mrjob warc 5 | sudo pip-2.7 install https://github.com/commoncrawl/gzipstream/archive/master.zip 6 | -------------------------------------------------------------------------------- /activities/decision-trees/random_forest.py: -------------------------------------------------------------------------------- 1 | import iris_tree as iris 2 | from sklearn.ensemble import RandomForestClassifier 3 | 4 | forest = RandomForestClassifier(criterion='gini',n_estimators=10,max_depth=3,random_state=1,n_jobs=2) 5 | 6 | iris.tree(forest) 7 | 8 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmilowski/data-science/HEAD/activities/crawling-the-crawl/extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz -------------------------------------------------------------------------------- /activities/decision-trees/gradient_classifier.py: -------------------------------------------------------------------------------- 1 | import iris_tree as iris 2 | from sklearn.ensemble import GradientBoostingClassifier 3 | 4 | boosting = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0,max_depth=3,random_state=1) 5 | 6 | iris.tree(boosting) 7 | 8 | -------------------------------------------------------------------------------- /activities/emr-cluster/cluster.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "InstanceGroupType": "MASTER", 4 | "InstanceCount": 1, 5 | "InstanceType": "m1.medium" 6 | }, 7 | { 8 | "InstanceGroupType": "CORE", 9 | "InstanceCount": 2, 10 | "InstanceType": "m1.medium" 11 | } 12 | ] 13 | -------------------------------------------------------------------------------- /activities/data-munging/csv-dump.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | 4 | for line in sys.stdin: 5 | f = open(line.strip(),"r") 6 | # process data 7 | reader = csv.reader(f,delimiter=',',quotechar='"') 8 | for row in reader: 9 | print ','.join(row) 10 | 11 | f.close() 12 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/cluster.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "InstanceGroupType": "MASTER", 4 | "InstanceCount": 1, 5 | "InstanceType": "m1.medium" 6 | }, 7 | { 8 | "InstanceGroupType": "CORE", 9 | "InstanceCount": 2, 10 | "InstanceType": "m1.medium" 11 | } 12 | ] -------------------------------------------------------------------------------- /activities/intro-to-spark/random-text.py: -------------------------------------------------------------------------------- 1 | import random 2 | import sys 3 | 4 | words = sys.stdin.read().splitlines() 5 | 6 | for i in range(int(sys.argv[1])): 7 | for j in range(int(sys.argv[2])): 8 | sys.stdout.write(random.choice(words)) 9 | sys.stdout.write(" ") 10 | sys.stdout.write("\n") 11 | 12 | 13 | -------------------------------------------------------------------------------- /activities/data-munging/s3list.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from boto.s3.connection import S3Connection 3 | from boto.s3.key import Key 4 | 5 | conn = S3Connection() 6 | bucket = conn.get_bucket(sys.argv[1]) 7 | 8 | subset = sys.argv[2] if len(sys.argv)>2 else "" 9 | 10 | for key in bucket.list(prefix=subset): 11 | print key.key 12 | 13 | -------------------------------------------------------------------------------- /activities/intro-to-spark/rdd-map.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from pyspark import SparkContext 4 | 5 | sc = SparkContext(appName="TweetLoader") 6 | tweetData = sc.textFile("2015-02*.txt") 7 | tweets = tweetData.map(lambda line: json.loads(line)) 8 | 9 | output = tweets.collect() 10 | for (tweet) in output: 11 | print tweet 12 | -------------------------------------------------------------------------------- /sessions/session-8.md: -------------------------------------------------------------------------------- 1 | # Session Schedule - Week 8 # 2 | 3 | * [5] Intro to models over data 4 | * [10] [Bag-of-words model](http://en.wikipedia.org/wiki/Bag-of-words_model) 5 | * [10] Feature extraction from movie reviews 6 | * [10] Candy Corn Example 7 | * [25] Project Feedback / Activity 8 | * [20] Training Classifiers 9 | * [10] Discussion -------------------------------------------------------------------------------- /activities/crawling-the-crawl/s3.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from signal import signal, SIGPIPE, SIG_DFL 4 | 5 | #Ignore SIG_PIPE and don't throw exceptions on it... (http://docs.python.org/library/signal.html) 6 | signal(SIGPIPE,SIG_DFL) 7 | 8 | for line in sys.stdin: 9 | print("s3://aws-publicdatasets/"+line[0:-1]) -------------------------------------------------------------------------------- /activities/nosql-databases/mongo/mongo.conf: -------------------------------------------------------------------------------- 1 | systemLog: 2 | destination: file 3 | path: "log/mongodb.log" 4 | logAppend: true 5 | processManagement: 6 | fork: true 7 | storage: 8 | dbPath: "data" 9 | directoryPerDB: true 10 | journal: 11 | enabled: true 12 | net: 13 | bindIp: 127.0.0.1 14 | port: 27017 15 | 16 | -------------------------------------------------------------------------------- /activities/emr-map-only/generate-input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import os 4 | 5 | prefix = sys.argv[1] 6 | files = int(sys.argv[2]) 7 | max = int(sys.argv[3]) 8 | 9 | for n in range(files): 10 | f = open(prefix+"-"+str(n+1)+".txt","w") 11 | for i in range(max): 12 | f.write(str(n+1)+" "+str(i+1)+"\n") 13 | f.close(); 14 | 15 | -------------------------------------------------------------------------------- /data-science.xpr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /activities/emr-tweet-wordcount/format-tweets.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | f = open(sys.argv[1],"r") 5 | data = json.load(f) 6 | f.close() 7 | 8 | for tweet in data["tweets"]: 9 | language = tweet["metadata"]["iso_language_code"].encode('utf-8') 10 | text = tweet["text"].replace("\n"," ") 11 | print "#iso-"+language+" "+text.encode('utf-8') 12 | -------------------------------------------------------------------------------- /activities/data-munging/s3copy.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from boto.s3.connection import S3Connection 3 | from boto.s3.key import Key 4 | 5 | conn = S3Connection() 6 | bucket = conn.get_bucket(sys.argv[1]) 7 | prefix = sys.argv[2] 8 | for i in range(3,len(sys.argv)): 9 | print sys.argv[i] 10 | k = Key(bucket) 11 | k.key = prefix+"/"+sys.argv[i] 12 | k.set_contents_from_filename(sys.argv[i]) 13 | -------------------------------------------------------------------------------- /activities/emr-prime-multiplier/step.json: -------------------------------------------------------------------------------- 1 | { 2 | "Type" : "STREAMING", 3 | "Name" : "Multiply", 4 | "ActionOnFailure" : "CONTINUE", 5 | "Args" : [ 6 | "-files","s3://mybucket/prime-factors.py", 7 | "-mapper","prime-factors.py", 8 | "-reducer","aggregate", 9 | "-input","s3://mybucket/multiply/input", 10 | "-output","s3://mybucket/multiply/output" 11 | ] 12 | } -------------------------------------------------------------------------------- /activities/data-munging/s3cat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from boto.s3.connection import S3Connection 4 | from boto.s3.key import Key 5 | 6 | conn = S3Connection() 7 | for uri in sys.argv[1:]: 8 | m = re.match(r"s3://([\w\-]+)/(.*)",uri) 9 | if m: 10 | bucket = conn.get_bucket(m.group(1)) 11 | k = Key(bucket) 12 | k.key = m.group(2) 13 | print k.get_contents_as_string() 14 | -------------------------------------------------------------------------------- /sessions/session-7.md: -------------------------------------------------------------------------------- 1 | # Session Schedule - Week 7 # 2 | 3 | * [5] Setup (if you haven't done so) 4 | * [20] Introduction to NoSQL 5 | * what really is a NoSQL database? 6 | * market players / gartner report 7 | * two significant players: Mongo and MarkLogic 8 | * [10] Mongo introduction 9 | * [15] Data storage in Mongo activity 10 | * [10] MarkLogic introduction 11 | * [15] Data storage in MarkLogic activity 12 | * [15] Q&A and wrap-up 13 | -------------------------------------------------------------------------------- /activities/intro-to-spark/rdd-flatmap.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from pyspark import SparkContext 4 | 5 | sc = SparkContext(appName="TweetLoader") 6 | tweetData = sc.textFile("2015-02*.txt") 7 | users = tweetData.map(lambda line: json.loads(line)) \ 8 | .flatMap(lambda tweet: [tweet["user"]["screen_name"]] + map(lambda u : u["screen_name"],tweet["entities"]["user_mentions"])).distinct() 9 | 10 | output = users.collect() 11 | for user in output: 12 | print user 13 | -------------------------------------------------------------------------------- /activities/twitter-acquisition/hello-twitter.py: -------------------------------------------------------------------------------- 1 | import tweepy 2 | import json; 3 | 4 | # Don't forget to install tweepy 5 | # pip install tweepy 6 | 7 | consumer_key = "..."; 8 | consumer_secret = "..."; 9 | 10 | access_token = "..."; 11 | access_token_secret = "..."; 12 | 13 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 14 | auth.set_access_token(access_token, access_token_secret) 15 | 16 | api = tweepy.API(auth) 17 | 18 | for tweet in api.search(q="minecraft"): 19 | print tweet.text -------------------------------------------------------------------------------- /activities/sentiment-analysis/wordcounts.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import nltk 3 | import sets 4 | import operator 5 | 6 | import featureset 7 | 8 | words = {} 9 | 10 | for line in sys.stdin: 11 | for word in featureset.wordlist(line.decode('utf-8')): 12 | words[word] = words[word] + 1 if word in words else 1 13 | 14 | wordsSorted = sorted(words.items(), key=operator.itemgetter(1),reverse=True) 15 | 16 | for w in wordsSorted: 17 | sys.stdout.write("{0}\t{1}\n".format(w[0].encode('utf-8'),w[1])) 18 | 19 | -------------------------------------------------------------------------------- /activities/emr-prime-multiplier/generate-input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | 4 | # The maximum magnitude of the numbers 5 | max = int(sys.argv[1]) 6 | # The number to generate 7 | count = int(sys.argv[2]) 8 | 9 | def positiveRandom(max): 10 | n = random.random() 11 | while n==0: 12 | n = random.random() 13 | r = int(n*max) 14 | return 1 if r==0 else r 15 | 16 | # Generate a positive random number for the count up to the given maximum 17 | for i in range(count): 18 | print positiveRandom(max) -------------------------------------------------------------------------------- /activities/intro-to-spark/rdd-reduce.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from pyspark import SparkContext 4 | 5 | sc = SparkContext(appName="TweetLoader") 6 | tweetData = sc.textFile("2015-02*.txt") 7 | counts = tweetData.map(lambda line: json.loads(line)) \ 8 | .map(lambda tweet: (tweet["user"]["screen_name"],1)) \ 9 | .reduceByKey(lambda a,b: a + b) 10 | 11 | output = sorted(counts.collect(),lambda a,b: b[1] - a[1]) 12 | for (user,count) in output: 13 | print "{}: {}".format(user,count) 14 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/run-step.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script expects the path list as input for the MRJob 4 | 5 | SCRIPT=$1 6 | BUCKET=$2 7 | FLOWID=$3 8 | 9 | if [ -z $SCRIPT ] || [ -z $BUCKET ] || [ -z $FLOWID ] ; then 10 | echo "Usage: $(basename $0) script.py bucket-name job-flow-id" 11 | exit 1 12 | fi 13 | 14 | shift 3 15 | OUTDIR=s3://$BUCKET/common-crawl/wat/domains/$$ 16 | echo "Output: $OUTDIR" 17 | python $SCRIPT -r emr --python-bin python2.7 --python-archive mrcc.tar.gz --no-output --output-dir $OUTDIR --emr-job-flow-id $FLOWID $* 18 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/ccex.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | 5 | # 6 | from mrcc import CCJob 7 | 8 | class Example(CCJob): 9 | def process_record(self, record): 10 | # Some header readers aren't for Web resources 11 | if "warc-target-uri" in record.header: 12 | 13 | uri = record.header["warc-target-uri"] 14 | print uri 15 | 16 | # load the payload into a string 17 | payload = record.payload.read() 18 | 19 | yield uri,1 20 | yield "zzzz-count",1 21 | 22 | if __name__ == '__main__': 23 | Example.run() -------------------------------------------------------------------------------- /activities/nosql-databases/mongo/test-insert.js: -------------------------------------------------------------------------------- 1 | var MongoClient = require('mongodb').MongoClient, 2 | ObjectID = require('mongodb').ObjectID; 3 | 4 | // The database connection URI 5 | var url = 'mongodb://localhost:27017/test'; 6 | 7 | // Connect to the database and provide a callback function 8 | MongoClient.connect(url, function(err, db) { 9 | console.log("Connected!"); 10 | var collection = db.collection("conference"); 11 | collection.insert( 12 | [{ test: "A" },{ test: "B" },{ test: "C" },{ test: "A" }], 13 | function() { 14 | console.log("done!") 15 | db.close(); 16 | } 17 | ); 18 | }); 19 | -------------------------------------------------------------------------------- /activities/data-munging/xml-parse.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from xml.etree import ElementTree 3 | 4 | # See: https://docs.python.org/2/library/xml.etree.elementtree.html 5 | 6 | # This will parse the document from a file. If the handle was elsewhere, you can give it an open stream too. 7 | doc = ElementTree.parse(sys.argv[1]) 8 | 9 | # iteration is a lot like //report in XPath 10 | for report in doc.getroot().iter('{http://weather.milowski.com/V/APRS/}report'): 11 | # If the attribute isn't available, we'll get a dictionary key exception 12 | # so we check for its existence 13 | if "temperature" in report.attrib: 14 | print report.attrib["temperature"] -------------------------------------------------------------------------------- /sessions/session-9.md: -------------------------------------------------------------------------------- 1 | # Session Schedule - Week 9 # 2 | 3 | * [5] Admin / Project Update Next Week - 3 slides needed - architecture, update on info organization, top issues 4 | * [10] Assignment Q&A 5 | * [15] Introduction to Map/Reduce 6 | * [10] Hadoop / YARN 7 | * [10] [Starting a cluster on EMR](https://github.com/alexmilowski/data-science/tree/master/activities/emr-cluster) 8 | * [15] Input Splitting - [Map Only Example](https://github.com/alexmilowski/data-science/tree/master/activities/emr-map-only) 9 | * [15] Reducing - [Tweet Word Count Example](https://github.com/alexmilowski/data-science/tree/master/activities/emr-tweet-wordcount) 10 | * [10] Wrap-up -------------------------------------------------------------------------------- /activities/intro-to-spark/wordcount.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkContext 3 | 4 | sc = SparkContext(appName="PythonWordCount") 5 | 6 | # Load the data from the file (or wildcard) on the command line 7 | lines = sc.textFile(sys.argv[1], 1) 8 | 9 | # count the words: split each line, output a key/value pair (count of 1), reduce by summation 10 | counts = lines.flatMap(lambda x: x.split()) \ 11 | .map(lambda word: (word, 1)) \ 12 | .reduceByKey(lambda a,b : a + b) 13 | 14 | # output the results (unsorted) 15 | output = counts.collect() 16 | for (word, count) in output: 17 | print "{}: {}".format(word.encode('utf-8'), count) 18 | -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/input-example.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | import sys 3 | import os 4 | import json 5 | import math 6 | 7 | # A simple example of processing JSON input to compute an average (w/o counts) 8 | class InputExample(MRJob): 9 | 10 | # Yields an average for an input line (same key) 11 | def mapper(self, _, line): 12 | obj = json.loads(line) 13 | yield "average",sum(obj["data"])/len(obj["data"]) 14 | 15 | # Computes the average over all the values 16 | def reducer(self, key, values): 17 | data = list(values) 18 | yield key, sum(data) / len(data) 19 | 20 | 21 | if __name__ == '__main__': 22 | InputExample.run() 23 | -------------------------------------------------------------------------------- /sessions/session-12.md: -------------------------------------------------------------------------------- 1 | # Session Schedule - Week 12 # 2 | 3 | ## Spark ## 4 | 5 | * [5-10] Udpate / Q & A 6 | * [20] [Introduction to Spark](https://docs.google.com/presentation/d/1vgDuqCsbugrsw2W99ak70HVu9TFsiybDP8vAo3E6758/edit?usp=sharing) 7 | * [15] Activity - [Run some examples](https://github.com/alexmilowski/data-science/tree/master/activities/intro-to-spark#activity---run-some-example) 8 | * [40] Activity - [Problem Solving with Spark](https://github.com/alexmilowski/data-science/tree/master/activities/intro-to-spark#activity---problem-solving) 9 | * Extra - [Spark on EC2 / EMR](https://github.com/alexmilowski/data-science/tree/master/activities/intro-to-spark#activity---deploying-to-clusters) 10 | -------------------------------------------------------------------------------- /activities/common-crawl/test-5.warc: -------------------------------------------------------------------------------- 1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz 3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz 4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz 5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz 6 | -------------------------------------------------------------------------------- /activities/twitter-acquisition/partitions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | 4 | xsdDatetimeFormat = "%Y-%m-%dT%H:%M:%S" 5 | xsdDateFormat = "%Y-%m-%d" 6 | 7 | def datetime_partition(start,end,duration): 8 | current = start 9 | while start==current or (end-current).days > 0 or ((end-current).days==0 and (end-current).seconds>0): 10 | yield current 11 | current = current + duration 12 | 13 | def date_partition(start,end): 14 | return datetime_partition(start,end,datetime.timedelta(days=1)) 15 | 16 | if __name__ == "__main__": 17 | start = datetime.datetime.strptime(sys.argv[1],xsdDateFormat) # start date 18 | end = datetime.datetime.strptime(sys.argv[2],xsdDateFormat) # end date 19 | 20 | for d in date_partition(start,end): 21 | print d -------------------------------------------------------------------------------- /activities/intro-to-spark/tweet-wordcount.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from pyspark import SparkContext 4 | 5 | sc = SparkContext(appName="TweetWordCount") 6 | 7 | # Load the JSON data from the file (or wildcard) on the command line 8 | lines = sc.textFile(sys.argv[1], 1) 9 | 10 | # count the words: load each line into a JSON object, split each text property, output a key/value pair (count of 1), reduce by summation 11 | counts = lines.map(lambda line: json.loads(line)) \ 12 | .flatMap(lambda tweet: tweet["text"].split()) \ 13 | .map(lambda word: (word, 1)) \ 14 | .reduceByKey(lambda a,b : a + b) 15 | 16 | # output the results (unsorted) 17 | output = counts.collect() 18 | for (word, count) in output: 19 | print "{0}: {1}".format(word.encode("utf-8"), count) 20 | -------------------------------------------------------------------------------- /sessions/session-10.md: -------------------------------------------------------------------------------- 1 | # Session Schedule - Week 10 # 2 | 3 | ## More about AWS, EMR, and working with Hadoop ## 4 | 5 | * [10] Admin / Q & A 6 | * [10] [Configuring a cluster with JSON + tiny bit on bootstrapping scripts](https://github.com/alexmilowski/data-science/tree/master/activities/emr-cluster) 7 | * [20] Project Status Sharing 8 | * [10] Making an AMI 9 | * [15] [Prime Multiplier Example](https://github.com/alexmilowski/data-science/tree/master/activities/emr-prime-multiplier) - scaling simple computations 10 | * [10] [Introduction to mrjob](https://docs.google.com/a/milowski.com/presentation/d/1ZUCg4oPnHYbRXNOMLE6NmUl1af8X0Q1DUVMT-iFEjMw/edit?usp=sharing) 11 | * [20] [OpenNEX climate data](https://github.com/alexmilowski/data-science/tree/master/activities/emr-opennex-climate-model) - mrjob from scratch 12 | * [5] Wrap-up 13 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | KEYNAME=$1 3 | BUCKET=$2 4 | if [ -z $KEYNAME ] || [ -z $BUCKET ]; then 5 | echo "Usage: $(basename $0) key-name bucket-name" 6 | exit 1 7 | fi 8 | tmpname="/tmp/$(basename $0).bootstrap.$$.json" 9 | echo "[{\"Path\" : \"s3://$BUCKET/cc-bootstrap.sh\", \"Name\" : \"Common Crawl Bootstrap\", \"Args\" : [] }, { \"Path\":\"s3://elasticmapreduce/bootstrap-actions/configure-hadoop\",\"Args\":[\"-m\",\"mapred.map.max.attempts=1\"]} ]" > $tmpname 10 | 11 | AMI_VERSION=3.6.0 12 | CLUSTER=file://./cluster.json 13 | LOG_PATH=logs/ 14 | TAG=emr 15 | 16 | aws emr create-cluster --ami-version $AMI_VERSION --ec2-attributes KeyName=$KEYNAME --instance-groups $CLUSTER --name "Crawl The Crawl Cluster" --log-uri s3://$BUCKET/$LOG_PATH --enable-debugging --tags Name=$TAG --bootstrap-actions file://$tmpname --applications "[]" 17 | rm -f $tmpname -------------------------------------------------------------------------------- /activities/twitter-acquisition/search.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tweepy 3 | import datetime 4 | import urllib 5 | import signal 6 | import json 7 | 8 | # Don't forget to install tweepy 9 | # pip install tweepy 10 | 11 | consumer_key = "" 12 | consumer_secret = "" 13 | 14 | access_token = "" 15 | access_token_secret = "" 16 | 17 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 18 | auth.set_access_token(access_token, access_token_secret) 19 | 20 | api = tweepy.API(auth_handler=auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) 21 | 22 | q = urllib.quote_plus(sys.argv[1]) # URL encoded query 23 | 24 | # Additional query parameters: 25 | # since: {date} 26 | # until: {date} 27 | # Just add them to the 'q' variable: q+" since: 2014-01-01 until: 2014-01-02" 28 | for tweet in tweepy.Cursor(api.search,q=q).items(200): 29 | # FYI: JSON is in tweet._json 30 | print tweet._json 31 | -------------------------------------------------------------------------------- /activities/emr-tweet-wordcount/tweetSplitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import re 4 | 5 | def main(argv): 6 | pattern = re.compile("[a-zA-Z][a-zA-Z0-9]*") 7 | for line in sys.stdin: 8 | line = line.replace("..."," ") 9 | line.replace("("," ") 10 | line.replace(")"," ") 11 | for word in line.split(): 12 | if len(word)<3 or word[0:5] == "http:" or word[0:6] == "https:" or word == "-": 13 | continue 14 | if word[0] == "." or word[0] == "\"" or word[0] == "(": 15 | word = word[1:] 16 | if word[-1] == "." or word[-1] == "," or word[-1] == "!" or word[-1] == ":" or word[-1] == "\"" or word[-1] == ")": 17 | word = word[0:-1] 18 | if len(word)<3: 19 | continue 20 | print "LongValueSum:" + word.lower() + "\t" + "1" 21 | 22 | 23 | if __name__ == "__main__": 24 | main(sys.argv) 25 | -------------------------------------------------------------------------------- /activities/sentiment-analysis/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import nltk 3 | import sets 4 | import pickle 5 | 6 | # Local 7 | import featureset 8 | 9 | wordlistFilename = sys.argv[1] 10 | rangeSpec = sys.argv[2].split(",") 11 | wordStart = int(rangeSpec[0]) 12 | wordEnd = int(rangeSpec[1]) 13 | outputFilename = sys.argv[3] 14 | 15 | featureWords = featureset.load(wordlistFilename,wordStart,wordEnd) 16 | print featureWords 17 | 18 | sys.stderr.write("Loading training data..."); 19 | 20 | texts = [] 21 | 22 | for line in sys.stdin: 23 | parts = line.decode('utf-8').split("\n")[0].split("\t") 24 | wordlist = list(featureset.wordlist(parts[1])) 25 | texts.append((wordlist,parts[0])) 26 | 27 | extractFeatures = featureset.makeExtractor(featureWords) 28 | 29 | sys.stderr.write(" applying features ..."); 30 | trainingSet = nltk.classify.apply_features(extractFeatures, texts) 31 | 32 | sys.stderr.write(" training classifier ..."); 33 | classifier = nltk.NaiveBayesClassifier.train(trainingSet) 34 | sys.stderr.write(" done\n"); 35 | 36 | f = open(outputFilename, 'wb') 37 | pickle.dump(classifier, f) 38 | f.close() 39 | -------------------------------------------------------------------------------- /activities/sentiment-analysis/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import sets 4 | import nltk 5 | 6 | # Local 7 | import featureset 8 | 9 | classifierFilename = sys.argv[1] 10 | wordlistFilename = sys.argv[2] 11 | rangeSpec = sys.argv[3].split(",") 12 | wordStart = int(rangeSpec[0]) 13 | wordEnd = int(rangeSpec[1]) 14 | 15 | f = open(classifierFilename,"rb") 16 | classifier = pickle.load(f) 17 | f.close() 18 | 19 | featureWords = featureset.load(wordlistFilename,wordStart,wordEnd) 20 | 21 | reviews = [] 22 | 23 | extractFeatures = featureset.makeExtractor(featureWords) 24 | 25 | count = 0 26 | missed = 0 27 | variance = 0; 28 | for line in sys.stdin: 29 | parts = line.decode('utf-8').split("\n")[0].split("\t") 30 | wordlist = list(featureset.wordlist(parts[1])) 31 | c = classifier.classify(extractFeatures(wordlist)) 32 | a = parts[0] 33 | count += 1 34 | if c != a: 35 | missed += 1 36 | print str(count)+"\t"+a+"\t"+c+"\t"+(",".join(reduce(lambda l,w: l+[w] if w in featureWords else l,wordlist,[]))) 37 | 38 | if count>0: 39 | print "{0} % correct, {1}/{2} ".format(100* ((count-missed)*1.0 / count), (count-missed),count) -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/seqs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | 4 | # This library calculates sequence numbers for quadrangles of a given size in degrees. 5 | 6 | # Example: 5° quadrangle sequence number 7 | # s = sequenceNumber(5,49.5,-126.0) 8 | # 9 | def sequenceNumber(size,lat,lon): 10 | latMax = int(180 / size) 11 | lonMax = int(360 / size) 12 | nlat = 90 - lat 13 | nlon = 360 + lon if lon < 0 else lon 14 | s = int(math.floor(nlat/size)) * int(lonMax) + int(nlon / size) + 1 15 | return s 16 | 17 | # Example: 5° quandrangle sequence numbers for a large rectangular region 18 | # quad = sequencesFromQuadrangle(5,[40,-125,35,-120]) 19 | # 20 | def sequencesFromQuadrangle(size,quad): 21 | quadBounds = [ sequenceNumber(size,quad[0],quad[1]), sequenceNumber(size,quad[0],quad[3]), 22 | sequenceNumber(size,quad[2],quad[3])] 23 | width = int(quadBounds[1] - quadBounds[0] + 1) 24 | lonMax = int(360 / size) 25 | 26 | s = quadBounds[0] 27 | while s= 3 and not e.lower() in stopWords]: 10 | if word == "n't": 11 | word = "not" 12 | if word == "'re": 13 | word = "are" 14 | if word == "'ve": 15 | word = "have" 16 | if word == "'ll": 17 | word = "will" 18 | word = lemmatizer.lemmatize(word) 19 | yield word 20 | 21 | 22 | def load(filename,start,end): 23 | featureWords = sets.Set() 24 | input = open(filename,"r") 25 | count = 0 26 | for line in input: 27 | count += 1 28 | if count < start: 29 | continue 30 | if end>start and count > end: 31 | break 32 | parts = line.decode('utf-8').split("\n")[0].split("\t") 33 | featureWords.add(parts[0]) 34 | input.close() 35 | return featureWords 36 | 37 | def makeExtractor(featureWords): 38 | def extractFeatures(document): 39 | words = set(document) 40 | features = {} 41 | for word in featureWords: 42 | features['contains(%s)' % word] = (word in words) 43 | return features 44 | return extractFeatures 45 | -------------------------------------------------------------------------------- /sessions/session-6.md: -------------------------------------------------------------------------------- 1 | # Session Schedule - Week 6 # 2 | 3 | * [5] intro 4 | * [10] project wrangling (joint) 5 | * [20] project group discussion (breakout) 6 | 7 | 1. define the goals project in more detail 8 | 2. details of data acquisition 9 | 3. proposed analytics and tools 10 | 4. prepare a short slide deck of issues/concerns to share when you return 11 | 12 | * [10 - 20] project sharing / planning (joint) 13 | 14 | * proposal due the following week (week 7) 15 | * include details from the breakout 16 | * 2 pages: problem, data sources, proposed analytics, may include technical architecture 17 | 18 | * [5] ER diagrams in Gliffy 19 | * [10] db ER activity (breakout) 20 | 21 | * How would you store tweet data in a relational database? 22 | * Consider storing hash tags, text tweet, language, handle, user information 23 | * Draw an ER diagram of your proposed model 24 | 25 | * [10] ER share / discussion (joint) 26 | * [10] sqlite activity (breakout) 27 | 28 | * Create a set of table definitions from your ER model for the tweet data. 29 | * Create a database and load your sample data. 30 | * Execute some sample queries: 31 | 1. all tweets for a particular user 32 | 2. all users who used a particular hashtag 33 | * Create a histogram of hashtags used by querying the database 34 | 35 | * [10] class planning / wrap-up (joint) 36 | 37 | -------------------------------------------------------------------------------- /activities/common-crawl/test-10.warc: -------------------------------------------------------------------------------- 1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz 3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz 4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz 5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz 6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz 7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz 8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz 9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz 10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz 11 | -------------------------------------------------------------------------------- /assignments/tweet-acquisition/README.md: -------------------------------------------------------------------------------- 1 | # Acquiring and Storing Social Media Data # 2 | 3 | ## A Hypothetical Scenario ## 4 | 5 | Minecraft is a popular game throughout the world that was 6 | [acquired last year (2014) by Microsoft](https://mojang.com/2014/09/yes-were-being-bought-by-microsoft/). We'd like to 7 | assess the current sentiment of the acquisition by examining social media data. Twitter is an obvious and easy choice 8 | as a place to start. 9 | 10 | 11 | ## Acquisition Task ## 12 | 13 | Acquire relevant data around the Microsoft / Mojang for a recent week. To accomplish this, do the following: 14 | 15 | 1. Write an acquisition program that can acquire tweets for a specific date on the using the Tweepy python package. The program should pull tweets 16 | for the #microsoft and #mojang hash tags simultaneously. 17 | 18 | 2. Run your data analysis over a week period of time. You should chunk your data as appropriate and give yourself the ability to re-run the process reliable in case of failures. 19 | 20 | 3. Organize the resulting raw data into a set of tweets and store these tweets into S3. 21 | 22 | 4. Analyze the tweets by producing a histogram (a graph) of the words. 23 | 24 | 25 | ## What to Turn In ## 26 | 27 | 1. A link to your S3 bucket documented in your README.md file. Make sure to make it publicly accessible. 28 | 29 | 2. Your twitter acquisition code. 30 | 31 | 3. The histogram. 32 | 33 | 34 | -------------------------------------------------------------------------------- /activities/decision-trees/regions.py: -------------------------------------------------------------------------------- 1 | from matplotlib.colors import ListedColormap 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | def plot(X, y, classifier,test_idx=None, resolution=0.02): 6 | 7 | # setup marker generator and color map 8 | markers = ('s', 'x', 'o', '^', 'v') 9 | colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan') 10 | cmap = ListedColormap(colors[:len(np.unique(y))]) 11 | 12 | # plot the decision surface 13 | x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 14 | x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 15 | xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), 16 | np.arange(x2_min, x2_max, resolution)) 17 | Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T) 18 | Z = Z.reshape(xx1.shape) 19 | plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap) 20 | plt.xlim(xx1.min(), xx1.max()) 21 | plt.ylim(xx2.min(), xx2.max()) 22 | 23 | # plot all samples 24 | for idx, cl in enumerate(np.unique(y)): 25 | plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], 26 | alpha=0.8, c=cmap(idx), 27 | marker=markers[idx], label=cl) 28 | 29 | # highlight test samples 30 | if test_idx: 31 | X_test, y_test = X[test_idx, :], y[test_idx] 32 | plt.scatter(X_test[:, 0], X_test[:, 1], c='', 33 | alpha=1.0, linewidths=1, marker='o', 34 | s=55, label='test set') 35 | -------------------------------------------------------------------------------- /activities/emr-prime-multiplier/prime-factors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | 4 | # Appends a single next prime 5 | def appendPrime(primes): 6 | p = primes[-1] 7 | prime = False 8 | while not prime: 9 | p += 1 10 | divisor = False 11 | for i in range(0,len(primes)): 12 | if p % primes[i] == 0: 13 | divisor = True 14 | break 15 | if not divisor: 16 | prime = True 17 | primes.append(p) 18 | return primes 19 | 20 | 21 | # an initial set of primes 22 | primes = [2, 3, 5] 23 | # an initial array of zeros of the same length 24 | counts = [0 for i in range(len(primes))] 25 | 26 | # for each line of input, factor the input 27 | for line in sys.stdin: 28 | 29 | # Parse the integer and skip zeros 30 | i = int(line) 31 | if (i==0): 32 | continue 33 | 34 | # Factor until we reach 1 35 | p = 0; 36 | while i!=1: 37 | #print i,p,i % primes[p],counts 38 | 39 | # compute exponent for current prime 40 | while i!=1 and i % primes[p] == 0: 41 | i = i / primes[p] 42 | counts[p] += 1 43 | 44 | # increment prime 45 | p += 1 46 | 47 | # if we aren't at zero but have run out of primes, find the next prime to factor 48 | if i!=1 and p==len(primes): 49 | appendPrime(primes) 50 | counts.append(0) 51 | 52 | # Output the counts for each prime 53 | for i in range(len(primes)): 54 | if counts[i]>0: 55 | print "LongValueSum:",primes[i],"\t",counts[i] -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/average.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | import sys 3 | import os 4 | import json 5 | import math 6 | 7 | # Computes the average correctly for a given input dataset 8 | class Average(MRJob): 9 | 10 | # Loads the JSON object and yields yearMonth -> (length,average) 11 | def average_partition(self, _, line): 12 | obj = json.loads(line) 13 | #print obj["yearMonth"],(len(obj["data"]),sum(obj["data"])/len(obj["data"])) 14 | yield obj["yearMonth"],(len(obj["data"]),sum(obj["data"])/len(obj["data"])) 15 | 16 | # Combines sequence number averages for particular year+month 17 | def average_month(self, yearMonth, countAverage): 18 | sum = 0 19 | total = 0 20 | for count,value in countAverage: 21 | sum += count*value 22 | total += count 23 | #print yearMonth,(total,sum/total) 24 | yield "month",(total,sum/total) 25 | 26 | # Computes the average over the year/month data keeping track of counts 27 | def average(self,_,averageData): 28 | sum = 0 29 | total = 0 30 | for count,average in averageData: 31 | sum += count*average 32 | total += count 33 | #print "average",sum/total 34 | yield "average",sum/total 35 | 36 | # Define a 1-step job with a mapper, combiner, and reducer 37 | def steps(self): 38 | return [ 39 | self.mr(mapper=self.average_partition, 40 | combiner=self.average_month, 41 | reducer=self.average) 42 | ] 43 | 44 | 45 | if __name__ == '__main__': 46 | Average.run() 47 | -------------------------------------------------------------------------------- /activities/common-crawl/mrcc.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | # 3 | import boto 4 | import warc 5 | # 6 | from boto.s3.key import Key 7 | from gzipstream import GzipStreamFile 8 | from mrjob.job import MRJob 9 | 10 | 11 | class CCJob(MRJob): 12 | def configure_options(self): 13 | super(CCJob, self).configure_options() 14 | self.add_passthrough_option('--source',help="Source location of the common crawl data (s3 or file)") 15 | 16 | def process_record(self, record): 17 | """ 18 | Override process_record with your mapper 19 | """ 20 | raise NotImplementedError('Process record needs to be customized') 21 | 22 | def mapper(self, _, line): 23 | f = None 24 | ## If we're on EC2 or running on a Hadoop cluster, pull files via S3 25 | if self.options.source in ['s3' ]: 26 | print 'Downloading ...' 27 | # Connect to Amazon S3 using anonymous credentials 28 | conn = boto.connect_s3(anon=True) 29 | pds = conn.get_bucket('aws-publicdatasets') 30 | # Start a connection to one of the WARC files 31 | k = Key(pds, line) 32 | f = warc.WARCFile(fileobj=GzipStreamFile(k)) 33 | ## If we're local, use files on the local file system 34 | else: 35 | print 'Loading local file {}'.format(line) 36 | f = warc.WARCFile(fileobj=gzip.open(line)) 37 | ### 38 | for i, record in enumerate(f): 39 | for key, value in self.process_record(record): 40 | yield key, value 41 | self.increment_counter('commoncrawl', 'processed_records', 1) 42 | 43 | # TODO: Make the combiner use the reducer by default 44 | def combiner(self, key, value): 45 | yield key, sum(value) 46 | 47 | def reducer(self, key, value): 48 | yield key, sum(value) 49 | -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/date_partitions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | 4 | # A date partition library 5 | 6 | xsdDatetimeFormat = "%Y-%m-%dT%H:%M:%S" 7 | xsdDateFormat = "%Y-%m-%d" 8 | 9 | # A generator for date/times based on durations 10 | # 11 | def datetime_partition(start,end,duration): 12 | current = start 13 | while start==current or (end-current).days > 0 or ((end-current).days==0 and (end-current).seconds>0): 14 | yield current 15 | current = current + duration 16 | 17 | # A generator for months given a start and end month. 18 | # 19 | # Example: Generates the months from 2015-03 to 2016-03 20 | # 21 | # months = month_partition(datetime.datetime(2015,3,1),datetime.datetime(2016,3,1)) 22 | # 23 | def month_partition(start,end): 24 | current = datetime.datetime(start.year,start.month,1) 25 | while current.year= 3 and not e.lower() in stopWords] 40 | input.append((words, sentiment)) 41 | 42 | print input 43 | 44 | 45 | # Get an ordered list of most frequently used words 46 | def getAllWords(input): 47 | all = [] 48 | for (words, sentiment) in input: 49 | all.extend(words) 50 | return all 51 | 52 | print 53 | 54 | wordlist = nltk.FreqDist(getAllWords(input)) 55 | print wordlist.pprint(100) 56 | wordFeatures = wordlist.keys() 57 | 58 | def extractFeatures(document): 59 | words = set(document) 60 | features = {} 61 | for word in wordFeatures: 62 | features['contains(%s)' % word] = (word in words) 63 | return features 64 | 65 | trainingSet = nltk.classify.apply_features(extractFeatures, input) 66 | 67 | classifier = nltk.NaiveBayesClassifier.train(trainingSet) 68 | 69 | print 70 | for sentence in data: 71 | print classifier.classify(extractFeatures(sentence.split())),": ",sentence 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Data Science 2 | 3 | Data Science is a process of acquiring, organizing, analyzing, and representing information. The activities in the process focus primarily on learning various technologies in support of each of the major stages of any data science process. 4 | 5 | ![Data Science Process](data-science.png) 6 | 7 | ## Activities 8 | 9 | Guided activities provide you examples of various technologies and how they can be used for particular tasks. Each activity should be able to be accomplished as a self-study. 10 | 11 | See [Activities](./activities/) section for a list of various self-guided studies. 12 | 13 | ## Sessions 14 | 15 | An agenda for each [session](./sessions/) is available. 16 | 17 | ## Assignments 18 | 19 | If you are a student in one of my classes, you'll be using github to turn in assignments. 20 | 21 | ### Setup 22 | 23 | You'll need to create a repository for your assignments. You can [request a discount](https://education.github.com/discount_requests/new) as a 24 | student so that you can have a private repository. Otherwise, all your assignments will 25 | be publically accessible. 26 | 27 | Once you've setup your repository, you'll want to add your instructor as a collaborator. That way they can merge your 28 | pull requests when they are graded. 29 | 30 | ### Turning in Assignments 31 | 32 | 1. [Create a branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/) of your repository for the homework and make your changes in that branch. 33 | 2. Commit and push your changes to the branch. 34 | 3. [Create a pull request](https://help.github.com/articles/creating-a-pull-request/) for the code you'd like to turn in. 35 | 4. Your instructor can now view the pull request and grade the assignment. 36 | 5. Once your instructor has graded the assignment, they can merge the pull request as a final notification. 37 | 6. You can now delete the branch as the changes have been merged with the master. 38 | -------------------------------------------------------------------------------- /assignments/getting-started/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started # 2 | 3 | This assignment will step you through the process of running a simple computation over a data set using Map/Reduce via mrjob. The goal 4 | of the assignment is to have you walk through the process of using git, github, python, mrjob, and AWS and ensure you are setup with 5 | all the various tools and services. 6 | 7 | ## Recommended Readings ## 8 | 9 | * [Getting started with Amazon AWS video tutorials](http://aws.amazon.com/getting-started/) 10 | * [Introduction to AWS training](https://www.youtube.com/playlist?list=PLhr1KZpdzukcMmx04RbtWuQ0yYOp1vQi4) 11 | * [A Comparison of Clouds: Amazon Web Services, Windows Azure, Google Cloud Platform, VMWare and Others](http://pages.cs.wisc.edu/~akella/CS838/F12/notes/Cloud_Providers_Comparison.pdf) 12 | * [A Survey on Cloud Provider Security Measures](http://www.cs.ucsb.edu/~koc/ns/projects/12Reports/PucherDimopoulos.pdf) 13 | 14 | ## Tasks ## 15 | 16 | ### Part 1 ### 17 | 18 | Note: Keep track of the time necessary to run the process locally. For Linux/Mac users, you can use the `time` command to compute this. 19 | 20 | 1. Follow the instructions at https://github.com/alexmilowski/data-science/tree/master/activities/common-crawl to get setup with the tools and code. 21 | 2. Run the process locally on your computer. 22 | 23 | ### Part 2 ### 24 | 25 | 1. Follow the process for running the tag counter on AWS EMR. 26 | 2. Download the output from S3. 27 | 28 | ## What to Turn In ## 29 | 30 | You must turn in a pull request containing the following: 31 | 32 | 1. A copy of the output directory for the tag counter running locally (name the directory 'out'). 33 | 2. A copy of the output from S3 for the tag counter running on AWS (name the directory 'emr-out'). 34 | 3. How long did it take to run the process for each of these? 35 | 4. How many `address` tags are there in the input? 36 | 5. Does the local version and EMR version give the same answer? 37 | 38 | Please submit the answers to 3-5 in a text file called `answers.txt` 39 | 40 | -------------------------------------------------------------------------------- /activities/common-crawl/test-15.warc: -------------------------------------------------------------------------------- 1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz 3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz 4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz 5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz 6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz 7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz 8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz 9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz 10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz 11 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz 12 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz 13 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz 14 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz 15 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz 16 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/mrcc.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import gzip 4 | import sys 5 | # 6 | import boto 7 | import warc 8 | # 9 | from boto.s3.key import Key 10 | from gzipstream import GzipStreamFile 11 | from mrjob.job import MRJob 12 | 13 | 14 | class CCJob(MRJob): 15 | 16 | def process_record(self, record): 17 | """ 18 | Override process_record with your mapper 19 | """ 20 | raise NotImplementedError('Process record needs to be customized') 21 | 22 | def mapper(self, _, line): 23 | f = None 24 | ## If we're on EC2 or running on a Hadoop cluster, pull files via S3 25 | if line.startswith("s3://"): 26 | 27 | print('Downloading ...',file=sys.stderr) 28 | key = None 29 | 30 | # Connect to Amazon S3 using anonymous credentials 31 | conn = boto.connect_s3(anon=True) 32 | if line.startswith("s3://"): 33 | pathStart = line.index('/',5) 34 | bucketName = line[5:pathStart] 35 | keyPath = line[pathStart+1:] 36 | print("Bucket: "+bucketName,file=sys.stderr) 37 | print("Key: "+keyPath,file=sys.stderr) 38 | bucket = conn.get_bucket(bucketName) 39 | key = Key(bucket,keyPath) 40 | else: 41 | print("Bucket: aws-publicdatasets",file=sys.stderr) 42 | print("Key: "+line,file=sys.stderr) 43 | bucket = conn.get_bucket("aws-publicdatasets") 44 | key = Key(bucket,line) 45 | # Start a connection to one of the WARC files 46 | f = warc.WARCFile(fileobj=GzipStreamFile(key)) 47 | 48 | ## If we're local, use files on the local file system 49 | else: 50 | if line.startswith("file:///"): 51 | line = line[7:] 52 | print("Local: {}".format(line),file=sys.stderr) 53 | f = warc.WARCFile(fileobj=gzip.open(line)) 54 | ### 55 | for i, record in enumerate(f): 56 | for key, value in self.process_record(record): 57 | yield key, value 58 | self.increment_counter('commoncrawl', 'processed_records', 1) 59 | 60 | def reducer(self, key, value): 61 | yield key, sum(value) 62 | -------------------------------------------------------------------------------- /activities/emr-tweet-wordcount/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Word Counts for Tweets # 3 | 4 | This example shows running the word count example over tweets. 5 | 6 | # Setup # 7 | 8 | If you don't have a cluster running, you'll need to start one (see main setup page). You also need a bucket for the code, input, and output. 9 | 10 | # Running the Example # 11 | 12 | In this example, we'll use a sample set of 1995 tweets with the word 'Microsoft' in them. 13 | 14 | ## Step 1 ## 15 | 16 | The tweets are stored as JSON and We'll need to extract the tweet text and create an input with one line per tweet. The `format-tweets.py` program 17 | does this: 18 | 19 | mkdir -p tweet-wc/input 20 | python format-tweets.py microsoft-2014-10-07.json > tweet-wc/input/tweets.txt 21 | 22 | Now we need to store the input: 23 | 24 | aws s3 sync tweet-wc s3://mybucket/tweet-wc/ 25 | 26 | ## Step 2 ## 27 | 28 | We need to store the word count program: 29 | 30 | aws s3 cp tweetSplitter.py s3://mybucket/ 31 | 32 | ## Step 3 ## 33 | 34 | Now we add the streaming step to do the work: 35 | 36 | aws emr add-steps --cluster-id --steps Type=STREAMING,Name='Tweet Word Count',ActionOnFailure=CONTINUE,Args=--files,s3://mybucket/tweetSplitter.py,-mapper,tweetSplitter.py,-reducer,aggregate,-input,s3://mybucket/tweet-wc/input,-output,s3://mybucket/tweet-wc/output 37 | 38 | Note: don't forget to use your cluster id and bucket name in the above. 39 | 40 | This command returns the step id that you can use for further monitoring. If you use an 'm1.medium' instance type, this job should take 1 minute to process and 3 minutes of elapsed time. 41 | 42 | You can monitor its progress from the console or via: 43 | 44 | aws emr describe-step --cluster-id --step-id 45 | 46 | ## Step 4 ## 47 | 48 | Sync the output: 49 | 50 | aws s3 sync s3://mybucket/tweet-wc/output/ tweet-wc/output/ 51 | 52 | You should now have 4 files: 53 | 54 | tweet-wc/output/_SUCCESS 55 | tweet-wc/output/part-00000 56 | tweet-wc/output/part-00001 57 | tweet-wc/output/part-00002 58 | 59 | The output is a list of word counts split amongst the part-nnnnn files. 60 | -------------------------------------------------------------------------------- /activities/emr-map-only/README.md: -------------------------------------------------------------------------------- 1 | # Map Task Input Splitting # 2 | 3 | This example shows how map tasks get their input from splitting the input files. In this example, we'll 4 | just count the lines received via a map-only step (i.e., no reduce step) and the output will just consist 5 | of that count. You'll see the output of each map task and how much of the input it received. 6 | 7 | # Setup # 8 | 9 | If you don't have a cluster running, you'll need to start one (see main setup page). You also need a bucket for the code, input, and output. 10 | 11 | # Running the Example # 12 | 13 | ## Step 1 ## 14 | 15 | You'll need to setup input to run the job and so we'll create a directory with some input: 16 | 17 | mkdir -p job/input 18 | python generate-input.py job/input/test 3 1000 19 | 20 | This will create three test files. We will process this with a simple map-only task to show you how input 21 | is split. 22 | 23 | Now we need to store the input: 24 | 25 | aws s3 sync job s3://mybucket/job/ 26 | 27 | ## Step 2 ## 28 | 29 | We need to store the line count program: 30 | 31 | aws s3 cp line-count.py s3://mybucket/ 32 | 33 | ## Step 3 ## 34 | 35 | Now we add the streaming step to do the work: 36 | 37 | aws emr add-steps --cluster-id --steps Type=STREAMING,Name='Map Line Count',ActionOnFailure=CONTINUE,Args=--files,s3://mybucket/line-count.py,-mapper,line-count.py,-reducer,NONE,-input,s3://mybucket/job/input,-output,s3://mybucket/job/output 38 | 39 | Note: don't forget to use your cluster id and bucket name in the above. 40 | 41 | This command returns the step id that you can use for further monitoring. If you use an 'm1.medium' instance type, this job should take 1 minute to process and 3 minutes of elapsed time. 42 | 43 | You can monitor its progress from the console or via: 44 | 45 | aws emr describe-step --cluster-id --step-id 46 | 47 | ## Step 4 ## 48 | 49 | Sync the output: 50 | 51 | aws s3 sync s3://mybucket/job/output/ job/output/ 52 | 53 | You should now have 7 files: 54 | 55 | job/output/_SUCCESS 56 | job/output/part-00000 57 | job/output/part-00001 58 | job/output/part-00002 59 | job/output/part-00003 60 | job/output/part-00004 61 | job/output/part-00005 62 | 63 | -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/by-sequences.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | import sys 3 | import os 4 | import json 5 | import math 6 | import datetime 7 | 8 | import seqs 9 | import date_partitions as partitions 10 | 11 | # Gathers data based on traversing sequence numbers for a given region and period of time 12 | class ListSequences(MRJob): 13 | 14 | # Add a data directory for the data on disk 15 | def configure_options(self): 16 | super(ListSequences, self).configure_options() 17 | self.add_passthrough_option('--data-dir',help="The directory where the data is stored.") 18 | 19 | # Yields the set of sequence numbers for each year/month for the requested region 20 | def year_seq(self,_,line): 21 | if line[0] == '#': 22 | return 23 | 24 | args = line.rstrip().split(","); 25 | 26 | quad = [ float(args[0]), float(args[1]), 27 | float(args[2]), float(args[3]) ] 28 | size = int(args[4]) 29 | startYear = int(args[5]) 30 | startMonth = int(args[6]) 31 | endYear = int(args[7]) 32 | endMonth = int(args[8]) 33 | 34 | for month in partitions.month_partition(datetime.datetime(startYear,startMonth,1),datetime.datetime(endYear,endMonth,1)): 35 | for seq in seqs.sequencesFromQuadrangle(size / 120.0,quad): 36 | yield "{}-{:02d}".format(month.year,month.month),(size,seq) 37 | 38 | # Computes the average for a year/month + quadrangle + sequence number by loading the data (JSON) 39 | def average_quadrangle(self, yearMonth, quadSpec): 40 | size,seq = quadSpec 41 | fileName = self.options.data_dir+(os.sep if self.options.data_dir[-1]!=os.sep else "")+yearMonth+"-"+str(size)+"-"+str(seq)+".json" 42 | if os.path.exists(fileName): 43 | f = open(fileName,"r") 44 | obj = json.load(f) 45 | f.close() 46 | yield yearMonth,(1,len(obj["data"])) 47 | 48 | # Defines the job as a 2-step map-only job 49 | def steps(self): 50 | return [ 51 | self.mr(mapper=self.year_seq, 52 | reducer=None), 53 | self.mr(mapper=self.average_quadrangle, 54 | reducer=None) 55 | ] 56 | 57 | 58 | if __name__ == '__main__': 59 | ListSequences.run() 60 | 61 | -------------------------------------------------------------------------------- /assignments/organizing-tweets/README.md: -------------------------------------------------------------------------------- 1 | # Organizing Acquired Data # 2 | 3 | In this assignment we will be organizing the information like that acquired in 4 | [Acquiring and Storing Social Media Data](../tweet-acquisition). In fact, 5 | we will be organizing it in three different ways and contrasting how the various 6 | storage systems can be used to accomplish a particular task. 7 | 8 | The subject of the is the tweet data that was acquired from a conference: 9 | 10 | * [prague-2015-02-14.json](prague-2015-02-14.json) 11 | * [prague-2015-02-15.json](prague-2015-02-15.json) 12 | 13 | Note: The time of the conference is CET (+01:00) timezone. 14 | 15 | We need to answer the following questions by "querying" the data: 16 | 17 | 1. Who tweeted the most during the conference? 18 | 2. What were the top 10 hash tags used? 19 | 3. For a particular hour, how many tweets were produced? 20 | 21 | We are going to answer these questions using three different database storage techonlogies: 22 | 23 | * Key/Value — [AWS S3](http://aws.amazon.com/s3/) 24 | * NoSQL Database — [Mongo](https://www.mongodb.org) or [MarkLogic](http://www.marklogic.com) 25 | * Relational Database — SQLite, MySQL, etc. 26 | 27 | ## Tasks ## 28 | 29 | As you look at the following tasks, keep in mind that you don't need all the raw information from the tweet 30 | data as provided from Twitter. That is, you do not need to model or store all the raw information but just 31 | that which is sufficient to answer the tree questions. 32 | 33 | 1. Draw a UML ER diagram how you would model your information extracted from the raw tweet data. 34 | 2. For each database category of Key/Value, NoSQL, and Relational, decribe a systems architecture that contains: 35 | 1. Your implementation model of how data is actually organized (e.g. a schema, collection structure, etc.). 36 | 2. The process necessary to store the information into your implementation model. 37 | 3. Pseudo-code / procedures that describe how you would answer each of the questions. 38 | 3. For just one of the database categories, implement your architecture. 39 | 40 | ## What to turn in ## 41 | 42 | 1. Your UML ER diagram. 43 | 2. A document for each of the database categories for task #2. 44 | 3. Your implementation code for task #3. 45 | 4. The answers for each of the three questions. Please provide answers for the hours 9:00+01:00 through 16:00+01:00 on both days. -------------------------------------------------------------------------------- /activities/sentiment-analysis/candy-corn.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | # negative 4 | negative = [ 5 | ("We're all aware by now that Candy corn is evil","nasty"), 6 | ("Candy corn is so bad for you","nasty"), 7 | ("If you eat candy corn... I guess you would eat crayons, candles and ear wax too","nasty"), 8 | ("Candy corn is nasty","nasty"), 9 | ("Never not horrified by candy corn.","nasty") 10 | ] 11 | 12 | # positive 13 | positive = [ 14 | ("I'm craving candy corn","best"), 15 | ("I still love candy corn","best"), 16 | ("Yes, I tweet candy corn and not broccoli. You know why? Because candy corn is more exciting.","best"), 17 | ("Autumn candy corn. So sweet; so good; so sticky. I taste no regrets.","best"), 18 | ("I love candy corn","best"), 19 | ("Candy corn is good","best") 20 | ] 21 | 22 | # Test 23 | tests = [ 24 | "Now's as good a time as any to remind you candy corn is the worst and if you like it you have a deep personal failing that needs examining.", #nasty 25 | "Candy corn is my favorite candy on Halloween", #best 26 | "Candy corn is sugar and wax - nasty", #nasty 27 | "Can't get enough candy corn love", #best 28 | "Candy corn is evil", #nasty 29 | "Candy corn is bad candy" # nasty 30 | ] 31 | 32 | # words we will exclude 33 | stopWords = [ 34 | "candy", 35 | "corn", 36 | "and", 37 | "not", 38 | "the", 39 | "...", 40 | "'re" 41 | ] 42 | 43 | # process the texts into a training set of words 44 | texts = [] 45 | for (tweet, sentiment) in positive + negative: 46 | words = [e.lower() for e in nltk.word_tokenize(tweet) if len(e) >= 3 and not e.lower() in stopWords] 47 | texts.append((words, sentiment)) 48 | 49 | print texts 50 | 51 | 52 | # Get an ordered list of most frequently used words 53 | def getAllWords(texts): 54 | all = [] 55 | for (words, sentiment) in texts: 56 | all.extend(words) 57 | return all 58 | 59 | print 60 | 61 | wordlist = nltk.FreqDist(getAllWords(texts)) 62 | print wordlist.pprint(100) 63 | wordFeatures = wordlist.keys() 64 | 65 | def extractFeatures(document): 66 | words = set(document) 67 | features = {} 68 | for word in wordFeatures: 69 | features['contains(%s)' % word] = (word in words) 70 | return features 71 | 72 | trainingSet = nltk.classify.apply_features(extractFeatures, texts) 73 | 74 | classifier = nltk.NaiveBayesClassifier.train(trainingSet) 75 | 76 | print 77 | for tweet in tests: 78 | print classifier.classify(extractFeatures(tweet.split())),": ",tweet 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /activities/README.md: -------------------------------------------------------------------------------- 1 | # Activities 2 | 3 | 4 | ## Examples 5 | 6 | ### Common Crawl Exemplar 7 | 8 | The [Common Crawl Exemplar](common-crawl/) is a fully worked example of running Map/Reduce via Hadoop on AWS EMR for textual analysis. 9 | 10 | ### Processing the NASA OpenNEX model in EMR 11 | 12 | The [Processing the NASA OpenNEX model in EMR](emr-opennex-climate-model/) activity processes climate model data using AWS EMR. 13 | 14 | ### Multiplying Many Integers via Prime Factorization using EMR 15 | 16 | The [Multiplying Many Integers via Prime Factorization using EMR](emr-prime-multiplier/) activity is a simple example of using Map/Reduce to perform a computation. 17 | 18 | 19 | ## Acquiring Data 20 | 21 | ### Acquiring Data from Twitter 22 | 23 | The [Acquiring Data from Twitter](twitter-acquisition/) activity demonstrates how to acquire data from an API. 24 | 25 | ### Scraping the Web 26 | 27 | The [Scraping the Web](web-scraping/) activity demonstrates gather information from the web. 28 | 29 | ### Crawling the Common Crawl 30 | 31 | The [Crawling the Common Crawl](crawling-the-crawl/) activity demonstrates using prefetched web content from the [Common Crawl](http://commoncrawl.org). 32 | 33 | 34 | ## Organizing 35 | 36 | ### Data Munging - Processing JSON, XML, and CSV Data 37 | 38 | The [Data Munging](data-munging/) activity demonstrates processing various data formats in Python. 39 | 40 | ### NoSQL Databases 41 | 42 | The [NoSQL Databases](nosql-databases/) activity demonstrates using different NoSQL databases. 43 | 44 | ### Relational Databases 45 | 46 | The [Relational Databases](relational-databases/) activity demonstrates using a relational database from Python. 47 | 48 | 49 | ## Analyzing 50 | 51 | ### Creating Clusters for EMR 52 | 53 | The [Creating Clusters for EMR](emr-cluster/) activity steps through setting up an EMR cluster for Map/Reduce (Hadoop) on AWS. 54 | 55 | ### Word Counts for Tweets 56 | 57 | The [Word Counts for Tweets](emr-tweet-wordcount/) activity steps through the infamous word count example on AWS EMR using tweet data. 58 | 59 | ### Map Task Input Splitting 60 | 61 | The [Map Task Input Splitting](emr-map-only/) activity demonstrates how input is split by Hadoop on AWS EMR. 62 | 63 | ### Introduction to Spark 64 | 65 | The [Introduction to Spark](intro-to-spark/) activity introduces [Spark](http://spark.apache.org) and steps through reproducing various previous activities. 66 | 67 | ### NLP - Text Processing with NLTK 68 | 69 | The [Text Processing with NLTK](text-processing-with-nltk/) activity introduces how text can be processed with NLTK in Python. 70 | 71 | ### NLP - Sentiment Analysis (NLTK) 72 | 73 | The [Sentiment Analysis](sentiment-analysis/) activity introduces Sentiment Analysis and steps through using it via Python. 74 | 75 | -------------------------------------------------------------------------------- /activities/common-crawl/test-20.warc: -------------------------------------------------------------------------------- 1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz 3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz 4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz 5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz 6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz 7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz 8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz 9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz 10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz 11 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz 12 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz 13 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz 14 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz 15 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz 16 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00015-ip-10-180-136-8.ec2.internal.warc.gz 17 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00016-ip-10-180-136-8.ec2.internal.warc.gz 18 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00017-ip-10-180-136-8.ec2.internal.warc.gz 19 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00018-ip-10-180-136-8.ec2.internal.warc.gz 20 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00019-ip-10-180-136-8.ec2.internal.warc.gz 21 | -------------------------------------------------------------------------------- /activities/emr-prime-multiplier/README.md: -------------------------------------------------------------------------------- 1 | # Multiplying Many Integers via Prime Factorization using EMR# 2 | 3 | This example demonstrates how map reduce key/values work by mulitplying a large 4 | number of integers. In this case, the output keys are prime numbers and the value is 5 | the exponent of the prime factorization (e.g. 12 produces 2 → 2, 3 → 1). 6 | 7 | It also uses the built-in aggregator as the reducer step and so the output is prefixes with `LongValueSum:`. 8 | 9 | ## Setup ## 10 | 11 | If you don't have a cluster running, you'll need to start one (see main setup page). You also need a bucket for the code, input, and output. 12 | 13 | ## Running the Example ## 14 | 15 | In this example, we'll multiple 1,000,000 integers between 1 and 1000 16 | 17 | ### Step 1 ### 18 | 19 | You'll need to setup input to run the job and so we'll create a directory with some input: 20 | 21 | mkdir -p multiply/input 22 | python generate-input.py 1000 1000000 > multiply/input/numbers.txt 23 | 24 | Now we need to store the input: 25 | 26 | aws s3 sync multiply s3://mybucket/multiply/ 27 | 28 | ### Step 2 ### 29 | 30 | We need to store the line count program: 31 | 32 | aws s3 cp prime-factors.py s3://mybucket/ 33 | 34 | ### Step 3 ### 35 | 36 | Now we add the streaming step to do the work (shorthand) 37 | 38 | aws emr add-steps --cluster-id --steps Type=STREAMING,Name='Multiply',ActionOnFailure=CONTINUE,Args=--files,s3://mybucket/prime-factors.py,-mapper,prime-factors.py,-reducer,aggregate,-input,s3://mybucket/multiply/input,-output,s3://mybucket/multiply/output 39 | 40 | or using JSON: 41 | 42 | aws emr add-steps --cluster-id --steps file://./step.json 43 | 44 | where `step.json` is: 45 | 46 | { 47 | "Type" : "STREAMING", 48 | "Name" : "Multiply", 49 | "ActionOnFailure" : "CONTINUE", 50 | "Args" : [ 51 | "-files","s3://mybucket/prime-factors.py", 52 | "-mapper","prime-factors.py", 53 | "-reducer","aggregate", 54 | "-input","s3://mybucket/multiply/input", 55 | "-output","s3://mybucket/multiply/output" 56 | ] 57 | } 58 | 59 | Note: don't forget to use your cluster id and bucket name in the above. 60 | 61 | This command returns the step id that you can use for further monitoring. If you use an 'm1.medium' instance type, this job should take 1 minute to process and 2 minutes of elapsed time. 62 | 63 | You can monitor its progress from the console or via: 64 | 65 | aws emr describe-step --cluster-id --step-id 66 | 67 | ### Step 4 ### 68 | 69 | Sync the output: 70 | 71 | aws s3 sync s3://mybucket/multiply/output/multiply/output/ 72 | 73 | You should now have 4 files: 74 | 75 | job/output/_SUCCESS 76 | job/output/part-00000 77 | job/output/part-00001 78 | job/output/part-00002 79 | 80 | The output is a list of primes and exponents for a very large number! 81 | -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/acquire.py: -------------------------------------------------------------------------------- 1 | import urllib2, gzip, StringIO 2 | from xml.dom import pulldom 3 | from xml import sax 4 | import json 5 | import math 6 | import sys 7 | import datetime 8 | 9 | import seqs 10 | import date_partitions as partitions 11 | 12 | # Service URI for data set 13 | serviceURI = "http://data.pantabular.org/opennex/data/" 14 | 15 | # Fetches a sequence number data give the facets 16 | 17 | def fetchQuadrangle(dataset,yearMonth,resolution,sequence): 18 | 19 | # Format a URI 20 | strYearMonth = "{}-{:02d}".format(yearMonth.year,yearMonth.month) 21 | url = serviceURI+dataset+"/"+strYearMonth+"/"+str(resolution)+"/"+str(sequence); 22 | print url 23 | 24 | # Open an HTTP Request 25 | response = None 26 | try: 27 | response = urllib2.urlopen(url) 28 | except urllib2.HTTPError as e: 29 | return None 30 | 31 | html = None 32 | 33 | # Unpack the response 34 | if response.headers.get('content-encoding', '') == 'gzip': 35 | data = response.read() 36 | compressedstream = StringIO.StringIO(data) 37 | gzipper = gzip.GzipFile(fileobj=compressedstream) 38 | html = gzipper.read() 39 | else: 40 | html = response.read() 41 | 42 | # Parse the markup 43 | parser = sax.make_parser() 44 | parser.setFeature(sax.handler.feature_namespaces, 1) 45 | doc = pulldom.parseString(html,parser) 46 | 47 | inTable = False 48 | 49 | def textContent(parent): 50 | s = ""; 51 | for n in parent.childNodes: 52 | if n.data != None: 53 | s += n.data 54 | return s 55 | 56 | # Process the markup as a stream and detect the table of data 57 | data = [] 58 | for event, node in doc: 59 | if event == pulldom.START_ELEMENT and node.tagName == 'table': 60 | if node.getAttribute("typeof") == "IndexedTable": 61 | inTable = True 62 | if event == pulldom.END_ELEMENT and node.tagName == 'table': 63 | inTable = False 64 | if inTable and event == pulldom.START_ELEMENT and node.tagName == 'td': 65 | doc.expandNode(node) 66 | if len(node.childNodes) > 0: 67 | data.append(float(textContent(node))) 68 | 69 | if len(data) == 0: 70 | return None 71 | 72 | # Return the sequence number data object 73 | return {"dataset": dataset, "yearMonth": strYearMonth, "resolution" : resolution, "sequence": sequence, "data": data } 74 | 75 | # The data set name 76 | dataset = sys.argv[1] 77 | 78 | # The resolution in 1/120 degree counts 79 | resolution = int(sys.argv[2]) 80 | 81 | # The quadrangle to cover 82 | quad = json.loads(sys.argv[3]) 83 | 84 | # The start and end year/month 85 | start = datetime.datetime.strptime(sys.argv[4],"%Y-%m") # start month 86 | end = datetime.datetime.strptime(sys.argv[5],"%Y-%m") # end month 87 | 88 | # The prefix for the output files 89 | prefix = sys.argv[6] 90 | 91 | # Compute the degree size of the quadrangles 92 | size = resolution / 120.0 93 | 94 | # Iterate over the months 95 | for yearMonth in partitions.month_partition(start,end): 96 | 97 | # Iterate over the sequence numbers for the quadrangle 98 | for seq in seqs.sequencesFromQuadrangle(size,quad): 99 | 100 | # Fetch a sequence number's data 101 | obj = fetchQuadrangle(dataset,yearMonth,resolution,seq) 102 | if obj != None: 103 | 104 | # Serialize the data as JSON 105 | fileName = "{}{}-{:02d}-{}-{}.json".format(prefix,yearMonth.year,yearMonth.month,resolution,seq) 106 | f = open(fileName,"w") 107 | json.dump(obj,f) 108 | f.write("\n") 109 | f.close() 110 | 111 | -------------------------------------------------------------------------------- /activities/sentiment-analysis/README.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis # 2 | 3 | ## Setup ## 4 | 5 | Please make sure you have nltk installed: 6 | 7 | pip install nltk 8 | python -m nltk.downloader all 9 | 10 | Things you might review: 11 | 12 | * A [short set of slides](http://courses.ischool.berkeley.edu/ds205/f14/sentiment-analysis.xhtml) (also found [here](sentiment-analysis.xhtml)) that will walk you through the [Candy Corn example](candy-corn.py). 13 | * A nice blog post on [using NLTK for sentiment analysis](http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/) 14 | * A short article on [Bag-of-words model on Wikipedia](http://en.wikipedia.org/wiki/Bag-of-words_model) 15 | 16 | 17 | ## Overview ## 18 | 19 | We're going to work our way through training a classifier for to detect positive or negative sentiment. This activity 20 | will not make you an expert. Instead, it is designed to give you a sense of the steps and data pipeline 21 | necessary to run such a classifier. 22 | 23 | We have a set of movie review data gathered from the ["Rotten Tomatoes" website by Pang/Lee in 2005](http://www.cs.cornell.edu/People/pabo/movie-review-data/). Each review has 24 | been extracted from the page and turned into a single line of text that is categorized as positive or negative. 25 | 26 | The data is found in the [rt-polaritydata](rt-polaritydata/) directory: 27 | 28 | * [rt-polarity.neg](rt-polarity.neg) — the original negative reviews in Windows 1252 text encoding 29 | * [rt-polarity.neg.utf8](rt-polarity.neg.utf8) — the negative reviews in UTF-8 text encoding 30 | * [rt-polarity.pos](rt-polarity.pos) — the original positive reviews in Windows 1252 text encoding 31 | * [rt-polarity.pos.utf8](rt-polarity.pos.utf8) — the positive reviews in UTF-8 text encoding 32 | 33 | To apply the bag-of-words model, we must: 34 | 35 | 1. Decide on a set of "feature words" for the model. These might be words like "bad", "good", "excellent", "horrible". 36 | 2. Process our data to produce a feature vector for each review text. 37 | 3. Train a classifier (e.g. a [Naive Bayse classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier) on the data. 38 | 4. Apply the classifier non-annotated data (i.e. new reviews). 39 | 40 | There are two simple examples of this process: 41 | 42 | * [candy-corn.py](candy-corn.py) — an example of positive/negative sentiment (2-way classifier) 43 | * [n-way.py](n-way.py) — an example of a multiple category (>2) classifier 44 | 45 | ## Activity ## 46 | 47 | ### (A) Generate a word list and histogram ### 48 | 49 | Use nltk and the supporting code in [featureset.py](featureset.py) and [wordcounts.py](wordcounts.py) to generate a word count and histogram from the dataset. 50 | 51 | Use this to inform the choice of "features" (words) for you bag-of-words model. 52 | 53 | ### (B) Train a classifier ### 54 | 55 | Use or modify the sample code in [train.py](train.py) to train a classifier and store it into a "pickled" object. 56 | 57 | ### (C) Test a classifier ### 58 | 59 | The the classifier on various input data (see sample code [test.py](test.py)). 60 | 61 | ### (D) Model Questions ### 62 | 63 | 1. How can you improve the accuracy? 64 | 2. Are there less often used words that are more characteristic of positive or negative reviews? 65 | 3. Does including such words (less used) improve the accuracy? 66 | 4. What happens to sentences that exhibit no features? 67 | 5. Does changing the stemmer or lemmanizer improve the accuracy? 68 | 69 | ### (E) Scale-up Questions ### 70 | 71 | 1. How would you apply a classifier to a large amount of data? 72 | 2. Given a raw newly acquired data set, what is the data pipeline necessary to apply such a classifier? 73 | 3. How do you organize the input and output of running a such classifier on AWS S3 (or other key/value storage such as HDFS)? 74 | -------------------------------------------------------------------------------- /activities/relational-databases/README.md: -------------------------------------------------------------------------------- 1 | # Relational Databases # 2 | 3 | There are a number of relational databases with a great variety of features. In this activity, we'll use the 4 | popular [SQLite database](http://www.sqlite.org/) as a local embedded database. This will avoid the need to configure 5 | remote connections. Given the core interoperability of SQL, most of the activity can easily be ported to 6 | other databases once the connection has been established. 7 | 8 | ## Setup ## 9 | 10 | SQLite3 comes packaged with python. You may also want the sqlite3 command-line tools. If you do not have the 11 | command-line shell for sqlite, you may have only the supporting libraries for the python interface. Additional 12 | tools can be installed via the [SQLite website](http://www.sqlite.org/). 13 | 14 | You can test whether you have a SQLite command-line shell by: 15 | 16 | $ sqlite3 17 | SQLite version 3.8.7.4 2014-12-09 01:34:36 18 | Enter ".help" for usage hints. 19 | Connected to a transient in-memory database. 20 | Use ".open FILENAME" to reopen on a persistent database. 21 | sqlite> 22 | 23 | ## ER Models to SQLite Tables ## 24 | 25 | Once you have a Entity-Relationship Model (ER model), you'll need to translate the model into 26 | a set of table definitions. For SQLite, simple primary/foreign key relationships can be created 27 | by use of integer row identitifiers. 28 | 29 | A primary key is simply labeled with `INTEGER PRIMARY KEY` and this enables SQLite to manage autocreation 30 | integer values for primary keys. 31 | 32 | A foreign key is a specially labeled column that references the table column: 33 | 34 | FOREIGN KEY(user) REFERENCES users(id) 35 | 36 | You can create a set of tables either by executing SQL statements via python or the command-line shell: 37 | 38 | CREATE TABLE users ( 39 | id INTEGER PRIMARY KEY, 40 | alias TEXT UNIQUE NOT NULL, 41 | name TEXT 42 | ); 43 | CREATE TABLE tweets ( 44 | user INTEGER NOT NULL, 45 | tweet TEXT NOT NULL, 46 | FOREIGN KEY(user) REFERENCES users(id) 47 | ); 48 | 49 | Note: See the syntax of [CREATE TABLE](https://www.sqlite.org/lang_createtable.html) for more information on the possibilities and the [datatypes supported by SQLite](http://www.sqlite.org/datatype3.html). 50 | 51 | Try this now my running the sqlite3 command-line tool and just cut-n-past the above definitions: 52 | 53 | $ sqlite3 test.db 54 | SQLite version 3.8.7.4 2014-12-09 01:34:36 55 | Enter ".help" for usage hints. 56 | sqlite> CREATE TABLE users ( 57 | ...> id INTEGER PRIMARY KEY, 58 | ...> alias TEXT UNIQUE NOT NULL, 59 | ...> name TEXT 60 | ...> ); 61 | sqlite> CREATE TABLE tweets ( 62 | ...> user INTEGER NOT NULL, 63 | ...> tweet TEXT NOT NULL, 64 | ...> FOREIGN KEY(user) REFERENCES users(id) 65 | ...> ); 66 | sqlite> 67 | 68 | ## Inserting Data ## 69 | 70 | You can insert data into tables via simple SQL commands. SQLite will handle row identifiers for primary keys if you've defined them to be integers: 71 | 72 | sqlite> insert into users(alias,name) values ('alexmilowski','Alex Milowski'); 73 | sqlite> insert into users(alias,name) values ('ghopper','Grace Hopper'); 74 | sqlite> select * from users; 75 | 1|alexmilowski|Alex Milowski 76 | 2|ghopper|Grace Hopper 77 | 78 | If know the user's primary key, we can insert tweet text: 79 | 80 | sqlite> insert into tweets values (1,"Hello World!"); 81 | sqlite> select * from tweets where user=(select id from users where alias='alexmilowski'); 82 | 1|Hello World! 83 | 84 | ## SQLite in Python ## 85 | 86 | Connecting to a database is simple. Given the previous example database, we can do: 87 | 88 | >>> import sqlite3 89 | >>> conn = sqlite3.connect('test.db') 90 | 91 | and execute a query: 92 | 93 | >>> c = conn.cursor() 94 | >>> c.execute('SELECT * FROM users') 95 | >>> c.fetchone() 96 | (1, u'alexmilowski', u'Alex Milowski') 97 | >>> c.fetchone() 98 | (2, u'ghopper', u'Grace Hopper') 99 | >>> c.fetchone() 100 | 101 | We can also bind values in queries: 102 | 103 | >>> c.execute('SELECT * FROM users WHERE alias=?', ['alexmilowski']) 104 | >>> c.fetchone() 105 | (1, u'alexmilowski', u'Alex Milowski') 106 | 107 | Or iterate results: 108 | 109 | >>> for row in c.execute('SELECT * FROM users'): 110 | ... print row 111 | ... 112 | (1, u'alexmilowski', u'Alex Milowski') 113 | (2, u'ghopper', u'Grace Hopper') 114 | 115 | Inserting data requires both a query (insert statement) and a commit: 116 | 117 | >>> users=[('mariecurie',"Marie Curie"), 118 | ... ('albert',"Albert Einstein")] 119 | >>> c.executemany("INSERT INTO users(alias,name) VALUES(?,?)",users) 120 | 121 | >>> conn.commit() 122 | 123 | Finally, don't forget to close the connection: 124 | 125 | >>> conn.close() 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /activities/decision-trees/tree-example.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Is it warm outside? 7 | 8 | 9 | San Francisco warm? 10 | 11 | 12 | Really hot? 13 | 14 | 15 | Skip lunch, 16 | get ice cream 17 | 18 | 19 | Outside, without 20 | a jacket!!! 21 | 22 | 23 | Okay, maybe 24 | outside ... bring 25 | a jacket 26 | 27 | 28 | Inside 29 | 30 | 31 | < 75℉ 32 | 33 | 34 | < 65℉ 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | yes 46 | 47 | 48 | no 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | < 85℉ 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | yes 71 | 72 | 73 | no 74 | 75 | 76 | yes 77 | 78 | 79 | no 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /activities/emr-opennex-climate-model/README.md: -------------------------------------------------------------------------------- 1 | # Processing the NASA OpenNEX model in EMR # 2 | 3 | NASA produced a climate model which has been made available as web resources via the PAN methodology at 4 | as an example at http://data.pantabular.org/opennex/ with a demonstration application at http://data.pantabular.org/opennex/explore/ 5 | 6 | In this activity, we will be using this climate model data as an example data set to process numerical information in MRJob 7 | and produce various averages and summaries. 8 | 9 | ## Acquiring the Data ## 10 | 11 | To begin with, we will run the Map/Reduce processes locally on a data set rather than off S3 and via EMR. All of the 12 | examples can easily be translated by storing data in S3 and using the EMR runner to deploy the MRJob code on a cluster. 13 | 14 | The file [acquire.py](acquire.py) is a program that will download the data in PAN format and output the data as JSON 15 | data files. The program has the arguments: 16 | 17 | * the data set name 18 | * a resolution counts of a 1/120° 19 | * a quadrangle in JSON array syntax 20 | * a start year/month (e.g., 2015-03) 21 | * an end year/month 22 | * an output directory 23 | 24 | For example: 25 | 26 | python acquire.py avg-rcp85 60 "[40,-125,35,-120]" 2015-03 2015-03 dataset/ 27 | 28 | downloads data for the month of March, 2015 for 0.5° partitions for the data set `avg-rcp85` and stores it into the directory `dataset/`. 29 | 30 | The output is a set of files based on sequence numbers that cover the requested geospatial region. They are named `{year}-{month}-{resolution}-{sequence#}.json` and 31 | stored directory given as one JSON object per file without newlines in the formatting. 32 | 33 | For this activity, acquire the first three months of data for 2015: 34 | 35 | python acquire.py avg-rcp85 60 "[40,-125,35,-120]" 2015-01 2015-03 dataset/ 36 | 37 | This should take about 2-3 minutes. 38 | 39 | ## Understanding the Data ## 40 | 41 | The JSON object has the format: 42 | 43 | { 44 | "data" : [ 286.19, 286.19, 286.18, ... ], 45 | "yearMonth" : "2015-01", 46 | "sequence": 74634, 47 | "resolution": 60, 48 | "dataset": "avg-rcp85" 49 | } 50 | 51 | The array value for "data" is a set of temperature values in Kelvins from the model associated with the geospatial region for the sequence number. 52 | 53 | ## Supporting Code ## 54 | 55 | There are two supporting libraries: 56 | 57 | * seqs.py — a library supporting generating sequence numbers from latitude/longitude 58 | * date_partitions — a library supporting generating sequences of dates for partitioning time 59 | 60 | A set of sequence numbers given geospatial region can be enumerated giving the quadrangle and the size (in degrees): 61 | 62 | seqs.sequencesFromQuadrangle(0.5,[40,-125,35,-120]) 63 | 64 | where `0.5` is for half-degree quadrangles covering the region defined by the two points (40°,-125°) and (35°,-120°). 65 | 66 | The two supporting libraries can be put together: 67 | 68 | import datetime 69 | import seqs 70 | import date_partitions as partitions 71 | 72 | for month in partitions.month_partition(datetime.datetime(2015,3,1),datetime.datetime(2015,5,1)): 73 | for seq in seqs.sequencesFromQuadrangle(0.5,[40,-125,35,-120]): 74 | print "{}-{:02d},{}".format(month.year,month.month,seq) 75 | 76 | ## Input Example ## 77 | 78 | Because we'll be running the example locally, we can just create input from each of the data files where each line contains a single 79 | JSON object. The example code [input-example.py](input-example.py) produces an average via map/reduce (mrjob) over the data loaded. 80 | 81 | To run the example on the first three months we acquired: 82 | 83 | cat dataset/2015-0[1-3]*.json | python input-example.py 84 | 85 | The mapper loads the data from the line given and computes an average: 86 | 87 | def mapper(self, _, line): 88 | obj = json.loads(line) 89 | yield "average",sum(obj["data"])/len(obj["data"]) 90 | 91 | ## Average Example ## 92 | 93 | A more complicated example in [average.py](average.py) computes an average by month and keeps track of the counts. It uses a combiner 94 | to collect the sequence numbers associated with the month and then does the reduce step to compute the overall average. It uses the 95 | counts to make sure the average is calculated correctly. 96 | 97 | To run the example on the first three months we acquired: 98 | 99 | cat dataset/2015-0[1-3]*.json | python average.py 100 | 101 | Note that the average is not quite the same. 102 | 103 | ## Activity ## 104 | 105 | We'd like to take these simple examples and compute over a more generic input. We can transition our code to run over a local dataset (or one in S3) 106 | by using a setup like [by-sequences.py] where the data is retrieved from a data set and the input is a specification of what to process. 107 | 108 | This program assumes input in a CSV format with the columns: 109 | 110 | * lat1 — the NW latitude of the quadrangle 111 | * lon1 — the NW longitude of the quadrangle 112 | * lat2 — the SE latitude of the quadrangle 113 | * lon2 — the SE longitude of the quadrangle 114 | * size — the count of 1/120° arc lengths of the resolution (usually 60) 115 | * startYear - the year to start 116 | * startMonth - the month to start 117 | * endYear — the year to end 118 | * endMonth — the month to end 119 | 120 | An input might look like: 121 | 122 | #lat1,lon1,lat2,lon2,size,startYear,startMonth,endYear,endMonth 123 | 40,-125,35,-120,60,2015,02,2015,03 124 | 125 | and you can run the sample code like: 126 | 127 | python by-sequences.py --data-dir `pwd`/dataset/ < input-sequences.txt 128 | 129 | The sample code is a two-step map/reduce job. Your task is to modify it so that it correcly computes an average for the given input line. Take a look 130 | at `average.py` and see how you might modify the various methods and add/replace them in `by-sequences.py`. 131 | 132 | -------------------------------------------------------------------------------- /activities/crawling-the-crawl/README.md: -------------------------------------------------------------------------------- 1 | # Crawling the Common Crawl # 2 | 3 | The common crawl is a data set hosted by AWS that represents a crawl of the Web. The data set contains the raw web pages as well as 4 | metadata and text extracts that are smaller in size. 5 | 6 | The dataset is stored in [WARC format](http://en.wikipedia.org/wiki/Web_ARChive) (ISO 28500:2009) and consists of a textual stream of 7 | records. Each record contains a header of name/value pairs followed by an entity body (and encoded payload). 8 | 9 | The [Common Crawl stores its data](http://commoncrawl.org/the-data/get-started/) in these format on S3 as hosted by AWS in the bucket 10 | and prefix of `s3://aws-publicdatasets/common-crawl/`. Crawl data from 2013 onward has the key structure of `crawl-data/CC-MAIN-YYYY-DD/ 11 | and so, for example, the latest is stored at `s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-06/`. 12 | 13 | ## Activity — Exploring the Data Set ## 14 | 15 | ### How is it stored and partitioned? ### 16 | 17 | Use the AWS CLI to explore the data set (`s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-06/`) by using the `aws s3 ...` command and answer the following: 18 | 19 | 1. What is stored at the root? 20 | 2. What summary metadata can you retrieve? 21 | 3. What are the various data formats you can process? 22 | 4. How is the data set partitioned? 23 | 24 | ### WARC, WET, WAT ### 25 | 26 | 1. There are three data resources stored by the common crawl: raw pages, metadata, and textual extraction. Are they all stored in the same format? 27 | 28 | 2. What do you need to process them? 29 | 30 | 3. Retrieve a sample being careful not to download the whole dataset (it is large). 31 | 32 | 4. Examine a sample WAT file. 33 | 34 | 35 | ## Activity — Extracting Domain Coverage ## 36 | 37 | First you need to create (or reuse) an S3 bucket for this activity. Throughout this activity, we will use the name `mybucket` for the bucket 38 | name and you should replace that with your bucket name. 39 | 40 | Also, you'll need your AWS key name so that you have SSH access to the cluster. 41 | 42 | ### 1. Start a Cluster ### 43 | 44 | First, copy the bootstrapping script [cc-bootstrap.sh](cc-bootstrap.sh) to the root of your bucket (e.g. to s3://mybucket/cc-bootstrap.sh): 45 | 46 | aws s3 cp cc-bootstrap.sh s3://mybucket/cc-bootstrap.sh 47 | 48 | This script installs python 2.7 and various packages use by the WARC python modules. 49 | 50 | There is a script in the code called [start.sh](start.sh) that uses the AWS CLI to start a basic cluster for this activity. It takes a key name (for ssh) and bucket name as arguments: 51 | 52 | ./start.sh mykey mybucket 53 | 54 | It will start the cluster defined in [cluster.json](cluster.json). 55 | 56 | *You'll need this cluster at the end. Don't start the cluster until you need it a save yourself a bit a money.* 57 | 58 | ### 2. Get the manifest ### 59 | 60 | At the root of the crawl there should be several compressed manifest files that have paths to the data. Retrieve these files from S3 and examine the WAT file. 61 | 62 | The manifest contains a set of paths into the S3 bucket. You can convert these to S3 URIs by: 63 | 64 | gzip -dc wat.paths.gz | python s3.py 65 | 66 | ### 3. Retrieve sample data ### 67 | 68 | We will be working with the WAT metadata from here forward. You may want to retrieve some sample data to work locally and then test your code on a cluster afterwards. 69 | 70 | You can get the very first partition by: 71 | 72 | gzip -dc wat.paths.gz | python s3.py | head -n 1 73 | 74 | You can use the AWS CLI to download this locally from S3. Be warned that the data file is about 400MB in size. 75 | 76 | Alternatively, you can use the `extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz` file that is an extract of the first 907 records of the first partition. 77 | 78 | ### 4. View the data ### 79 | 80 | Just take a peek: 81 | 82 | gzip -dc extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz | more 83 | 84 | What's in there? Looks like JSON data ... 85 | 86 | ### 5. Run the example MRJob Locally ### 87 | 88 | There is sample code in [mrcc.py](mrcc.py) and [ccex.py](ccex.py). 89 | 90 | Run the example on the extract: 91 | 92 | echo `pwd`/extract-CC-MAIN-20150124161055-00000-ip-10-180-212-252.ec2.internal.warc.wat.gz | python ccex.py 93 | 94 | What does that command do? 95 | 96 | What did the program do? Do you know how this works? 97 | 98 | Notice something funny about the output? Explain your observation based on the input data. 99 | 100 | 101 | ### 6. Modify the example ### 102 | 103 | One basic issue with using the common crawl is to determine whether your target sites are in there. Thus, one simple task is to count the domains crawled within 104 | a particular data set. 105 | 106 | Can you modify [ccex.py](ccex.py) to count domains? 107 | 108 | The WAT data in WARC format contains metadata extracted from the crawl for each page. Process the data to extract and count the domain names. Be careful to remove sub-domains 109 | so that variants like `www1.hp.com` and `www2.hp.com` reduce to `hp.com`. 110 | 111 | ### 7. Run it on a cluster ### 112 | 113 | Once you have your script read, you can run it directly on the dataset hosted in AWS. All you need to do is provide a list of the S3 URIs you want to process as the input. 114 | 115 | One simple way to do that is from the path metadata. For example, the first 10 listed is: 116 | 117 | gzip -dc wat.paths.gz | python s3.py | head -n 10 118 | 119 | There is a script called [run.sh](run.sh) that will launch your job on your cluster and it takes the script, the bucket, and the cluster identifier as parameters: 120 | 121 | gzip -dc wat.paths.gz | python s3.py | head -n 10 | ./run-step.sh myscript.py mybucket j-xxxxxxxxxxxxx 122 | 123 | where `j-xxxxxxxxxxxxx` is your cluster identifier. 124 | 125 | ### 8. Discussion ### 126 | 127 | How long will it take to compute the domains for a partition? For the whole crawl date? For the whole data set? 128 | 129 | Does it scale? 130 | 131 | What do you need to do to make it scale? 132 | 133 | -------------------------------------------------------------------------------- /activities/twitter-acquisition/README.md: -------------------------------------------------------------------------------- 1 | # Acquiring Data from Twitter # 2 | 3 | This activity will step you through the process of acquiring data from Twitter and applying different acquisition strategies. 4 | 5 | ## Setup ## 6 | 7 | ### Install Tweepy ### 8 | 9 | The code provided and activities will use the [tweepy](https://github.com/tweepy/tweepy) module. You should install this package: 10 | 11 | pip install tweepy 12 | 13 | ### Create an Application ### 14 | 15 | Twitter data can be accessed over the Web by creating an application on their site and then using the access keys 16 | they provide for the application in your program. 17 | 18 | Note: You will need to have a Twitter account to create an application. 19 | 20 | To create an application, follow this procedure: 21 | 22 | 1. Login to Twitter (https://www.twitter.com/). 23 | 2. Visit https://apps.twitter.com and click on "Create New App". 24 | 3. Fill in the application name, description, and Website. The name will be listed in your application list when you return this Website. 25 | 4. Agree to the terms and agreements and click on "Create your Twitter Application" 26 | 27 | Once you have successfully created an application, it should take you to the newly created application. Here you must create access keys for 28 | subsequent operations by your application. To do so, use the following procedure: 29 | 30 | 1. Click on the "Keys and Access Tokens" tab. 31 | 2. Click on "Create my Access Token" near the bottom of the page. 32 | 33 | The response should be relatively immediate. 34 | 35 | Now you have for things: 36 | 37 | 1. A consumer key that identifies your application. 38 | 2. A consumer secret that acts as a "password" for your application. 39 | 3. An access token that identifies your authorized access. 40 | 4. An access token secret that acts as a "password" for that authorized access. 41 | 42 | At any point, you can revoke the access key or regenerated any of these values. 43 | 44 | To completely disable the application, you must delete the application. This does is remove the consumer key, secret, and access tokens from 45 | Twitter's system and any program using them will immediately stop working. 46 | 47 | ### Test your Application ### 48 | 49 | Use the `hello-twitter.py` program to test your application. Change the code and insert your consumer key, consumer secret, access token, and 50 | access token secret. You should then be able to just run the program and get a few tweets: 51 | 52 | python hello-twitter.py 53 | 54 | ## Data Collection Activities ## 55 | 56 | While real-time data collection is interesting, if you are research data provided by tweets, search is the simple way to 57 | collect information - even from the recent past. Instead of collecting information and sorting it ourselves, we'll use 58 | the twitter search API to partition information by date/time and other facets to partition the collected data. 59 | 60 | Also, the [Twitter API is rate limited](https://dev.twitter.com/rest/public/rate-limiting) and so you can't make more than 61 | 180 requests per 15 minutes. Fortunately, the tweepy library that we'll be using handles pausing automatically. With the 62 | partitioning and the automatic handling of rate limiting against the [Twitter REST API](https://dev.twitter.com/rest/public), 63 | we'll be able to just write our code normally and the calls will pause until requests can be made again. 64 | 65 | ### The Tweepy Library ### 66 | 67 | The Tweepy library handles talking directly to the various REST Web services provided by Twitter. Many of the calls 68 | have practical limits to the amount of data that is returned. If you are trying to gather large amounts of data from 69 | Twitter, you'll need to navigate the paged results. 70 | 71 | Tweepy provides a "cursor" functionality that handles the navigation of paged results for you. You simply 72 | wrap your call in a Cursor object: 73 | 74 | for tweet in tweepy.Cursor(api.search,q=q).items(200) 75 | print tweet.text 76 | 77 | In the above example, the 200 tweets are returned from the generator regardless of how many are returned from 78 | each call to a Twitter REST API. 79 | 80 | An example of this is shown in `search.py` where the first 200 tweets are collected for a search term. You'll need to modify 81 | the code to add your consumer key/secret and access token/secret. 82 | 83 | ### Activity: Chunking ### 84 | 85 | Suppose you are going to collect information about a particular topic (e.g. a hash tag) from Twitter and you'll be using code 86 | similar to `search.py` to do so. If you remove the `200` parameter to `items()` you'll be accessing all the search results in 87 | as much as Twitter will give you over time via the rate limiting. 88 | 89 | Change the search.py code to output data to a file and limiting the amount of tweets per file. 90 | 91 | Here are some things to consider: 92 | 93 | * What information will you store? 94 | * Tweets are actually complex JSON objects accessible as the '_json' member on the object returned by the tweepy API. Maybe 95 | you should store the JSON? 96 | * What is a syntactically correct json file (see http://www.json.org)? 97 | * Maybe you'll want a nice handler class for the data? 98 | * How do you cancel this possibly long running process and still have the last chunk be syntactically valid? 99 | 100 | Here is some helper code for serialization that relies on the `json` python module: 101 | 102 | class TweetSerializer: 103 | out = None 104 | first = True 105 | count = 0 106 | def start(self): 107 | self.count += 1 108 | fname = "tweets-"+str(self.count)+".json" 109 | self.out = open(fname,"w") 110 | self.out.write("[\n") 111 | self.first = True 112 | 113 | def end(self): 114 | if self.out is not None: 115 | self.out.write("\n]\n") 116 | self.out.close() 117 | self.out = None 118 | 119 | def write(self,tweet): 120 | if not self.first: 121 | self.out.write(",\n") 122 | self.first = False 123 | self.out.write(json.dumps(tweet._json).encode('utf8')) 124 | 125 | ### Activity: Interrupts and Resilience ### 126 | 127 | If you need to shutdown your data collection, you can define an interrupt handler: 128 | 129 | def interrupt(signum, frame): 130 | print "Interrupted, closing ..." 131 | # magic goes here 132 | exit(1) 133 | 134 | signal.signal(signal.SIGINT, interrupt) 135 | 136 | Things to consider: 137 | 138 | * What would you add to your chunking tweet acquisition code to handle interrupts? 139 | * What kind of exceptions might be thrown? 140 | * What kinds of errors might Tweepy or Twitter give you? 141 | * How do you make your process resilient? 142 | 143 | 144 | ### Activity: Partitioning Data on Facets ### 145 | 146 | While it may be convenient for the programmer to write out a fixed number of tweets per file, it might be more 147 | useful to partition the tweets on facets based on your data collection. For example, if you are collecting tweets over 148 | a specific period of time, treating the data as a time-series data set might make sense. As such, the partition or API use 149 | would use time to limit the results stored in each file. 150 | 151 | Twitter has two useful [search query operators](https://dev.twitter.com/rest/public/search): 152 | 153 | * until:{date} - limits the result to those up to a specific date 154 | * since:{date} - limits the results to those after a specific date 155 | 156 | These two operators can be used together to define a particular day. For example: 157 | 158 | minecraft since:2015-01-10 until:2015-01-11 159 | 160 | which you can view on the [twitter website](https://twitter.com/search?q=minecraft%20since%3A2015-01-10%20until%3A2015-01-11). 161 | 162 | Questions to consider: 163 | 164 | * How would you change your search program to use facets of the tweets for partitioning to retrieve data for a specific time period (e.g. a week)? 165 | * What duration of time would you use to store a "reasonable" number of tweets per chunked file? 166 | * What other criteria would you use to chunk data beyond a day? 167 | * How are the files named consistently to match the facet ranges? 168 | * Are the facet ranges in the JSON in each output file? 169 | 170 | -------------------------------------------------------------------------------- /activities/common-crawl/README.md: -------------------------------------------------------------------------------- 1 | # Common Crawl Exemplar # 2 | 3 | This activity will step you through the process of running various Map/Reduce (MR) processes 4 | on the [Common Crawl](http://commoncrawl.org/) data set hosted by [AWS](http://aws.amazon.com). 5 | 6 | In this activity you will: 7 | 8 | 1. Install various supporting tools for running MR processes via [mrjob](https://github.com/Yelp/mrjob) and [AWS EMR](http://aws.amazon.com/elasticmapreduce/). 9 | 2. Process data locally using mrjob. 10 | 3. Run the same process on AWS EMR. 11 | 4. Ensure you have all correct development environment to do the above. 12 | 13 | This activity is divided into two parts. In the first part, you'll run the example code locally. Afterwards, you can setup an AWS account and role so that you can run the same 14 | process on AWS in the cloud. 15 | 16 | We will be running the "[Tag Counter](https://github.com/commoncrawl/cc-mrjob#running-the-code)" over portions of the Common Crawl data set. 17 | 18 | # General Setup # 19 | 20 | ## Shell Access ## 21 | 22 | Most of the following code uses shell commands. You should become familiar with running commands from the shell and make sure you have an environment that 23 | matches your deployment environment (likely Linux). You can run a Linux OS locally via technology like [Virtual Box](https://www.virtualbox.org). 24 | 25 | ## Get the Code via Git ## 26 | 27 | You need to install git from [github](http://github.com) and you're already on their site. If you haven't already done so, sign up for an account and clone the 28 | code for the [Common Crawl - mrjob starter kit](https://github.com/commoncrawl/cc-mrjob): 29 | 30 | git clone https://github.com/commoncrawl/cc-mrjob.git 31 | 32 | This will download the code into whatever directory you are in when you issue that command. You should then have a directory called 'cc-mrjob'. The setup from now on 33 | will assume you are in the same directory. 34 | 35 | If you do not have this repository, clone this into a parallel directory: 36 | 37 | git clone https://github.com/alexmilowski/data-science.git 38 | 39 | You should now have two parallel directories: 40 | 41 | .../cc-mrjob/ 42 | .../data-science/ 43 | 44 | Copy these files from `data-science/activities/common-crawl` to the `cc-mrjob` directory : 45 | 46 | mrcc.py 47 | mrcc.py.tar.gz 48 | mrjob.conf 49 | 50 | Note: The modified code just fixes issues with pulling the common crawl data from S3 and the `mrjob.conf` is a configuration of EMR bit more specific to this activity. 51 | 52 | 53 | ## Setup Python ## 54 | 55 | You should install [Python 2.7.8](https://www.python.org/download/releases/2.7.8/) locally so you can run this example. If you have previous versions 56 | of Python, you may run into compatibility reasons (e.g. don't use 2.6.x). In addition, Python 3.0 has many changes that also may be problematic. 57 | 58 | You may find a Python IDE useful but you should ensure you can run Python from the command line properly. Also, installing multiple versions of Python is not recommended. 59 | 60 | Once you've gotten your Python install sorted, load the packages for the activity via pip: 61 | 62 | pip install -r requirements.txt 63 | 64 | Note: Depending on how you have install various bits, you may need a "sudo" in front of that. 65 | 66 | # Run it Locally # 67 | 68 | ## Requirements ## 69 | 70 | You'll need good bandwidth to download the various data. 71 | 72 | ## Get the Data ## 73 | 74 | There is a script that uses `wget` to download various content from the hosted dataset on S3: 75 | 76 | ./get-data.sh 77 | 78 | If you are on a Mac or Windows, you'll likely need to install wget. If you use Mac Ports, you can install wget via: 79 | 80 | sudo port install wget 81 | 82 | Otherwise, the datasets for this activity are located at: 83 | 84 | https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 85 | https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/wat/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wat.gz 86 | https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/wet/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wet.gz 87 | 88 | The various subsequent scripts expect a subdirectory structure of: 89 | 90 | common-crawl/ 91 | crawl-data/ 92 | CC-MAIN-2014-35/ 93 | segments/ 94 | 1408500800168.29/ 95 | warc/ 96 | CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 97 | wat/ 98 | CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wat.gz 99 | wet/ 100 | CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.wet.gz 101 | 102 | ## Run the Code ## 103 | 104 | To run the code, do the following: 105 | 106 | python absolutize_path.py < input/test-1.warc | python tag_counter.py -r local --conf-path mrjob.conf --no-output --output-dir out 107 | 108 | The first python script just turns a relative path into an absolute path. The second python uses that path as input via stdin and then runs the Map/Reduce process locally via mrjob. 109 | 110 | The output is in the file `out/part-00000`. 111 | 112 | 113 | # Run it on AWS EMR # 114 | 115 | If you have not signed up for AWS, you'll need to do that first by visiting http://aws.amazon.com/ 116 | 117 | ## AWS Setup ## 118 | 119 | If you do not have a user/group with access to EMR, you'll need to do the following procedure. 120 | 121 | First, you need to setup a user to run EMR: 122 | 123 | 1. Visit http://aws.amazon.com/ and sign up for an account. 124 | 2. Select the "Identity and Access Management" (or IAM) from your console or visit https://console.aws.amazon.com/iam/home 125 | 3. Select "Users" from the list on the left. 126 | 3. Click on the "Create New Users" 127 | 4. Enter a user name for yourself and create the user. 128 | 5. The next screen will give you an option to download the credentials for this user. Do so and store them in a safe place. You will not be able to retrieve them again. 129 | 130 | Second, you need to create a group with the right roles: 131 | 132 | 1. Select "Groups" from the list on the left. 133 | 2. Click on "Create New Group". 134 | 3. Enter a name and click on "Next Step". 135 | 4. Scroll down to "Amazon Elastic MapReduce Full Access" click on "Select". 136 | 5. Once the policy document is displayed, click on "Next Step". 137 | 6. Click on "Create Group" to create the group. 138 | 139 | Third, you need to assign your user to the group: 140 | 141 | 1. Select the check box next to your group. 142 | 2. Click on the "Group Actions" drop-down menu and click on "Add Users to Group". 143 | 3. Select your user by clicking on the check box. 144 | 4. Click on "Add Users". 145 | 146 | ## Configure mrjob ## 147 | 148 | You need to configure mrjob to access your AWS account: 149 | 150 | 1. Edit the mrjob.conf 151 | 2. Locate the `#aws_access_key_id:` and `#aws_secret_access_key:` lines. 152 | 3. Remove the hash (#) and add your AWS key and secret after the colon (:). You should have these from previously creating the user. 153 | 154 | ## Setup an Output Bucket on S3 ## 155 | 156 | You need to create an output bucket on S3 for the results of your computation: 157 | 158 | 1. Go to https://aws.amazon.com/ in your browser. 159 | 2. Click on the 'S3' service link. 160 | 3. Click on the 'Create Bucket' button. 161 | 4. Enter a name and hit create. 162 | 163 | Keep in mind that the bucket name is unique to all of Amazon. If you use some common name, it is likely to clash with other 164 | users. One suggestion is to use a common prefix (e.g. a domain name) for all your bucket names. 165 | 166 | ## Run the Code on EMR ## 167 | 168 | In the previous step, you created an output bucket. In the example below, replace `{your-bucket-name}` with the name of the bucket you created. 169 | 170 | To run the tag count on EMR for one input, do the following: 171 | 172 | time python tag_counter.py -r emr --conf-path mrjob.conf --python-archive mrcc.py.tar.gz --no-output --output-dir s3://{your-bucket-name}/cc-test-1 --source s3 input/test-1.warc 173 | -------------------------------------------------------------------------------- /activities/intro-to-spark/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Spark # 2 | 3 | ## Setup ## 4 | 5 | ### Installing Spark ### 6 | 7 | 1. Visit the Spark [Downloads](https://spark.apache.org/downloads.html) page. 8 | 2. Select "1.3.0" from the first list box. 9 | 3. Select "Pre-built for Hadoop 2.4 and later". 10 | 4. Leave "Select Apache Mirror" alone. 11 | 5. Click on the link in #4 12 | 6. When the result page loads, click on the suggested mirror to download Spark. 13 | 14 | Once you have downloaded Spark, just unpack the directory somewhere convenient. We'll be using the executable directly from the distribution. 15 | 16 | We'll use the environment variable `$SPARK_HOME` throughout this example. You should define it to be where you unpacked the Spark distribution: 17 | 18 | export SPARK_HOME=~/workspace/spark-1.3.0-bin-hadoop2.4/ 19 | 20 | You should install psutil as well: 21 | 22 | pip install psutil 23 | 24 | ### Preparing Sample Data ### 25 | 26 | We'll be using the same conference data from the [Organizing Acquired Data](../../assignments/organizing-tweets/) assignment. We will prepare the data by writing each tweet onto a single line: 27 | 28 | python one-line-json.py < ../../assignments/organizing-tweets/prague-2015-02-14.json > 2015-02-14.txt 29 | python one-line-json.py < ../../assignments/organizing-tweets/prague-2015-02-15.json > 2015-02-15.txt 30 | 31 | We'll also use some randomly generated files: 32 | 33 | curl "http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain" > words 34 | mkdir random 35 | python random-text.py 1000000 10 < words > random/random-0.txt 36 | python random-text.py 1000000 10 < words > random/random-1.txt 37 | python random-text.py 1000000 10 < words > random/random-2.txt 38 | python random-text.py 1000000 10 < words > random/random-3.txt 39 | python random-text.py 1000000 10 < words > random/random-4.txt 40 | python random-text.py 1000000 10 < words > random/random-5.txt 41 | python random-text.py 1000000 10 < words > random/random-6.txt 42 | python random-text.py 1000000 10 < words > random/random-7.txt 43 | python random-text.py 1000000 10 < words > random/random-8.txt 44 | python random-text.py 1000000 10 < words > random/random-9.txt 45 | 46 | 47 | ## Activity - Run some example ## 48 | 49 | ### Hello World - Word Count ### 50 | 51 | The classic "hello world" of map/reduce is a simple word count. An example implementation is in [wordcount.py](wordcount.py) and can be run as follows: 52 | 53 | $SPARK_HOME/bin/spark-submit wordcount.py "random/random-*.txt" 54 | 55 | This will run the word count over the randomly generated data (from the setup) of 100 million words. 56 | 57 | The RDD contains a wild card and is effectively the same as: 58 | 59 | lines = sc.textFile("random/random-*.txt", 1) 60 | 61 | and the wild card allows Spark to access all the generated data files. 62 | 63 | The code is straight forward and starts with splitting the lines of text into words: 64 | 65 | lines.flatMap(lambda x: x.split()) 66 | 67 | then mapping each word to a pair of the word and a count of one: 68 | 69 | .map(lambda word: (word, 1)) 70 | 71 | and finally reducing the pairs by key using summation: 72 | 73 | .reduceByKey(lambda a,b : a + b) 74 | 75 | ### Word Count over Tweets ### 76 | 77 | We can change the first actions on the RDD in the word count example and have it operate on tweet text. The tweet data has been prepared with one 78 | JSON tweet object per line in `2015-02-14.txt` and `2015-02-15.txt` (see Setup). 79 | 80 | The first lines look something like: 81 | 82 | lines.map(lambda line: json.loads(line)) \ 83 | .flatMap(lambda tweet: tweet["text"].split()) 84 | 85 | which loads the JSON object and splits the "text" property instead of the whole line. 86 | 87 | The code is in [tweet-wordcount.py](tweet-wordcount.py) and can be run by: 88 | 89 | $SPARK_HOME/bin/spark-submit tweet-wordcount.py "2015-02-*.txt" 90 | 91 | ### Understanding Scaling ### 92 | 93 | By default, you are running Spark locally. You can specify the "master" by the `--master` option which takes a URI. 94 | 95 | A special value of "local[n]" allows you to control the number of workers in your local cluster and can give you an 96 | idea of "speed-up via parallelization" (within the limits of your hardware). 97 | 98 | Try the following experiment: 99 | 100 | time $SPARK_HOME/bin/spark-submit --master local[1] wordcount.py "random-large-*.txt" 101 | 102 | and note the time. Now remove the `--master` option and do the same. It should take longer as Spark will attempt 103 | to guess and the correct number of local resources for your hardware. 104 | 105 | Now, trying increasing `local[1]` to `local[2]` through `local[6]` and note the times. Is there a limit to the 106 | increase in speed as you add more workers? 107 | 108 | You can try the same experiments later by creating actual clusters of various sizes. The only change would be 109 | the value for the `--master` option. 110 | 111 | 112 | ## Activity - Problem Solving ## 113 | 114 | The tweet data we prepared is from a conference. How can we use Spark to answer the following questions? 115 | 116 | 1. Who tweeted the most during the conference? 117 | 2. What were the top 10 hash tags used? 118 | 3. For a particular hour, how many tweets were produced? 119 | 120 | ## Activity - Deploying to Clusters ## 121 | 122 | ### Spark on EC2 ### 123 | 124 | #### Overview #### 125 | 126 | You can start a standalone Spark cluster on EC2 using the program `spark-ec2` located in the `ec2` directory of the spark distribution. You'll need: 127 | 128 | * your key name 129 | * your local key (e.g. .pem file) 130 | * a preferred zone 131 | * your AWS key and secret 132 | 133 | You'll need to setup two environment variables to contain your AWS credentials: 134 | 135 | export AWS_SECRET_ACCESS_KEY=xxxxxxxxx 136 | export AWS_ACCESS_KEY_ID=xxxxxxxx 137 | 138 | You will need to make sure your access key is allowed to start EC2 instances. You may need to modify the policy for the access key in "Identity and Access Management" and at minimum you'll 139 | want: 140 | 141 | { 142 | "Version": "2012-10-17", 143 | "Statement": [ 144 | { 145 | "Sid": "Stmtnnnnnn", 146 | "Effect": "Allow", 147 | "Action": [ 148 | "ec2:*" 149 | ], 150 | "Resource": [ 151 | "*" 152 | ] 153 | } 154 | ] 155 | } 156 | 157 | You can create this policy by clicking on "Create Another Policy" when viewing the group. Use the policy generator and select "Amazon EC2" from the "AWS Service", 158 | select "All Actions" for "Actions", and enter "*" for "Amazon Resource Name (ARN)". This is the most liberal policy and you can certain restrict it from there. 159 | 160 | A simple cluster can then be launched as follows: 161 | 162 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem -s 3 -t m3.medium -z us-east-1c --copy-aws-credentials launch "Spark Test" 163 | 164 | At the very end you'll see the master hostname and you can visit this in your browser: 165 | 166 | http://ec2-nn-nn-nn-nn.compute-1.amazonaws.com:8080/ 167 | 168 | Spark jobs are run from the master node of the cluster. You can login (ssh) via: 169 | 170 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem login "Spark Test" 171 | 172 | Finally, you can terminate your cluster: 173 | 174 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem destroy "Spark Test" 175 | 176 | Running a job requires two things: 177 | 178 | 1. Your code (driver) must be transferred to the master node. 179 | 2. Your data must be accessible by all nodes (copied to each node, put into HDFS or S3, etc.) 180 | 181 | #### Testing #### 182 | 183 | First let's try transferring our data and code to the master node: 184 | 185 | scp -i yourkey.pem wordcount.py root@ec2-nn-nn-nn-nn.compute-1.amazonaws.com:~ 186 | scp -i yourkey.pem random/random-0.txt root@ec2-nn-nn-nn-nn.compute-1.amazonaws.com:~ 187 | 188 | Note: We'll only use the first set of random works to minimize network bandwidth use. 189 | 190 | Then login: 191 | 192 | $SPARK_HOME/ec2/spark-ec2 -k yourkey -i yourkey.pem login "Spark Test" 193 | 194 | Run a job: 195 | 196 | time spark/bin/spark-submit --master spark://ec2-nn-nn-nn-nn.compute-1.amazonaws.com:7077 wordcount.py random-0.txt > /dev/null 197 | 198 | Now we can copy that same file to S3 from your local machine: 199 | 200 | aws s3 cp random/random-0.txt s3://mybucket/random/random-0.txt 201 | 202 | and try the same job with an S3 URI (note the use of s3n) 203 | 204 | time spark/bin/spark-submit --master spark://ec2-nn-nn-nn-nn.compute-1.amazonaws.com:7077 wordcount.py s3n://mybucket/random/random-0.txt > /dev/null 205 | 206 | You should see a notable difference in processing time as S3 is far slower than local files. 207 | 208 | ### Spark on EMR ### 209 | 210 | TBD ... yarn, yarn, yarn 211 | -------------------------------------------------------------------------------- /activities/decision-trees/decision-tree.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | X 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | A 29 | 30 | 31 | B 32 | 33 | 34 | A 35 | 36 | 37 | C 38 | 39 | 40 | D 41 | 42 | 43 | 44 | 45 | 46 | X 47 | 48 | 49 | 50 | 51 | 52 | C 53 | 54 | 55 | 56 | 57 | 58 | D 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | A 67 | 68 | 69 | 70 | 71 | 72 | B 73 | 74 | 75 | 76 | 77 | 78 | 79 | X 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | A 91 | 92 | 93 | 94 | 95 | 96 | B 97 | 98 | 99 | 100 | 101 | 102 | 103 | X 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /activities/text-processing-with-nltk/README.md: -------------------------------------------------------------------------------- 1 | # Text Processing with NLTK # 2 | 3 | ## Setup ## 4 | 5 | NLTK is a module for python for processing "natural languages". It also contains supporting data files 6 | (e.g., stop word lists by langauge) necessary for some of the algorithms to function. 7 | 8 | To install NLTK for yourself, do the following: 9 | 10 | pip install nltk 11 | python -m nltk.downloader all 12 | 13 | If you are on a Max OS X / Linux system, you may want to install the NLTK module for everyone: 14 | 15 | sudo pip install nltk 16 | sudo python -m nltk.downloader -d /usr/share/nltk_data all 17 | 18 | To test that you've got everything installed: 19 | 20 | from nltk.book import * 21 | text1.concordance("whale") 22 | 23 | should print a list of phrases in Moby Dick that contain the word 'whale'. 24 | 25 | ## Basics of Tokenization ## 26 | 27 | Many algorithms for processing text require taking passages of text and turn them into sentences and words. The process of doing is very 28 | specific to the language being processed and possibily influenced by how the text was collected or the genre of communication. 29 | 30 | In general, langauges like English, Spanish, and other modern european languages are directly supported by the corpus of configuration data 31 | provided by NLTK. These languages also share common mechanism for simple tokenization into sentences and words. 32 | 33 | A passage of text, like the above, can be first be broken down into sentences and then into words: 34 | ``` 35 | import nltk 36 | 37 | text = '''Many algorithms for processing text require taking passages of text and turn them into sentences 38 | and words. The process of doing is very specific to the language being processed and possibily influenced 39 | by how the text was collected or the genre of communication.''' 40 | 41 | sentences = nltk.tokenize.sent_tokenize(text) 42 | 43 | for s in sentences: 44 | words = nltk.tokenize.word_tokenize(s) 45 | print words 46 | ``` 47 | Notice how the punctuation of the sentences are mixed in with the words. Tokenization doesn't take into account any 48 | syntax that might be present. As such, text tha contains any kind of annotation, URLs, etc. may need to be filtered 49 | when turned into word tokens. 50 | 51 | Futher, words can be annotated independently for their "parts of speech" (POS): 52 | 53 | import nltk 54 | 55 | s = "The quick brown fox jumped over the fence." 56 | words = nltk.tokenize.word_tokenize(s) 57 | nltk.pos_tag(words) 58 | 59 | which should produce: 60 | 61 | [('The', 'DT'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('fence', 'NN'), ('.', '.')] 62 | 63 | Each of the codes can be looked up in the help: 64 | 65 | >>> nltk.help.upenn_tagset('DT') 66 | DT: determiner 67 | all an another any both del each either every half la many much nary 68 | neither no some such that the them these this those 69 | >>> nltk.help.upenn_tagset('NN') 70 | NN: noun, common, singular or mass 71 | common-carrier cabbage knuckle-duster Casino afghan shed thermostat 72 | investment slide humour falloff slick wind hyena override subhumanity 73 | machinist ... 74 | 75 | ## Stopwords ## 76 | 77 | Many languages contain words that occur very often (e.g., "the" or "a" in English) and their frequent use will 78 | overwhelm more interesting words useful in analysis. A common technique is to use a stop word list to exclude 79 | such common words from further processing. 80 | 81 | NLTK supports stop words for a number of languages and they are accessed as: 82 | 83 | stopWords = nltk.corpus.stopwords.words('english') 84 | >>> stopWords 85 | [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', 86 | u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 87 | u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 88 | u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 89 | u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 90 | u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', 91 | u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', 92 | u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', 93 | u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', 94 | u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', 95 | u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', 96 | u'don', u'should', u'now'] 97 | 98 | The `nltk.corpus.stopwords` module just returns a simple list of words you can use in your own code. For example, 99 | a simple list comprehension can be used to filter a list of words: 100 | 101 | stopWords = nltk.corpus.stopwords.words('english') 102 | filtered = [e.lower() for e in words if not e.lower() in stopWords] 103 | 104 | and another trick is to add your list of punctuation to the stop word list: 105 | 106 | stopWords = nltk.corpus.stopwords.words('english') + ['.',','] 107 | filtered = [e.lower() for e in words if not e.lower() in stopWords] 108 | 109 | The languages supported by NLTK can be discovered by inspecting the `nltk.corpus.stopwords` object: 110 | 111 | >>> nltk.corpus.stopwords 112 | 113 | 114 | The reader outputs the directory in which the stop words are stored. You can list the suppored langauges: 115 | 116 | $ ls /usr/share/nltk_data/corpora/stopwords 117 | README english german norwegian spanish 118 | danish finnish hungarian portuguese swedish 119 | dutch french italian russian turkish 120 | $ head -n 10 /usr/share/nltk_data/corpora/stopwords/english 121 | i 122 | me 123 | my 124 | myself 125 | we 126 | our 127 | ours 128 | ourselves 129 | you 130 | your 131 | 132 | The files contain a single word per line. As such, you can create or modify a stop word list for any language and add it to NLTK. 133 | 134 | ## Frequency Distributions ## 135 | 136 | A frequency distribution can be constructed from the a list as a construction parameter: 137 | 138 | import nltk 139 | words = [ 'A', 'A', 'B', 'B', 'B', 'C'] 140 | fd = FreqDist(words) 141 | fd.tabulate() 142 | 143 | produces the output: 144 | 145 | B A C 146 | 3 2 1 147 | 148 | You can also produce a visual plot by calling `plot()`. 149 | 150 | A frequency distribution can be constructed iteratively as well: 151 | 152 | fd = FreqDist() 153 | for w in words: 154 | fd[w.lower()] += 1 155 | 156 | or via a comprehension: 157 | 158 | fd = FreqDist(w.lower() for w in words) 159 | 160 | ## Stemming and Lemminization # 161 | 162 | Stemming: the process for reducing inflected (or sometimes derived) words to their stem, base or root form. 163 | 164 | Lemmatization: the process of grouping together the different inflected forms of a word so they can be analysed as a single item. 165 | 166 | NLTK supports: 167 | 168 | * [Porter Stemming](http://tartarus.org/martin/PorterStemmer/) 169 | * [Lancaster Stemming](http://www.comp.lancs.ac.uk/computing/research/stemming/) 170 | * [Snowball Stemming](http://snowball.tartarus.org) 171 | * Lemminization based on [WordNet’s built-in morphy function](http://wordnet.princeton.edu) 172 | 173 | For stemming, you construct a stemmer and then call `stem()` on the word: 174 | 175 | from nltk.stem.lancaster import LancasterStemmer 176 | stemmer = LancasterStemmer() 177 | w = lancaster_stemmer.stem(‘presumably’) # returns u’presum’ 178 | 179 | In the above, you can use `nltk.stem.porter.PorterStemmer`, `nltk.stem.lancaster.LancasterStemmer`, or `nltk.stem.SnowballStemmer`. 180 | 181 | Lemmatization is similar: 182 | 183 | from nltk.stem import WordNetLemmatizer 184 | lemmatizer = WordNetLemmatizer() 185 | lemmatizer.lemmatize(‘dogs’) # returns u'dog' 186 | 187 | but the lemmatizer assumes by default everything is a noun. For verbs, this means that results are not lemmatized 188 | properly (e.g., "are" and "is" do not become "be"). 189 | 190 | For example, try: 191 | 192 | from nltk.stem import WordNetLemmatizer 193 | lemmatizer = WordNetLemmatizer() 194 | lemmatizer.lemmatize('is',pos='v') 195 | lemmatizer.lemmatize('are',pos='v') 196 | 197 | The `pos` argument can have the following values: 198 | 199 | * 'a' - adjective 200 | * 'r' - adverb 201 | * 'n' - noun 202 | * 'v' - verb 203 | 204 | ## Activity ## 205 | 206 | Pick a passage of text and: 207 | 208 | 1. Tokenize the text. 209 | 2. List all the nouns in the passage. 210 | 3. Apply a stop word filter to the tokenized text. 211 | 4. Compute and plot a frequency distribution of the top 50 words. 212 | 5. Apply a lemmatization algorithm with the pos argument set to 'n' and recompute your frequency distribution. 213 | 214 | -------------------------------------------------------------------------------- /activities/data-munging/README.md: -------------------------------------------------------------------------------- 1 | # Data Munging - Processing JSON, XML, and CSV Data # 2 | 3 | ## CWOP Data Set ## 4 | 5 | The Citizen Weather Observation Program (CWOP) collects weather data from a variety of citizen, business, and government 6 | sources over the Internet. It collects over 75,000 weather reports an hour from 10,000+ weather stations located all over 7 | the world but mostly concentrated in North America. 8 | 9 | The data collected is transmitted in as APRS weather reports (need ref) in a coded format that is eventually disseminated 10 | via a real-time peer-to-peer network using a system called APRS-IS (need ref). This information can be received and decoded 11 | by attaching to a several of the servers associated with the CWOP program and aggregated the results. 12 | 13 | The [mesonet.info](http://www.mesonet.info) collects and aggregates this data. The data acquisition process first 14 | serializes the data collected from each server into 5 minute segments stored in an custom XML format: 15 | 16 | 17 | 18 | 19 | 20 | ... 21 | 22 | 23 | Each weather report has an identifier (@from), a location (@latitude and @longitude), a received time (@received), a generation time from the weather station (@at), and a 24 | variety of weather report facets (e.g., @temperature). These facets for the weather reports and their units of measure are listed below: 25 | 26 | wind-dir 27 | wind-speed 28 | wind-gust 29 | temperature 30 | rain-hour 31 | rain-24hours 32 | rain-midnight 33 | humidity 34 | pressure 35 | 36 | An excerpt of this data for 2014-12-26 has been stored on AWS S3 in the public bucket `milowski-cwop-data`. It is organized first by date (e.g., 2014-12-26) and then by format and hour. The 37 | raw XML data has been transformed into JSON (geo JSON?) and CSV data formats as well. Each of the variations are located on 'xml', 'json', or 'csv' "directories" in S3. 38 | 39 | For example, 2014-12-26 from 13:00 to 14:00 in JSON is located in: 40 | 41 | s3://milowski-cwop-data/2014-12-26/json/13:00/ 42 | 43 | Each key (file) in represents a 5 minute segment of data partitioned only by time. The location of the reports can only be sorted by selecting the subset of information 44 | amongst all the various sources stored under the same key (directory). 45 | 46 | For example, you'll find the full keys for the data set as follows: 47 | 48 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:00:00Z.json 49 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:05:00Z.json 50 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:10:00Z.json 51 | ... 52 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop2-2014-12-26T13:00:00Z.json 53 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop2-2014-12-26T13:05:00Z.json 54 | s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop2-2014-12-26T13:10:00Z.json 55 | ... 56 | 57 | The names of the keys encode the source server and start of the time segment. This information is only repeated in the XML source and not in the JSON or CSV formats. 58 | 59 | ## Activities ## 60 | 61 | In this activity, we will be: 62 | 63 | * downloading copies of the data via S3 64 | * processing a variety of data formats (i.e., XML, JSON, and CSV) 65 | * computing simple statistics or subsets 66 | * accessing data directly via S3 via boto 67 | 68 | In general, we'll be computing two things: 69 | 70 | * an average (e.g., average temperature) 71 | * geospatial subsets for rectangular areas (quadrangles) 72 | 73 | ### A. Making a Copy ### 74 | 75 | #### Description #### 76 | 77 | The data is available on S3 and you can download a copy (or a subset) easily via the AWS CLI. Keep in mind that S3 is a key/value store. All the data is associated with 78 | the full path of the key. The concept of a "directory" and contained "files" is only implied by the "/" in the key and so is an interpretation of the tool being used. 79 | 80 | Fortunately, the AWS CLI interprets directories in keys as you might expect. Try the following: 81 | 82 | aws s3 ls s3://milowski-cwop-data/2014-12-26/json/13:00/ 83 | 84 | When you run that command, you should see the complete listing of 79 keys (files). 85 | 86 | You can copy a single file or directory to your local drive via the same base command. To copy a file locally, try: 87 | 88 | aws s3 cp s3://milowski-cwop-data/2014-12-26/json/13:00/weather-cwop1-2014-12-26T13:00:00Z.json . 89 | 90 | If you want to copy a whole directory, try: 91 | 92 | aws s3 cp s3://milowski-cwop-data/2014-12-26/json/13:00 . --recursive 93 | 94 | #### Tasks #### 95 | 96 | 1. Pick an particular hour (e.g., 13:00) 97 | 2. Copy the remote buckets for all the formats (i.e., 'xml', 'json', 'csv') to your local disk. 98 | 99 | 100 | ### B. Parsing XML: Computing an Average ### 101 | 102 | #### Description #### 103 | 104 | In this activity you'll be parsing XML data sources and computing an average temperature. You will want to iterate a set of XML documents in a directory, parsing each XML source, 105 | and interpret the @temperature attribute as a real number measuring temperature in Fahrenheit. You should compute an average over all weather reports in all the documents you process. 106 | 107 | You can parse XML using Python's built in [xml.etree module](https://docs.python.org/2/library/xml.etree.elementtree.html); see [xml-parse.py](xml-parse.py). 108 | 109 | #### Tasks #### 110 | 111 | 1. Pick a particular hour. 112 | 2. Parse all the XML files in python and sum the temperature values for every observed weather report. 113 | 3. Calculate the average temperature for that hour for all the CWOP data received. 114 | 115 | ### C. Parsing JSON: Geospatial Partitioning ### 116 | 117 | #### Description #### 118 | 119 | The CWOP XML data has been translated into [geojson](http://geojson.org). The data is received in whatever order the weather stations report them but it can be filter for a specific region. 120 | We'll parse the weather data as JSON and select only those that occur within a specific quadrangle. 121 | 122 | #### Tasks #### 123 | 124 | 1. Pick a particular hour. 125 | 2. Parse all the JSON files and select the temperature values that occur within the quadrangle \[-125, 40, -120, 35 \] (upper left, lower right). 126 | 3. Calculate the average temperature for that hour for that region. 127 | 128 | ### D. Parsing CSV: Grid Averages ### 129 | 130 | #### Description #### 131 | 132 | Comma Separated Values (CSV) is a very common but non-standardized data format. The CWOP data set has been transformed into a simple set of CSV data files. You should attempt to partition the data 133 | by quadrangles and produce a temperature summary for each quadrangle covering the continental USA (i.e., \[-125, 45, -65, 25\]). A partitioning by 5° quadrangles will produce a 134 | 12 by 4 grid over the region. 135 | 136 | CSV data can be easily parsed in Python using the [csv module](https://docs.python.org/2/library/csv.html); see [csv-dump.py](csv-dump.py). 137 | 138 | #### Tasks #### 139 | 140 | 1. Pick a particular hour. 141 | 2. Parse all the CSV files and select the subset within the region. Assign report to grid cells. 142 | 3. Calculate the average temperature for each grid cell. 143 | 144 | 145 | ### E. Direct Access to S3 via boto ### 146 | 147 | #### Description #### 148 | 149 | You can access S3 in python via the [boto module](http://boto.readthedocs.org/en/latest/s3_tut.html). There are samples for outputting a key value ([s3cat.py](s3cat.py)), 150 | copying a file into s3 ([s3copy.py](s3copy.py)]), and list the keys in a bucket ([s3list.py](s3list.py)). 151 | 152 | You need to set environment variables for the code to work as it needs your AWS key and secret: 153 | 154 | export AWS_ACCESS_KEY_ID=... 155 | export AWS_SECRET_ACCESS_KEY=... 156 | 157 | The documentation is [available online](http://boto.readthedocs.org/en/latest/ref/s3.html). 158 | 159 | #### Activity #### 160 | 161 | You can repeat any of the activities above by accessing the data directly. 162 | 163 | 1. Pick a previous activity for which you have working code. 164 | 2. Modify the activity to read the list of files out of the bucket. 165 | 3. Process the data directly by either temporarily storing the files locally or loading the contents into strings. 166 | 167 | Note: You can list a subset of keys in a bucket by using the `prefix` parameter. See [s3list.py](s3list.py) for an example. 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /activities/common-crawl/tag-count-mr.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | list of files 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | ...file.warc.gz 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | web resource 55 | + 56 | headers 57 | 58 | 59 | 60 | 61 | parse 62 | + 63 | tag count 64 | 65 | 66 | 67 | 68 | 69 | parse 70 | + 71 | tag count 72 | 73 | 74 | 75 | 76 | 77 | parse 78 | + 79 | tag count 80 | 81 | 82 | 83 | ... 84 | 85 | 86 | ... 87 | 88 | 89 | ... 90 | 91 | 92 | 93 | 94 | reduce 95 | tag count 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | List of 107 | tag/count pairs 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /activities/emr-cluster/README.md: -------------------------------------------------------------------------------- 1 | # Creating Clusters for EMR # 2 | 3 | ## Setup ## 4 | 5 | Install the [AWS CLI](http://docs.aws.amazon.com/cli/latest/userguide/) with 6 | 7 | sudo pip install awscli 8 | 9 | You may need to link to the executable: 10 | 11 | sudo ln -s /opt/local/Library/Frameworks/Python.framework/Versions/2.7/bin/aws /opt/local/bin/aws 12 | 13 | Then configure your system with the AWS key, secret, and default region (e.g. us-east-1). You can leave the default output format blank. 14 | 15 | aws configure 16 | 17 | You can re-run this command at any time to change the values. 18 | 19 | Now you can test it by asking about your running EC2 instances (you may have none): 20 | 21 | aws ec2 describe-instances 22 | 23 | You'll need two things to run any of the EMR activities: 24 | 25 | 1. An S3 bucket to store logs, code, input, and output data. 26 | 2. An EMR cluster to run the examples. 27 | 28 | ## Basics of EMR Clusters ## 29 | 30 | You can start a simple test cluster by doing the following: 31 | 32 | aws emr create-cluster --ami-version 3.4.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m1.medium InstanceGroupType=CORE,InstanceCount=2,InstanceType=m1.medium --name "Test Cluster" --log-uri s3://mybucket/logs/ --enable-debugging --tags Name=emr 33 | 34 | The --instance-groups option contains a set of triples in the shorthand syntax for `InstanceGroupType` (one of "MASTER", "CORE", or "TASK"), 35 | `InstanceType` (a EC2 instance type), and `InstanceCount` (the number of instances to start). Alternatively, you can use JSON to describe the 36 | cluster instance groups. 37 | 38 | For example, in a file [cluster.json](cluster.json): 39 | 40 | [ 41 | { 42 | "InstanceGroupType": "MASTER", 43 | "InstanceCount": 1, 44 | "InstanceType": "m1.medium" 45 | }, 46 | { 47 | "InstanceGroupType": "CORE", 48 | "InstanceCount": 2, 49 | "InstanceType": "m1.medium" 50 | } 51 | ] 52 | 53 | and then the command: 54 | 55 | aws emr create-cluster --ami-version 3.4.0 --instance-groups file://./cluster.json --name "Test Cluster" --log-uri s3://mybucket/logs/ --enable-debugging --tags Name=emr 56 | 57 | The command will return the "Cluster ID" that you will need for further manipulations including to terminating the cluster. You can always find this via the command: 58 | 59 | aws emr list-clusters --active 60 | 61 | You can terminate a cluster by: 62 | 63 | aws emr terminate-clusters --cluster-id 64 | 65 | The documentation examples consistently use the bucket name 'mybucket'. You'll need to replace that with your bucket name to get the commands to work. 66 | 67 | ## Resizing a Cluster ## 68 | 69 | You can add core or task nodes to a running cluster via the cluster details. Clicking on "Resize" next to "Network and Hardware" will give you the ability to add Core and Task nodes 70 | whilst choosing the instance type. Clicking on "Resize" in the "Hardware" section only allows you to change the number of nodes of a given category with the same instance type. 71 | 72 | Both of these are useful techniques to adjust your running cluster once you have found it to be insufficient for processing data. The adjustment only happens after the currently 73 | running step completes. As such, you may need to kill a running step if you know it will take too long to complete to adjust the size your running cluster. 74 | 75 | ## Bootstrap Actions ## 76 | 77 | Once a generic cluster instance has been started, you may need to install specialized software (e.g. python packages). You can specify a set of one-time actions 78 | called "Bootstrap Actions" when you create the cluster using the `--bootstrap-actions` option. Like the --instance-groups option, you can use the shorthand syntax or JSON. 79 | 80 | Each action must contain three things: 81 | 82 | * Path — the path to a script (typically in S3) 83 | * Args - any arguments to the script 84 | * Name — a name to show in the console 85 | 86 | The shorthand is: 87 | 88 | --bootstrap-actions Path=s3://mybucket/python.sh,Name="Install python packages",Args=[numpy,nltk] 89 | 90 | The JSON in `bootstrap.json`: 91 | 92 | [ 93 | { 94 | "Path" : "s3://mybucket/python.sh", 95 | "Name" : "Install python packages", 96 | "Args" : ["numpy","nltk"] 97 | } 98 | ] 99 | 100 | with the option: 101 | 102 | --bootstrap-actions file://./bootstrap.json 103 | 104 | The script stored at s3://mybucket/python.sh might be something like: 105 | 106 | #!/bin/bash 107 | sudo pip install $* 108 | 109 | ### Testing Bootstrap Actions ### 110 | 111 | In general, if you script runs on a like-operating system (e.g. linux of the same flavor), you'll be in good shape. AWS EMR's AMI are based on RedHat/CentOS and 112 | so scripts that work on those particular flavors may work. The right way to test bootstrapping is to use the specific AMI for the EMR version, start an EC2 113 | instance, and test on that machine. 114 | 115 | ### Testing Bootstrapping using EMR AMIs ### 116 | 117 | You can test your bootstrapping commands by just starting the exact AMI used by EMR. When you start a cluster, you can look up the 118 | AMI used by EMR in your EC2 console. Under the details of a running or terminate instance associated with your cluster, you'll see 119 | the AMI listed. It should be a identifier formatted like "ami-xxxxxxxx". 120 | 121 | For example, ami-2e88aa46 is the identifier for AMI version 3.6.0 that you can select when you start your cluster. You can then 122 | start an EC2 instance using that AMI using the CLI: 123 | 124 | aws ec2 run-instances --image-id ami-2e88aa46 --key-name your-key-name --instance-type m1.medium --placement AvailabilityZone=us-east-1c 125 | 126 | In the above, you'll want to list your actual key name in place of `your-key-name` and adjust the `AvailabilityZone` value to your preference. 127 | 128 | Now you can ssh into the machine using your key and the user `hadoop`. This user has sudo privileges and so should be able to run your script exactly 129 | as EMR would during cluster bootstrapping. 130 | 131 | Once you are done, you can shutdown the instance via the console in the browser or use the instance ID returned from the `run-instances` command in the following: 132 | 133 | aws ec2 terminate-instances --instance-ids i-3259abcf 134 | 135 | ## Running "Steps" ## 136 | 137 | A step is a unit of work. You add can steps to your cluster via the AWS CLI or via libraries like MRJob. 138 | 139 | A step contains a set of jobs and a job contains a set of tasks (e.g. mappers and reducers). 140 | 141 | Often, a single step contains a single job that contains a map/reduce process. That map/reduce process is turned into a set of map tasks based on the input size. The control over 142 | that process is handled by the input splitter used in Hadoop. Subsequently, the number of reduce tasks depends on the number of map tasks. These are all things you can control when you 143 | configure Hadoop. 144 | 145 | 146 | ### Running Steps via AWS CLI ### 147 | 148 | The AWS CLI command `aws emr add-steps` is used to add steps to your cluster. The cluster identifier is necessary and you can find this in the cluster details. 149 | 150 | The step is described by a set of metadata: 151 | 152 | * Name — A descriptive name. 153 | * Type — the type of Hadoop job (i.e. one of "CUSTOM_JAR", "STREAMING", "HIVE", "PIG", "IMPALA") 154 | * Args - a set of arguments to pass to the step 155 | * Jar — a location of a jar implementing the step (only for "CUSTOM_JAR"). This location must be accessible the Hadoop cluster and may be an S3 URI. 156 | * ActionOnFailure — One of "TERMINATE_CLUSTER", "CANCEL_AND_WAIT" (pause the step queue), "CONTINUE" 157 | * MainClass — the main class to use (only for CUSTOM_JAR) 158 | 159 | The shorthand syntax can be used to specify all of the above but the JSON syntax is more useful: 160 | 161 | For example, a Hadoop streaming job might be specified as: 162 | 163 | [ { 164 | "Type" : "STREAMING", 165 | "Name" : "Multiply", 166 | "ActionOnFailure" : "CONTINUE", 167 | "Args" : [ 168 | "-files","s3://mybucket/prime-factors.py", 169 | "-mapper","prime-factors.py", 170 | "-reducer","aggregate", 171 | "-input","s3://mybucket/multiply/input", 172 | "-output","s3://mybucket/multiply/output" 173 | ] 174 | } ] 175 | 176 | The arguments are all specific to the [Hadoop Streaming program](http://hadoop.apache.org/docs/r2.6.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/HadoopStreaming.html). Similarly, 177 | any other program would (including your own custom jar) would have its own argument definition. 178 | 179 | Once you have your JSON definition (`step.json` in this case), you can add it to your running cluster by: 180 | 181 | aws emr add-steps --cluster-id --steps file://./step.json 182 | 183 | ### Running Steps via MRJob ### 184 | 185 | [MRJob](http://mrjob.readthedocs.org) is a very useful abstraction and has the ability to run jobs directly on EMR. While you can use MRJob to start a cluster, 186 | a more useful technique is to run your MRJob program on an already started cluster. 187 | 188 | Running on an existing cluster is easily done by two extra parameters: 189 | 190 | 1. Add the `-r emr` option to select the EMR runner. 191 | 2. Add the `--emr-job-flow-id your-cluster-id` to specify your existing cluster. 192 | 193 | Since you are running on the cluster, there are some additional life-cycle options you may want to control. First, by default, MRJob will upload your 194 | input (e.g. stdin) to S3 and download the output. You'll probably want to run everything from S3 and this is easily done: 195 | 196 | 1. Specify your input bucket by just an extra argument to your program just as you might give it a file name but instead just give in the S3 bucket URI. 197 | 2. Use `--no-output` to turn off downloading the result and `--output-dir s3://yourbucket/yourpath` to specify the output S3 bucket. 198 | 199 | If you have supporting code for your program, you'll need to package it into an archive in tar/gz format. Then just specify that on the command-line using `--python-archive code.tar.gz` 200 | 201 | You may have changed the version of python on your cluster via a bootstrap action. If so, you can specify the python command via `--python-bin`. That command expects a command (or full path) 202 | that will run the python interpreter. 203 | 204 | When you use the `--no-output` and `--output-dir` together with MRJob the results are stored on AWS S3. You can interrupt the local MRJob process after the step has started 205 | and it will continue to run on your cluster. This allows you to terminate the local process and continue other work. You will have to check the cluster interface online to 206 | see the status of your job. 207 | 208 | ### Killing Steps ### 209 | 210 | There is no easy way to kill a running step via the AWS CLI or the browser interface. If you terminate the cluster, the step will be killed first but that is a 211 | draconian way to kill a step. If you stop the cluster, you will have restart the cluster and that can take quite awhile. 212 | 213 | The way you kill the step is to talk to Hadoop directly by the following: 214 | 215 | 1. SSH into the master node. You'll find the connection information in the cluster details and then you'll do something like: 216 | 217 | `ssh hadoop@ec2-nn-nn-nn-nn.compute-1.amazonaws.com -i ~/your-identity.pem` 218 | 219 | 2. Once you are connected, list the jobs with `mapred job -list` 220 | 221 | 3. Locate the row that represents the step you'd like to kill. At this point, a step has turned into a set of jobs. If you only have one job, there will be only one row. 222 | 223 | 4. The first column is labeled `JobId`. Use that identifier to kill the job with `mapred job -kill id` where `id` is the value in that column. 224 | 225 | ## Manipulating S3 Buckets via AWS CLI ## 226 | 227 | You can create a bucket by: 228 | 229 | aws s3 mb s3://mybucket/ 230 | 231 | Listing a bucket: 232 | 233 | aws s3 ls s3://mybucket/ 234 | 235 | Copying a file to a path: 236 | 237 | aws s3 cp file.txt s3://mybucket/somewhere/ 238 | 239 | Removing a key: 240 | 241 | aws s3 rm s3://mybucket/somewhere/file.txt 242 | 243 | Syncing a directory to s3 (both ways): 244 | 245 | aws s3 sync somewhere s3://mybucket/somewhere 246 | aws s3 sync s3://mybucket/somewhere somewhere 247 | 248 | 249 | Removing a set of keys via a prefix: 250 | 251 | aws s3 rm s3://mybucket/somewhere/ --recursive 252 | -------------------------------------------------------------------------------- /activities/common-crawl/test-100.warc: -------------------------------------------------------------------------------- 1 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz 2 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz 3 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz 4 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz 5 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz 6 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz 7 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz 8 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz 9 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz 10 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz 11 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz 12 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz 13 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz 14 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz 15 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz 16 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00015-ip-10-180-136-8.ec2.internal.warc.gz 17 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00016-ip-10-180-136-8.ec2.internal.warc.gz 18 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00017-ip-10-180-136-8.ec2.internal.warc.gz 19 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00018-ip-10-180-136-8.ec2.internal.warc.gz 20 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00019-ip-10-180-136-8.ec2.internal.warc.gz 21 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00020-ip-10-180-136-8.ec2.internal.warc.gz 22 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00021-ip-10-180-136-8.ec2.internal.warc.gz 23 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00022-ip-10-180-136-8.ec2.internal.warc.gz 24 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00023-ip-10-180-136-8.ec2.internal.warc.gz 25 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00024-ip-10-180-136-8.ec2.internal.warc.gz 26 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00025-ip-10-180-136-8.ec2.internal.warc.gz 27 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00026-ip-10-180-136-8.ec2.internal.warc.gz 28 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00027-ip-10-180-136-8.ec2.internal.warc.gz 29 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00028-ip-10-180-136-8.ec2.internal.warc.gz 30 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00029-ip-10-180-136-8.ec2.internal.warc.gz 31 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00030-ip-10-180-136-8.ec2.internal.warc.gz 32 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00031-ip-10-180-136-8.ec2.internal.warc.gz 33 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00032-ip-10-180-136-8.ec2.internal.warc.gz 34 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00033-ip-10-180-136-8.ec2.internal.warc.gz 35 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00034-ip-10-180-136-8.ec2.internal.warc.gz 36 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00035-ip-10-180-136-8.ec2.internal.warc.gz 37 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00036-ip-10-180-136-8.ec2.internal.warc.gz 38 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00037-ip-10-180-136-8.ec2.internal.warc.gz 39 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00038-ip-10-180-136-8.ec2.internal.warc.gz 40 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00039-ip-10-180-136-8.ec2.internal.warc.gz 41 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00040-ip-10-180-136-8.ec2.internal.warc.gz 42 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00041-ip-10-180-136-8.ec2.internal.warc.gz 43 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00042-ip-10-180-136-8.ec2.internal.warc.gz 44 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00043-ip-10-180-136-8.ec2.internal.warc.gz 45 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00044-ip-10-180-136-8.ec2.internal.warc.gz 46 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00045-ip-10-180-136-8.ec2.internal.warc.gz 47 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00046-ip-10-180-136-8.ec2.internal.warc.gz 48 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00047-ip-10-180-136-8.ec2.internal.warc.gz 49 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00048-ip-10-180-136-8.ec2.internal.warc.gz 50 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00049-ip-10-180-136-8.ec2.internal.warc.gz 51 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00050-ip-10-180-136-8.ec2.internal.warc.gz 52 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00051-ip-10-180-136-8.ec2.internal.warc.gz 53 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00052-ip-10-180-136-8.ec2.internal.warc.gz 54 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00053-ip-10-180-136-8.ec2.internal.warc.gz 55 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00054-ip-10-180-136-8.ec2.internal.warc.gz 56 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00055-ip-10-180-136-8.ec2.internal.warc.gz 57 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00056-ip-10-180-136-8.ec2.internal.warc.gz 58 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00057-ip-10-180-136-8.ec2.internal.warc.gz 59 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00058-ip-10-180-136-8.ec2.internal.warc.gz 60 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00059-ip-10-180-136-8.ec2.internal.warc.gz 61 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00060-ip-10-180-136-8.ec2.internal.warc.gz 62 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00061-ip-10-180-136-8.ec2.internal.warc.gz 63 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00062-ip-10-180-136-8.ec2.internal.warc.gz 64 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00063-ip-10-180-136-8.ec2.internal.warc.gz 65 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00064-ip-10-180-136-8.ec2.internal.warc.gz 66 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00065-ip-10-180-136-8.ec2.internal.warc.gz 67 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00066-ip-10-180-136-8.ec2.internal.warc.gz 68 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00067-ip-10-180-136-8.ec2.internal.warc.gz 69 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00068-ip-10-180-136-8.ec2.internal.warc.gz 70 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00069-ip-10-180-136-8.ec2.internal.warc.gz 71 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00070-ip-10-180-136-8.ec2.internal.warc.gz 72 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00071-ip-10-180-136-8.ec2.internal.warc.gz 73 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00072-ip-10-180-136-8.ec2.internal.warc.gz 74 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00073-ip-10-180-136-8.ec2.internal.warc.gz 75 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00074-ip-10-180-136-8.ec2.internal.warc.gz 76 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00075-ip-10-180-136-8.ec2.internal.warc.gz 77 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00076-ip-10-180-136-8.ec2.internal.warc.gz 78 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00077-ip-10-180-136-8.ec2.internal.warc.gz 79 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00078-ip-10-180-136-8.ec2.internal.warc.gz 80 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00079-ip-10-180-136-8.ec2.internal.warc.gz 81 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00080-ip-10-180-136-8.ec2.internal.warc.gz 82 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00081-ip-10-180-136-8.ec2.internal.warc.gz 83 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00082-ip-10-180-136-8.ec2.internal.warc.gz 84 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00083-ip-10-180-136-8.ec2.internal.warc.gz 85 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00084-ip-10-180-136-8.ec2.internal.warc.gz 86 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00085-ip-10-180-136-8.ec2.internal.warc.gz 87 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00086-ip-10-180-136-8.ec2.internal.warc.gz 88 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00087-ip-10-180-136-8.ec2.internal.warc.gz 89 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00088-ip-10-180-136-8.ec2.internal.warc.gz 90 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00089-ip-10-180-136-8.ec2.internal.warc.gz 91 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00090-ip-10-180-136-8.ec2.internal.warc.gz 92 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00091-ip-10-180-136-8.ec2.internal.warc.gz 93 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00092-ip-10-180-136-8.ec2.internal.warc.gz 94 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00093-ip-10-180-136-8.ec2.internal.warc.gz 95 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00094-ip-10-180-136-8.ec2.internal.warc.gz 96 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00095-ip-10-180-136-8.ec2.internal.warc.gz 97 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00096-ip-10-180-136-8.ec2.internal.warc.gz 98 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00097-ip-10-180-136-8.ec2.internal.warc.gz 99 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00098-ip-10-180-136-8.ec2.internal.warc.gz 100 | common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00099-ip-10-180-136-8.ec2.internal.warc.gz 101 | -------------------------------------------------------------------------------- /data-science.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Hypothesis 7 | 8 | 9 | Hypothesis Formation 10 | 11 | 12 | Evaluation 13 | 14 | 15 | Experiment 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | Organization 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | Analysis 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | Representation 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | Acquisition 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | Observations 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | Data 87 | Sampling 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | Evaluation 118 | 119 | 120 | Outcome 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | Implementation Iteration 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | data artifacts 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | --------------------------------------------------------------------------------