├── .gitignore
├── README.md
├── flu_news
    ├── get_news_data.sh
    ├── news_clustering
    │   ├── build.sbt
    │   ├── project
    │   │   └── build.properties
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── com
    │   │               └── oracle
    │   │                   └── newscluster
    │   │                       └── NewsClustering.scala
    ├── news_rss_collector.py
    └── news_streaming
    │   ├── flu_news_flume_config
    │   ├── news_rss_collector.py
    │   ├── news_streamer
    │       ├── build.sbt
    │       ├── project
    │       │   ├── assembly.sbt
    │       │   └── build.properties
    │       └── src
    │       │   └── main
    │       │       └── scala
    │       │           └── com
    │       │               └── oracle
    │       │                   └── newsstream
    │       │                       └── NewsStreamer.scala
    │   └── start_flume.sh
├── flu_shots
    └── flu_shots_to_db.py
├── flu_statistics
    ├── OIE_Pathogenic_Flu.py
    ├── country_chop_who.py
    ├── get_flu_summary_data.sh
    ├── state_populations.csv
    ├── us_chop_ilinet.py
    └── us_chop_who.py
├── notebooks
    ├── 01 WHO US Simple CSV Loading.ipynb
    ├── 02 WHO Country-Level Flu Data.ipynb
    ├── 03 HHS_Flu_Vaccination_Data.ipynb
    ├── 04 OIE_Pathogenic_Flu.ipynb
    ├── 05 Collecting Web Data With Pandas.ipynb
    ├── 06 Does Ethnicity Impact Vaccination Rates?.ipynb
    ├── 07 Do vaccination rates impact flu rates?.ipynb
    ├── 08 Does GDP explain flu rates?.ipynb
    ├── 09 Does living in cities influence flu rates?.ipynb
    ├── 10 How do sanitation and clean water effect flu outbreaks?.ipynb
    ├── 11 Basic Big Data with PySpark.ipynb
    ├── 12 Moving and Clustering Data with Sqoop and Spark.ipynb
    ├── 13 Finding Important Words With Spark.ipynb
    ├── 14 Trend Search with Big Data and SQL.ipynb
    ├── 15 Clustering the News with Spark and MLlib.ipynb
    ├── 16 Collecting Streaming News with Flume and Spark.ipynb
    ├── Video Opportunities.ipynb
    └── Visual Data Inspection.ipynb
├── setup
    ├── 00-pyspark-setup.py
    ├── data_science_bootcamp_setup.sh
    ├── download_data.sh
    ├── fludb.sql
    ├── ipython_notebook_config_spark.py
    └── setup_pyspark_notebook.sh
└── templates
    ├── pandas_oracle_template.py
    ├── raw_oracle_template.py
    └── sql_alchemy_oracle_template.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *.*~
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | notebooks/.ipynb_checkpoints/*
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | odsb2014
 2 | ========
 3 | 
 4 | Oracle Data Science Bootcamp 2014
 5 | 
 6 | A series of workshops to explain both basic and advanced data science concepts using SQL, Python, Matplotlib, and Apache Spark.
 7 | 
 8 | Getting Started
 9 | ----------------
10 | 
11 | * Download the Oracle Big Data Lite VM at http://www.oracle.com/technetwork/database/bigdata-appliance/oracle-bigdatalite-2104726.html
12 |     This requires Virtual Box.  All username/passwords for the VM are `oracle/welcome1` unless stated otherwise.
13 | * Clone this git repository: `git clone https://github.com/dwmclary/odsb2014`
14 | * Change into the setup directory: `cd odsb2014/setup`
15 | * Run the setup script: `./data_science_bootcamp_setup.sh`
16 | * Run the data download script `./download_data.sh`
17 | * Run the pyspark installation script `./setup_pyspark_notebook.sh`
18 | * Run the database setup script `sqlplus sys/welcome1 as sysdba @fludb.sql`
19 | * Start the database listener `lsnrctl start`
20 | * Source ~/.bashrc or open a new terminal window
21 | * Change to the `odsb2014/notebooks` directory and start ipython: `ipython notebook --profile pyspark`
22 | 
23 | Loading Data
24 | ----------------
25 | The `flu_statitics` and `flu_news` directories contain data download scripts that must be run
26 | in order to complete the workshop.  These can be run standalone, or by running `setup/download_data.sh`.
27 | The `flu_shots` directory contains a script for fetching data from the US Dept. HHS, but collection
28 | of this data is included as part of the series of notebooks.
29 | 


--------------------------------------------------------------------------------
/flu_news/get_news_data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | mkdir data
 4 | cd data
 5 | wget https://s3.amazonaws.com/orcl-dsb-fludata/wikinews/wikinews.json
 6 | split --lines=10000 wikinews.json wikinews_data
 7 | wget http://mattmahoney.net/dc/text8.zip
 8 | unzip text8.zip
 9 | tr -s '[[:punct:][:space:]]' '\n' < text8 > linewise_text_8
10 | cd ..
11 | wget https://s3.amazonaws.com/orcl-dsb-fludata/wikinews/wikinews_builder.py
12 | 


--------------------------------------------------------------------------------
/flu_news/news_clustering/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "NewsClustering"
 2 | version := "0.1"
 3 | scalaVersion := "2.10.4"
 4 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0"
 5 | libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.1.0"
 6 | libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.3.0-cdh5.1.2"
 7 | libraryDependencies += "org.json4s" %% "json4s-jackson" % "3.2.11"
 8 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.11"
 9 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
10 | resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"


--------------------------------------------------------------------------------
/flu_news/news_clustering/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.7


--------------------------------------------------------------------------------
/flu_news/news_clustering/src/main/scala/com/oracle/newscluster/NewsClustering.scala:
--------------------------------------------------------------------------------
 1 | package com.oracle.newscluster
 2 | 
 3 | import org.json4s._
 4 | import org.json4s.jackson.Serialization.{read,write}
 5 | import org.apache.spark._
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.rdd._
 9 | import org.apache.spark.mllib.clustering.KMeans
10 | import org.apache.spark.mllib.feature.Word2Vec
11 | import org.apache.spark.mllib.feature.Word2VecModel
12 | import org.apache.spark.mllib.linalg._
13 | 
14 | case class NewsArticle(date : String, title : String, byline : String, fulltext : String)
15 | 
16 | object NewsClustering {
17 |   def sumArray (m: Array[Double], n: Array[Double]): Array[Double] = {
18 |     for (i <- 0 until m.length) {m(i) += n(i)}
19 |     return m
20 |   }
21 | 
22 |   def divArray (m: Array[Double], divisor: Double) : Array[Double] = {
23 |     for (i <- 0 until m.length) {m(i) /= divisor}
24 |     return m
25 |   }
26 | 
27 |   def wordToVector (w:String, m: Word2VecModel): Vector = {
28 |     try {
29 |       return m.transform(w)
30 |     } catch {
31 |       case e: Exception => return Vectors.zeros(100)
32 |     }
33 |   }
34 |   
35 |   def main(args : Array[String]) = {
36 |     val sc = new SparkContext(new SparkConf().setAppName("News Clustering"))
37 |     
38 |     val news_rdd = sc.textFile("hdfs://localhost:8020/user/oracle/flu_news")
39 | 
40 |     val news_json = news_rdd.map(record => {
41 |       implicit val formats = DefaultFormats
42 |       read[NewsArticle](record)
43 |     })
44 | 
45 |     val news_titles = news_json.map(_.title.split(" ").toSeq)
46 |     val news_title_words = news_titles.flatMap(x => x).map(x => Seq(x))
47 |     
48 |     val w2v_input = sc.textFile("file:///home/oracle/odsb2014/flu_news/data/linewise_text_8").sample(false, 0.25,2).map(x => Seq(x))
49 |     val all_input = w2v_input ++ news_title_words
50 | 
51 |     val word2vec = new Word2Vec()
52 |     val model = word2vec.fit(all_input)
53 | 
54 |     val title_vectors = news_titles.map(x => new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector])     
55 |     val title_pairs = news_titles.map(x => (x,new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector]))
56 | 
57 |     var numClusters = 100
58 |     val numIterations = 25
59 | 
60 |     var clusters = KMeans.train(title_vectors, numClusters, numIterations)
61 |     var wssse = clusters.computeCost(title_vectors)
62 |     println("WSSSE for clusters:"+wssse)
63 | 
64 |     val article_membership = title_pairs.map(x => (clusters.predict(x._2), x._1))
65 |     val cluster_centers = sc.parallelize(clusters.clusterCenters.zipWithIndex.map{ e => (e._2,e._1)})
66 |     val cluster_topics = cluster_centers.mapValues(x => model.findSynonyms(x,5).map(x => x._1))
67 | 
68 |     var sample_topic = cluster_topics.take(12)
69 |     var sample_members = article_membership.filter(x => x._1 == 6).take(10)
70 |     for (i <- 6 until 12) {
71 | 	println("Topic Group #"+i)
72 | 	println(sample_topic(i)._2.mkString(","))
73 | 	println("-----------------------------")
74 | 	sample_members = article_membership.filter(x => x._1 == i).take(10)
75 | 	sample_members.foreach{x => println(x._2.mkString(" "))}
76 | 	println("-----------------------------")
77 |     }
78 | 
79 |     article_membership.map{x => x._1.toString+","+x._2.mkString(" ")}.saveAsTextFile("/user/oracle/flu_news_categorization")
80 |     cluster_topics.map{x => x._1+","+x._2.mkString(" ")}.saveAsTextFile("/user/oracle/flu_news_categories")
81 | 
82 |   }
83 | 
84 | }
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/flu_news/news_rss_collector.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import feedparser
 4 | from bs4 import BeautifulSoup
 5 | import json
 6 | import time
 7 | 
 8 | urls = {"top_news":"http://feeds.reuters.com/reuters/topNews", \
 9 |             "health": "http://feeds.reuters.com/reuters/healthNews", \
10 |             "healthcare":"http://feeds.reuters.com/reuters/UShealthcareNews", \
11 |             "science":"http://feeds.reuters.com/reuters/scienceNews"}
12 | 
13 | etags = {"top_news": None, "health": None, "healthcare": None, "science": None}
14 | 
15 | done = False
16 | 
17 | while not done:
18 |     for k, v in urls.items():
19 |         if etags[k]:
20 |             d = feedparser.parse(v, etag=etags[k])
21 |         else:
22 |             d = feedparser.parse(v)
23 |             for e in d.entries:
24 |                 doc = json.dumps({"category":k, "title":e.title.strip(), "summary":BeautifulSoup(e.summary).text.strip()})
25 |                 print doc
26 |         etags[k] = d.etag
27 |         time.sleep(60)
28 | 


--------------------------------------------------------------------------------
/flu_news/news_streaming/flu_news_flume_config:
--------------------------------------------------------------------------------
 1 | # example.conf: A single-node Flume configuration
 2 | 
 3 | # Name the components on this agent
 4 | newsAgent.sources = r1
 5 | newsAgent.sinks = k1
 6 | newsAgent.channels = c1
 7 | 
 8 | # Describe/configure the source
 9 | newsAgent.sources.r1.type = exec
10 | newsAgent.sources.r1.command = ./news_rss_collector.py
11 | 
12 | # Describe the sink
13 | newsAgent.sinks.k1.type = avro
14 | newsAgent.sinks.k1.channel = c1
15 | newsAgent.sinks.k1.hostname = localhost
16 | newsAgent.sinks.k1.port = 44444
17 | 
18 | # Use a channel which buffers events in memory
19 | newsAgent.channels.c1.type = memory
20 | newsAgent.channels.c1.capacity = 1000
21 | newsAgent.channels.c1.transactionCapacity = 100
22 | 
23 | # Bind the source and sink to the channel
24 | newsAgent.sources.r1.channels = c1
25 | newsAgent.sinks.k1.channel = c1


--------------------------------------------------------------------------------
/flu_news/news_streaming/news_rss_collector.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import feedparser
 4 | from bs4 import BeautifulSoup
 5 | import json
 6 | import time
 7 | import sys
 8 | 
 9 | urls = {"top_news":"http://feeds.reuters.com/reuters/topNews", \
10 |             "health": "http://feeds.reuters.com/reuters/healthNews", \
11 |             "healthcare":"http://feeds.reuters.com/reuters/UShealthcareNews", \
12 |             "science":"http://feeds.reuters.com/reuters/scienceNews"}
13 | 
14 | etags = {"top_news": None, "health": None, "healthcare": None, "science": None}
15 | 
16 | done = False
17 | 
18 | while not done:
19 |     for k, v in urls.items():
20 |         if etags[k]:
21 |             d = feedparser.parse(v, etag=etags[k])
22 |         else:
23 |             d = feedparser.parse(v)
24 |             for e in d.entries:
25 |                 doc = json.dumps({"category":k, "title":e.title.strip(), "summary":BeautifulSoup(e.summary).text.strip()})
26 |                 print doc
27 |         sys.stdout.flush()
28 |         etags[k] = d.etag
29 |         time.sleep(30)
30 | 


--------------------------------------------------------------------------------
/flu_news/news_streaming/news_streamer/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "NewsStreamer"
 2 | version := "0.1"
 3 | scalaVersion := "2.10.4"
 4 | libraryDependencies ++= Seq(
 5 | "org.apache.spark" %% "spark-core" % "1.1.0" % "provided", 
 6 | "org.apache.spark" %% "spark-streaming" % "1.1.0" % "provided",
 7 | "org.apache.spark" % "spark-streaming-flume_2.10" % "1.1.0" ,
 8 | "org.apache.hadoop" % "hadoop-client" % "2.3.0-cdh5.1.2",
 9 | "org.json4s" %% "json4s-jackson" % "3.2.11",
10 | "org.json4s" %% "json4s-native" % "3.2.11").map({dep =>
11 | dep.exclude("org.mortbay.jetty", "servlet-api").
12 |         exclude("commons-beanutils", "commons-beanutils-core").
13 |         exclude("commons-collections", "commons-collections").
14 |         exclude("commons-collections", "commons-collections").
15 | 	exclude("commons-logging", "commons-logging").
16 |         exclude("com.esotericsoftware.minlog", "minlog").
17 |         exclude("asm", "asm").			     
18 |         exclude("org.apache.hadoop", "hadoop-yarn-common")
19 | })
20 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/"
21 | resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
22 | 
23 | mainClass in assembly := Some("com.oracle.newsstream.NewsStreamer")
24 | 
25 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
26 |   {
27 |     case x if x.startsWith("META-INF/ECLIPSEF.RSA") => MergeStrategy.last
28 |     case x if x.startsWith("META-INF/mailcap") => MergeStrategy.last
29 |     case x if x.startsWith("META-INF/mimetypes") => MergeStrategy.last
30 |     case x if x.startsWith("plugin.properties") => MergeStrategy.last
31 |     case x if x.startsWith("javax") => MergeStrategy.first
32 |     case x => old(x)
33 |   }
34 | }


--------------------------------------------------------------------------------
/flu_news/news_streaming/news_streamer/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")


--------------------------------------------------------------------------------
/flu_news/news_streaming/news_streamer/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.7


--------------------------------------------------------------------------------
/flu_news/news_streaming/news_streamer/src/main/scala/com/oracle/newsstream/NewsStreamer.scala:
--------------------------------------------------------------------------------
 1 | package com.oracle.newsstream
 2 | 
 3 | import org.json4s._
 4 | import org.json4s.jackson.Serialization.{read,write}
 5 | import org.apache.spark._
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.rdd._
 8 | import org.apache.spark.streaming._
 9 | import org.apache.spark.streaming.StreamingContext._
10 | import org.apache.spark.streaming.flume._
11 | import scala.collection.immutable.StringOps
12 | 
13 | 
14 | case class RSSItem(category : String, title : String, summary : String)
15 | 
16 | 
17 | object NewsStreamer {
18 |     def containsFlu(x : String): Boolean = x match {
19 |       case x if x contains " flu " => true
20 |       case x if x contains " influenza " => true
21 |       case x if x contains " disease " => true
22 |       case x if x contains " outbreak " => true
23 |       case x if x contains " H1N1 " => true
24 |       case x if x contains " H5N1 " => true
25 |       case x if x contains " sick " => true
26 |       case _ => false
27 |     }
28 | 
29 |   def main(args : Array[String]) = {
30 |     val conf = new SparkConf().setAppName("NewsStreamer")
31 |     val ssc = new StreamingContext(conf, Seconds(30))
32 |     
33 |     val flumeStream = FlumeUtils.createStream(ssc, "localhost", 44444)
34 |     val rssData = flumeStream.map(record => {
35 | 	implicit val formats = DefaultFormats
36 |         read[RSSItem](new String(record.event.getBody().array()))
37 | 	})
38 |     val healthSummaries = rssData.filter(x => containsFlu(x.summary))
39 | 
40 |     // print batch summaries to the screen
41 |     rssData.count().map(cnt => "rss recv " + cnt + " events").print()
42 | 
43 |     val hsc = healthSummaries.count()
44 |     hsc.map(cnt => "health summaries recv " + cnt + " events").print()
45 |     
46 |     //write health data out to HDFS
47 |     val now: Long = System.currentTimeMillis
48 |     healthSummaries.foreachRDD(r => {
49 | 	    if (r.count() > 0) {
50 |               r.map(item => {
51 |                 implicit val formats = DefaultFormats
52 |                 write(item)
53 | 		  }).saveAsTextFile("/user/oracle/flu_streaming/flu_stream-"+now.toString())
54 | 		}
55 | 	})
56 |     ssc.start()
57 |     ssc.awaitTermination()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/flu_news/news_streaming/start_flume.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | flume-ng agent --name newsAgent --conf-file ./flu_news_flume_config -f /usr/lib/flume-ng/conf/flume-conf.properties.template -Dflume.root.logger=DEBUG,console
3 | 


--------------------------------------------------------------------------------
/flu_shots/flu_shots_to_db.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """ Working version of a parser/db-writer for WHO Continent-Level data.  Will be reformatted as an iPython Notebook as well."""
 3 | 
 4 | import sys
 5 | import cx_Oracle
 6 | import pprint
 7 | import re
 8 | import string
 9 | import json
10 | import urllib2
11 | 
12 | def jsonify_data(d):
13 |   return(json.dumps(d),)
14 | 	
15 | 
16 | 
17 | def write_to_db(db, data):
18 |   cursor = db.cursor()
19 |   try:
20 |       cursor.prepare("INSERT INTO flu_shot_json(doc) VALUES (:1)")
21 |       cursor.executemany(None, map(jsonify_data, data['results']))
22 |       db.commit()
23 |   except Exception as e:
24 |       print e
25 | 
26 |     
27 | def main():
28 |   db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')
29 |   drop_table = 'drop table flu_shot_json'
30 |   ddl = 'create table flu_shot_json (doc varchar2(4000), CONSTRAINT "ENSURE_JSON" CHECK (doc IS JSON))'
31 |   cursor = db.cursor()
32 |   try:
33 |     cursor.execute(drop_table)
34 |   except:
35 |     pass
36 |   cursor.execute(ddl)
37 |   cursor.close()
38 |   print "parsing dataset..."
39 |   for e in ["T","W","A","B","H"]:
40 |     url = "http://flu-vaccination-map.hhs.gov/api/v1/states.json?ethnicity="+e+"&year=lte:2014"
41 |     data = json.load(urllib2.urlopen(url))
42 |     print "writing to DB..."
43 |     write_to_db(db, data)
44 | 
45 |     view_ddl = """CREATE OR REPLACE VIEW FLUSHOTS
46 |       AS SELECT
47 |       CAST(j.doc.count AS NUMBER) eligible,
48 |       CAST(j.doc.week AS NUMBER) week,
49 |       CAST(j.doc.name AS VARCHAR2(20)) state_name,
50 |       CAST(j.doc.short_name AS VARCHAR2(2)) state,
51 |       CAST(j.doc.fips_id\tAS NUMBER) fips_id,
52 |       CAST(j.doc.disparity as VARCHAR2(20)) disparity,
53 |       CAST(j.doc.medicare_status as VARCHAR2(20)) medicare_status,
54 |       CAST(j.doc.year as NUMBER) year,
55 |       CAST(j.doc.percentage AS NUMBER) percentage_claimed,
56 |       CAST(j.doc.ethnicity AS VARCHAR2(20)) ethnicity
57 |       FROM flu_shot_json j"""
58 |       cursor = db.cursor()
59 |       cursor.execute(view_ddl)
60 |       cursor.close()
61 | 
62 | 
63 | if __name__ == "__main__":
64 |   main()
65 |   
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/flu_statistics/OIE_Pathogenic_Flu.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | import cx_Oracle
 5 | import datetime
 6 | 
 7 | 
 8 | base_year = 2004
 9 | years = 10
10 | base_url = "http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/"
11 | report = requests.get(base_url+str(base_year))
12 | 
13 | searchable_report = BeautifulSoup(report.text)
14 | flu_table = searchable_report.table
15 | 
16 | row_tags = flu_table.find_all("tr")[1:]
17 | 
18 | def make_db_row(r):
19 |     #newer reports have an extra year column
20 |     if len(r) > 4:
21 |         r.pop(2)
22 |         
23 |     r[-1]=r[-1].findChild()
24 |     #try to get the report url
25 |     url = None
26 |     try:
27 |         url = r[-1]['href']
28 |     except:
29 |         pass
30 |     row_text = map(lambda x: x.text.encode('ascii', 'ignore'), r)+[url]
31 |     try:
32 |         row_text[2] = datetime.datetime.strptime(row_text[2], "%d/%m/%y").date()
33 |     except Exception as e:
34 |         print r, e
35 |     return tuple(row_text)
36 | 
37 | data_to_insert = map(lambda x: make_db_row(x.find_all("td")), row_tags)
38 | for i in range(1,years+1):
39 |     print base_year+i
40 |     report = requests.get(base_url+str(base_year+i))
41 |     searchable_report = BeautifulSoup(report.text)
42 |     flu_table = searchable_report.table
43 |     row_tags = flu_table.find_all("tr")[1:]
44 |     data_to_insert += map(lambda x: make_db_row(x.find_all("td")), row_tags)
45 | create_table = """CREATE TABLE PATHOGENIC_FLU (
46 |                     INCIDENT_LOCATION VARCHAR2(100),
47 |                     INCIDENT_TYPE VARCHAR2(50),
48 |                     INCIDENT_DATE DATE,
49 |                     INCIDENT_REPORT VARCHAR2(100),
50 |                     REPORT_LINK VARCHAR2(200)
51 |                     )
52 |                     """
53 | db = cx_Oracle.connect("fludb", "flushot", "localhost:1521/orcl")
54 | cursor = db.cursor()
55 | cursor.execute(create_table)
56 | 
57 | 
58 | cursor.prepare("""INSERT INTO PATHOGENIC_FLU
59 |                 (INCIDENT_LOCATION,INCIDENT_TYPE,INCIDENT_DATE, INCIDENT_REPORT, REPORT_LINK)
60 |                 VALUES
61 |                 (:1, :2, :3, :4, :5)
62 |                 """)
63 | cursor.executemany(None, data_to_insert)
64 | db.commit()
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/flu_statistics/country_chop_who.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """ Working version of a parser/db-writer for WHO Continent-Level data.  Will be reformatted as an iPython Notebook as well."""
  3 | 
  4 | import sys
  5 | import cx_Oracle
  6 | import pprint
  7 | import re
  8 | import string
  9 | 
 10 | def write_to_db(db, dataset):
 11 |   dt = "DROP TABLE %s" % dataset['title']
 12 |   cursor = db.cursor()
 13 |   #try:
 14 |   #  cursor.execute(dt)
 15 |   #except:
 16 |   #  pass
 17 |   ctine = """CREATE TABLE %s (
 18 | 	  region VARCHAR(12),
 19 | 	  country VARCHAR(500),
 20 | 	  year NUMBER,
 21 | 	  week NUMBER,
 22 | 	  measure NUMBER,
 23 |           CONSTRAINT %s_id PRIMARY KEY (country, year, week)
 24 |           )""" % (dataset['title'], dataset['title'])
 25 |   try:
 26 |     cursor.execute(ctine)
 27 |   except Exception as e:
 28 |     print e
 29 |     print "failed to create table", dataset['title']
 30 |     pass
 31 |   for country in dataset['data']:
 32 |     try:
 33 |       cursor.prepare("INSERT INTO %s (region, country, year, week, measure) VALUES (:1, :2, :3, :4, :5)" % dataset['title'])
 34 |       cursor.executemany(None, country)
 35 |       db.commit()
 36 |     except Exception as e:
 37 |       print e
 38 |       print country[0]
 39 | 
 40 | 
 41 | def make_tablename(d):
 42 |   title_string = d.split("|")[0]
 43 |   if "specimen" in title_string:
 44 |     title = title_string.split("->")[-1].strip()+"_Specimens"
 45 |     if "processed" in title_string:
 46 |       title += "_proc"
 47 |     else:
 48 |       title += "_recv"
 49 |   else:
 50 |     title = title_string.split("->")[-1].strip()+"_Infections"
 51 |   title = re.sub("\(", "", title)
 52 |   title = re.sub("\)", "", title)
 53 |   title = re.sub(" ","_", title)
 54 |   return title[:25]
 55 | 
 56 | def makerows(region_code, data):
 57 |   dateprefix = data[0]
 58 |   print dateprefix
 59 |   dates = map(lambda x: x.split(), dateprefix.strip().split("|")[1:])
 60 |   print dates
 61 |   dates = map(lambda x: [x[0], x[-1]], dates)
 62 |   print dates
 63 |   data = map(lambda x: x.strip().split("|"), data[1:])
 64 |   # what we're really doing here is pivoting the data so that we can have country, year, week, value
 65 |   # for each row of raw data, we want a list of tuple (country, year, week, value)
 66 |   def row_to_tuple(dates, r):
 67 |     t = []
 68 |     for i in range(1,len(r)):
 69 |       t.append((region_code, r[0], int(dates[i-1][0]), int(dates[i-1][1]), int(re.sub(",","",r[i]))))
 70 |     return t
 71 |   data = map(lambda x: row_to_tuple(dates, x), data)
 72 |   return data
 73 | 
 74 | def parseWHOCountryFile(filename):
 75 |   region = re.sub("WHO+", "",filename)
 76 |   region = re.sub(".psv", "", region)
 77 |   region = re.sub("\+","", region)
 78 |   region = region.split("/")[-1]
 79 |   raw = open(filename).readlines()
 80 |   big_splits = []
 81 |   for i in range(len(raw)):
 82 |     if len(raw[i].split("|")) == 2:
 83 |       big_splits.append(i)
 84 |   datasets = []
 85 |   #pp = pprint.PrettyPrinter(indent=4)
 86 |   for i in range(0,len(big_splits),2):
 87 |     ds = {}
 88 |     ds['title'] = raw[big_splits[i]]
 89 |     ds['title'] = make_tablename(ds['title'])
 90 |     ds['period'] = raw[big_splits[i+1]]
 91 |     if i < len(big_splits)-2:
 92 |       ds['starts'] = big_splits[i+1]+1
 93 |       ds['ends'] = big_splits[i+2]-1
 94 |     else:
 95 |       ds['starts'] = big_splits[i+1]+1
 96 |       ds['ends'] = len(raw)-1
 97 |     ds['data'] = raw[ds['starts']:ds['ends']]
 98 |     ds['data'] = makerows(region, ds['data'])
 99 |     datasets.append(ds)
100 |     #pp.pprint(ds)
101 |   return datasets
102 | 
103 | def build_view(datasets):
104 |   view = """CREATE OR REPLACE VIEW flu_statistics AS
105 |   SELECT a.region, a.country, a.year, a.week, \n"""
106 |   from_clause = " from \n"
107 |   where_clause = " where \n"
108 |   column_creation = ""
109 |   table_creation = ""
110 |   join_creation = ""
111 |   for i in range(len(datasets)):
112 |     column_creation += "{0}.measure as {1}".format(string.ascii_lowercase[i], datasets[i]['title'])
113 |     
114 |     table_creation += "{0} {1}".format(datasets[i]['title'], string.ascii_lowercase[i])
115 |     
116 |     if (i < len(datasets)-1):
117 |       column_creation += ",\n"
118 |       table_creation += ",\n"
119 |       join_creation += "{1}.country = {0}.country and {1}.year = {0}.year and {1}.week = {0}.week \n".format(string.ascii_lowercase[i], string.ascii_lowercase[i+1])
120 |       if (i < len(datasets)-2):
121 |         join_creation += "and\n"
122 |     else:
123 |       column_creation += "\n"
124 |       table_creation += "\n"
125 |      
126 | 
127 |   view += column_creation + from_clause + table_creation + where_clause + join_creation
128 |   return view
129 |     
130 | def main(filename):
131 |   db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')
132 |   print "parsing datasets..."
133 |   datasets = parseWHOCountryFile(filename)
134 |   print "writing to DB..."
135 |   for dataset in datasets:
136 |     write_to_db(db, dataset)
137 |   print "creating view..."
138 |   c = db.cursor()
139 |   c.execute(build_view(datasets))
140 | 
141 | if __name__ == "__main__":
142 |   main(sys.argv[1])
143 |   
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/flu_statistics/get_flu_summary_data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | mkdir data
 4 | cd data
 5 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+AFRO.psv
 6 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+EURO.psv
 7 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+PAHO.psv
 8 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+WEST+ASIA.psv
 9 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+US+comprehensive.psv
10 | wget https://s3.amazonaws.com/orcl-dsb-fludata/FluNet/WHO_NREVSS.csv
11 | wget https://s3.amazonaws.com/orcl-dsb-fludata/FluNet/ILINet.csv
12 | cd ..


--------------------------------------------------------------------------------
/flu_statistics/state_populations.csv:
--------------------------------------------------------------------------------
 1 | state_name,state,region,population
 2 | Alabama,AL,East South Central,4779736
 3 | Alaska,AK,Pacific,710231
 4 | Arizona,AZ,Mountain,6392017
 5 | Arkansas,AR,West South Central,2915918
 6 | California,CA,Pacific,37253956
 7 | Colorado,CO,Mountain,5029196
 8 | Connecticut,CT,New England,3574097
 9 | Delaware,DE,South Atlantic,897934
10 | Florida,FL,South Atlantic,18801310
11 | Georgia,GA,South Atlantic,9687653
12 | Hawaii,HI,Pacific,1360301
13 | Idaho,ID,Mountain,1567582
14 | Illinois,IL,East North Central,12830632
15 | Indiana,IN,East North Central,6483802
16 | Iowa,IA,West North Central,3046355
17 | Kansas,KS,West North Central,2853118
18 | Kentucky,KY,East South Central,4339367
19 | Louisiana,LA,West South Central,4533372
20 | Maine,ME,New England,1328361
21 | Maryland,MD,South Atlantic,5773552
22 | Massachusetts,MA,New England,6547629
23 | Michigan,MI,East North Central,9883640
24 | Minnesota,MN,East North Central,5303925
25 | Mississippi,MS,East South Central,2967297
26 | Missouri,MO,West North Central,5988927
27 | Montana,MT,Mountain,989415
28 | Nebraska,NE,West North Central,1826341
29 | Nevada,NV,Mountain,2700551
30 | New Hampshire,NH,New England,1316470
31 | New Jersey,NJ,Mid-Atlantic,8791894
32 | New Mexico,NM,Mountain,2059179
33 | New York,NY,Mid-Atlantic,19378102
34 | North Carolina,NC,South Atlantic,9535483
35 | North Dakota,ND,West North Central,672591
36 | Ohio,OH,East North Central,11536504
37 | Oklahoma,OK,West South Central,3751351
38 | Oregon,OR,Pacific,3831074
39 | Pennsylvania,PA,Mid-Atlantic,12702379
40 | Rhode Island,RI,New England,1052567
41 | South Carolina,SC,South Atlantic,4625364
42 | South Dakota,SD,West North Central,814180
43 | Tennessee,TN,East South Central,6346105
44 | Texas,TX,West South Central,25145561
45 | Utah,UT,Mountain,2763885
46 | Vermont,VT,New England,625741
47 | Virginia,VA,South Atlantic,8001024
48 | Washington,WA,Pacific,6724540
49 | West Virginia,WV,South Atlantic,1852994
50 | Wisconsin,WI,East North Central,5686986
51 | Wyoming,WY,Mountain,563626


--------------------------------------------------------------------------------
/flu_statistics/us_chop_ilinet.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """ Working version of a parser/db-writer for WHO Continent-Level data.  Will be reformatted as an iPython Notebook as well."""
 3 | 
 4 | import sys
 5 | import cx_Oracle
 6 | import pprint
 7 | import re
 8 | import string
 9 | 
10 | def parseILIFile(filename):
11 |   raw = map(lambda x: x.strip().split(","),open(filename).readlines())
12 |   data = raw[1:]
13 |   return data
14 | 
15 | def columns_to_type(row):
16 |   try:
17 |     for i in range(len(row)):
18 |       if (row[i] == "X"):
19 |         row[i] = None
20 |       elif (i > 1 and i != 7 and i != 8):
21 |         row[i] = int(row[i])
22 |       elif (i == 7 or i==8):
23 |         row[i] = float(row[i])
24 |     return tuple(row[1:])
25 |   except:
26 |       return tuple()
27 | 
28 | def write_to_db(db, data):
29 |   create_table = """CREATE TABLE US_FLU_DEMOGRAPHICS (
30 |       REGION VARCHAR2(50),
31 |       YEAR NUMBER(10,0),
32 |       WEEK NUMBER(10,0),
33 |       TOTAL_SICK NUMBER(10,0),
34 |       TOTAL_PATIENTS NUMBER(10,0),
35 |       TOTAL_PROVIDERS NUMBER(10,0),
36 |       WEIGHTED_SICK NUMBER,
37 |       UNWEIGHTED_SICK NUMBER,
38 |       AGE_0_4 NUMBER(10,0),
39 |       AGE_5_24 NUMBER(10,0),
40 |       AGE_25_64 NUMBER(10,0),
41 |       AGE_25_49 NUMBER(10,0),
42 |       AGE_50_64 NUMBER(10,0),
43 |       AGE_65_PLUS NUMBER(10,0))"""
44 |   rows_to_insert = filter(lambda x: len(x)> 0, map(columns_to_type, data))
45 |   cursor = db.cursor()
46 |   try:
47 |     cursor.execute("drop table us_flu_demographics")
48 |   except:
49 |     pass
50 |   cursor.execute(create_table)
51 |   try:
52 |     cursor.prepare("""INSERT INTO US_FLU_DEMOGRAPHICS (
53 |           REGION, YEAR, WEEK, TOTAL_SICK,
54 |           TOTAL_PATIENTS, TOTAL_PROVIDERS, WEIGHTED_SICK, UNWEIGHTED_SICK,
55 |           AGE_0_4, AGE_5_24, AGE_25_64, AGE_25_49, AGE_50_64, AGE_65_PLUS) 
56 |           VALUES 
57 |           (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14)""")
58 |     cursor.executemany(None, rows_to_insert)
59 |     db.commit()
60 |   except Exception as e:
61 |     print e
62 | 
63 | 
64 | def main(filename):
65 |   db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')
66 |   print "parsing datasets..."
67 |   datasets = parseILIFile(filename)
68 |   print "writing to DB..."
69 |   write_to_db(db, datasets)
70 | 
71 | if __name__ == "__main__":
72 |   main(sys.argv[1])
73 | 


--------------------------------------------------------------------------------
/flu_statistics/us_chop_who.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """ Working version of a parser/db-writer for WHO Continent-Level data.  Will be reformatted as an iPython Notebook as well."""
 3 | 
 4 | import sys
 5 | import cx_Oracle
 6 | import pprint
 7 | import re
 8 | import string
 9 | 
10 | def parseWHOFile(filename):
11 |   raw = map(lambda x: x.strip().split(","),open(filename).readlines())
12 |   data = raw[1:]
13 |   return data
14 | 
15 | def columns_to_type(row):
16 |   try:
17 |     for i in range(len(row)):
18 |       if (i > 1 and i != 5):
19 |         row[i] = int(row[i])
20 |       if (i == 5):
21 |         row[i] = float(row[i])
22 |     return tuple(row[1:])
23 |   except:
24 |     return tuple()
25 | 
26 | def write_to_db(db, data):
27 |   create_table = """CREATE TABLE US_WHO_FLU_STATS (
28 |       REGION VARCHAR2(50),
29 |       YEAR NUMBER(10,0),
30 |       WEEK NUMBER(10,0),
31 |       TOTAL_SPECIMENS NUMBER(10,0),
32 |       PERCENT_POSITIVE NUMBER,
33 |       A_H1 NUMBER(10,0),
34 |       A_NO_SUBTYPE NUMBER(10,0),
35 |       A_H3 NUMBER(10,0),
36 |       H1N1 NUMBER(10,0),
37 |       A_TOTAL NUMBER(10,0),
38 |       B NUMBER(10,0),
39 |       H3N2v NUMBER(10,0))"""
40 |   rows_to_insert = filter(lambda x: len(x)> 0, map(columns_to_type, data))
41 |   cursor = db.cursor()
42 |   cursor.execute("drop table us_who_flu_stats")
43 |   cursor.execute(create_table)
44 |   try:
45 |     cursor.prepare("""INSERT INTO US_WHO_FLU_STATS (
46 |           REGION, YEAR, WEEK, TOTAL_SPECIMENS,
47 |           PERCENT_POSITIVE, A_H1, A_NO_SUBTYPE,
48 |           A_H3, H1N1, A_TOTAL, B, H3N2v) VALUES 
49 |           (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12)""")
50 |     cursor.executemany(None, rows_to_insert)
51 |     db.commit()
52 |   except Exception as e:
53 |     print e
54 | 
55 | 
56 | def main(filename):
57 |   db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')
58 |   print "parsing datasets..."
59 |   datasets = parseWHOFile(filename)
60 |   print "writing to DB..."
61 |   write_to_db(db, datasets)
62 | 
63 | if __name__ == "__main__":
64 |   main(sys.argv[1])
65 | 


--------------------------------------------------------------------------------
/notebooks/01 WHO US Simple CSV Loading.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:d48ee330d31bd89a1d9a6618b4ec0ae255df9f196a5935716a4aa2a2e15b8dd8"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Loading and Sharing Simple CSV Data"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "Sometimes we get lucky with the datasets we find: they're formatted just right, and in a format everyone can read.  The most common of these would be a CSV or delimited-text file.  All the columns are present on each row, delimited by the same character: all we need to do is parse and load the file.  The [Centers for Disease Control](http://www.cdc.gov/flu/weekly/fluviewinteractive.htm) provides a way to download just this kind of read-to-consume data about flu rates in the US.\n",
 24 |       "\n",
 25 |       "There are lots of tools to do this with most databases.  Oracle provides `SQL*Loader` as well as external table capabilities for this.  MySQL can use the `LOAD DATA INFILE` directive to quickly load CSV data.  However, we're going to use Python to illustrate how we can quickly connect this analysis-ready scripting languge to our database.\n",
 26 |       "\n",
 27 |       "We'll focus on cx-Oracle, a python module designed to connect to Oracle database."
 28 |      ]
 29 |     },
 30 |     {
 31 |      "cell_type": "code",
 32 |      "collapsed": false,
 33 |      "input": [
 34 |       "import sys\n",
 35 |       "import cx_Oracle\n",
 36 |       "import pprint\n",
 37 |       "import re\n",
 38 |       "import string"
 39 |      ],
 40 |      "language": "python",
 41 |      "metadata": {},
 42 |      "outputs": [],
 43 |      "prompt_number": 1
 44 |     },
 45 |     {
 46 |      "cell_type": "markdown",
 47 |      "metadata": {},
 48 |      "source": [
 49 |       "Since the data is delimited by commas, parsing out the data we want is simple.  We'll put it in a function for later use."
 50 |      ]
 51 |     },
 52 |     {
 53 |      "cell_type": "code",
 54 |      "collapsed": false,
 55 |      "input": [
 56 |       "def parseWHOFile(filename):\n",
 57 |       "  raw = map(lambda x: x.strip().split(\",\"),open(filename).readlines())\n",
 58 |       "  data = raw[1:]\n",
 59 |       "  return data"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": [],
 64 |      "prompt_number": 2
 65 |     },
 66 |     {
 67 |      "cell_type": "markdown",
 68 |      "metadata": {},
 69 |      "source": [
 70 |       "Our data isn't all strings or numbers, so we'll have to write a quick function to type-convert the rows.  We'll write the function to handle a single row, then rely on python's map operator to convert the whole dataset at once. "
 71 |      ]
 72 |     },
 73 |     {
 74 |      "cell_type": "code",
 75 |      "collapsed": false,
 76 |      "input": [
 77 |       "def columns_to_type(row):\n",
 78 |       "  try:\n",
 79 |       "    for i in range(len(row)):\n",
 80 |       "      if (i > 1 and i != 5):\n",
 81 |       "        row[i] = int(row[i])\n",
 82 |       "      if (i == 5):\n",
 83 |       "        row[i] = float(row[i])\n",
 84 |       "    return tuple(row[1:])\n",
 85 |       "  except:\n",
 86 |       "    return tuple()"
 87 |      ],
 88 |      "language": "python",
 89 |      "metadata": {},
 90 |      "outputs": [],
 91 |      "prompt_number": 3
 92 |     },
 93 |     {
 94 |      "cell_type": "markdown",
 95 |      "metadata": {},
 96 |      "source": [
 97 |       "*write_to_db* is the most interesting part of our work.  We're going to need to create a table in Oracle 12c and then fill it with rows.  We do this by passing SQL language statements through cx-Oracle to the database.  Notice we create a **cursor** in the method.  A cursor is the structure which allows us to traverse over records in a database and execute commands.  Any time we use cx-Oracle, we'll create a cursor.\n",
 98 |       "\n",
 99 |       "Once we have a cursor, we execute a few statements\n",
100 |       "\n",
101 |       "* We drop US_WHO_FLU_STATS to make sure there's no stale data\n",
102 |       "* We create the US_WHO_FLU_STATS table, whcih tracks statistics for census regions of the US\n",
103 |       "* We insert our dataset into the table using the `executemany` statement\n",
104 |       "* We save our work to the database by calling `db.commit`"
105 |      ]
106 |     },
107 |     {
108 |      "cell_type": "code",
109 |      "collapsed": false,
110 |      "input": [
111 |       "def write_to_db(db, data):\n",
112 |       "  create_table = \"\"\"CREATE TABLE US_WHO_FLU_STATS (\n",
113 |       "      REGION VARCHAR2(50),\n",
114 |       "      YEAR NUMBER(10,0),\n",
115 |       "      WEEK NUMBER(10,0),\n",
116 |       "      TOTAL_SPECIMENS NUMBER(10,0),\n",
117 |       "      PERCENT_POSITIVE NUMBER,\n",
118 |       "      A_H1 NUMBER(10,0),\n",
119 |       "      A_NO_SUBTYPE NUMBER(10,0),\n",
120 |       "      A_H3 NUMBER(10,0),\n",
121 |       "      H1N1 NUMBER(10,0),\n",
122 |       "      A_TOTAL NUMBER(10,0),\n",
123 |       "      B NUMBER(10,0),\n",
124 |       "      H3N2v NUMBER(10,0))\"\"\"\n",
125 |       "  rows_to_insert = filter(lambda x: len(x)> 0, map(columns_to_type, data))\n",
126 |       "  cursor = db.cursor()\n",
127 |       "  try:\n",
128 |       "    cursor.execute(\"drop table us_who_flu_stats\")\n",
129 |       "  except Exception:\n",
130 |       "    pass\n",
131 |       "  cursor.execute(create_table)\n",
132 |       "  try:\n",
133 |       "    cursor.prepare(\"\"\"INSERT INTO US_WHO_FLU_STATS (\n",
134 |       "          REGION, YEAR, WEEK, TOTAL_SPECIMENS,\n",
135 |       "          PERCENT_POSITIVE, A_H1, A_NO_SUBTYPE,\n",
136 |       "          A_H3, H1N1, A_TOTAL, B, H3N2v) VALUES \n",
137 |       "          (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12)\"\"\")\n",
138 |       "    cursor.executemany(None, rows_to_insert)\n",
139 |       "    db.commit()\n",
140 |       "  except Exception as e:\n",
141 |       "    print e\n"
142 |      ],
143 |      "language": "python",
144 |      "metadata": {},
145 |      "outputs": [],
146 |      "prompt_number": 4
147 |     },
148 |     {
149 |      "cell_type": "markdown",
150 |      "metadata": {},
151 |      "source": [
152 |       "With all of our methods complete, parsing the data is as simple as parsing the file and passing the results to our writer function."
153 |      ]
154 |     },
155 |     {
156 |      "cell_type": "code",
157 |      "collapsed": false,
158 |      "input": [
159 |       "db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')"
160 |      ],
161 |      "language": "python",
162 |      "metadata": {},
163 |      "outputs": [],
164 |      "prompt_number": 2
165 |     },
166 |     {
167 |      "cell_type": "code",
168 |      "collapsed": false,
169 |      "input": [
170 |       "print \"parsing datasets...\"\n",
171 |       "datasets = parseWHOFile('../flu_statistics/data/WHO_NREVSS.csv')\n",
172 |       "print \"writing to DB...\"\n",
173 |       "write_to_db(db, datasets)\n"
174 |      ],
175 |      "language": "python",
176 |      "metadata": {},
177 |      "outputs": []
178 |     },
179 |     {
180 |      "cell_type": "markdown",
181 |      "metadata": {},
182 |      "source": [
183 |       "We have another CSV file which might help us.  It contains mappings of states to their populations and flu surveillance regions.  Let's load it in a similar fashion."
184 |      ]
185 |     },
186 |     {
187 |      "cell_type": "code",
188 |      "collapsed": false,
189 |      "input": [
190 |       "state_mappings = map(lambda x: x.strip().split(\",\"), open(\"../flu_statistics/state_populations.csv\").readlines())\n",
191 |       "header = state_mappings[0]\n",
192 |       "state_data = map(lambda x: tuple(x), state_mappings[1:])\n",
193 |       "state_data[:10]"
194 |      ],
195 |      "language": "python",
196 |      "metadata": {},
197 |      "outputs": [
198 |       {
199 |        "metadata": {},
200 |        "output_type": "pyout",
201 |        "prompt_number": 3,
202 |        "text": [
203 |         "[('Alabama', 'AL', 'East South Central', '4779736'),\n",
204 |         " ('Alaska', 'AK', 'Pacific', '710231'),\n",
205 |         " ('Arizona', 'AZ', 'Mountain', '6392017'),\n",
206 |         " ('Arkansas', 'AR', 'West South Central', '2915918'),\n",
207 |         " ('California', 'CA', 'Pacific', '37253956'),\n",
208 |         " ('Colorado', 'CO', 'Mountain', '5029196'),\n",
209 |         " ('Connecticut', 'CT', 'New England', '3574097'),\n",
210 |         " ('Delaware', 'DE', 'South Atlantic', '897934'),\n",
211 |         " ('Florida', 'FL', 'South Atlantic', '18801310'),\n",
212 |         " ('Georgia', 'GA', 'South Atlantic', '9687653')]"
213 |        ]
214 |       }
215 |      ],
216 |      "prompt_number": 3
217 |     },
218 |     {
219 |      "cell_type": "code",
220 |      "collapsed": false,
221 |      "input": [
222 |       "create_state_table = \"\"\"CREATE TABLE state_stats(\n",
223 |       "  state_name varchar2(26),\n",
224 |       "  state varchar2(2),\n",
225 |       "  region_name varchar2(26),\n",
226 |       "  population number,\n",
227 |       "  primary key (state)\n",
228 |       "  )\"\"\"\n",
229 |       "cursor = db.cursor()\n",
230 |       "cursor.execute(create_state_table)"
231 |      ],
232 |      "language": "python",
233 |      "metadata": {},
234 |      "outputs": [
235 |       {
236 |        "ename": "DatabaseError",
237 |        "evalue": "ORA-00955: name is already used by an existing object\n",
238 |        "output_type": "pyerr",
239 |        "traceback": [
240 |         "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mDatabaseError\u001b[0m                             Traceback (most recent call last)",
241 |         "\u001b[1;32m<ipython-input-4-436155a3825a>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      7\u001b[0m   )\"\"\"\n\u001b[0;32m      8\u001b[0m \u001b[0mcursor\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcreate_state_table\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
242 |         "\u001b[1;31mDatabaseError\u001b[0m: ORA-00955: name is already used by an existing object\n"
243 |        ]
244 |       }
245 |      ],
246 |      "prompt_number": 4
247 |     },
248 |     {
249 |      "cell_type": "code",
250 |      "collapsed": false,
251 |      "input": [
252 |       "cursor.prepare(\"INSERT INTO state_stats (state_name, state, region_name, population) values (:1, :2, :3, :4)\")\n",
253 |       "cursor.executemany(None, state_data)\n"
254 |      ],
255 |      "language": "python",
256 |      "metadata": {},
257 |      "outputs": [],
258 |      "prompt_number": 16
259 |     },
260 |     {
261 |      "cell_type": "code",
262 |      "collapsed": false,
263 |      "input": [
264 |       "db.commit()"
265 |      ],
266 |      "language": "python",
267 |      "metadata": {},
268 |      "outputs": [],
269 |      "prompt_number": 17
270 |     }
271 |    ],
272 |    "metadata": {}
273 |   }
274 |  ]
275 | }


--------------------------------------------------------------------------------
/notebooks/02 WHO Country-Level Flu Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:7bca72e089995043bc31f094a29934ac96b1a43fa92cca8e9caa02736dbb9c7a"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Loading Simple Delimited Data"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "There are lots of tools to load delimited data in to a database for sharing.  But very often, the delimited data we're given doesn't fit the form of the tables we'd like to make.  In our case, the [WHO](http://www.who.int/influenza/gisrs_laboratory/flunet/en/) makes available country-level data for influenza surveillance: what strain, how many samples, and so on for every week of the year.  The data is free to download, but just a raw dump of delimited strings.\n",
 24 |       "\n",
 25 |       "In this exercise, we'll use basic python and the cx-Oracle to clean up the data and form it into useful tables.  To begin, let's import cx-Oracle, which we'll use to communicate with our database, and a few standard python libraries."
 26 |      ]
 27 |     },
 28 |     {
 29 |      "cell_type": "heading",
 30 |      "level": 2,
 31 |      "metadata": {},
 32 |      "source": [
 33 |       "Munging and Pivoting Data"
 34 |      ]
 35 |     },
 36 |     {
 37 |      "cell_type": "code",
 38 |      "collapsed": false,
 39 |      "input": [
 40 |       "import sys\n",
 41 |       "import cx_Oracle\n",
 42 |       "import re\n",
 43 |       "import string\n",
 44 |       "from glob import glob"
 45 |      ],
 46 |      "language": "python",
 47 |      "metadata": {},
 48 |      "outputs": [],
 49 |      "prompt_number": 10
 50 |     },
 51 |     {
 52 |      "cell_type": "markdown",
 53 |      "metadata": {},
 54 |      "source": [
 55 |       "First thing's first: we need a method that will parse one of the `.psv` files which our WHO data comes in.  Take a minute to look at `WHO+EURO.psv`.  How many potential tables do you see in the data?\n",
 56 |       "\n",
 57 |       "Unfortunately, lines in the files aren't all the same length.  Some rows have 1 delimiter, some have more than 50.  In our method, we've decided that the split between tables occurs where the rows have 2 \"columns.\"  Can you see why?\n",
 58 |       "\n",
 59 |       "We're going to break our file up into logical chunks, and each chunk will become a table.  For each of these chunks, we'll need to get the name of the table and the data for the table.  We'll also extract the region of the world this data belongs to from the filename.\n",
 60 |       "\n",
 61 |       "Finding the title and the dataset inside each chunk is fairly simple, but we'll need to do more to create well-formed data we can share with a database.  For this, we'll define a few functions: one for creating the table name, the other for formatting rows.  Then, for each logical chunk, we'll add it to a list of datasets to be inserted into the database."
 62 |      ]
 63 |     },
 64 |     {
 65 |      "cell_type": "code",
 66 |      "collapsed": false,
 67 |      "input": [
 68 |       "def parseWHOCountryFile(filename):\n",
 69 |       "  region = re.sub(\"WHO+\", \"\",filename)\n",
 70 |       "  region = re.sub(\".psv\", \"\", region)\n",
 71 |       "  region = re.sub(\"\\+\",\"\", region)\n",
 72 |       "  region = region.split(\"/\")[-1]\n",
 73 |       "  raw = open(filename).readlines()\n",
 74 |       "  big_splits = []\n",
 75 |       "  for i in range(len(raw)):\n",
 76 |       "    if len(raw[i].split(\"|\")) == 2:\n",
 77 |       "      big_splits.append(i)\n",
 78 |       "  datasets = []\n",
 79 |       "  for i in range(0,len(big_splits),2):\n",
 80 |       "    ds = {}\n",
 81 |       "    ds['title'] = raw[big_splits[i]]\n",
 82 |       "    ds['title'] = make_tablename(ds['title'])\n",
 83 |       "    ds['period'] = raw[big_splits[i+1]]\n",
 84 |       "    if i < len(big_splits)-2:\n",
 85 |       "      ds['starts'] = big_splits[i+1]+1\n",
 86 |       "      ds['ends'] = big_splits[i+2]-1\n",
 87 |       "    else:\n",
 88 |       "      ds['starts'] = big_splits[i+1]+1\n",
 89 |       "      ds['ends'] = len(raw)-1\n",
 90 |       "    ds['data'] = raw[ds['starts']:ds['ends']]\n",
 91 |       "    ds['data'] = makerows(region, ds['data'])\n",
 92 |       "    datasets.append(ds)\n",
 93 |       "  return datasets"
 94 |      ],
 95 |      "language": "python",
 96 |      "metadata": {},
 97 |      "outputs": [],
 98 |      "prompt_number": 5
 99 |     },
100 |     {
101 |      "cell_type": "markdown",
102 |      "metadata": {},
103 |      "source": [
104 |       "Making the name for the table is pretty easy.  We know we can't have special characters like `(` or `)` in our table name.  We also can't have spaces.  So, with a little chopping up of the string and a few calls to the re module, we've got a clean table name."
105 |      ]
106 |     },
107 |     {
108 |      "cell_type": "code",
109 |      "collapsed": false,
110 |      "input": [
111 |       "def make_tablename(d):\n",
112 |       "  title_string = d.split(\"|\")[0]\n",
113 |       "  if \"specimen\" in title_string:\n",
114 |       "    title = title_string.split(\"->\")[-1].strip()+\"_Specimens\"\n",
115 |       "    if \"processed\" in title_string:\n",
116 |       "      title += \"_proc\"\n",
117 |       "    else:\n",
118 |       "      title += \"_recv\"\n",
119 |       "  else:\n",
120 |       "    title = title_string.split(\"->\")[-1].strip()+\"_Infections\"\n",
121 |       "  title = re.sub(\"\\(\", \"\", title)\n",
122 |       "  title = re.sub(\"\\)\", \"\", title)\n",
123 |       "  title = re.sub(\" \",\"_\", title)\n",
124 |       "  return title[:25]"
125 |      ],
126 |      "language": "python",
127 |      "metadata": {},
128 |      "outputs": [],
129 |      "prompt_number": 3
130 |     },
131 |     {
132 |      "cell_type": "markdown",
133 |      "metadata": {},
134 |      "source": [
135 |       "Separating the lines of data into rows and columns for our database insert is easy: everything is delimited by the `|` character.  However, we'd like to have a table of tuples like this (region, country, year, week, measurement) and instead we've got all the measures for every week of the year on a single line.  Fortunately, this sort of *en masse* string manipulation is easy with python's map function.  By defining an inner function and using the map operator, we can quickly pivot all the weeks into the 4-tuple for our database table."
136 |      ]
137 |     },
138 |     {
139 |      "cell_type": "code",
140 |      "collapsed": false,
141 |      "input": [
142 |       "def makerows(region_code, data):\n",
143 |       "  dateprefix = data[0]\n",
144 |       "  dates = map(lambda x: x.split(), dateprefix.strip().split(\"|\")[1:])\n",
145 |       "  dates = map(lambda x: [x[0], x[-1]], dates)\n",
146 |       "  data = map(lambda x: x.strip().split(\"|\"), data[1:])\n",
147 |       "  # what we're really doing here is pivoting the data so that we can have country, year, week, value\n",
148 |       "  # for each row of raw data, we want a list of tuple (country, year, week, value)\n",
149 |       "  def row_to_tuple(dates, r):\n",
150 |       "    t = []\n",
151 |       "    for i in range(1,len(r)):\n",
152 |       "      t.append((region_code, r[0], int(dates[i-1][0]), int(dates[i-1][1]), int(re.sub(\",\",\"\",r[i]))))\n",
153 |       "    return t\n",
154 |       "  data = map(lambda x: row_to_tuple(dates, x), data)\n",
155 |       "  return data"
156 |      ],
157 |      "language": "python",
158 |      "metadata": {},
159 |      "outputs": [],
160 |      "prompt_number": 4
161 |     },
162 |     {
163 |      "cell_type": "heading",
164 |      "level": 2,
165 |      "metadata": {},
166 |      "source": [
167 |       "Writing Tables and Views"
168 |      ]
169 |     },
170 |     {
171 |      "cell_type": "markdown",
172 |      "metadata": {},
173 |      "source": [
174 |       "We now have a list of datasets, each with a table name and a set of 4-tuples.  In order to write this to the database, we'll need a method which does bulk inserts into Oracle database.  Our *write_to_db* function does just that.  Notice that because we don't know the name of the table, we use string substitution to automatically create a table for each of the logical chunks in a `psv` file."
175 |      ]
176 |     },
177 |     {
178 |      "cell_type": "code",
179 |      "collapsed": false,
180 |      "input": [
181 |       "def write_to_db(db, dataset):\n",
182 |       "  dt = \"DROP TABLE %s\" % dataset['title']\n",
183 |       "  cursor = db.cursor()\n",
184 |       "  ctine = \"\"\"CREATE TABLE %s (\n",
185 |       "\t  region VARCHAR(12),\n",
186 |       "\t  country VARCHAR(500),\n",
187 |       "\t  year NUMBER,\n",
188 |       "\t  week NUMBER,\n",
189 |       "\t  measure NUMBER,\n",
190 |       "          CONSTRAINT %s_id PRIMARY KEY (country, year, week)\n",
191 |       "          )\"\"\" % (dataset['title'], dataset['title'])\n",
192 |       "  try:\n",
193 |       "    cursor.execute(ctine)\n",
194 |       "  except Exception as e:\n",
195 |       "    print e\n",
196 |       "    print \"failed to create table\", dataset['title']\n",
197 |       "    pass\n",
198 |       "  for country in dataset['data']:\n",
199 |       "    try:\n",
200 |       "      cursor.prepare(\"INSERT INTO %s (region, country, year, week, measure) VALUES (:1, :2, :3, :4, :5)\" % dataset['title'])\n",
201 |       "      cursor.executemany(None, country)\n",
202 |       "      db.commit()\n",
203 |       "    except Exception as e:\n",
204 |       "      print e\n",
205 |       "      print country[0]"
206 |      ],
207 |      "language": "python",
208 |      "metadata": {},
209 |      "outputs": [],
210 |      "prompt_number": 2
211 |     },
212 |     {
213 |      "cell_type": "markdown",
214 |      "metadata": {},
215 |      "source": [
216 |       "While the *write_to_db* method will rapidly insert a dataset into the database, the tables it creates only show us one measure at a time.  When we think about the flu, we would like to look at measures for different strains side-by-side.  More importantly, when we expose that data to external tools, we'd like to present a single dataset.  Fortunately, database *views* make it easy to \"publish\" a particular query for others to quickly access.\n",
217 |       "\n",
218 |       "The view we want needs to do the following: for each instance of (country, year, week), produce all the measurements from the tables we created from the raw data.  That means our SQL will need to:\n",
219 |       "\n",
220 |       "* SELECT region, country, year, and week from **one** table\n",
221 |       "* SELECT the measurement from **each** table with a new column name (say, the table name)\n",
222 |       "* JOIN the tables together so that there is one row with all measures for each (region, country, year, week) tuple.\n",
223 |       "\n",
224 |       "Can you write the SQL query yourself?  Can you see how the *build_view* method assembles the query automatically?"
225 |      ]
226 |     },
227 |     {
228 |      "cell_type": "code",
229 |      "collapsed": false,
230 |      "input": [
231 |       "def build_view(datasets):\n",
232 |       "  view = \"\"\"CREATE OR REPLACE VIEW flu_statistics AS\n",
233 |       "  SELECT a.region, a.country, a.year, a.week, \\n\"\"\"\n",
234 |       "  from_clause = \" from \\n\"\n",
235 |       "  where_clause = \" where \\n\"\n",
236 |       "  column_creation = \"\"\n",
237 |       "  table_creation = \"\"\n",
238 |       "  join_creation = \"\"\n",
239 |       "  for i in range(len(datasets)):\n",
240 |       "    column_creation += \"{0}.measure as {1}\".format(string.ascii_lowercase[i], datasets[i]['title'])\n",
241 |       "    \n",
242 |       "    table_creation += \"{0} {1}\".format(datasets[i]['title'], string.ascii_lowercase[i])\n",
243 |       "    \n",
244 |       "    if (i < len(datasets)-1):\n",
245 |       "      column_creation += \",\\n\"\n",
246 |       "      table_creation += \",\\n\"\n",
247 |       "      join_creation += \"{1}.country = {0}.country and {1}.year = {0}.year and {1}.week = {0}.week \\n\".format(string.ascii_lowercase[i], string.ascii_lowercase[i+1])\n",
248 |       "      if (i < len(datasets)-2):\n",
249 |       "        join_creation += \"and\\n\"\n",
250 |       "    else:\n",
251 |       "      column_creation += \"\\n\"\n",
252 |       "      table_creation += \"\\n\"\n",
253 |       "     \n",
254 |       "\n",
255 |       "  view += column_creation + from_clause + table_creation + where_clause + join_creation\n",
256 |       "  return view"
257 |      ],
258 |      "language": "python",
259 |      "metadata": {},
260 |      "outputs": [],
261 |      "prompt_number": 6
262 |     },
263 |     {
264 |      "cell_type": "markdown",
265 |      "metadata": {},
266 |      "source": [
267 |       "With our methods complete, we can finally list the files and get to processing.  The glob function allows us a wildcard search of the flu statistics data we downloaded."
268 |      ]
269 |     },
270 |     {
271 |      "cell_type": "code",
272 |      "collapsed": false,
273 |      "input": [
274 |       "files = glob(\"../flu_statistics/data/*.psv\")\n",
275 |       "print files\n",
276 |       "files = files[:-1]"
277 |      ],
278 |      "language": "python",
279 |      "metadata": {},
280 |      "outputs": [
281 |       {
282 |        "output_type": "stream",
283 |        "stream": "stdout",
284 |        "text": [
285 |         "['/home/oracle/odsb2014/flu_statistics/data/WHO+WEST+ASIA.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+EURO.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+PAHO.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+AFRO.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+US+comprehensive.psv']\n"
286 |        ]
287 |       }
288 |      ],
289 |      "prompt_number": 9
290 |     },
291 |     {
292 |      "cell_type": "markdown",
293 |      "metadata": {},
294 |      "source": [
295 |       "And a simple for-loop will load the data into our database and create a view to share with our teammembers."
296 |      ]
297 |     },
298 |     {
299 |      "cell_type": "code",
300 |      "collapsed": false,
301 |      "input": [
302 |       "db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')\n",
303 |       "print \"parsing datasets...\"\n",
304 |       "for filename in files:\n",
305 |       "    datasets = parseWHOCountryFile(filename)\n",
306 |       "    print \"writing to DB...\"\n",
307 |       "    for dataset in datasets:\n",
308 |       "        write_to_db(db, dataset)\n",
309 |       "    print \"creating view...\"\n",
310 |       "    c = db.cursor()\n",
311 |       "    c.execute(build_view(datasets))"
312 |      ],
313 |      "language": "python",
314 |      "metadata": {},
315 |      "outputs": []
316 |     }
317 |    ],
318 |    "metadata": {}
319 |   }
320 |  ]
321 | }


--------------------------------------------------------------------------------
/notebooks/03 HHS_Flu_Vaccination_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:39723e6a76e2b17dd6babf6931598ccb7c0d08ec6fea51f65e4bf875ab75348a"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Loading HHS Flu Vaccination JSON Data"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "Sometimes data is already in a handy document store for us. JSON (JavaScript Object Notation) data is in the form of a list of objects composed of a set of key-value pairs. \n",
 24 |       "\n",
 25 |       "In this exercise, we'll use basic python and the cx-Oracle to pull Flu Vaccination Data from HHS.gov and store it in a table in our database. The data and API are provided by the [US HHS](http://flu-vaccination-map.hhs.gov/).  Since the data we want is already in JSON format, we won't need to do much parsing, we'll mostly just \"dump\" it into the database. As usual, we begin by importing the libraries we'll need."
 26 |      ]
 27 |     },
 28 |     {
 29 |      "cell_type": "code",
 30 |      "collapsed": false,
 31 |      "input": [
 32 |       "import sys\n",
 33 |       "import cx_Oracle\n",
 34 |       "import pprint\n",
 35 |       "import re\n",
 36 |       "import string\n",
 37 |       "import json\n",
 38 |       "import urllib2"
 39 |      ],
 40 |      "language": "python",
 41 |      "metadata": {},
 42 |      "outputs": []
 43 |     },
 44 |     {
 45 |      "cell_type": "markdown",
 46 |      "metadata": {},
 47 |      "source": [
 48 |       "First thing we'll need to do is write a quick helper function to \"jsonify\" our data. This will turn our data into separate JSON strings and return them as a list of tuples. "
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "def jsonify_data(d):\n",
 56 |       "  return(json.dumps(d),)"
 57 |      ],
 58 |      "language": "python",
 59 |      "metadata": {},
 60 |      "outputs": []
 61 |     },
 62 |     {
 63 |      "cell_type": "markdown",
 64 |      "metadata": {},
 65 |      "source": [
 66 |       "First, we'll need to connect to the database and make a table to store the data in.  Because the data we're collecting is JSON, we can save time by assigning the whole document to a single column.  For this exercise we'll just need one table, call it flu_shot_json with, one column, call it doc. We'll pull the 'results' from our data, jsonify it, then insert it into our table in the doc column."
 67 |      ]
 68 |     },
 69 |     {
 70 |      "cell_type": "code",
 71 |      "collapsed": false,
 72 |      "input": [
 73 |       "db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')\n",
 74 |       "drop_table = 'drop table flu_shot_json'\n",
 75 |       "ddl = 'create table flu_shot_json (doc varchar2(4000), CONSTRAINT \"ENSURE_JSON\" CHECK (doc IS JSON))'\n",
 76 |       "cursor = db.cursor()\n",
 77 |       "try:\n",
 78 |       "  cursor.execute(drop_table)\n",
 79 |       "except:\n",
 80 |       "  pass\n",
 81 |       "cursor.execute(ddl)\n",
 82 |       "cursor.close()"
 83 |      ],
 84 |      "language": "python",
 85 |      "metadata": {},
 86 |      "outputs": []
 87 |     },
 88 |     {
 89 |      "cell_type": "markdown",
 90 |      "metadata": {},
 91 |      "source": [
 92 |       "That's pretty much all the setup we need to do, so now we'll go ahead and create a `write to db` function. \n",
 93 |       "\n",
 94 |       "As with most database operations, we need a cursor.  Don't forget to commit the inserts after they've executed!"
 95 |      ]
 96 |     },
 97 |     {
 98 |      "cell_type": "code",
 99 |      "collapsed": false,
100 |      "input": [
101 |       "def write_to_db(db, data):\n",
102 |       "  cursor = db.cursor()\n",
103 |       "  try:\n",
104 |       "      cursor.prepare(\"INSERT INTO flu_shot_json(doc) VALUES (:1)\")\n",
105 |       "      cursor.executemany(None, map(jsonify_data, data['results']))\n",
106 |       "      db.commit()\n",
107 |       "  except Exception as e:\n",
108 |       "      print e"
109 |      ],
110 |      "language": "python",
111 |      "metadata": {},
112 |      "outputs": []
113 |     },
114 |     {
115 |      "cell_type": "markdown",
116 |      "metadata": {},
117 |      "source": [
118 |       "Now all we need to do is pull the data from HHS.gov and write it to the db. There are a number of ethnicities, so we need to collect all of them."
119 |      ]
120 |     },
121 |     {
122 |      "cell_type": "code",
123 |      "collapsed": false,
124 |      "input": [
125 |       "print \"parsing dataset...\"\n",
126 |       "for eth in [\"T\",\"A\",\"W\",\"B\",\"H\"]:\n",
127 |       "  url = \"http://flu-vaccination-map.hhs.gov/api/v1/states.json?ethnicity=\"+eth+\"{&year=lte:2014}\"\n",
128 |       "  data = json.load(urllib2.urlopen(url))\n",
129 |       "  print \"writing to DB...\"\n",
130 |       "  write_to_db(db, data)"
131 |      ],
132 |      "language": "python",
133 |      "metadata": {},
134 |      "outputs": []
135 |     },
136 |     {
137 |      "cell_type": "markdown",
138 |      "metadata": {},
139 |      "source": [
140 |       "Finally, we'll make a database view on the data so that we don't have to write JSON access paths to get at individual fields."
141 |      ]
142 |     },
143 |     {
144 |      "cell_type": "code",
145 |      "collapsed": false,
146 |      "input": [
147 |       "view_ddl = \"\"\"CREATE OR REPLACE VIEW FLUSHOTS \n",
148 |       "AS SELECT\n",
149 |       "CAST(j.doc.count AS NUMBER) eligible,\n",
150 |       "CAST(j.doc.week AS NUMBER) week,\n",
151 |       "CAST(j.doc.name AS VARCHAR2(20)) state_name,\n",
152 |       "CAST(j.doc.short_name AS VARCHAR2(2)) state,\n",
153 |       "CAST(j.doc.fips_id\tAS NUMBER) fips_id,\n",
154 |       "CAST(j.doc.disparity as VARCHAR2(20)) disparity,\n",
155 |       "CAST(j.doc.medicare_status as VARCHAR2(20)) medicare_status,\n",
156 |       "CAST(j.doc.year as NUMBER) year,\n",
157 |       "CAST(j.doc.percentage AS NUMBER) percentage_claimed,\n",
158 |       "CAST(j.doc.ethnicity AS VARCHAR2(20)) ethnicity\n",
159 |       "FROM flu_shot_json j;\"\"\"\n",
160 |       "cursor = db.cursor()\n",
161 |       "cursor.execute(view_ddl)\n",
162 |       "cursor.close()"
163 |      ],
164 |      "language": "python",
165 |      "metadata": {},
166 |      "outputs": []
167 |     }
168 |    ],
169 |    "metadata": {}
170 |   }
171 |  ]
172 | }


--------------------------------------------------------------------------------
/notebooks/04 OIE_Pathogenic_Flu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:f589cc76360d648a1ea77317abc619ebb9e34ba612bf4545f47efc777dc82060"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Gathering OIE Pathogenic Flu Data: Scraping the Web"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "Sometimes the data we want to analyze is available on the web, but isn't as conveniently accessed via an API or direct download.  In some cases, the data is embedded in web pages and needs to be scraped out.  In this exercise, we'll collect data on pathogenic strains of influenza in animals, provided by the [World Organization for Animal Health](http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/2013/).\n",
 24 |       "\n",
 25 |       "To get this data we'll use a number of pythonic tools\n",
 26 |       "\n",
 27 |       "- Requests: a library which simplifies making web requests\n",
 28 |       "- Beautiful Soup: a library designed to parse and extract information from HTML pages\n",
 29 |       "- cx-Oracle: our standard library for making bulk inserts into Oracle 12c\n",
 30 |       "- ipython-sql: an iPython extension that allows us to write SQL directly in our notebook "
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "heading",
 35 |      "level": 2,
 36 |      "metadata": {},
 37 |      "source": [
 38 |       "Exploring Web Data"
 39 |      ]
 40 |     },
 41 |     {
 42 |      "cell_type": "code",
 43 |      "collapsed": false,
 44 |      "input": [
 45 |       "import requests\n",
 46 |       "from bs4 import BeautifulSoup\n",
 47 |       "import cx_Oracle\n",
 48 |       "import datetime"
 49 |      ],
 50 |      "language": "python",
 51 |      "metadata": {},
 52 |      "outputs": [],
 53 |      "prompt_number": 94
 54 |     },
 55 |     {
 56 |      "cell_type": "markdown",
 57 |      "metadata": {},
 58 |      "source": [
 59 |       "By looking at the OIE site, we can tell there are reports for the years 2004-2014, all with the same base URL.  Since we want to create a table of all data for all years, let's keep the base URL and year in a pair of variables."
 60 |      ]
 61 |     },
 62 |     {
 63 |      "cell_type": "code",
 64 |      "collapsed": false,
 65 |      "input": [
 66 |       "base_year = 2004\n",
 67 |       "years = 10\n",
 68 |       "base_url = \"http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/\""
 69 |      ],
 70 |      "language": "python",
 71 |      "metadata": {},
 72 |      "outputs": [],
 73 |      "prompt_number": 95
 74 |     },
 75 |     {
 76 |      "cell_type": "markdown",
 77 |      "metadata": {},
 78 |      "source": [
 79 |       "The requests library will let us quickly get the report for 2004.  However, if we look at the first few lines, we have a whole page of HTML, not just the table we want.  How can we quickly get just the information in the table?"
 80 |      ]
 81 |     },
 82 |     {
 83 |      "cell_type": "code",
 84 |      "collapsed": false,
 85 |      "input": [
 86 |       "report = requests.get(base_url+str(base_year))\n",
 87 |       "print report.text[:1000]"
 88 |      ],
 89 |      "language": "python",
 90 |      "metadata": {},
 91 |      "outputs": [
 92 |       {
 93 |        "output_type": "stream",
 94 |        "stream": "stdout",
 95 |        "text": [
 96 |         "<!DOCTYPE html\n",
 97 |         "     PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n",
 98 |         "     \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n",
 99 |         "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n",
100 |         "<head>\n",
101 |         "\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n",
102 |         "\n",
103 |         "<!-- \n",
104 |         "\tThis website is powered by TYPO3 - inspiring people to share!\n",
105 |         "\tTYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.\n",
106 |         "\tTYPO3 is copyright 1998-2009 of Kasper Skaarhoj. Extensions are copyright of their respective owners.\n",
107 |         "\tInformation and contribution at http://typo3.com/ and http://typo3.org/\n",
108 |         "-->\n",
109 |         "\n",
110 |         "\t<base href=\"http://www.oie.int/\" />\n",
111 |         "\t<link rel=\"shortcut icon\" href=\"http://www.oie.int/fileadmin/www.oie.fr/templates/images/favicon.ico\" />\n",
112 |         "\t<link rel=\"icon\" href=\"http://www.oie.int/fileadmin/www.oie.fr/templates/images/favicon.ico\" />\n",
113 |         "\t<link rel=\"stylesheet\" type=\"text/css\" href=\"typo3temp/stylesheet_c3ad6dc8fa.css\" />\n",
114 |         "\t<link rel=\"stylesheet\" type=\"text\n"
115 |        ]
116 |       }
117 |      ],
118 |      "prompt_number": 96
119 |     },
120 |     {
121 |      "cell_type": "markdown",
122 |      "metadata": {},
123 |      "source": [
124 |       "Beautiful Soup is a must-have tool for extracting useful date from HTML pages.  By instantiating a new BeautifulSoup object with the raw HTML, we can locate the first table in the page with a simple call."
125 |      ]
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "searchable_report = BeautifulSoup(report.text)\n",
132 |       "flu_table = searchable_report.table\n",
133 |       "print flu_table.text[:100]"
134 |      ],
135 |      "language": "python",
136 |      "metadata": {},
137 |      "outputs": [
138 |       {
139 |        "output_type": "stream",
140 |        "stream": "stdout",
141 |        "text": [
142 |         "\n",
143 |         "\n",
144 |         "\n",
145 |         "Location\n",
146 |         "Virus Type\n",
147 |         "Date\n",
148 |         "Link\n",
149 |         "\n",
150 |         "\n",
151 |         "\n",
152 |         "\n",
153 |         "VietnamH5N1\n",
154 |         "08/01/04\n",
155 |         "Emergency report\u00a0\u00a02053\n",
156 |         "\n",
157 |         "\n",
158 |         "JapanH5N1\n",
159 |         "12/01/0\n"
160 |        ]
161 |       }
162 |      ],
163 |      "prompt_number": 97
164 |     },
165 |     {
166 |      "cell_type": "markdown",
167 |      "metadata": {},
168 |      "source": [
169 |       "Since our table has a header line, we can extract the `<th>` elements to find out what the column names are.  We may not use these names, but it's handy to know what data to expect."
170 |      ]
171 |     },
172 |     {
173 |      "cell_type": "code",
174 |      "collapsed": false,
175 |      "input": [
176 |       "columns = map(lambda x: x.text, flu_table.find_all(\"th\"))\n",
177 |       "print columns"
178 |      ],
179 |      "language": "python",
180 |      "metadata": {},
181 |      "outputs": [
182 |       {
183 |        "output_type": "stream",
184 |        "stream": "stdout",
185 |        "text": [
186 |         "[u'Location', u'Virus Type', u'Date', u'Link']\n"
187 |        ]
188 |       }
189 |      ],
190 |      "prompt_number": 98
191 |     },
192 |     {
193 |      "cell_type": "markdown",
194 |      "metadata": {},
195 |      "source": [
196 |       "What we *really* want are the rows with data.  They're marked with `<tr>` tags, so they're easy to find.  We should exclude the first row, since it only contains the column names."
197 |      ]
198 |     },
199 |     {
200 |      "cell_type": "code",
201 |      "collapsed": false,
202 |      "input": [
203 |       "row_tags = flu_table.find_all(\"tr\")[1:]\n",
204 |       "row_tags[0].find_all(\"td\")"
205 |      ],
206 |      "language": "python",
207 |      "metadata": {},
208 |      "outputs": [
209 |       {
210 |        "metadata": {},
211 |        "output_type": "pyout",
212 |        "prompt_number": 99,
213 |        "text": [
214 |         "[<td>Vietnam</td>,\n",
215 |         " <td>H5N1</td>,\n",
216 |         " <td>08/01/04</td>,\n",
217 |         " <td><a href=\"ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf\" target=\"_top\" title=\"Emergency report\">Emergency report</a>\u00a0\u00a0<a name=\"c2053\" style=\"color:#FAF9F8\">2053</a></td>]"
218 |        ]
219 |       }
220 |      ],
221 |      "prompt_number": 99
222 |     },
223 |     {
224 |      "cell_type": "heading",
225 |      "level": 2,
226 |      "metadata": {},
227 |      "source": [
228 |       "Fetching and Storing Web Data"
229 |      ]
230 |     },
231 |     {
232 |      "cell_type": "markdown",
233 |      "metadata": {},
234 |      "source": [
235 |       "Now that we understand how to find the row data in each OIE report, we'll need a function to convert the contents of a `<tr>` tag into a tuple we can store in a database. The *make_db_row* function will do this.  In the function we need to do several things, including\n",
236 |       "\n",
237 |       "* extracting the `href` field from the link field\n",
238 |       "* converting the date field into a python date object\n",
239 |       "* dealing with inconsistencies in the data\n",
240 |       "\n",
241 |       "Look at the series of OIE reports.  Does the structure change over time?  How does our method need to adapt to those changes?"
242 |      ]
243 |     },
244 |     {
245 |      "cell_type": "code",
246 |      "collapsed": false,
247 |      "input": [
248 |       "def make_db_row(r):\n",
249 |       "    #newer reports have an extra year column\n",
250 |       "    if len(r) > 4:\n",
251 |       "        r.pop(2)\n",
252 |       "        \n",
253 |       "    r[-1]=r[-1].findChild()\n",
254 |       "    #try to get the report url\n",
255 |       "    url = None\n",
256 |       "    try:\n",
257 |       "        url = r[-1]['href']\n",
258 |       "    except:\n",
259 |       "        pass\n",
260 |       "    row_text = map(lambda x: x.text.encode('ascii', 'ignore'), r)+[url]\n",
261 |       "    try:\n",
262 |       "        row_text[2] = datetime.datetime.strptime(row_text[2], \"%d/%m/%y\").date()\n",
263 |       "    except Exception as e:\n",
264 |       "        print r, e\n",
265 |       "    return tuple(row_text)"
266 |      ],
267 |      "language": "python",
268 |      "metadata": {},
269 |      "outputs": [],
270 |      "prompt_number": 121
271 |     },
272 |     {
273 |      "cell_type": "markdown",
274 |      "metadata": {},
275 |      "source": [
276 |       "Let's test our method by passing the 2004 HTML data into it.  We should get back a list of tuples."
277 |      ]
278 |     },
279 |     {
280 |      "cell_type": "code",
281 |      "collapsed": false,
282 |      "input": [
283 |       "data_to_insert = map(lambda x: make_db_row(x.find_all(\"td\")), row_tags)\n",
284 |       "data_to_insert[:5]"
285 |      ],
286 |      "language": "python",
287 |      "metadata": {},
288 |      "outputs": [
289 |       {
290 |        "metadata": {},
291 |        "output_type": "pyout",
292 |        "prompt_number": 101,
293 |        "text": [
294 |         "[('Vietnam',\n",
295 |         "  'H5N1',\n",
296 |         "  datetime.date(2004, 1, 8),\n",
297 |         "  'Emergency report',\n",
298 |         "  'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf'),\n",
299 |         " ('Japan',\n",
300 |         "  'H5N1',\n",
301 |         "  datetime.date(2004, 1, 12),\n",
302 |         "  'Emergency report',\n",
303 |         "  'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n",
304 |         " ('Japan',\n",
305 |         "  'H5N1',\n",
306 |         "  datetime.date(2004, 1, 13),\n",
307 |         "  'Follow up report  No.1',\n",
308 |         "  'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n",
309 |         " ('Chinese Taipei',\n",
310 |         "  'H5N2',\n",
311 |         "  datetime.date(2004, 1, 20),\n",
312 |         "  'Emergency report',\n",
313 |         "  'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf'),\n",
314 |         " ('Japan',\n",
315 |         "  'H5N1',\n",
316 |         "  datetime.date(2004, 1, 20),\n",
317 |         "  'Follow up report  No.2',\n",
318 |         "  'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf')]"
319 |        ]
320 |       }
321 |      ],
322 |      "prompt_number": 101
323 |     },
324 |     {
325 |      "cell_type": "markdown",
326 |      "metadata": {},
327 |      "source": [
328 |       "Now that we know our method works, all we need to do is step through the years and build up our dataset."
329 |      ]
330 |     },
331 |     {
332 |      "cell_type": "code",
333 |      "collapsed": false,
334 |      "input": [
335 |       "for i in range(1,years+1):\n",
336 |       "    print base_year+i\n",
337 |       "    report = requests.get(base_url+str(base_year+i))\n",
338 |       "    searchable_report = BeautifulSoup(report.text)\n",
339 |       "    flu_table = searchable_report.table\n",
340 |       "    row_tags = flu_table.find_all(\"tr\")[1:]\n",
341 |       "    data_to_insert += map(lambda x: make_db_row(x.find_all(\"td\")), row_tags)\n",
342 |       "print len(data_to_insert)"
343 |      ],
344 |      "language": "python",
345 |      "metadata": {},
346 |      "outputs": [
347 |       {
348 |        "output_type": "stream",
349 |        "stream": "stdout",
350 |        "text": [
351 |         "2005\n",
352 |         "2006"
353 |        ]
354 |       },
355 |       {
356 |        "output_type": "stream",
357 |        "stream": "stdout",
358 |        "text": [
359 |         "\n",
360 |         "2007"
361 |        ]
362 |       },
363 |       {
364 |        "output_type": "stream",
365 |        "stream": "stdout",
366 |        "text": [
367 |         "\n",
368 |         "2008"
369 |        ]
370 |       },
371 |       {
372 |        "output_type": "stream",
373 |        "stream": "stdout",
374 |        "text": [
375 |         "\n",
376 |         "2009"
377 |        ]
378 |       },
379 |       {
380 |        "output_type": "stream",
381 |        "stream": "stdout",
382 |        "text": [
383 |         "\n",
384 |         "2010"
385 |        ]
386 |       },
387 |       {
388 |        "output_type": "stream",
389 |        "stream": "stdout",
390 |        "text": [
391 |         "\n",
392 |         "2011"
393 |        ]
394 |       },
395 |       {
396 |        "output_type": "stream",
397 |        "stream": "stdout",
398 |        "text": [
399 |         "\n",
400 |         "2012"
401 |        ]
402 |       },
403 |       {
404 |        "output_type": "stream",
405 |        "stream": "stdout",
406 |        "text": [
407 |         "\n",
408 |         "2013"
409 |        ]
410 |       },
411 |       {
412 |        "output_type": "stream",
413 |        "stream": "stdout",
414 |        "text": [
415 |         "\n",
416 |         "2014"
417 |        ]
418 |       },
419 |       {
420 |        "output_type": "stream",
421 |        "stream": "stdout",
422 |        "text": [
423 |         "\n",
424 |         "1350"
425 |        ]
426 |       },
427 |       {
428 |        "output_type": "stream",
429 |        "stream": "stdout",
430 |        "text": [
431 |         "\n"
432 |        ]
433 |       }
434 |      ],
435 |      "prompt_number": 102
436 |     },
437 |     {
438 |      "cell_type": "markdown",
439 |      "metadata": {},
440 |      "source": [
441 |       "Now that we have all the data, we can insert it into Oracle 12c just like our other data sets.  First, we create the table."
442 |      ]
443 |     },
444 |     {
445 |      "cell_type": "code",
446 |      "collapsed": false,
447 |      "input": [
448 |       "create_table = \"\"\"CREATE TABLE PATHOGENIC_FLU (\n",
449 |       "                    INCIDENT_LOCATION VARCHAR2(100),\n",
450 |       "                    INCIDENT_TYPE VARCHAR2(50),\n",
451 |       "                    INCIDENT_DATE DATE,\n",
452 |       "                    INCIDENT_REPORT VARCHAR2(100),\n",
453 |       "                    REPORT_LINK VARCHAR2(200)\n",
454 |       "                    )\n",
455 |       "                    \"\"\"\n",
456 |       "db = cx_Oracle.connect(\"fludb\", \"flushot\", \"localhost:1521/orcl\")\n",
457 |       "cursor = db.cursor()\n",
458 |       "cursor.execute(create_table)"
459 |      ],
460 |      "language": "python",
461 |      "metadata": {},
462 |      "outputs": [],
463 |      "prompt_number": 110
464 |     },
465 |     {
466 |      "cell_type": "markdown",
467 |      "metadata": {},
468 |      "source": [
469 |       "Then we insert the rows."
470 |      ]
471 |     },
472 |     {
473 |      "cell_type": "code",
474 |      "collapsed": false,
475 |      "input": [
476 |       "cursor.prepare(\"\"\"INSERT INTO PATHOGENIC_FLU\n",
477 |       "                (INCIDENT_LOCATION,INCIDENT_TYPE,INCIDENT_DATE, INCIDENT_REPORT, REPORT_LINK)\n",
478 |       "                VALUES\n",
479 |       "                (:1, :2, :3, :4, :5)\n",
480 |       "                \"\"\")\n",
481 |       "cursor.executemany(None, data_to_insert)\n",
482 |       "db.commit()"
483 |      ],
484 |      "language": "python",
485 |      "metadata": {},
486 |      "outputs": [],
487 |      "prompt_number": 111
488 |     },
489 |     {
490 |      "cell_type": "markdown",
491 |      "metadata": {},
492 |      "source": [
493 |       "Once our rows are inserted, we can use ipython-sql to connect to the database and query our new table directly."
494 |      ]
495 |     },
496 |     {
497 |      "cell_type": "code",
498 |      "collapsed": false,
499 |      "input": [
500 |       "%load_ext sql"
501 |      ],
502 |      "language": "python",
503 |      "metadata": {},
504 |      "outputs": [],
505 |      "prompt_number": 114
506 |     },
507 |     {
508 |      "cell_type": "code",
509 |      "collapsed": false,
510 |      "input": [
511 |       "%sql oracle://fludb:flushot@localhost:1521/orcl"
512 |      ],
513 |      "language": "python",
514 |      "metadata": {},
515 |      "outputs": [
516 |       {
517 |        "metadata": {},
518 |        "output_type": "pyout",
519 |        "prompt_number": 117,
520 |        "text": [
521 |         "u'Connected: fludb@orcl'"
522 |        ]
523 |       }
524 |      ],
525 |      "prompt_number": 117
526 |     },
527 |     {
528 |      "cell_type": "code",
529 |      "collapsed": false,
530 |      "input": [
531 |       "%sql select * from pathogenic_flu where rownum < 5 order by incident_date"
532 |      ],
533 |      "language": "python",
534 |      "metadata": {},
535 |      "outputs": [
536 |       {
537 |        "output_type": "stream",
538 |        "stream": "stdout",
539 |        "text": [
540 |         "0 rows affected.\n"
541 |        ]
542 |       },
543 |       {
544 |        "html": [
545 |         "<table>\n",
546 |         "    <tr>\n",
547 |         "        <th>incident_location</th>\n",
548 |         "        <th>incident_type</th>\n",
549 |         "        <th>incident_date</th>\n",
550 |         "        <th>incident_report</th>\n",
551 |         "        <th>report_link</th>\n",
552 |         "    </tr>\n",
553 |         "    <tr>\n",
554 |         "        <td>Vietnam</td>\n",
555 |         "        <td>H5N1</td>\n",
556 |         "        <td>2004-01-08 00:00:00</td>\n",
557 |         "        <td>Emergency report</td>\n",
558 |         "        <td>ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf</td>\n",
559 |         "    </tr>\n",
560 |         "    <tr>\n",
561 |         "        <td>Japan</td>\n",
562 |         "        <td>H5N1</td>\n",
563 |         "        <td>2004-01-12 00:00:00</td>\n",
564 |         "        <td>Emergency report</td>\n",
565 |         "        <td>ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf</td>\n",
566 |         "    </tr>\n",
567 |         "    <tr>\n",
568 |         "        <td>Japan</td>\n",
569 |         "        <td>H5N1</td>\n",
570 |         "        <td>2004-01-13 00:00:00</td>\n",
571 |         "        <td>Follow up report  No.1</td>\n",
572 |         "        <td>ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf</td>\n",
573 |         "    </tr>\n",
574 |         "    <tr>\n",
575 |         "        <td>Chinese Taipei</td>\n",
576 |         "        <td>H5N2</td>\n",
577 |         "        <td>2004-01-20 00:00:00</td>\n",
578 |         "        <td>Emergency report</td>\n",
579 |         "        <td>ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf</td>\n",
580 |         "    </tr>\n",
581 |         "</table>"
582 |        ],
583 |        "metadata": {},
584 |        "output_type": "pyout",
585 |        "prompt_number": 120,
586 |        "text": [
587 |         "[('Vietnam', 'H5N1', datetime.datetime(2004, 1, 8, 0, 0), 'Emergency report', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf'),\n",
588 |         " ('Japan', 'H5N1', datetime.datetime(2004, 1, 12, 0, 0), 'Emergency report', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n",
589 |         " ('Japan', 'H5N1', datetime.datetime(2004, 1, 13, 0, 0), 'Follow up report  No.1', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n",
590 |         " ('Chinese Taipei', 'H5N2', datetime.datetime(2004, 1, 20, 0, 0), 'Emergency report', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf')]"
591 |        ]
592 |       }
593 |      ],
594 |      "prompt_number": 120
595 |     },
596 |     {
597 |      "cell_type": "code",
598 |      "collapsed": false,
599 |      "input": [],
600 |      "language": "python",
601 |      "metadata": {},
602 |      "outputs": []
603 |     }
604 |    ],
605 |    "metadata": {}
606 |   }
607 |  ]
608 | }


--------------------------------------------------------------------------------
/notebooks/05 Collecting Web Data With Pandas.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "metadata": {
   3 |   "name": ""
   4 |  },
   5 |  "nbformat": 3,
   6 |  "nbformat_minor": 0,
   7 |  "worksheets": [
   8 |   {
   9 |    "cells": [
  10 |     {
  11 |      "cell_type": "heading",
  12 |      "level": 1,
  13 |      "metadata": {},
  14 |      "source": [
  15 |       "Collecting Web Data with Pandas"
  16 |      ]
  17 |     },
  18 |     {
  19 |      "cell_type": "markdown",
  20 |      "metadata": {},
  21 |      "source": [
  22 |       "As we explore how flu interacts with the world around us, it would be useful to have data about countries' populations, GDP, and other indicators of economic and healthcare development.  One place we might look is in the [World Bank's collection of development indicators](http://data.worldbank.org/data-catalog/world-development-indicators/).\n",
  23 |       "\n",
  24 |       "We've looked at a number of ways to collect data from the web, but sometimes useful data is included in the tools that we use.  We could go directly to the World Bank site, download information and then parse it, but it's actually included in a tool we've been using: Pandas.  Pandas includes [remote data access](http://pandas.pydata.org/pandas-docs/stable/remote_data.html) for quickly grabbing data about world development, finance, and web traffic.\n",
  25 |       "\n",
  26 |       "In this short workshop, we'll quickly collect data via Pandas and include it in our database.\n",
  27 |       "\n",
  28 |       "We start with what is becoming a common set of `import`s"
  29 |      ]
  30 |     },
  31 |     {
  32 |      "cell_type": "code",
  33 |      "collapsed": false,
  34 |      "input": [
  35 |       "import numpy as np\n",
  36 |       "import pandas as pd\n",
  37 |       "import matplotlib as mpl\n",
  38 |       "import matplotlib.pyplot as plt\n",
  39 |       "%matplotlib inline\n",
  40 |       "%load_ext sql\n",
  41 |       "%sql oracle://fludb:flushot@localhost:1521/orcl"
  42 |      ],
  43 |      "language": "python",
  44 |      "metadata": {},
  45 |      "outputs": [
  46 |       {
  47 |        "metadata": {},
  48 |        "output_type": "pyout",
  49 |        "prompt_number": 1,
  50 |        "text": [
  51 |         "u'Connected: fludb@orcl'"
  52 |        ]
  53 |       }
  54 |      ],
  55 |      "prompt_number": 1
  56 |     },
  57 |     {
  58 |      "cell_type": "markdown",
  59 |      "metadata": {},
  60 |      "source": [
  61 |       "Since we're looking for data about countries, we should start with the list of countries we have data about.  They're easy to get using SQL."
  62 |      ]
  63 |     },
  64 |     {
  65 |      "cell_type": "code",
  66 |      "collapsed": false,
  67 |      "input": [
  68 |       "countries = %sql select distinct(country) from flu_statistics\n",
  69 |       "countries = countries.DataFrame()\n",
  70 |       "countries[:5]"
  71 |      ],
  72 |      "language": "python",
  73 |      "metadata": {},
  74 |      "outputs": [
  75 |       {
  76 |        "output_type": "stream",
  77 |        "stream": "stdout",
  78 |        "text": [
  79 |         "0 rows affected.\n"
  80 |        ]
  81 |       },
  82 |       {
  83 |        "html": [
  84 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  85 |         "<table border=\"1\" class=\"dataframe\">\n",
  86 |         "  <thead>\n",
  87 |         "    <tr style=\"text-align: right;\">\n",
  88 |         "      <th></th>\n",
  89 |         "      <th>country</th>\n",
  90 |         "    </tr>\n",
  91 |         "  </thead>\n",
  92 |         "  <tbody>\n",
  93 |         "    <tr>\n",
  94 |         "      <th>0</th>\n",
  95 |         "      <td>                          Ireland</td>\n",
  96 |         "    </tr>\n",
  97 |         "    <tr>\n",
  98 |         "      <th>1</th>\n",
  99 |         "      <td>                          Algeria</td>\n",
 100 |         "    </tr>\n",
 101 |         "    <tr>\n",
 102 |         "      <th>2</th>\n",
 103 |         "      <td>                           Brazil</td>\n",
 104 |         "    </tr>\n",
 105 |         "    <tr>\n",
 106 |         "      <th>3</th>\n",
 107 |         "      <td> Lao People's Democratic Republic</td>\n",
 108 |         "    </tr>\n",
 109 |         "    <tr>\n",
 110 |         "      <th>4</th>\n",
 111 |         "      <td>                          Croatia</td>\n",
 112 |         "    </tr>\n",
 113 |         "  </tbody>\n",
 114 |         "</table>\n",
 115 |         "</div>"
 116 |        ],
 117 |        "metadata": {},
 118 |        "output_type": "pyout",
 119 |        "prompt_number": 2,
 120 |        "text": [
 121 |         "                            country\n",
 122 |         "0                           Ireland\n",
 123 |         "1                           Algeria\n",
 124 |         "2                            Brazil\n",
 125 |         "3  Lao People's Democratic Republic\n",
 126 |         "4                           Croatia"
 127 |        ]
 128 |       }
 129 |      ],
 130 |      "prompt_number": 2
 131 |     },
 132 |     {
 133 |      "cell_type": "markdown",
 134 |      "metadata": {},
 135 |      "source": [
 136 |       "We have the country names, but not the ISO codes by which they are most commonly identified.  Let's start by using pandas to get the list of country codes from the world bank."
 137 |      ]
 138 |     },
 139 |     {
 140 |      "cell_type": "code",
 141 |      "collapsed": false,
 142 |      "input": [
 143 |       "from pandas.io import wb\n",
 144 |       "wb_country_codes = wb.get_countries()[['name', 'iso2c']]\n",
 145 |       "wb_country_codes[:5]"
 146 |      ],
 147 |      "language": "python",
 148 |      "metadata": {},
 149 |      "outputs": [
 150 |       {
 151 |        "html": [
 152 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 153 |         "<table border=\"1\" class=\"dataframe\">\n",
 154 |         "  <thead>\n",
 155 |         "    <tr style=\"text-align: right;\">\n",
 156 |         "      <th></th>\n",
 157 |         "      <th>name</th>\n",
 158 |         "      <th>iso2c</th>\n",
 159 |         "    </tr>\n",
 160 |         "  </thead>\n",
 161 |         "  <tbody>\n",
 162 |         "    <tr>\n",
 163 |         "      <th>0</th>\n",
 164 |         "      <td>       Aruba</td>\n",
 165 |         "      <td> AW</td>\n",
 166 |         "    </tr>\n",
 167 |         "    <tr>\n",
 168 |         "      <th>1</th>\n",
 169 |         "      <td> Afghanistan</td>\n",
 170 |         "      <td> AF</td>\n",
 171 |         "    </tr>\n",
 172 |         "    <tr>\n",
 173 |         "      <th>2</th>\n",
 174 |         "      <td>      Africa</td>\n",
 175 |         "      <td> A9</td>\n",
 176 |         "    </tr>\n",
 177 |         "    <tr>\n",
 178 |         "      <th>3</th>\n",
 179 |         "      <td>      Angola</td>\n",
 180 |         "      <td> AO</td>\n",
 181 |         "    </tr>\n",
 182 |         "    <tr>\n",
 183 |         "      <th>4</th>\n",
 184 |         "      <td>     Albania</td>\n",
 185 |         "      <td> AL</td>\n",
 186 |         "    </tr>\n",
 187 |         "  </tbody>\n",
 188 |         "</table>\n",
 189 |         "</div>"
 190 |        ],
 191 |        "metadata": {},
 192 |        "output_type": "pyout",
 193 |        "prompt_number": 3,
 194 |        "text": [
 195 |         "          name iso2c\n",
 196 |         "0        Aruba    AW\n",
 197 |         "1  Afghanistan    AF\n",
 198 |         "2       Africa    A9\n",
 199 |         "3       Angola    AO\n",
 200 |         "4      Albania    AL"
 201 |        ]
 202 |       }
 203 |      ],
 204 |      "prompt_number": 3
 205 |     },
 206 |     {
 207 |      "cell_type": "markdown",
 208 |      "metadata": {},
 209 |      "source": [
 210 |       "In order to create a frame which has only the countries (and codes) that we have flu data about, we'll use pandas `merge` function.  This is akin to writing a right join in SQL."
 211 |      ]
 212 |     },
 213 |     {
 214 |      "cell_type": "code",
 215 |      "collapsed": false,
 216 |      "input": [
 217 |       "flu_country_codes = pd.merge(countries, wb_country_codes, how=\"right\", left_on=\"country\", right_on=\"name\")[['country','iso2c']]\n",
 218 |       "flu_country_codes[:10]"
 219 |      ],
 220 |      "language": "python",
 221 |      "metadata": {},
 222 |      "outputs": [
 223 |       {
 224 |        "html": [
 225 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 226 |         "<table border=\"1\" class=\"dataframe\">\n",
 227 |         "  <thead>\n",
 228 |         "    <tr style=\"text-align: right;\">\n",
 229 |         "      <th></th>\n",
 230 |         "      <th>country</th>\n",
 231 |         "      <th>iso2c</th>\n",
 232 |         "    </tr>\n",
 233 |         "  </thead>\n",
 234 |         "  <tbody>\n",
 235 |         "    <tr>\n",
 236 |         "      <th>0</th>\n",
 237 |         "      <td>                Ireland</td>\n",
 238 |         "      <td> IE</td>\n",
 239 |         "    </tr>\n",
 240 |         "    <tr>\n",
 241 |         "      <th>1</th>\n",
 242 |         "      <td>                Algeria</td>\n",
 243 |         "      <td> DZ</td>\n",
 244 |         "    </tr>\n",
 245 |         "    <tr>\n",
 246 |         "      <th>2</th>\n",
 247 |         "      <td>                 Brazil</td>\n",
 248 |         "      <td> BR</td>\n",
 249 |         "    </tr>\n",
 250 |         "    <tr>\n",
 251 |         "      <th>3</th>\n",
 252 |         "      <td>                Croatia</td>\n",
 253 |         "      <td> HR</td>\n",
 254 |         "    </tr>\n",
 255 |         "    <tr>\n",
 256 |         "      <th>4</th>\n",
 257 |         "      <td>                Denmark</td>\n",
 258 |         "      <td> DK</td>\n",
 259 |         "    </tr>\n",
 260 |         "    <tr>\n",
 261 |         "      <th>5</th>\n",
 262 |         "      <td>                  Kenya</td>\n",
 263 |         "      <td> KE</td>\n",
 264 |         "    </tr>\n",
 265 |         "    <tr>\n",
 266 |         "      <th>6</th>\n",
 267 |         "      <td>           South Africa</td>\n",
 268 |         "      <td> ZA</td>\n",
 269 |         "    </tr>\n",
 270 |         "    <tr>\n",
 271 |         "      <th>7</th>\n",
 272 |         "      <td>                 Serbia</td>\n",
 273 |         "      <td> RS</td>\n",
 274 |         "    </tr>\n",
 275 |         "    <tr>\n",
 276 |         "      <th>8</th>\n",
 277 |         "      <td>                 Poland</td>\n",
 278 |         "      <td> PL</td>\n",
 279 |         "    </tr>\n",
 280 |         "    <tr>\n",
 281 |         "      <th>9</th>\n",
 282 |         "      <td> Bosnia and Herzegovina</td>\n",
 283 |         "      <td> BA</td>\n",
 284 |         "    </tr>\n",
 285 |         "  </tbody>\n",
 286 |         "</table>\n",
 287 |         "</div>"
 288 |        ],
 289 |        "metadata": {},
 290 |        "output_type": "pyout",
 291 |        "prompt_number": 4,
 292 |        "text": [
 293 |         "                  country iso2c\n",
 294 |         "0                 Ireland    IE\n",
 295 |         "1                 Algeria    DZ\n",
 296 |         "2                  Brazil    BR\n",
 297 |         "3                 Croatia    HR\n",
 298 |         "4                 Denmark    DK\n",
 299 |         "5                   Kenya    KE\n",
 300 |         "6            South Africa    ZA\n",
 301 |         "7                  Serbia    RS\n",
 302 |         "8                  Poland    PL\n",
 303 |         "9  Bosnia and Herzegovina    BA"
 304 |        ]
 305 |       }
 306 |      ],
 307 |      "prompt_number": 4
 308 |     },
 309 |     {
 310 |      "cell_type": "heading",
 311 |      "level": 2,
 312 |      "metadata": {},
 313 |      "source": [
 314 |       "World Population"
 315 |      ]
 316 |     },
 317 |     {
 318 |      "cell_type": "markdown",
 319 |      "metadata": {},
 320 |      "source": [
 321 |       "Now that we have our ISO codes, we can use pandas to get data on the world's population.  We'll need this to normalize things like the number of flu cases in each country."
 322 |      ]
 323 |     },
 324 |     {
 325 |      "cell_type": "code",
 326 |      "collapsed": false,
 327 |      "input": [
 328 |       "world_population = wb.download(indicator='sp.pop.totl', country=flu_country_codes['iso2c'], start=2013, end=2013)\n",
 329 |       "world_population[:5]"
 330 |      ],
 331 |      "language": "python",
 332 |      "metadata": {},
 333 |      "outputs": [
 334 |       {
 335 |        "output_type": "stream",
 336 |        "stream": "stderr",
 337 |        "text": [
 338 |         "/usr/lib64/python2.6/site-packages/pandas/io/wb.py:128: UserWarning: Non-standard ISO country codes: 1A, 1W, 4E, 7E, 8S, A4, A5, A9, B8, C4, C5, C6, C7, C8, C9, EU, F1, JG, KV, L4, L5, L6, L7, M2, OE, S1, S2, S3, S4, XC, XD, XE, XJ, XL, XM, XN, XO, XP, XQ, XR, XS, XT, XU, XY, Z4, Z7, ZF, ZG, ZJ, ZQ\n",
 339 |         "  warnings.warn('Non-standard ISO country codes: %s' % tmp)\n"
 340 |        ]
 341 |       },
 342 |       {
 343 |        "html": [
 344 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 345 |         "<table border=\"1\" class=\"dataframe\">\n",
 346 |         "  <thead>\n",
 347 |         "    <tr style=\"text-align: right;\">\n",
 348 |         "      <th></th>\n",
 349 |         "      <th></th>\n",
 350 |         "      <th>sp.pop.totl</th>\n",
 351 |         "    </tr>\n",
 352 |         "    <tr>\n",
 353 |         "      <th>country</th>\n",
 354 |         "      <th>year</th>\n",
 355 |         "      <th></th>\n",
 356 |         "    </tr>\n",
 357 |         "  </thead>\n",
 358 |         "  <tbody>\n",
 359 |         "    <tr>\n",
 360 |         "      <th>Aruba</th>\n",
 361 |         "      <th>2013</th>\n",
 362 |         "      <td>   102911</td>\n",
 363 |         "    </tr>\n",
 364 |         "    <tr>\n",
 365 |         "      <th>Andorra</th>\n",
 366 |         "      <th>2013</th>\n",
 367 |         "      <td>    79218</td>\n",
 368 |         "    </tr>\n",
 369 |         "    <tr>\n",
 370 |         "      <th>Afghanistan</th>\n",
 371 |         "      <th>2013</th>\n",
 372 |         "      <td> 30551674</td>\n",
 373 |         "    </tr>\n",
 374 |         "    <tr>\n",
 375 |         "      <th>Angola</th>\n",
 376 |         "      <th>2013</th>\n",
 377 |         "      <td> 21471618</td>\n",
 378 |         "    </tr>\n",
 379 |         "    <tr>\n",
 380 |         "      <th>Albania</th>\n",
 381 |         "      <th>2013</th>\n",
 382 |         "      <td>  2773620</td>\n",
 383 |         "    </tr>\n",
 384 |         "  </tbody>\n",
 385 |         "</table>\n",
 386 |         "</div>"
 387 |        ],
 388 |        "metadata": {},
 389 |        "output_type": "pyout",
 390 |        "prompt_number": 37,
 391 |        "text": [
 392 |         "                  sp.pop.totl\n",
 393 |         "country     year             \n",
 394 |         "Aruba       2013       102911\n",
 395 |         "Andorra     2013        79218\n",
 396 |         "Afghanistan 2013     30551674\n",
 397 |         "Angola      2013     21471618\n",
 398 |         "Albania     2013      2773620"
 399 |        ]
 400 |       }
 401 |      ],
 402 |      "prompt_number": 37
 403 |     },
 404 |     {
 405 |      "cell_type": "markdown",
 406 |      "metadata": {},
 407 |      "source": [
 408 |       "We might also care about the percentage of a country's population that lives in cities.  There's a World Bank Development Indicator for that as well."
 409 |      ]
 410 |     },
 411 |     {
 412 |      "cell_type": "code",
 413 |      "collapsed": false,
 414 |      "input": [
 415 |       "urban_population = wb.download(indicator='SP.URB.TOTL.IN.ZS', country=flu_country_codes['iso2c'], start=2013, end=2013)\n",
 416 |       "urban_population[:5]"
 417 |      ],
 418 |      "language": "python",
 419 |      "metadata": {},
 420 |      "outputs": [
 421 |       {
 422 |        "html": [
 423 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 424 |         "<table border=\"1\" class=\"dataframe\">\n",
 425 |         "  <thead>\n",
 426 |         "    <tr style=\"text-align: right;\">\n",
 427 |         "      <th></th>\n",
 428 |         "      <th></th>\n",
 429 |         "      <th>SP.URB.TOTL.IN.ZS</th>\n",
 430 |         "    </tr>\n",
 431 |         "    <tr>\n",
 432 |         "      <th>country</th>\n",
 433 |         "      <th>year</th>\n",
 434 |         "      <th></th>\n",
 435 |         "    </tr>\n",
 436 |         "  </thead>\n",
 437 |         "  <tbody>\n",
 438 |         "    <tr>\n",
 439 |         "      <th>Aruba</th>\n",
 440 |         "      <th>2013</th>\n",
 441 |         "      <td> 42.058</td>\n",
 442 |         "    </tr>\n",
 443 |         "    <tr>\n",
 444 |         "      <th>Andorra</th>\n",
 445 |         "      <th>2013</th>\n",
 446 |         "      <td> 86.165</td>\n",
 447 |         "    </tr>\n",
 448 |         "    <tr>\n",
 449 |         "      <th>Afghanistan</th>\n",
 450 |         "      <th>2013</th>\n",
 451 |         "      <td> 25.871</td>\n",
 452 |         "    </tr>\n",
 453 |         "    <tr>\n",
 454 |         "      <th>Angola</th>\n",
 455 |         "      <th>2013</th>\n",
 456 |         "      <td> 42.490</td>\n",
 457 |         "    </tr>\n",
 458 |         "    <tr>\n",
 459 |         "      <th>Albania</th>\n",
 460 |         "      <th>2013</th>\n",
 461 |         "      <td> 55.383</td>\n",
 462 |         "    </tr>\n",
 463 |         "  </tbody>\n",
 464 |         "</table>\n",
 465 |         "</div>"
 466 |        ],
 467 |        "metadata": {},
 468 |        "output_type": "pyout",
 469 |        "prompt_number": 38,
 470 |        "text": [
 471 |         "                  SP.URB.TOTL.IN.ZS\n",
 472 |         "country     year                   \n",
 473 |         "Aruba       2013             42.058\n",
 474 |         "Andorra     2013             86.165\n",
 475 |         "Afghanistan 2013             25.871\n",
 476 |         "Angola      2013             42.490\n",
 477 |         "Albania     2013             55.383"
 478 |        ]
 479 |       }
 480 |      ],
 481 |      "prompt_number": 38
 482 |     },
 483 |     {
 484 |      "cell_type": "markdown",
 485 |      "metadata": {},
 486 |      "source": [
 487 |       "If we're getting urban population, we should probably grab the percentage which is rural and the growth in population as well.  Each of these is just a download away."
 488 |      ]
 489 |     },
 490 |     {
 491 |      "cell_type": "code",
 492 |      "collapsed": false,
 493 |      "input": [
 494 |       "rural_population = wb.download(indicator='SP.RUR.TOTL.ZS', country=flu_country_codes['iso2c'], start=2013, end=2013)\n",
 495 |       "population_growth = wb.download(indicator='SP.POP.GROW', country=flu_country_codes['iso2c'], start=2013, end=2013)"
 496 |      ],
 497 |      "language": "python",
 498 |      "metadata": {},
 499 |      "outputs": [],
 500 |      "prompt_number": 39
 501 |     },
 502 |     {
 503 |      "cell_type": "markdown",
 504 |      "metadata": {},
 505 |      "source": [
 506 |       "With all of this population-related data, it makes sense to put them together into a single DataFrame.  We can use `pd.merge` again to do this; this time we're performing the equivalent of a left-join in SQL.  The result is a single DataFrame with all of our data."
 507 |      ]
 508 |     },
 509 |     {
 510 |      "cell_type": "code",
 511 |      "collapsed": false,
 512 |      "input": [
 513 |       "population_frame = pd.merge(world_population, urban_population, how=\"left\", left_index=True, right_index=True)\n",
 514 |       "population_frame = pd.merge(population_frame, rural_population, how=\"left\", left_index=True, right_index=True)\n",
 515 |       "population_frame = pd.merge(population_frame, population_growth, how=\"left\", left_index=True, right_index=True)\n",
 516 |       "population_frame.columns = ['total_population', 'urban_pop_percent', 'rural_pop_percent', 'pop_grow']\n",
 517 |       "population_frame[:10]"
 518 |      ],
 519 |      "language": "python",
 520 |      "metadata": {},
 521 |      "outputs": [
 522 |       {
 523 |        "html": [
 524 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 525 |         "<table border=\"1\" class=\"dataframe\">\n",
 526 |         "  <thead>\n",
 527 |         "    <tr style=\"text-align: right;\">\n",
 528 |         "      <th></th>\n",
 529 |         "      <th></th>\n",
 530 |         "      <th>total_population</th>\n",
 531 |         "      <th>urban_pop_percent</th>\n",
 532 |         "      <th>rural_pop_percent</th>\n",
 533 |         "      <th>pop_grow</th>\n",
 534 |         "    </tr>\n",
 535 |         "    <tr>\n",
 536 |         "      <th>country</th>\n",
 537 |         "      <th>year</th>\n",
 538 |         "      <th></th>\n",
 539 |         "      <th></th>\n",
 540 |         "      <th></th>\n",
 541 |         "      <th></th>\n",
 542 |         "    </tr>\n",
 543 |         "  </thead>\n",
 544 |         "  <tbody>\n",
 545 |         "    <tr>\n",
 546 |         "      <th>Aruba</th>\n",
 547 |         "      <th>2013</th>\n",
 548 |         "      <td>    102911</td>\n",
 549 |         "      <td> 42.058000</td>\n",
 550 |         "      <td> 57.942000</td>\n",
 551 |         "      <td> 0.513409</td>\n",
 552 |         "    </tr>\n",
 553 |         "    <tr>\n",
 554 |         "      <th>Andorra</th>\n",
 555 |         "      <th>2013</th>\n",
 556 |         "      <td>     79218</td>\n",
 557 |         "      <td> 86.165000</td>\n",
 558 |         "      <td> 13.835000</td>\n",
 559 |         "      <td> 1.088995</td>\n",
 560 |         "    </tr>\n",
 561 |         "    <tr>\n",
 562 |         "      <th>Afghanistan</th>\n",
 563 |         "      <th>2013</th>\n",
 564 |         "      <td>  30551674</td>\n",
 565 |         "      <td> 25.871000</td>\n",
 566 |         "      <td> 74.129000</td>\n",
 567 |         "      <td> 2.408807</td>\n",
 568 |         "    </tr>\n",
 569 |         "    <tr>\n",
 570 |         "      <th>Angola</th>\n",
 571 |         "      <th>2013</th>\n",
 572 |         "      <td>  21471618</td>\n",
 573 |         "      <td> 42.490000</td>\n",
 574 |         "      <td> 57.510000</td>\n",
 575 |         "      <td> 3.079269</td>\n",
 576 |         "    </tr>\n",
 577 |         "    <tr>\n",
 578 |         "      <th>Albania</th>\n",
 579 |         "      <th>2013</th>\n",
 580 |         "      <td>   2773620</td>\n",
 581 |         "      <td> 55.383000</td>\n",
 582 |         "      <td> 44.617000</td>\n",
 583 |         "      <td>-1.006627</td>\n",
 584 |         "    </tr>\n",
 585 |         "    <tr>\n",
 586 |         "      <th>Arab World</th>\n",
 587 |         "      <th>2013</th>\n",
 588 |         "      <td> 369761523</td>\n",
 589 |         "      <td> 57.339136</td>\n",
 590 |         "      <td> 42.660864</td>\n",
 591 |         "      <td> 2.012570</td>\n",
 592 |         "    </tr>\n",
 593 |         "    <tr>\n",
 594 |         "      <th>United Arab Emirates</th>\n",
 595 |         "      <th>2013</th>\n",
 596 |         "      <td>   9346129</td>\n",
 597 |         "      <td> 84.981000</td>\n",
 598 |         "      <td> 15.019000</td>\n",
 599 |         "      <td> 1.514471</td>\n",
 600 |         "    </tr>\n",
 601 |         "    <tr>\n",
 602 |         "      <th>Argentina</th>\n",
 603 |         "      <th>2013</th>\n",
 604 |         "      <td>  41446246</td>\n",
 605 |         "      <td> 91.452000</td>\n",
 606 |         "      <td>  8.548000</td>\n",
 607 |         "      <td> 0.870732</td>\n",
 608 |         "    </tr>\n",
 609 |         "    <tr>\n",
 610 |         "      <th>Armenia</th>\n",
 611 |         "      <th>2013</th>\n",
 612 |         "      <td>   2976566</td>\n",
 613 |         "      <td> 62.975000</td>\n",
 614 |         "      <td> 37.025000</td>\n",
 615 |         "      <td> 0.251781</td>\n",
 616 |         "    </tr>\n",
 617 |         "    <tr>\n",
 618 |         "      <th>American Samoa</th>\n",
 619 |         "      <th>2013</th>\n",
 620 |         "      <td>     55165</td>\n",
 621 |         "      <td> 87.334000</td>\n",
 622 |         "      <td> 12.666000</td>\n",
 623 |         "      <td> 0.067094</td>\n",
 624 |         "    </tr>\n",
 625 |         "  </tbody>\n",
 626 |         "</table>\n",
 627 |         "</div>"
 628 |        ],
 629 |        "metadata": {},
 630 |        "output_type": "pyout",
 631 |        "prompt_number": 46,
 632 |        "text": [
 633 |         "                           total_population  urban_pop_percent  \\\n",
 634 |         "country              year                                        \n",
 635 |         "Aruba                2013            102911          42.058000   \n",
 636 |         "Andorra              2013             79218          86.165000   \n",
 637 |         "Afghanistan          2013          30551674          25.871000   \n",
 638 |         "Angola               2013          21471618          42.490000   \n",
 639 |         "Albania              2013           2773620          55.383000   \n",
 640 |         "Arab World           2013         369761523          57.339136   \n",
 641 |         "United Arab Emirates 2013           9346129          84.981000   \n",
 642 |         "Argentina            2013          41446246          91.452000   \n",
 643 |         "Armenia              2013           2976566          62.975000   \n",
 644 |         "American Samoa       2013             55165          87.334000   \n",
 645 |         "\n",
 646 |         "                           rural_pop_percent  pop_grow  \n",
 647 |         "country              year                               \n",
 648 |         "Aruba                2013          57.942000  0.513409  \n",
 649 |         "Andorra              2013          13.835000  1.088995  \n",
 650 |         "Afghanistan          2013          74.129000  2.408807  \n",
 651 |         "Angola               2013          57.510000  3.079269  \n",
 652 |         "Albania              2013          44.617000 -1.006627  \n",
 653 |         "Arab World           2013          42.660864  2.012570  \n",
 654 |         "United Arab Emirates 2013          15.019000  1.514471  \n",
 655 |         "Argentina            2013           8.548000  0.870732  \n",
 656 |         "Armenia              2013          37.025000  0.251781  \n",
 657 |         "American Samoa       2013          12.666000  0.067094  "
 658 |        ]
 659 |       }
 660 |      ],
 661 |      "prompt_number": 46
 662 |     },
 663 |     {
 664 |      "cell_type": "markdown",
 665 |      "metadata": {},
 666 |      "source": [
 667 |       "We'd like to put this information in our database, both to avoid re-downloading it and to enable SQL access over it.  To do that quickly, we'll transform the frame into delimite data, then use ipython-sql to insert the data into our database."
 668 |      ]
 669 |     },
 670 |     {
 671 |      "cell_type": "code",
 672 |      "collapsed": false,
 673 |      "input": [
 674 |       "from StringIO import StringIO"
 675 |      ],
 676 |      "language": "python",
 677 |      "metadata": {},
 678 |      "outputs": [],
 679 |      "prompt_number": 5
 680 |     },
 681 |     {
 682 |      "cell_type": "code",
 683 |      "collapsed": false,
 684 |      "input": [
 685 |       "output = StringIO()\n",
 686 |       "population_frame.to_csv(output, sep=\"|\")\n",
 687 |       "\n",
 688 |       "population_data = output.getvalue().split(\"\\n\")[1:]\n",
 689 |       "output.close()\n",
 690 |       "population_data[:5]"
 691 |      ],
 692 |      "language": "python",
 693 |      "metadata": {},
 694 |      "outputs": [
 695 |       {
 696 |        "ename": "NameError",
 697 |        "evalue": "name 'population_frame' is not defined",
 698 |        "output_type": "pyerr",
 699 |        "traceback": [
 700 |         "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
 701 |         "\u001b[1;32m<ipython-input-10-d2f56ff9aaeb>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mStringIO\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mpopulation_frame\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"|\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mpopulation_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetvalue\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"\\n\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 702 |         "\u001b[1;31mNameError\u001b[0m: name 'population_frame' is not defined"
 703 |        ]
 704 |       }
 705 |      ],
 706 |      "prompt_number": 10
 707 |     },
 708 |     {
 709 |      "cell_type": "markdown",
 710 |      "metadata": {},
 711 |      "source": [
 712 |       "With the data ready for insert, we now create a table."
 713 |      ]
 714 |     },
 715 |     {
 716 |      "cell_type": "code",
 717 |      "collapsed": false,
 718 |      "input": [
 719 |       "%%sql create table population_info (country VARCHAR2(50), year NUMBER, total_pop NUMBER, \n",
 720 |       "                                    urban_pop NUMBER, rural_pop NUMBER,\n",
 721 |       "                                    pop_grow NUMBER, primary key(country))"
 722 |      ],
 723 |      "language": "python",
 724 |      "metadata": {},
 725 |      "outputs": [
 726 |       {
 727 |        "metadata": {},
 728 |        "output_type": "pyout",
 729 |        "prompt_number": 90,
 730 |        "text": [
 731 |         "[]"
 732 |        ]
 733 |       }
 734 |      ],
 735 |      "prompt_number": 90
 736 |     },
 737 |     {
 738 |      "cell_type": "markdown",
 739 |      "metadata": {},
 740 |      "source": [
 741 |       "And then insert the data"
 742 |      ]
 743 |     },
 744 |     {
 745 |      "cell_type": "code",
 746 |      "collapsed": false,
 747 |      "input": [
 748 |       " %config SqlMagic.feedback = False"
 749 |      ],
 750 |      "language": "python",
 751 |      "metadata": {},
 752 |      "outputs": []
 753 |     },
 754 |     {
 755 |      "cell_type": "code",
 756 |      "collapsed": false,
 757 |      "input": [
 758 |       "for d in population_data:\n",
 759 |       "    try:\n",
 760 |       "        country, year, total_pop, urban_pop, rural_pop, pop_grow = d.split(\"|\")\n",
 761 |       "        %sql insert into population_info (country, year, total_pop, urban_pop, rural_pop, pop_grow) values (:country, :year, :total_pop, :urban_pop, :rural_pop, :pop_grow)\n",
 762 |       "    except:\n",
 763 |       "        pass"
 764 |      ],
 765 |      "language": "python",
 766 |      "metadata": {},
 767 |      "outputs": [],
 768 |      "prompt_number": 91
 769 |     },
 770 |     {
 771 |      "cell_type": "heading",
 772 |      "level": 2,
 773 |      "metadata": {},
 774 |      "source": [
 775 |       "GDP Data"
 776 |      ]
 777 |     },
 778 |     {
 779 |      "cell_type": "markdown",
 780 |      "metadata": {},
 781 |      "source": [
 782 |       "We can follow exactly the same procedure to get economic data from the World Bank.  Economic data is essential if we want to understand how national or individual wealth impacts flu susceptability."
 783 |      ]
 784 |     },
 785 |     {
 786 |      "cell_type": "code",
 787 |      "collapsed": false,
 788 |      "input": [
 789 |       "per_cap_gdp = wb.download(indicator='NY.GDP.PCAP.CD', country=flu_country_codes['iso2c'], start=2013, end=2013)\n",
 790 |       "gdp = wb.download(indicator='NY.GDP.MKTP.CD', country=flu_country_codes['iso2c'], start=2013, end=2013)\n",
 791 |       "gdp_frame = pd.merge(gdp, per_cap_gdp, how=\"left\",left_index=True, right_index=True)\n",
 792 |       "gdp_frame.columns = [\"GDP\", \"PerCapGDP\"]\n",
 793 |       "gdp_frame[:5]"
 794 |      ],
 795 |      "language": "python",
 796 |      "metadata": {},
 797 |      "outputs": [
 798 |       {
 799 |        "html": [
 800 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 801 |         "<table border=\"1\" class=\"dataframe\">\n",
 802 |         "  <thead>\n",
 803 |         "    <tr style=\"text-align: right;\">\n",
 804 |         "      <th></th>\n",
 805 |         "      <th></th>\n",
 806 |         "      <th>GDP</th>\n",
 807 |         "      <th>PerCapGDP</th>\n",
 808 |         "    </tr>\n",
 809 |         "    <tr>\n",
 810 |         "      <th>country</th>\n",
 811 |         "      <th>year</th>\n",
 812 |         "      <th></th>\n",
 813 |         "      <th></th>\n",
 814 |         "    </tr>\n",
 815 |         "  </thead>\n",
 816 |         "  <tbody>\n",
 817 |         "    <tr>\n",
 818 |         "      <th>Aruba</th>\n",
 819 |         "      <th>2013</th>\n",
 820 |         "      <td>          NaN</td>\n",
 821 |         "      <td>         NaN</td>\n",
 822 |         "    </tr>\n",
 823 |         "    <tr>\n",
 824 |         "      <th>Andorra</th>\n",
 825 |         "      <th>2013</th>\n",
 826 |         "      <td>          NaN</td>\n",
 827 |         "      <td>         NaN</td>\n",
 828 |         "    </tr>\n",
 829 |         "    <tr>\n",
 830 |         "      <th>Afghanistan</th>\n",
 831 |         "      <th>2013</th>\n",
 832 |         "      <td> 2.030967e+10</td>\n",
 833 |         "      <td>  664.764589</td>\n",
 834 |         "    </tr>\n",
 835 |         "    <tr>\n",
 836 |         "      <th>Angola</th>\n",
 837 |         "      <th>2013</th>\n",
 838 |         "      <td> 1.241782e+11</td>\n",
 839 |         "      <td> 5783.366760</td>\n",
 840 |         "    </tr>\n",
 841 |         "    <tr>\n",
 842 |         "      <th>Albania</th>\n",
 843 |         "      <th>2013</th>\n",
 844 |         "      <td> 1.292324e+10</td>\n",
 845 |         "      <td> 4659.340601</td>\n",
 846 |         "    </tr>\n",
 847 |         "  </tbody>\n",
 848 |         "</table>\n",
 849 |         "</div>"
 850 |        ],
 851 |        "metadata": {},
 852 |        "output_type": "pyout",
 853 |        "prompt_number": 93,
 854 |        "text": [
 855 |         "                           GDP    PerCapGDP\n",
 856 |         "country     year                           \n",
 857 |         "Aruba       2013           NaN          NaN\n",
 858 |         "Andorra     2013           NaN          NaN\n",
 859 |         "Afghanistan 2013  2.030967e+10   664.764589\n",
 860 |         "Angola      2013  1.241782e+11  5783.366760\n",
 861 |         "Albania     2013  1.292324e+10  4659.340601"
 862 |        ]
 863 |       }
 864 |      ],
 865 |      "prompt_number": 93
 866 |     },
 867 |     {
 868 |      "cell_type": "markdown",
 869 |      "metadata": {},
 870 |      "source": [
 871 |       "As before, we'll create a table, dump the output to text, and insert it."
 872 |      ]
 873 |     },
 874 |     {
 875 |      "cell_type": "code",
 876 |      "collapsed": false,
 877 |      "input": [
 878 |       "%%sql create table gdp_data (country varchar2(50), year number, \n",
 879 |       "                             gdp number, percapgdp number, primary key(country))"
 880 |      ],
 881 |      "language": "python",
 882 |      "metadata": {},
 883 |      "outputs": [
 884 |       {
 885 |        "metadata": {},
 886 |        "output_type": "pyout",
 887 |        "prompt_number": 106,
 888 |        "text": [
 889 |         "[]"
 890 |        ]
 891 |       }
 892 |      ],
 893 |      "prompt_number": 106
 894 |     },
 895 |     {
 896 |      "cell_type": "code",
 897 |      "collapsed": false,
 898 |      "input": [
 899 |       "output = StringIO()\n",
 900 |       "gdp_frame.to_csv(output, sep=\"|\")\n",
 901 |       "\n",
 902 |       "gdp_data = output.getvalue().split(\"\\n\")[1:]\n",
 903 |       "output.close()\n",
 904 |       "for d in gdp_data:\n",
 905 |       "        try:\n",
 906 |       "            country, year, gdp, percapgdp = d.split(\"|\")\n",
 907 |       "            %sql insert into gdp_data (country, year, gdp, percapgdp) values (:country, :year, :gdp, :percapgdp)\n",
 908 |       "        except:\n",
 909 |       "            pass"
 910 |      ],
 911 |      "language": "python",
 912 |      "metadata": {},
 913 |      "outputs": [],
 914 |      "prompt_number": 107
 915 |     },
 916 |     {
 917 |      "cell_type": "heading",
 918 |      "level": 2,
 919 |      "metadata": {},
 920 |      "source": [
 921 |       "Sanitation, Water, Cellular and Education"
 922 |      ]
 923 |     },
 924 |     {
 925 |      "cell_type": "markdown",
 926 |      "metadata": {},
 927 |      "source": [
 928 |       "Lastly, we're going to grab some information about other indicators of development:\n",
 929 |       "\n",
 930 |       "* sanitation improvement\n",
 931 |       "* rural improved access to clean water\n",
 932 |       "* urban improved access to clean water\n",
 933 |       "* access to cellular phone service\n",
 934 |       "* percentage of the population with a primary school education or better\n",
 935 |       "\n",
 936 |       "The data here opens up many questions.  Does worse sanitation correlate with higher numbers of flu cases?  Does better education reduce flu cases?\n",
 937 |       "\n",
 938 |       "Just like population and GDP, these indicators are easy to grab."
 939 |      ]
 940 |     },
 941 |     {
 942 |      "cell_type": "code",
 943 |      "collapsed": false,
 944 |      "input": [
 945 |       "sanitation = wb.download(indicator='SH.STA.ACSN', country=flu_country_codes['iso2c'], start=2012, end=2012)\n",
 946 |       "safe_rural_water = wb.download(indicator='SH.H2O.SAFE.RU.ZS', country=flu_country_codes['iso2c'], start=2012, end=2012)\n",
 947 |       "safe_urb_water = wb.download(indicator='SH.H2O.SAFE.UR.ZS', country=flu_country_codes['iso2c'], start=2012, end=2012)\n",
 948 |       "cellular = wb.download(indicator='IT.CEL.SETS.P2', country=flu_country_codes['iso2c'], start=2012, end=2012)\n",
 949 |       "primary_school = wb.download(indicator='SE.PRM.CMPT.ZS', country=flu_country_codes['iso2c'], start=2012, end=2012)"
 950 |      ],
 951 |      "language": "python",
 952 |      "metadata": {},
 953 |      "outputs": [
 954 |       {
 955 |        "output_type": "stream",
 956 |        "stream": "stderr",
 957 |        "text": [
 958 |         "/usr/lib64/python2.6/site-packages/pandas/io/wb.py:128: UserWarning: Non-standard ISO country codes: 1A, 1W, 4E, 7E, 8S, A4, A5, A9, B8, C4, C5, C6, C7, C8, C9, EU, F1, JG, KV, L4, L5, L6, L7, M2, OE, S1, S2, S3, S4, XC, XD, XE, XJ, XL, XM, XN, XO, XP, XQ, XR, XS, XT, XU, XY, Z4, Z7, ZF, ZG, ZJ, ZQ\n",
 959 |         "  warnings.warn('Non-standard ISO country codes: %s' % tmp)\n"
 960 |        ]
 961 |       }
 962 |      ],
 963 |      "prompt_number": 6
 964 |     },
 965 |     {
 966 |      "cell_type": "code",
 967 |      "collapsed": false,
 968 |      "input": [
 969 |       "development_frame = pd.merge(sanitation, safe_rural_water, how='left', left_index=True, right_index=True)\n",
 970 |       "development_frame = pd.merge(development_frame, safe_urb_water, how='left', left_index=True, right_index=True)\n",
 971 |       "development_frame = pd.merge(development_frame, cellular, how='left', left_index=True, right_index=True)\n",
 972 |       "development_frame = pd.merge(development_frame, primary_school, how='left', left_index=True, right_index=True)\n",
 973 |       "development_frame.columns = [\"sanitation\", \"safe_r_h2o\", \"safe_urb_h20\", \"cellular\", \"primary_ed\"]"
 974 |      ],
 975 |      "language": "python",
 976 |      "metadata": {},
 977 |      "outputs": [],
 978 |      "prompt_number": 7
 979 |     },
 980 |     {
 981 |      "cell_type": "code",
 982 |      "collapsed": false,
 983 |      "input": [
 984 |       "%%sql create table dev_indc (country varchar2(50), year number, \n",
 985 |       "                             sanitation number, rh2o number, uh2o number, cellular number, \n",
 986 |       "                             primary_ed number, primary key(country))"
 987 |      ],
 988 |      "language": "python",
 989 |      "metadata": {},
 990 |      "outputs": [
 991 |       {
 992 |        "output_type": "stream",
 993 |        "stream": "stdout",
 994 |        "text": [
 995 |         "Done.\n"
 996 |        ]
 997 |       },
 998 |       {
 999 |        "metadata": {},
1000 |        "output_type": "pyout",
1001 |        "prompt_number": 8,
1002 |        "text": [
1003 |         "[]"
1004 |        ]
1005 |       }
1006 |      ],
1007 |      "prompt_number": 8
1008 |     },
1009 |     {
1010 |      "cell_type": "code",
1011 |      "collapsed": false,
1012 |      "input": [
1013 |       "output = StringIO()\n",
1014 |       "development_frame.to_csv(output, sep=\"|\")\n",
1015 |       "\n",
1016 |       "dev_data = output.getvalue().split(\"\\n\")[1:]\n",
1017 |       "output.close()\n",
1018 |       "for d in dev_data:\n",
1019 |       "  try:\n",
1020 |       "    country, year, san, rh2o, uh2o, cell, prim = d.split(\"|\")\n",
1021 |       "    %sql insert into dev_indc (country, year, sanitation, rh2o, uh2o, cellular, primary_ed) values (:country, :year, :san, :rh2o, :uh2o, :cell, :prim)\n",
1022 |       "  except:\n",
1023 |       "    pass"
1024 |      ],
1025 |      "language": "python",
1026 |      "metadata": {},
1027 |      "outputs": [
1028 |       {
1029 |        "output_type": "stream",
1030 |        "stream": "stdout",
1031 |        "text": [
1032 |         "1 rows affected.\n",
1033 |         "1 rows affected.\n",
1034 |         "1 rows affected.\n",
1035 |         "1 rows affected.\n",
1036 |         "1 rows affected.\n",
1037 |         "1 rows affected.\n",
1038 |         "1 rows affected.\n",
1039 |         "1 rows affected.\n",
1040 |         "1 rows affected.\n",
1041 |         "1 rows affected."
1042 |        ]
1043 |       },
1044 |       {
1045 |        "output_type": "stream",
1046 |        "stream": "stdout",
1047 |        "text": [
1048 |         "\n",
1049 |         "1 rows affected.\n",
1050 |         "1 rows affected.\n",
1051 |         "1 rows affected.\n",
1052 |         "1 rows affected.\n",
1053 |         "1 rows affected.\n",
1054 |         "1 rows affected.\n",
1055 |         "1 rows affected.\n",
1056 |         "1 rows affected."
1057 |        ]
1058 |       },
1059 |       {
1060 |        "output_type": "stream",
1061 |        "stream": "stdout",
1062 |        "text": [
1063 |         "\n",
1064 |         "1 rows affected.\n",
1065 |         "1 rows affected.\n",
1066 |         "1 rows affected.\n",
1067 |         "1 rows affected.\n",
1068 |         "1 rows affected.\n",
1069 |         "1 rows affected.\n",
1070 |         "1 rows affected.\n",
1071 |         "1 rows affected.\n",
1072 |         "1 rows affected."
1073 |        ]
1074 |       },
1075 |       {
1076 |        "output_type": "stream",
1077 |        "stream": "stdout",
1078 |        "text": [
1079 |         "\n",
1080 |         "1 rows affected.\n",
1081 |         "1 rows affected.\n",
1082 |         "1 rows affected.\n",
1083 |         "1 rows affected.\n",
1084 |         "1 rows affected.\n",
1085 |         "1 rows affected.\n",
1086 |         "1 rows affected."
1087 |        ]
1088 |       },
1089 |       {
1090 |        "output_type": "stream",
1091 |        "stream": "stdout",
1092 |        "text": [
1093 |         "\n",
1094 |         "1 rows affected.\n",
1095 |         "1 rows affected.\n",
1096 |         "1 rows affected.\n",
1097 |         "1 rows affected.\n",
1098 |         "1 rows affected.\n",
1099 |         "1 rows affected.\n",
1100 |         "1 rows affected.\n",
1101 |         "1 rows affected.\n",
1102 |         "1 rows affected."
1103 |        ]
1104 |       },
1105 |       {
1106 |        "output_type": "stream",
1107 |        "stream": "stdout",
1108 |        "text": [
1109 |         "\n",
1110 |         "1 rows affected.\n",
1111 |         "1 rows affected.\n",
1112 |         "1 rows affected.\n",
1113 |         "1 rows affected.\n",
1114 |         "1 rows affected.\n",
1115 |         "1 rows affected.\n",
1116 |         "1 rows affected.\n",
1117 |         "1 rows affected."
1118 |        ]
1119 |       },
1120 |       {
1121 |        "output_type": "stream",
1122 |        "stream": "stdout",
1123 |        "text": [
1124 |         "\n",
1125 |         "1 rows affected.\n",
1126 |         "1 rows affected.\n",
1127 |         "1 rows affected.\n",
1128 |         "1 rows affected.\n",
1129 |         "1 rows affected.\n",
1130 |         "1 rows affected.\n",
1131 |         "1 rows affected.\n",
1132 |         "1 rows affected."
1133 |        ]
1134 |       },
1135 |       {
1136 |        "output_type": "stream",
1137 |        "stream": "stdout",
1138 |        "text": [
1139 |         "\n",
1140 |         "1 rows affected.\n",
1141 |         "1 rows affected.\n",
1142 |         "1 rows affected.\n",
1143 |         "1 rows affected.\n",
1144 |         "1 rows affected.\n",
1145 |         "1 rows affected.\n",
1146 |         "1 rows affected.\n",
1147 |         "1 rows affected."
1148 |        ]
1149 |       },
1150 |       {
1151 |        "output_type": "stream",
1152 |        "stream": "stdout",
1153 |        "text": [
1154 |         "\n",
1155 |         "1 rows affected.\n",
1156 |         "1 rows affected.\n",
1157 |         "1 rows affected.\n",
1158 |         "1 rows affected.\n",
1159 |         "1 rows affected.\n",
1160 |         "1 rows affected.\n",
1161 |         "1 rows affected.\n",
1162 |         "1 rows affected."
1163 |        ]
1164 |       },
1165 |       {
1166 |        "output_type": "stream",
1167 |        "stream": "stdout",
1168 |        "text": [
1169 |         "\n",
1170 |         "1 rows affected.\n",
1171 |         "1 rows affected.\n",
1172 |         "1 rows affected.\n",
1173 |         "1 rows affected.\n",
1174 |         "1 rows affected.\n",
1175 |         "1 rows affected.\n",
1176 |         "1 rows affected.\n",
1177 |         "1 rows affected."
1178 |        ]
1179 |       },
1180 |       {
1181 |        "output_type": "stream",
1182 |        "stream": "stdout",
1183 |        "text": [
1184 |         "\n",
1185 |         "1 rows affected.\n",
1186 |         "1 rows affected.\n",
1187 |         "1 rows affected.\n",
1188 |         "1 rows affected.\n",
1189 |         "1 rows affected.\n",
1190 |         "1 rows affected.\n",
1191 |         "1 rows affected.\n",
1192 |         "1 rows affected.\n",
1193 |         "1 rows affected."
1194 |        ]
1195 |       },
1196 |       {
1197 |        "output_type": "stream",
1198 |        "stream": "stdout",
1199 |        "text": [
1200 |         "\n",
1201 |         "1 rows affected.\n",
1202 |         "1 rows affected.\n",
1203 |         "1 rows affected.\n",
1204 |         "1 rows affected.\n",
1205 |         "1 rows affected.\n",
1206 |         "1 rows affected.\n",
1207 |         "1 rows affected.\n",
1208 |         "1 rows affected.\n",
1209 |         "1 rows affected."
1210 |        ]
1211 |       },
1212 |       {
1213 |        "output_type": "stream",
1214 |        "stream": "stdout",
1215 |        "text": [
1216 |         "\n",
1217 |         "1 rows affected.\n",
1218 |         "1 rows affected.\n",
1219 |         "1 rows affected.\n",
1220 |         "1 rows affected.\n",
1221 |         "1 rows affected.\n",
1222 |         "1 rows affected.\n",
1223 |         "1 rows affected.\n",
1224 |         "1 rows affected.\n",
1225 |         "1 rows affected."
1226 |        ]
1227 |       },
1228 |       {
1229 |        "output_type": "stream",
1230 |        "stream": "stdout",
1231 |        "text": [
1232 |         "\n",
1233 |         "1 rows affected.\n",
1234 |         "1 rows affected.\n",
1235 |         "1 rows affected.\n",
1236 |         "1 rows affected.\n",
1237 |         "1 rows affected.\n",
1238 |         "1 rows affected.\n",
1239 |         "1 rows affected.\n",
1240 |         "1 rows affected."
1241 |        ]
1242 |       },
1243 |       {
1244 |        "output_type": "stream",
1245 |        "stream": "stdout",
1246 |        "text": [
1247 |         "\n",
1248 |         "1 rows affected.\n",
1249 |         "1 rows affected.\n",
1250 |         "1 rows affected.\n",
1251 |         "1 rows affected.\n",
1252 |         "1 rows affected.\n",
1253 |         "1 rows affected.\n",
1254 |         "1 rows affected.\n",
1255 |         "1 rows affected.\n",
1256 |         "1 rows affected."
1257 |        ]
1258 |       },
1259 |       {
1260 |        "output_type": "stream",
1261 |        "stream": "stdout",
1262 |        "text": [
1263 |         "\n",
1264 |         "1 rows affected.\n",
1265 |         "1 rows affected.\n",
1266 |         "1 rows affected.\n",
1267 |         "1 rows affected.\n",
1268 |         "1 rows affected.\n",
1269 |         "1 rows affected.\n",
1270 |         "1 rows affected.\n",
1271 |         "1 rows affected.\n",
1272 |         "1 rows affected."
1273 |        ]
1274 |       },
1275 |       {
1276 |        "output_type": "stream",
1277 |        "stream": "stdout",
1278 |        "text": [
1279 |         "\n",
1280 |         "1 rows affected.\n",
1281 |         "1 rows affected.\n",
1282 |         "1 rows affected.\n",
1283 |         "1 rows affected.\n",
1284 |         "1 rows affected.\n",
1285 |         "1 rows affected.\n",
1286 |         "1 rows affected."
1287 |        ]
1288 |       },
1289 |       {
1290 |        "output_type": "stream",
1291 |        "stream": "stdout",
1292 |        "text": [
1293 |         "\n",
1294 |         "1 rows affected.\n",
1295 |         "1 rows affected.\n",
1296 |         "1 rows affected.\n",
1297 |         "1 rows affected.\n",
1298 |         "1 rows affected.\n",
1299 |         "1 rows affected.\n",
1300 |         "1 rows affected.\n",
1301 |         "1 rows affected.\n",
1302 |         "1 rows affected."
1303 |        ]
1304 |       },
1305 |       {
1306 |        "output_type": "stream",
1307 |        "stream": "stdout",
1308 |        "text": [
1309 |         "\n",
1310 |         "1 rows affected.\n",
1311 |         "1 rows affected.\n",
1312 |         "1 rows affected.\n",
1313 |         "1 rows affected.\n",
1314 |         "1 rows affected.\n",
1315 |         "1 rows affected.\n",
1316 |         "1 rows affected."
1317 |        ]
1318 |       },
1319 |       {
1320 |        "output_type": "stream",
1321 |        "stream": "stdout",
1322 |        "text": [
1323 |         "\n",
1324 |         "1 rows affected.\n",
1325 |         "1 rows affected.\n",
1326 |         "1 rows affected.\n",
1327 |         "1 rows affected.\n",
1328 |         "1 rows affected.\n",
1329 |         "1 rows affected.\n",
1330 |         "1 rows affected."
1331 |        ]
1332 |       },
1333 |       {
1334 |        "output_type": "stream",
1335 |        "stream": "stdout",
1336 |        "text": [
1337 |         "\n",
1338 |         "1 rows affected.\n",
1339 |         "1 rows affected.\n",
1340 |         "1 rows affected.\n",
1341 |         "1 rows affected.\n",
1342 |         "1 rows affected.\n",
1343 |         "1 rows affected.\n",
1344 |         "1 rows affected.\n",
1345 |         "1 rows affected."
1346 |        ]
1347 |       },
1348 |       {
1349 |        "output_type": "stream",
1350 |        "stream": "stdout",
1351 |        "text": [
1352 |         "\n",
1353 |         "1 rows affected.\n",
1354 |         "1 rows affected.\n",
1355 |         "1 rows affected.\n",
1356 |         "1 rows affected.\n",
1357 |         "1 rows affected.\n",
1358 |         "1 rows affected.\n",
1359 |         "1 rows affected.\n",
1360 |         "1 rows affected."
1361 |        ]
1362 |       },
1363 |       {
1364 |        "output_type": "stream",
1365 |        "stream": "stdout",
1366 |        "text": [
1367 |         "\n",
1368 |         "1 rows affected.\n",
1369 |         "1 rows affected.\n",
1370 |         "1 rows affected.\n",
1371 |         "1 rows affected.\n",
1372 |         "1 rows affected.\n",
1373 |         "1 rows affected.\n",
1374 |         "1 rows affected."
1375 |        ]
1376 |       },
1377 |       {
1378 |        "output_type": "stream",
1379 |        "stream": "stdout",
1380 |        "text": [
1381 |         "\n",
1382 |         "1 rows affected.\n",
1383 |         "1 rows affected.\n",
1384 |         "1 rows affected.\n",
1385 |         "1 rows affected.\n",
1386 |         "1 rows affected.\n",
1387 |         "1 rows affected.\n",
1388 |         "1 rows affected.\n",
1389 |         "1 rows affected."
1390 |        ]
1391 |       },
1392 |       {
1393 |        "output_type": "stream",
1394 |        "stream": "stdout",
1395 |        "text": [
1396 |         "\n",
1397 |         "1 rows affected.\n",
1398 |         "1 rows affected.\n",
1399 |         "1 rows affected.\n",
1400 |         "1 rows affected.\n",
1401 |         "1 rows affected.\n",
1402 |         "1 rows affected.\n",
1403 |         "1 rows affected."
1404 |        ]
1405 |       },
1406 |       {
1407 |        "output_type": "stream",
1408 |        "stream": "stdout",
1409 |        "text": [
1410 |         "\n",
1411 |         "1 rows affected.\n",
1412 |         "1 rows affected.\n",
1413 |         "1 rows affected.\n",
1414 |         "1 rows affected.\n",
1415 |         "1 rows affected.\n",
1416 |         "1 rows affected.\n",
1417 |         "1 rows affected."
1418 |        ]
1419 |       },
1420 |       {
1421 |        "output_type": "stream",
1422 |        "stream": "stdout",
1423 |        "text": [
1424 |         "\n",
1425 |         "1 rows affected.\n",
1426 |         "1 rows affected.\n",
1427 |         "1 rows affected.\n",
1428 |         "1 rows affected.\n",
1429 |         "1 rows affected.\n",
1430 |         "1 rows affected.\n",
1431 |         "1 rows affected.\n",
1432 |         "1 rows affected."
1433 |        ]
1434 |       },
1435 |       {
1436 |        "output_type": "stream",
1437 |        "stream": "stdout",
1438 |        "text": [
1439 |         "\n",
1440 |         "1 rows affected.\n",
1441 |         "1 rows affected.\n",
1442 |         "1 rows affected.\n",
1443 |         "1 rows affected.\n",
1444 |         "1 rows affected.\n",
1445 |         "1 rows affected.\n",
1446 |         "1 rows affected.\n",
1447 |         "1 rows affected."
1448 |        ]
1449 |       },
1450 |       {
1451 |        "output_type": "stream",
1452 |        "stream": "stdout",
1453 |        "text": [
1454 |         "\n",
1455 |         "1 rows affected.\n",
1456 |         "1 rows affected.\n",
1457 |         "1 rows affected.\n",
1458 |         "1 rows affected.\n",
1459 |         "1 rows affected.\n",
1460 |         "1 rows affected.\n",
1461 |         "1 rows affected.\n",
1462 |         "1 rows affected."
1463 |        ]
1464 |       },
1465 |       {
1466 |        "output_type": "stream",
1467 |        "stream": "stdout",
1468 |        "text": [
1469 |         "\n",
1470 |         "1 rows affected.\n",
1471 |         "1 rows affected.\n",
1472 |         "1 rows affected.\n",
1473 |         "1 rows affected.\n",
1474 |         "1 rows affected.\n",
1475 |         "1 rows affected.\n",
1476 |         "1 rows affected.\n",
1477 |         "1 rows affected."
1478 |        ]
1479 |       },
1480 |       {
1481 |        "output_type": "stream",
1482 |        "stream": "stdout",
1483 |        "text": [
1484 |         "\n",
1485 |         "1 rows affected.\n",
1486 |         "1 rows affected.\n",
1487 |         "1 rows affected.\n",
1488 |         "1 rows affected.\n"
1489 |        ]
1490 |       }
1491 |      ],
1492 |      "prompt_number": 9
1493 |     },
1494 |     {
1495 |      "cell_type": "heading",
1496 |      "level": 2,
1497 |      "metadata": {},
1498 |      "source": [
1499 |       "Summary"
1500 |      ]
1501 |     },
1502 |     {
1503 |      "cell_type": "markdown",
1504 |      "metadata": {},
1505 |      "source": [
1506 |       "In this short exercise, we learned how to use Pandas built-in capabilities to quickly grab data from the web in an analyzable format.  We now have data on:\n",
1507 |       "\n",
1508 |       "* World population, population growth, and rural/urban splits\n",
1509 |       "* GDP and Per-capita GDP\n",
1510 |       "* Sanitation, clean water, cellular and education data\n",
1511 |       "\n",
1512 |       "These data sets should help us to ask a number of questions about what influences flu rates around the world. "
1513 |      ]
1514 |     },
1515 |     {
1516 |      "cell_type": "code",
1517 |      "collapsed": false,
1518 |      "input": [],
1519 |      "language": "python",
1520 |      "metadata": {},
1521 |      "outputs": []
1522 |     }
1523 |    ],
1524 |    "metadata": {}
1525 |   }
1526 |  ]
1527 | }


--------------------------------------------------------------------------------
/notebooks/15 Clustering the News with Spark and MLlib.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:033999fe4f6eaa5796cb83f44c3efb5d8d4007d75dd6e35b6e33f7b8cec3ef37"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 1,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Clustering the News with Spark and MLLib"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "We've previously looked at using Spark for both the analysis of text and some machine learning tasks via the PySpark interface.  Through this, we've learned about what words are important over time, and what articles are about.  However, what if we wanted to understand what sort of categories the news breaks into?  This might mean that we'd have to use both our text processing skills and some machine learning.\n",
 24 |       "\n",
 25 |       "In this lesson, we'll do just that: we'll use a simple unsupervised machine learning method, k-means clustering, to determine what broad categories the news fits into.  To do this, we'll use Spark and it's MLLib libraries via the Scala programming language.  This means the following notebook is **not** interactive.  All of the commands can be copied into Spark's interactive Scala shell (launch it by typing `spark-shell`) or by building a standalone application.\n",
 26 |       "\n",
 27 |       "We'll discuss building and running a standalone app at the end of the lesson."
 28 |      ]
 29 |     },
 30 |     {
 31 |      "cell_type": "markdown",
 32 |      "metadata": {},
 33 |      "source": [
 34 |       "We'll begin with our imports.  We'll need a few things: the MLLib classes that we require and the json4s package for parsing JSON in Scala."
 35 |      ]
 36 |     },
 37 |     {
 38 |      "cell_type": "code",
 39 |      "collapsed": false,
 40 |      "input": [
 41 |       "import org.json4s._\n",
 42 |       "import org.json4s.jackson.Serialization.{read,write}\n",
 43 |       "import org.apache.spark.rdd.RDD\n",
 44 |       "import org.apache.spark.mllib.clustering.KMeans\n",
 45 |       "import org.apache.spark.mllib.feature.Word2Vec\n",
 46 |       "import org.apache.spark.mllib.feature.Word2VecModel\n",
 47 |       "import org.apache.spark.mllib.linalg._"
 48 |      ],
 49 |      "language": "python",
 50 |      "metadata": {},
 51 |      "outputs": []
 52 |     },
 53 |     {
 54 |      "cell_type": "markdown",
 55 |      "metadata": {},
 56 |      "source": [
 57 |       "Before we parse our JSON, we're going to want a class to put it in.  Rather than treating it like a python dictionary, we're going to use a Scala Case Class.  This lets us get a full Scala class with just a single line declaration."
 58 |      ]
 59 |     },
 60 |     {
 61 |      "cell_type": "code",
 62 |      "collapsed": false,
 63 |      "input": [
 64 |       "case class NewsArticle(date : String, title : String, byline : String, fulltext : String)"
 65 |      ],
 66 |      "language": "python",
 67 |      "metadata": {},
 68 |      "outputs": []
 69 |     },
 70 |     {
 71 |      "cell_type": "markdown",
 72 |      "metadata": {},
 73 |      "source": [
 74 |       "We need a number of helper functions for our lesson, so we'll define them here.  Don't worry about what they do yet, we'll cover than in a moment."
 75 |      ]
 76 |     },
 77 |     {
 78 |      "cell_type": "code",
 79 |      "collapsed": false,
 80 |      "input": [
 81 |       "def sumArray (m: Array[Double], n: Array[Double]): Array[Double] = {\n",
 82 |       "  for (i <- 0 until m.length) {m(i) += n(i)}\n",
 83 |       "  return m\n",
 84 |       "}\n",
 85 |       "\n",
 86 |       "def divArray (m: Array[Double], divisor: Double) : Array[Double] = {\n",
 87 |       "  for (i <- 0 until m.length) {m(i) /= divisor}\n",
 88 |       "  return m\n",
 89 |       "}\n",
 90 |       "\n",
 91 |       "def wordToVector (w:String, m: Word2VecModel): Vector = {\n",
 92 |       "  try {\n",
 93 |       "    return m.transform(w)\n",
 94 |       "  } catch {\n",
 95 |       "    case e: Exception => return Vectors.zeros(100)\n",
 96 |       "  }  \n",
 97 |       "}"
 98 |      ],
 99 |      "language": "python",
100 |      "metadata": {},
101 |      "outputs": []
102 |     },
103 |     {
104 |      "cell_type": "markdown",
105 |      "metadata": {},
106 |      "source": [
107 |       "Now we're read to get started analyzing data.  Let's load up our news data using, as before, `sc.textFile`."
108 |      ]
109 |     },
110 |     {
111 |      "cell_type": "code",
112 |      "collapsed": false,
113 |      "input": [
114 |       "val news_rdd = sc.textFile(\"hdfs://localhost:8020/user/oracle/flu_news\")"
115 |      ],
116 |      "language": "python",
117 |      "metadata": {},
118 |      "outputs": []
119 |     },
120 |     {
121 |      "cell_type": "markdown",
122 |      "metadata": {},
123 |      "source": [
124 |       "We need to parse the JSON data into objects, so, as with our PySpark work, we'll use the `map` function.  However, we're using json4s' mechanisms.  This means we'll use the `read` operation and provide it a *type* of `NewsArticle`.  Unlike Python, Scala is a strongly-typed language.  If the distinction is new to you, try to read up a bit on either Scala basics or on the importance of type to programming languages."
125 |      ]
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "val news_json = news_rdd.map(record => {\n",
132 |       "  implicit val formats = DefaultFormats\n",
133 |       "  read[NewsArticle](record)\n",
134 |       "})"
135 |      ],
136 |      "language": "python",
137 |      "metadata": {},
138 |      "outputs": []
139 |     },
140 |     {
141 |      "cell_type": "markdown",
142 |      "metadata": {},
143 |      "source": [
144 |       "We're planning to use [k-means](http://en.wikipedia.org/wiki/K-means_clustering) clustering to determine automatically which news articles belong to which clusters.  However, we have to deal with a bit of an inconsistency first.  K-means operates on numeric vectors (i.e., points in space), but we have words, not vectors in our articles.  One way to treat this would be to compute TF-IDF for each article and treat that as a point in space.  Each word would be a dimension, and each each TF-IDF score would be the value in that dimension.\n",
145 |       "\n",
146 |       "Ask yourself, how big would that vector be?  Would it only include the dimensions for the words in the article?  For the words in all articles?  Maybe the vectors need to be as big as all the words in the English language!\n",
147 |       "\n",
148 |       "In fact, MLLib has a built-in TF-IDF transform which produces, by default, vectors that are 2^20 long.  That's way too big to deal with in our virtual machine.\n",
149 |       "\n",
150 |       "What would happen if we did the following\n",
151 |       "\n",
152 |       "+ Made each article's vector include only the dimensions of the words in each title?\n",
153 |       "+ Made each article's vector include the dimensions of only the words in all titles?\n",
154 |       "+ Made each article's vector a dimensional reduction of all the words in all titles?\n",
155 |       "\n",
156 |       "For simplicity, we're only going to deal with the titles, as opposed to all the words in the articles.  We're also not going to use TF-IDF, for reasons that will become apparent if you answer the questions above.  Instead, we're going to rely on a method called [Word2Vec](https://code.google.com/p/word2vec/).  Originated at Google, word2vec does a remarkably good job of transforming single words into reasonably-sized vectors.  When generated from a large corpus, these vectors allow us to find synonyms with surprising accuracy.\n",
157 |       "\n",
158 |       "So, the first thing we'll need is a corpus of words.  Let's make one from our titles."
159 |      ]
160 |     },
161 |     {
162 |      "cell_type": "code",
163 |      "collapsed": false,
164 |      "input": [
165 |       "val news_titles = news_json.map(_.title.split(\" \").toSeq)\n",
166 |       "val news_title_words = news_titles.flatMap(x => x).map(x => Seq(x))"
167 |      ],
168 |      "language": "python",
169 |      "metadata": {},
170 |      "outputs": []
171 |     },
172 |     {
173 |      "cell_type": "markdown",
174 |      "metadata": {},
175 |      "source": [
176 |       "In order to find better synonyms, we should add more words to our corpus.  Let's do just that by grabbing a sample from the `linewise_text_8` file included in `flu_news/data`."
177 |      ]
178 |     },
179 |     {
180 |      "cell_type": "code",
181 |      "collapsed": false,
182 |      "input": [
183 |       "val w2v_input = sc.textFile(\"file:///home/oracle/odsb2014/flu_news/data/linewise_text_8\").sample(false, 0.25,2).map(x => Seq(x))\n",
184 |       "val all_input = w2v_input ++ news_title_words"
185 |      ],
186 |      "language": "python",
187 |      "metadata": {},
188 |      "outputs": []
189 |     },
190 |     {
191 |      "cell_type": "markdown",
192 |      "metadata": {},
193 |      "source": [
194 |       "Now we're ready to build a word2vec model from our corpus.  Constructing this model using Spark is easy!"
195 |      ]
196 |     },
197 |     {
198 |      "cell_type": "code",
199 |      "collapsed": false,
200 |      "input": [
201 |       "val word2vec = new Word2Vec()\n",
202 |       "val model = word2vec.fit(all_input)"
203 |      ],
204 |      "language": "python",
205 |      "metadata": {},
206 |      "outputs": []
207 |     },
208 |     {
209 |      "cell_type": "markdown",
210 |      "metadata": {},
211 |      "source": [
212 |       "Now we've got a model which can compute synonyms, but we have another problem.  Titles have many words and word2vec only operates on one of 2 things: words or vectors.  How can we find the synonyms for a whole title?!\n",
213 |       "\n",
214 |       "One of the interesting features of word2vec is that it displays reasonably good synonym prediction when the fectors for words are added together or subtracted.  That is `v(king) - v(man) ~= v(queen)`.  Thus, we could rationalize that a title is just the average vector of all the words in the title.  Let's give that a try.\n",
215 |       "\n",
216 |       "For this, we'll need to use a couple of our helper functions.  \n",
217 |       "\n",
218 |       "+ Inside our Spark RDD's `map` operation, we're going to call Scala's `map` to apply the word2vec model to each word.  \n",
219 |       "+ That gives us a Sequence of Arrays for each title, which we need to\n",
220 |       "  * Sum up\n",
221 |       "  * Divide by the total number of words in the title\n",
222 |       "+ The summing can be handled by using the `reduceLeft` Scala operator.  Look at the helper function and see if you can determine what is happening.\n",
223 |       "+ The dividing is taken care of by the `divArray` helper function\n",
224 |       "\n",
225 |       "Once this is done, we now have RDDs which contain average vectors for each title.  We're ready to cluster!"
226 |      ]
227 |     },
228 |     {
229 |      "cell_type": "code",
230 |      "collapsed": false,
231 |      "input": [
232 |       "val title_vectors = news_titles.map(x => new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector])\n",
233 |       "\n",
234 |       "val title_pairs = news_titles.map(x => (x,new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector]))"
235 |      ],
236 |      "language": "python",
237 |      "metadata": {},
238 |      "outputs": []
239 |     },
240 |     {
241 |      "cell_type": "markdown",
242 |      "metadata": {},
243 |      "source": [
244 |       "As with word2vec, Spark's MLLib make k-means clustering easy.  All we need to do is specify the number of clusters and iterations."
245 |      ]
246 |     },
247 |     {
248 |      "cell_type": "code",
249 |      "collapsed": false,
250 |      "input": [
251 |       "var numClusters = 100\n",
252 |       "val numIterations = 25\n",
253 |       "var clusters = KMeans.train(title_vectors, numClusters, numIterations)\n",
254 |       "var wssse = clusters.computeCost(title_vectors)"
255 |      ],
256 |      "language": "python",
257 |      "metadata": {},
258 |      "outputs": []
259 |     },
260 |     {
261 |      "cell_type": "markdown",
262 |      "metadata": {},
263 |      "source": [
264 |       "With our cluster model complete, we can assign article titles to clusters.  We can also create RDDs for each of the cluster centers and produce words for their vectors (i.e., make titles for these purely numerical cluster centers)."
265 |      ]
266 |     },
267 |     {
268 |      "cell_type": "code",
269 |      "collapsed": false,
270 |      "input": [
271 |       "val article_membership = title_pairs.mapValues(x => clusters.predict(x))\n",
272 |       "val cluster_centers = sc.parallelize(clusters.clusterCenters.zipWithIndex.map{ e => (e._2,e._1)})\n",
273 |       "val cluster_topics = cluster_centers.mapValues(x => model.findSynonyms(x,5).map(x => x(0)))"
274 |      ],
275 |      "language": "python",
276 |      "metadata": {},
277 |      "outputs": []
278 |     },
279 |     {
280 |      "cell_type": "markdown",
281 |      "metadata": {},
282 |      "source": [
283 |       "Taking a look at the cluster membership, we can see not everything is a perfect match. But on the whole more articles make sense in the cluster than do not.  It seems we've done a reasonable job classifying the types of stories in the news."
284 |      ]
285 |     },
286 |     {
287 |      "cell_type": "code",
288 |      "collapsed": false,
289 |      "input": [
290 |       "var sample_topic = cluster_topics.take(10)(6)\n",
291 |       "println(sample_topic._2.mkString(\",\"))\n",
292 |       "\n",
293 |       "var sample_members = article_membership.filter(x => x._2 == 6).take(100)\n",
294 |       "sample_members.foreach{x => println(x._1.mkString(\",\"))}"
295 |      ],
296 |      "language": "python",
297 |      "metadata": {},
298 |      "outputs": []
299 |     },
300 |     {
301 |      "cell_type": "markdown",
302 |      "metadata": {},
303 |      "source": [
304 |       "Let's consider some of the output from a sample run of the method.  We'll start by looking at the cluster categories that have small membership.  In this example, cluster 15 was small, and had word2vec synonyms of \"rugby soccer baseball basketball hockey.\"  Let's look at what was in its cluster:\n",
305 |       "\n",
306 |       "###rugby soccer baseball basketball hockey\n",
307 |       "\n",
308 |       "* Thunderbird wins MBA rugby tournament\n",
309 |       "* Florida wins NCAA basketball championship\n",
310 |       "* NHL rival teams fight during hockey game\n",
311 |       "* Ice hockey\n",
312 |       "* American football\n",
313 |       "* Scottish football team Hibernian appoint new manager\n",
314 |       "* Wheelchair basketball\n",
315 |       "* Australia men's national wheelchair basketball team\n",
316 |       "* Australia women's national wheelchair basketball team\n",
317 |       "* Australian women's national wheelchair basketball team\n",
318 |       "* Wheelchair rugby\n",
319 |       "* Women's sports\n",
320 |       "* Australian rules football\n",
321 |       "* Australian football\n",
322 |       "* Association football\n",
323 |       "* Scotland national football team\n",
324 |       "* Scottish national football team\n",
325 |       "* Women's association football\n",
326 |       "* Sledge hockey\n",
327 |       "* Field hockey\n",
328 |       "* Wheelchair curling\n",
329 |       "* Japan women's national wheelchair basketball team\n",
330 |       "* Germany women's national wheelchair basketball team\n",
331 |       "* China women's national wheelchair basketball team\n",
332 |       "* Canada women's national wheelchair basketball team\n",
333 |       "* Rugby league"
334 |      ]
335 |     },
336 |     {
337 |      "cell_type": "markdown",
338 |      "metadata": {},
339 |      "source": [
340 |       "That's a very small, specific cluster.  But what about larger clusters?  Some of these are good fits, but others are not.  Let's take a look at a sample of a cluster that appears to be about the Internet or Nations.\n",
341 |       "\n",
342 |       "### Internet Google Nations Manchester Africa\n",
343 |       "* Colombia releases official notice in response to Venezuela\n",
344 |       "* Spanish government to hold ISPs responsible for web content\n",
345 |       "* Spanish government to enforce ISP's to censor web content\n",
346 |       "* FBI places limitation on public viewing of files\n",
347 |       "* Google releases test of mapping service\n",
348 |       "* Google offers to help Wikipedia\n",
349 |       "* ABC to move Internet news network back to U.S. TVs\n",
350 |       "* ABC to move successful Internet news network to U.S. TVs\n",
351 |       "* Separatists fail to stop re-opening of Kashmir bus service\n",
352 |       "* German Wikipedia DVD on P2P networks\n",
353 |       "* IBM and National Geographic to launch DNA database project"
354 |      ]
355 |     },
356 |     {
357 |      "cell_type": "markdown",
358 |      "metadata": {},
359 |      "source": [
360 |       "What about the flu in the news?  One category we came up with was \"Security UK - H5N1 High,\" which suggests it has something to do with the flu.  Let's look at some of the articles that belong to that cluster:\n",
361 |       "\n",
362 |       "* Premature aging disease reversed in cells\n",
363 |       "* Robot Zoe finds life in Atacama Desert\n",
364 |       "* Fire in Tema\n",
365 |       "* Cure for cat allergies may be close\n",
366 |       "* Talk-therapy can make a difference in early treatment of severe depression\n",
367 |       "* Drug-resistant infections on the rise\n",
368 |       "* Deadly virus samples missing in Mexico/Lebanon\n",
369 |       "* Australian blitz on fish poaching\n",
370 |       "* Partnership for a Drug-Free America study finds 1 in 5 teens abused prescription drugs\n",
371 |       "* U.S. EPA submits 2003 greenhouse gas inventory to U.N.\n",
372 |       "* United States begins testing equipment for demolition of a major VX nerve gas stockpile\n",
373 |       "* Nuclear fuel leaks at Sellafield facility on Cumbrian coast\n",
374 |       "* Red and processed meats linked to bowel cancer\n",
375 |       "* No H5N1 virus found in blood tests of suspected human Bird Flu cluster\n",
376 |       "* Swan in German zoo tests positive for H5N1 virus\n",
377 |       "* Swan in German zoo tests positive for H5N1virus\n",
378 |       "* American cyclist Floyd Landis tested positive for excessive levels of testosterone in second test\n",
379 |       "* Vaccine targets obesity in rats\n",
380 |       "* Suspected low pathogenic H5N1 Bird Flu virus found in the United States\n",
381 |       "* Possible low pathogenic H5N1 Bird Flu virus discovered in the United States\n",
382 |       "\n",
383 |       "Many of the entries in the cluster have to do with national security, but many, like those above deal with health and disease.  Given that word2vec finds synonyms, it's possible that H5N1 maps very closely with other words about disease."
384 |      ]
385 |     },
386 |     {
387 |      "cell_type": "heading",
388 |      "level": 2,
389 |      "metadata": {},
390 |      "source": [
391 |       "Building a Stand-Alone Application"
392 |      ]
393 |     },
394 |     {
395 |      "cell_type": "markdown",
396 |      "metadata": {},
397 |      "source": [
398 |       "To really get the most out of this, we need to build a stand-alone Spark application.  To do this, we'll need to do a few things.  We've provided the framework for the standalone application (and the code) in the `flu_data/news_clustering` directory.\n",
399 |       "\n",
400 |       "First, we'll need to set up a directory structure for the project.  You can see the directory structure here:"
401 |      ]
402 |     },
403 |     {
404 |      "cell_type": "code",
405 |      "collapsed": false,
406 |      "input": [
407 |       "!ls -R ../flu_news/news_clustering/*"
408 |      ],
409 |      "language": "python",
410 |      "metadata": {},
411 |      "outputs": [
412 |       {
413 |        "output_type": "stream",
414 |        "stream": "stdout",
415 |        "text": [
416 |         "../flu_news/news_clustering/build.sbt\r\n",
417 |         "\r\n",
418 |         "../flu_news/news_clustering/project:\r\n",
419 |         "build.properties\r\n",
420 |         "\r\n",
421 |         "../flu_news/news_clustering/src:\r\n",
422 |         "\u001b[34mmain\u001b[m\u001b[m\r\n",
423 |         "\r\n",
424 |         "../flu_news/news_clustering/src/main:\r\n",
425 |         "\u001b[34mscala\u001b[m\u001b[m\r\n",
426 |         "\r\n",
427 |         "../flu_news/news_clustering/src/main/scala:\r\n",
428 |         "\u001b[34mcom\u001b[m\u001b[m\r\n",
429 |         "\r\n",
430 |         "../flu_news/news_clustering/src/main/scala/com:\r\n",
431 |         "\u001b[34moracle\u001b[m\u001b[m\r\n",
432 |         "\r\n",
433 |         "../flu_news/news_clustering/src/main/scala/com/oracle:\r\n",
434 |         "\u001b[34mnewscluster\u001b[m\u001b[m\r\n",
435 |         "\r\n",
436 |         "../flu_news/news_clustering/src/main/scala/com/oracle/newscluster:\r\n",
437 |         "NewsClustering.scala\r\n"
438 |        ]
439 |       }
440 |      ],
441 |      "prompt_number": 1
442 |     },
443 |     {
444 |      "cell_type": "markdown",
445 |      "metadata": {},
446 |      "source": [
447 |       "The application consists of three main parts:\n",
448 |       "\n",
449 |       "* The build.sbt file\n",
450 |       "* The build.properties file\n",
451 |       "* The source code (`NewsClustering.scala`)\n",
452 |       "\n",
453 |       "The `build.sbt` contains the library dependencies and build instructions for our application. The `build.properties` specifies the version of the `sbt` program we're using.  Of course, all of the hard work is in `NewsClustering.scala`\n",
454 |       "\n",
455 |       "Once these pieces are setup, we simply need to change to the `news_clustering` directory, and build a JAR which contains our application.  We build and package with the following command:"
456 |      ]
457 |     },
458 |     {
459 |      "cell_type": "code",
460 |      "collapsed": false,
461 |      "input": [
462 |       "sbt package"
463 |      ],
464 |      "language": "python",
465 |      "metadata": {},
466 |      "outputs": []
467 |     },
468 |     {
469 |      "cell_type": "markdown",
470 |      "metadata": {},
471 |      "source": [
472 |       "This gives us a JAR under the `target` directory.  We can submit this to the spark cluster using the `spark-submit` command."
473 |      ]
474 |     },
475 |     {
476 |      "cell_type": "code",
477 |      "collapsed": false,
478 |      "input": [
479 |       "spark-submit --class com.oracle.newscluster.NewsClustering target/scala-2.10/newsclustering_2.10-0.1.jar"
480 |      ],
481 |      "language": "python",
482 |      "metadata": {},
483 |      "outputs": []
484 |     },
485 |     {
486 |      "cell_type": "heading",
487 |      "level": 2,
488 |      "metadata": {},
489 |      "source": [
490 |       "Summary"
491 |      ]
492 |     },
493 |     {
494 |      "cell_type": "markdown",
495 |      "metadata": {},
496 |      "source": [
497 |       "In this exploration, we considered the question \"What is the news about?\" using machine learning.  Along the way, we added some valuable skills to our Big Data toolkit.  We learned to:\n",
498 |       "\n",
499 |       "* Apply Apache Spark's MLlib component to classify text in a supervised fashion\n",
500 |       "* Used Apache Spark's core to perform simple algebra on vectors\n",
501 |       "* Used MLlib's k-means clustering algorithms to classify text in an unsupervised fashion\n",
502 |       "* Learned to build and submit a standalone Spark application\n",
503 |       "\n",
504 |       "At this point, you should be more than ready to write other standalone Spark applications.  What sort of applications could you build to apply machine learning to the term-frequency data from previous sections?"
505 |      ]
506 |     },
507 |     {
508 |      "cell_type": "code",
509 |      "collapsed": false,
510 |      "input": [],
511 |      "language": "python",
512 |      "metadata": {},
513 |      "outputs": []
514 |     }
515 |    ],
516 |    "metadata": {}
517 |   }
518 |  ]
519 | }


--------------------------------------------------------------------------------
/notebooks/Video Opportunities.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:b46ff24614ac36ce277e9e3f9e20f9bf1c864750c507c1907249c31e6d2bdebc"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "heading",
 13 |      "level": 2,
 14 |      "metadata": {},
 15 |      "source": [
 16 |       "Video Opportunities: each notebook 10-20 min of video"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "Introduction:\n",
 24 |       "\n",
 25 |       "    - purpose of the course\n",
 26 |       "    - summary of the story we'll tell\n",
 27 |       "    - what we're doing, why, and how\n",
 28 |       "    \n",
 29 |       "   * we need to think carefully about defining an overarching question that will guide our course\n",
 30 |       "   \n",
 31 |       "   * we should also talk in overview about data frames, RDMS, databases, python, etc."
 32 |      ]
 33 |     },
 34 |     {
 35 |      "cell_type": "markdown",
 36 |      "metadata": {},
 37 |      "source": [
 38 |       "Part I: Loading Data\n",
 39 |       "\n",
 40 |       "    Q: What kind of data are we looking for?\n",
 41 |       "    A: Flu statistics. Who is getting the flu, where, when and what kind?\n",
 42 |       "    \n",
 43 |       "    Q: Where might we find this data?\n",
 44 |       "    A: CDC, WHO, HHS, OIE\n",
 45 |       "    \n",
 46 |       "Notebook 1: Loading and Sharing Simple CSV Data\n",
 47 |       "    - What data is in this file?\n",
 48 |       "        -- mappings of states to their populations and flu surveillance regions, ????\n",
 49 |       "    - How do we plan on using this data?\n",
 50 |       "    - Overview of loading a CSV file\n",
 51 |       "        -- what's the file type? what tools?\n",
 52 |       "        -- process overview: import, parse file, convert, write to DB\n",
 53 |       "    - Putting stuff into the DB (cursors, tables, etc.)\n",
 54 |       "   \n",
 55 |       "Notebook 2: Loading Simple Delimited Data\n",
 56 |       "    - What data is in this file?\n",
 57 |       "            -- country-level data for influenza surveillance: what  strain, how many samples, etc.\n",
 58 |       "    - How are we going to split this data?\n",
 59 |       "    - What does it mean to put the data into tables in the DB?\n",
 60 |       "    - What's a view?\n",
 61 |       "\n",
 62 |       "Notebook 3: Loading HHS Flu Vaccination JSON Data\n",
 63 |       "    - What data is in this file?\n",
 64 |       "    - What is JSON? Why do people use it?\n",
 65 |       "\n",
 66 |       "Notebook 4: Gathering OIE Pathogenic Flu Data from the Web\n",
 67 |       "    - What data is in this file?\n",
 68 |       "        -- pathogenic straings of influenza in animals\n",
 69 |       "    - Pythonic tools --> what are they? why?\n",
 70 |       "    - Extracting useful data from HTML pages\n",
 71 |       "    - Dealing with inconsistencies in the data\n",
 72 |       "\n",
 73 |       "   * are we interested in combining these first few notebooks into one video since they are simpler in content?\n",
 74 |       "   \n"
 75 |      ]
 76 |     },
 77 |     {
 78 |      "cell_type": "markdown",
 79 |      "metadata": {},
 80 |      "source": [
 81 |       "Part II: Analyzing and Visualizing Data with Databases\n",
 82 |       "\n",
 83 |       "    Q: Now that we have some data, what's our next step?\n",
 84 |       "    A: One next step is to use our data to give visual answers to our questions. In this next part, we'll ask four broad questions about the flu and use different data visualization tools to test hypotheses, and then visualize the results.\n",
 85 |       "    \n",
 86 |       "Notebook 1: Does Ethnicity Impact Vaccination Rates?\n",
 87 |       "    - What is hypothesis testing? Overview of formulating a testable hypothesis\n",
 88 |       "    - T-tests (**** Is this one or two tailed?)\n",
 89 |       "    - Conclusions --> only group that did not differ is African-American vs. Hispanic (**** Do we want to speculate on more conclusions about this?)\n",
 90 |       "    \n",
 91 |       "Notebook 2: Do Vaccination Rates Impact Flu Rates?\n",
 92 |       "    - Analytical SQL? Ordinary Least Squares regression models? Why?\n",
 93 |       "    - What's statsmodel Python library? Why are we using it?\n",
 94 |       "    - What manipulations do we need to do to the data to get it to something we can analyze?\n",
 95 |       "    - Why should we \"start by taking a look at it?\" Shouldn't we have a hypothesis first?\n",
 96 |       "        -- Using linear models when results aren't obvious visually\n",
 97 |       "    - Why our linear model isn't working\n",
 98 |       "        -- what is this \"print shot_to_sick_model.summary()\" business? nonrobust?\n",
 99 |       "    - New model explanation; theories about why it's better; checking goodness of fit\n",
100 |       "    - Summary + food for thought\n",
101 |       " \n",
102 |       " * yikes to \"year-over-year change in flu is CAUSED\" ... we always learned to be about making claims about causation --> correlation does not imply causation\n",
103 |       "      \n",
104 |       "Notebook 3: Does GDP explain flu rates?\n",
105 |       "    - What's the difference between per capita and total GDP? Why might wealthier countries ahve lower rates of infection?\n",
106 |       "    - What's a linear model and why is it a good way to answer this question? Correlation between GDP and flu rates? Whats a linear correlation? What's a linear regression model and why would knowing about linear correlation help us decide if it's an appropriate model?\n",
107 |       "    - Results of type-A and type-B regressions, next steps? \n",
108 |       "    - Speculations on minor effect of GDP on the flu?\n",
109 |       "    \n",
110 |       "Notebook 4: Does Living in Cities Influence Flu Rates?\n",
111 |       " * is it fair to generalize a country as \"largely urban\" or \"largely agrarian\"?\n",
112 |       "     - Explanation of analysis 'per capita'\n",
113 |       "     - Quick scatter plot to look for obvious relationships --> why do this before forming a hypothesis? How is poking around different from hypothesis testing? How are they related?\n",
114 |       "     - Results table: p value vs r-squared\n",
115 |       "     - Set of residual plots\n",
116 |       "         - \"Notice that as urban population percentage increases, the model explains fewer of the data points\" ** this conclusion needs more explanation, it's not clear\n",
117 |       "     - A follow-up approach: population and GDP \n",
118 |       "         - Two-factor linear models\n",
119 |       "         - Results explain almost 20% more of the data\n",
120 |       "         - Interaction plot\n",
121 |       "    \n",
122 |       " * more rhetorical questions in the notebooks\n",
123 |       " * factor interaction and residual plotting needs more explanation in general, it's not obvious\n",
124 |       " \n",
125 |       "Notebook 5: How Do Sanitation a\n",
126 |       "nd Clean Water Effect Flu Transmission? \n",
127 |       "    - Thinking about other factors that effect our health: sanitation + clean water\n",
128 |       "    - Where/what data can we get to look at this?\n",
129 |       "    - Hypothesis: compare improvements in sanitation to flu cases\n",
130 |       "    - What does cellular phone access and primary education have to do with anything?\n",
131 |       "    - Modeling data in parallel\n",
132 |       "    - Are there noticable differences in sanitation across the WHO regions?  \n",
133 |       "    - What are our factors of interest?\n",
134 |       "    - Follow-up: modeling the AFRO region\n",
135 |       "    - Handling sample size\n",
136 |       "    - Binary vs. linear effects; logistic regression model; testing and training sets\n",
137 |       "    - Actual classification of the training set? (the graph is confusing)\n",
138 |       "    - Predictions: **** this whole section of the notebook is confusing\n",
139 |       "    - Handling an inconclusive analysis\n",
140 |       "    "
141 |      ]
142 |     },
143 |     {
144 |      "cell_type": "markdown",
145 |      "metadata": {},
146 |      "source": [
147 |       "Part III: Analyzing Big Data with Apache Spark\n",
148 |       "\n",
149 |       "Introduction:\n",
150 |       "    - What is...\n",
151 |       "        - Big Data?\n",
152 |       "        - Apache Spark?\n",
153 |       "        - Apache Hadoop?\n",
154 |       "    \n",
155 |       "Notebook 1: Basic Big Data Manipulation with PySpark\n",
156 |       "    - Loading data into a Hadoop file system\n",
157 |       "    - What's an RDD? Explain the Spark Context\n",
158 |       "    - What data are we going to work with now? What kinds of 'big data' would help us with our flu investigation?\n",
159 |       "    - What news stories mention the flu? What are the stories that mention the flu about? How could we go about finding out? What's wrong with our first-pass method?\n",
160 |       "       \n",
161 |       "Notebook 2: Moving and Clustering Data with Sqoop and Spark\n",
162 |       "\t- flu types a + b (http://www.cdc.gov/flu/about/viruses/types.htm)\n",
163 |       "\t- integrating sql\n",
164 |       "\t- deciding how many clusters with k means (within set sum of squared error, elbow in the graph, point of diminishing returns)\n",
165 |       "\t- flatten_cluster_data\n",
166 |       "\t- how good is the flu reporting? how large is the population?\n",
167 |       "    \n",
168 |       "Notebook 3: Finding Important Words with Spark\n",
169 |       "    - Guiding questions: How do we figure out what articles are \"about\"? How important is news about the flu over time?\n",
170 |       "    - TF-IDF transformation\n",
171 |       "    - Cleaning and normalizing the text\n",
172 |       "    - PairRDD\n",
173 |       "    - Scale term frequency by how common that word is across all docs\n",
174 |       "    - Reframing the problem: Why would we want to know about importance over time? Is there a flu season?\n",
175 |       "    - Incorporating all the data\n",
176 |       "    - Real trends\n",
177 |       "\n",
178 |       " * Is it robust to consider words inside of date ranges without considering them inside of articles?\n",
179 |       "    \n",
180 |       " * I think this notebook needs more explanation\n",
181 |       " \n",
182 |       "Notebook 4: Building a Trend Search with Big Data and Big Data SQL\n",
183 |       "    - Preliminary question unclear -- does the flu impact the news?\n",
184 |       "    - Pairing term-trend data with statistics data\n",
185 |       "    - Big Data SQL, what is it and how is it different from what we just did with batch-based copies in Hadoop\n",
186 |       "    - First-pass comparison\n",
187 |       "    - No correlated rise between H5N1 and countries with the most outbreaks --> what's another approach?\n",
188 |       "    - The SQL 'like' operator\n",
189 |       "    - Summary, what is causing what here? \n",
190 |       "    \n",
191 |       " * possible error in [36]? should there be a graph here?\n",
192 |       " * it's a little fuzzy in the logic, may need more explanation step by step\n",
193 |       " * what is the Out[40] graph?\n",
194 |       " \n",
195 |       "Notebok 5: Clustering the news with Spark and MLLib\n",
196 |       "    - Categorizing the news: text processing + machine learning\n",
197 |       "    - k-means clustering, brief overview of supervised vs. unsupervised machine learning techniques, importance of clustering\n",
198 |       "    - Scala Case Class?\n",
199 |       "    - JSON4s? Read operation with type\n",
200 |       "    - How to get numeric vectors out of words (word2vec)\n",
201 |       "    - Building a stand-alone application\n",
202 |       "    - Follow-up questions, summary of machine learning techniques with Big Data\n",
203 |       "    \n",
204 |       " * this notebook needs graphs??\n",
205 |       " * why randomly talk about a stand-alone application here? this should be a separate, supplementary notebook with more meat or more suggestions for future work\n",
206 |       " * the material in this notebook is already difficult, we probably won't have time to cover the stand-alone part sufficiently\n",
207 |       " \n",
208 |       "Notebook 6: Collecting Streaming News with Flume and Spark Streaming\n",
209 |       "    - How do we keep up with news happening in real time? \n",
210 |       "    - Explanation of a \"stream\" of data\n",
211 |       "    - End-to-end stream processing system, Reuters news wire\n",
212 |       "    - Collecting RSS data in python\n",
213 |       "    - Apache Flume -- distributed data transfer system\n",
214 |       "    - Spark streaming, spark applications, value of processing incoming data in real time (how is this different from analyzing historical data?)\n",
215 |       " \n",
216 |       "        - E-Tag data, high-water mark for news we've seen\n",
217 |       "        - Source, channel, sink\n",
218 |       "        - Using Spark Streaming to Search Streams\n",
219 |       "            - processs the JSON records, keep track of how many articles we've seen, and write any articles about the flu to disk for later use\n",
220 |       "            - If we've found flu data, we want to write it to HDFS --> how do we know we've found flu data?\n",
221 |       "    \n",
222 |       "    - Providing libraries in a JAR file\n",
223 |       "    - Summary of end-to-end pipeline that can monitor the news\n",
224 |       "    - Follow-up projects: Add some of the machine learning approaches we applied to clustering the news to the streaming case. Could we detect a flu outbreak as it's happening?\n",
225 |       "               \n",
226 |       " * should we cut down the output In[52]?"
227 |      ]
228 |     },
229 |     {
230 |      "cell_type": "markdown",
231 |      "metadata": {},
232 |      "source": [
233 |       "Conclusion:\n",
234 |       "    - Restate purpose of the workshop\n",
235 |       "    - Who we are\n",
236 |       "    - Where to learn more, how to ask questions, social media, etc.\n",
237 |       "\n",
238 |       " * How are we going to handle ours/Oracle's affiliation with this project?"
239 |      ]
240 |     },
241 |     {
242 |      "cell_type": "markdown",
243 |      "metadata": {},
244 |      "source": [
245 |       "FAQ's\n",
246 |       "    - Should we have a list of all the developer/data tools we use with links to their documentation ('for more information')\n",
247 |       "    - How to contact us/report bugs\n",
248 |       "    - Where'd we get our data?\n",
249 |       "    - Suggested follow-up projects\n",
250 |       "    - Github repo, video links"
251 |      ]
252 |     }
253 |    ],
254 |    "metadata": {}
255 |   }
256 |  ]
257 | }


--------------------------------------------------------------------------------
/setup/00-pyspark-setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 |  
 4 | spark_home = os.environ.get('SPARK_HOME', None)
 5 | if not spark_home:
 6 |   raise ValueError('SPARK_HOME environment variable is not set')
 7 | sys.path.insert(0, os.path.join(spark_home, 'python'))
 8 | sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))
 9 | execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
10 | 


--------------------------------------------------------------------------------
/setup/data_science_bootcamp_setup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | #we need a newer version of numpy, so we'll get it from pip
 4 | echo "Fixing setuptools"
 5 | sudo -E yum remove numpy python-setuptools
 6 | #install the necessary libraries
 7 | echo "installing BLAS and LAPACK"
 8 | sudo -E yum install blas blas-devel lapack lapack-devel
 9 | wget --no-check-certificate https://bootstrap.pypa.io/ez_setup.py
10 | sudo -E python ez_setup.py --insecure
11 | sudo -E easy_install nose
12 | sudo yum install impala-shell
13 | #fix pip and easy_install
14 | echo "Installing pip"
15 | sudo -E easy_install pip
16 | sudo -E easy_install -U distribute
17 | #pip install all our basic python modules
18 | echo "installing numpy"
19 | sudo -E easy_install numpy 
20 | echo "installing scipy"
21 | sudo -E easy_install scipy 
22 | echo "installing pandas"
23 | sudo -E pip install pandas
24 | echo "installing cx_Oracle and SQLAlchemy"
25 | sudo -E pip install cx_Oracle 
26 | sudo -E pip install SQLAlchemy 
27 | echo "installing pandasql vincent and seaborn"
28 | sudo -E pip install pandasql vincent seaborn 
29 | echo "installing bs4 requests and feedparser"
30 | sudo -E pip install beautifulsoup4 requests feedparser 
31 | echo "installing statsmodels and scikit-learn"
32 | sudo -E pip install statsmodels scikit-learn
33 | #upgrade spark to a reasonable version
34 | echo "installing spark 1.2"
35 | sudo -E yum install spark-core spark-master spark-worker spark-history-server spark-python
36 | #python 2.6 requires ipython 1.x, so we need to git clone
37 | echo "installing ipython"
38 | git clone https://github.com/ipython/ipython.git
39 | cd ipython
40 | git checkout 1.x
41 | git pull origin 1.x
42 | sudo -E python setup.py install
43 | sudo -E pip install pyzmq jinja2 tornado ipython-sql
44 | #set up the pyspark profile
45 | echo "installing pyspark profile for ipython"
46 | ipython profile create pyspark
47 | cp ipython_notebook_config_spark.py $HOME/.config/ipython/profile_pyspark/ipython_notebook_config.py
48 | cp 00-pyspark-setup.py $HOME/.config/ipython/profile_pyspark/startup/
49 | echo "installing SBT"
50 | #install sbt
51 | wget -O sbt-0.13.7.rpm https://dl.bintray.com/sbt/rpm/sbt-0.13.7.rpm
52 | sudo -E yum localinstall sbt-0.13.7.rpm
53 | #run the get-data scripts
54 | ./download_data.sh
55 | #run the database setup script
56 | cat fludb.sql | sqlplus sys/welcome1 as sysdba
57 | #install rvm and rubies
58 | echo "installing RVM and Ruby"
59 | gpg2 --keyserver hkp://keys.gnupg.net --recv-keys D39DC0E3
60 | \curl -sSL https://get.rvm.io | bash -s stable
61 | source /home/oracle/.rvm/scripts/rvm
62 | rvm install jruby
63 | echo "setting environment and loading data"
64 | echo "export SPARK_HOME=/usr/lib/spark" >> ~/.bashrc
65 | #finished


--------------------------------------------------------------------------------
/setup/download_data.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | #run the get-data scripts
4 | cd ../flu_statistics
5 | ./get_flu_summary_data.sh
6 | cd ../flu_news
7 | ./get_news_data.sh
8 | cd ../setup


--------------------------------------------------------------------------------
/setup/fludb.sql:
--------------------------------------------------------------------------------
 1 | prompt >> Starting up
 2 | startup
 3 | prompt >> Dropping fludb user
 4 | drop user fludb cascade;
 5 | prompt >> Creating fludb...
 6 | 
 7 | prompt >> Creating tablespace
 8 | CREATE TABLESPACE FLUDB DATAFILE 'fludb.dbf' SIZE 1G reuse AUTOEXTEND ON nologging;
 9 | 
10 | prompt >> Creating user
11 | CREATE USER fludb IDENTIFIED BY flushot
12 | 	DEFAULT TABLESPACE FLUDB
13 | 	QUOTA UNLIMITED ON FLUDB;
14 | 
15 | prompt >> Assiging privileges
16 | grant dba to fludb;
17 | grant ALTER ANY PROCEDURE to fludb;
18 | grant ALTER SYSTEM to fludb;
19 | grant CREATE ANY PROCEDURE to fludb;
20 | grant CREATE PROCEDURE to fludb;
21 | grant CREATE TABLE to fludb;
22 | grant DEBUG ANY PROCEDURE to fludb;
23 | grant DEBUG CONNECT SESSION to fludb;
24 | grant EXECUTE ANY PROCEDURE to fludb;
25 | grant UNLIMITED TABLESPACE to fludb;
26 | 
27 | prompt >> creating staging directory
28 | !mkdir /home/oracle/fludb_staging
29 | create or replace directory fludb_directory as '/home/oracle/fludb_staging';
30 | grant all on directory fludb_directory to fludb;
31 | quit
32 | 


--------------------------------------------------------------------------------
/setup/ipython_notebook_config_spark.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for ipython-notebook.
  2 | 
  3 | c = get_config()
  4 | 
  5 | #------------------------------------------------------------------------------
  6 | # NotebookApp configuration
  7 | #------------------------------------------------------------------------------
  8 | 
  9 | # NotebookApp will inherit config from: BaseIPythonApplication, Application
 10 | 
 11 | # The url for MathJax.js.
 12 | # c.NotebookApp.mathjax_url = ''
 13 | 
 14 | # The IP address the notebook server will listen on.
 15 | # c.NotebookApp.ip = '127.0.0.1'
 16 | 
 17 | # The base URL for the notebook server.
 18 | # 
 19 | # Leading and trailing slashes can be omitted, and will automatically be added.
 20 | # c.NotebookApp.base_project_url = '/'
 21 | 
 22 | # Create a massive crash report when IPython encounters what may be an internal
 23 | # error.  The default is to append a short message to the usual traceback
 24 | # c.NotebookApp.verbose_crash = False
 25 | 
 26 | # The random bytes used to secure cookies. By default this is a new random
 27 | # number every time you start the Notebook. Set it to a value in a config file
 28 | # to enable logins to persist across server sessions.
 29 | # 
 30 | # Note: Cookie secrets should be kept private, do not share config files with
 31 | # cookie_secret stored in plaintext (you can read the value from a file).
 32 | # c.NotebookApp.cookie_secret = ''
 33 | 
 34 | # The number of additional ports to try if the specified port is not available.
 35 | # c.NotebookApp.port_retries = 50
 36 | 
 37 | # Whether to open in a browser after starting. The specific browser used is
 38 | # platform dependent and determined by the python standard library `webbrowser`
 39 | # module, unless it is overridden using the --browser (NotebookApp.browser)
 40 | # configuration option.
 41 | # c.NotebookApp.open_browser = True
 42 | 
 43 | # The notebook manager class to use.
 44 | # c.NotebookApp.notebook_manager_class = 'IPython.html.services.notebooks.filenbmanager.FileNotebookManager'
 45 | 
 46 | # The date format used by logging formatters for %(asctime)s
 47 | # c.NotebookApp.log_datefmt = '%Y-%m-%d %H:%M:%S'
 48 | 
 49 | # The base URL for the kernel server
 50 | # 
 51 | # Leading and trailing slashes can be omitted, and will automatically be added.
 52 | # c.NotebookApp.base_kernel_url = '/'
 53 | 
 54 | # The port the notebook server will listen on.
 55 | c.NotebookApp.port = 8880
 56 | 
 57 | # Whether to overwrite existing config files when copying
 58 | # c.NotebookApp.overwrite = False
 59 | 
 60 | # Whether to enable MathJax for typesetting math/TeX
 61 | # 
 62 | # MathJax is the javascript library IPython uses to render math/LaTeX. It is
 63 | # very large, so you may want to disable it if you have a slow internet
 64 | # connection, or for offline use of the notebook.
 65 | # 
 66 | # When disabled, equations etc. will appear as their untransformed TeX source.
 67 | # c.NotebookApp.enable_mathjax = True
 68 | 
 69 | # The full path to an SSL/TLS certificate file.
 70 | # c.NotebookApp.certfile = u''
 71 | 
 72 | # Path to an extra config file to load.
 73 | # 
 74 | # If specified, load this config file in addition to any other IPython config.
 75 | # c.NotebookApp.extra_config_file = u''
 76 | 
 77 | # The IPython profile to use.
 78 | # c.NotebookApp.profile = u'default'
 79 | 
 80 | # The base URL for the websocket server, if it differs from the HTTP server
 81 | # (hint: it almost certainly doesn't).
 82 | # 
 83 | # Should be in the form of an HTTP origin: ws[s]://hostname[:port]
 84 | # c.NotebookApp.websocket_url = ''
 85 | 
 86 | # The name of the IPython directory. This directory is used for logging
 87 | # configuration (through profiles), history storage, etc. The default is usually
 88 | # $HOME/.ipython. This options can also be specified through the environment
 89 | # variable IPYTHONDIR.
 90 | # c.NotebookApp.ipython_dir = u'/home/oracle/.config/ipython'
 91 | 
 92 | # Set the log level by value or name.
 93 | # c.NotebookApp.log_level = 30
 94 | 
 95 | # Hashed password to use for web authentication.
 96 | # 
 97 | # To generate, type in a python/IPython shell:
 98 | # 
 99 | #   from IPython.lib import passwd; passwd()
100 | # 
101 | # The string should be of the form type:salt:hashed-password.
102 | # c.NotebookApp.password = u''
103 | 
104 | # The Logging format template
105 | # c.NotebookApp.log_format = '[%(name)s]%(highlevel)s %(message)s'
106 | 
107 | # Wether to use Browser Side less-css parsing instead of compiled css version in
108 | # templates that allows it. This is mainly convenient when working on the less
109 | # file to avoid a build step, or if user want to overwrite some of the less
110 | # variables without having to recompile everything.
111 | # 
112 | # You will need to install the less.js component in the static directory either
113 | # in the source tree or in your profile folder.
114 | # c.NotebookApp.use_less = False
115 | 
116 | # Extra paths to search for serving static files.
117 | # 
118 | # This allows adding javascript/css to be available from the notebook server
119 | # machine, or overriding individual files in the IPython
120 | # c.NotebookApp.extra_static_paths = []
121 | 
122 | # Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded-
123 | # For headerssent by the upstream reverse proxy. Neccesary if the proxy handles
124 | # SSL
125 | # c.NotebookApp.trust_xheaders = False
126 | 
127 | # Whether to install the default config files into the profile dir. If a new
128 | # profile is being created, and IPython contains config files for that profile,
129 | # then they will be staged into the new directory.  Otherwise, default config
130 | # files will be automatically generated.
131 | # c.NotebookApp.copy_config_files = False
132 | 
133 | # The full path to a private key file for usage with SSL/TLS.
134 | # c.NotebookApp.keyfile = u''
135 | 
136 | # Supply overrides for the tornado.web.Application that the IPython notebook
137 | # uses.
138 | # c.NotebookApp.webapp_settings = {}
139 | 
140 | # Specify what command to use to invoke a web browser when opening the notebook.
141 | # If not specified, the default browser will be determined by the `webbrowser`
142 | # standard library module, which allows setting of the BROWSER environment
143 | # variable to override it.
144 | # c.NotebookApp.browser = u''
145 | 
146 | #------------------------------------------------------------------------------
147 | # IPKernelApp configuration
148 | #------------------------------------------------------------------------------
149 | 
150 | # IPython: an enhanced interactive Python shell.
151 | 
152 | # IPKernelApp will inherit config from: BaseIPythonApplication, Application,
153 | # InteractiveShellApp
154 | 
155 | # The importstring for the DisplayHook factory
156 | # c.IPKernelApp.displayhook_class = 'IPython.kernel.zmq.displayhook.ZMQDisplayHook'
157 | 
158 | # Set the IP or interface on which the kernel will listen.
159 | # c.IPKernelApp.ip = u''
160 | 
161 | # Pre-load matplotlib and numpy for interactive use, selecting a particular
162 | # matplotlib backend and loop integration.
163 | # c.IPKernelApp.pylab = None
164 | 
165 | # Create a massive crash report when IPython encounters what may be an internal
166 | # error.  The default is to append a short message to the usual traceback
167 | # c.IPKernelApp.verbose_crash = False
168 | 
169 | # The Kernel subclass to be used.
170 | # 
171 | # This should allow easy re-use of the IPKernelApp entry point to configure and
172 | # launch kernels other than IPython's own.
173 | # c.IPKernelApp.kernel_class = 'IPython.kernel.zmq.ipkernel.Kernel'
174 | 
175 | # Run the module as a script.
176 | # c.IPKernelApp.module_to_run = ''
177 | 
178 | # The date format used by logging formatters for %(asctime)s
179 | # c.IPKernelApp.log_datefmt = '%Y-%m-%d %H:%M:%S'
180 | 
181 | # set the shell (ROUTER) port [default: random]
182 | # c.IPKernelApp.shell_port = 0
183 | 
184 | # set the control (ROUTER) port [default: random]
185 | # c.IPKernelApp.control_port = 0
186 | 
187 | # Whether to overwrite existing config files when copying
188 | # c.IPKernelApp.overwrite = False
189 | 
190 | # Execute the given command string.
191 | # c.IPKernelApp.code_to_run = ''
192 | 
193 | # set the stdin (ROUTER) port [default: random]
194 | # c.IPKernelApp.stdin_port = 0
195 | 
196 | # Set the log level by value or name.
197 | # c.IPKernelApp.log_level = 30
198 | 
199 | # lines of code to run at IPython startup.
200 | # c.IPKernelApp.exec_lines = []
201 | 
202 | # Path to an extra config file to load.
203 | # 
204 | # If specified, load this config file in addition to any other IPython config.
205 | # c.IPKernelApp.extra_config_file = u''
206 | 
207 | # The importstring for the OutStream factory
208 | # c.IPKernelApp.outstream_class = 'IPython.kernel.zmq.iostream.OutStream'
209 | 
210 | # Whether to create profile dir if it doesn't exist
211 | # c.IPKernelApp.auto_create = False
212 | 
213 | # set the heartbeat port [default: random]
214 | # c.IPKernelApp.hb_port = 0
215 | 
216 | # 
217 | # c.IPKernelApp.transport = 'tcp'
218 | 
219 | # redirect stdout to the null device
220 | # c.IPKernelApp.no_stdout = False
221 | 
222 | # dotted module name of an IPython extension to load.
223 | # c.IPKernelApp.extra_extension = ''
224 | 
225 | # A file to be run
226 | # c.IPKernelApp.file_to_run = ''
227 | 
228 | # The IPython profile to use.
229 | # c.IPKernelApp.profile = u'default'
230 | 
231 | # 
232 | # c.IPKernelApp.parent_appname = u''
233 | 
234 | # kill this process if its parent dies.  On Windows, the argument specifies the
235 | # HANDLE of the parent process, otherwise it is simply boolean.
236 | # c.IPKernelApp.parent_handle = 0
237 | 
238 | # JSON file in which to store connection info [default: kernel-<pid>.json]
239 | # 
240 | # This file will contain the IP, ports, and authentication key needed to connect
241 | # clients to this kernel. By default, this file will be created in the security
242 | # dir of the current profile, but can be specified by absolute path.
243 | # c.IPKernelApp.connection_file = ''
244 | 
245 | # If true, IPython will populate the user namespace with numpy, pylab, etc. and
246 | # an 'import *' is done from numpy and pylab, when using pylab mode.
247 | # 
248 | # When False, pylab mode should not import any names into the user namespace.
249 | # c.IPKernelApp.pylab_import_all = True
250 | 
251 | # The name of the IPython directory. This directory is used for logging
252 | # configuration (through profiles), history storage, etc. The default is usually
253 | # $HOME/.ipython. This options can also be specified through the environment
254 | # variable IPYTHONDIR.
255 | # c.IPKernelApp.ipython_dir = u'/home/oracle/.config/ipython'
256 | 
257 | # Configure matplotlib for interactive use with the default matplotlib backend.
258 | # c.IPKernelApp.matplotlib = None
259 | 
260 | # ONLY USED ON WINDOWS Interrupt this process when the parent is signaled.
261 | # c.IPKernelApp.interrupt = 0
262 | 
263 | # Whether to install the default config files into the profile dir. If a new
264 | # profile is being created, and IPython contains config files for that profile,
265 | # then they will be staged into the new directory.  Otherwise, default config
266 | # files will be automatically generated.
267 | # c.IPKernelApp.copy_config_files = False
268 | 
269 | # List of files to run at IPython startup.
270 | # c.IPKernelApp.exec_files = []
271 | 
272 | # Enable GUI event loop integration with any of ('glut', 'gtk', 'gtk3', 'none',
273 | # 'osx', 'pyglet', 'qt', 'qt4', 'tk', 'wx').
274 | # c.IPKernelApp.gui = None
275 | 
276 | # A list of dotted module names of IPython extensions to load.
277 | # c.IPKernelApp.extensions = []
278 | 
279 | # redirect stderr to the null device
280 | # c.IPKernelApp.no_stderr = False
281 | 
282 | # The Logging format template
283 | # c.IPKernelApp.log_format = '[%(name)s]%(highlevel)s %(message)s'
284 | 
285 | # set the iopub (PUB) port [default: random]
286 | # c.IPKernelApp.iopub_port = 0
287 | 
288 | #------------------------------------------------------------------------------
289 | # ZMQInteractiveShell configuration
290 | #------------------------------------------------------------------------------
291 | 
292 | # A subclass of InteractiveShell for ZMQ.
293 | 
294 | # ZMQInteractiveShell will inherit config from: InteractiveShell
295 | 
296 | # Use colors for displaying information about objects. Because this information
297 | # is passed through a pager (like 'less'), and some pagers get confused with
298 | # color codes, this capability can be turned off.
299 | # c.ZMQInteractiveShell.color_info = True
300 | 
301 | # A list of ast.NodeTransformer subclass instances, which will be applied to
302 | # user input before code is run.
303 | # c.ZMQInteractiveShell.ast_transformers = []
304 | 
305 | # 
306 | # c.ZMQInteractiveShell.history_length = 10000
307 | 
308 | # Don't call post-execute functions that have failed in the past.
309 | # c.ZMQInteractiveShell.disable_failing_post_execute = False
310 | 
311 | # Show rewritten input, e.g. for autocall.
312 | # c.ZMQInteractiveShell.show_rewritten_input = True
313 | 
314 | # Set the color scheme (NoColor, Linux, or LightBG).
315 | # c.ZMQInteractiveShell.colors = 'Linux'
316 | 
317 | # 
318 | # c.ZMQInteractiveShell.separate_in = '\n'
319 | 
320 | # Deprecated, use PromptManager.in2_template
321 | # c.ZMQInteractiveShell.prompt_in2 = '   .\\D.: '
322 | 
323 | # 
324 | # c.ZMQInteractiveShell.separate_out = ''
325 | 
326 | # Deprecated, use PromptManager.in_template
327 | # c.ZMQInteractiveShell.prompt_in1 = 'In [\\#]: '
328 | 
329 | # Enable deep (recursive) reloading by default. IPython can use the deep_reload
330 | # module which reloads changes in modules recursively (it replaces the reload()
331 | # function, so you don't need to change anything to use it). deep_reload()
332 | # forces a full reload of modules whose code may have changed, which the default
333 | # reload() function does not.  When deep_reload is off, IPython will use the
334 | # normal reload(), but deep_reload will still be available as dreload().
335 | # c.ZMQInteractiveShell.deep_reload = False
336 | 
337 | # Make IPython automatically call any callable object even if you didn't type
338 | # explicit parentheses. For example, 'str 43' becomes 'str(43)' automatically.
339 | # The value can be '0' to disable the feature, '1' for 'smart' autocall, where
340 | # it is not applied if there are no more arguments on the line, and '2' for
341 | # 'full' autocall, where all callable objects are automatically called (even if
342 | # no arguments are present).
343 | # c.ZMQInteractiveShell.autocall = 0
344 | 
345 | # 
346 | # c.ZMQInteractiveShell.separate_out2 = ''
347 | 
348 | # Deprecated, use PromptManager.justify
349 | # c.ZMQInteractiveShell.prompts_pad_left = True
350 | 
351 | # 
352 | # c.ZMQInteractiveShell.readline_parse_and_bind = ['tab: complete', '"\\C-l": clear-screen', 'set show-all-if-ambiguous on', '"\\C-o": tab-insert', '"\\C-r": reverse-search-history', '"\\C-s": forward-search-history', '"\\C-p": history-search-backward', '"\\C-n": history-search-forward', '"\\e[A": history-search-backward', '"\\e[B": history-search-forward', '"\\C-k": kill-line', '"\\C-u": unix-line-discard']
353 | 
354 | # Enable magic commands to be called without the leading %.
355 | # c.ZMQInteractiveShell.automagic = True
356 | 
357 | # 
358 | # c.ZMQInteractiveShell.debug = False
359 | 
360 | # 
361 | # c.ZMQInteractiveShell.object_info_string_level = 0
362 | 
363 | # 
364 | # c.ZMQInteractiveShell.ipython_dir = ''
365 | 
366 | # 
367 | # c.ZMQInteractiveShell.readline_remove_delims = '-/~'
368 | 
369 | # Start logging to the default log file.
370 | # c.ZMQInteractiveShell.logstart = False
371 | 
372 | # The name of the logfile to use.
373 | # c.ZMQInteractiveShell.logfile = ''
374 | 
375 | # 
376 | # c.ZMQInteractiveShell.wildcards_case_sensitive = True
377 | 
378 | # Save multi-line entries as one entry in readline history
379 | # c.ZMQInteractiveShell.multiline_history = True
380 | 
381 | # Start logging to the given file in append mode.
382 | # c.ZMQInteractiveShell.logappend = ''
383 | 
384 | # 
385 | # c.ZMQInteractiveShell.xmode = 'Context'
386 | 
387 | # 
388 | # c.ZMQInteractiveShell.quiet = False
389 | 
390 | # Deprecated, use PromptManager.out_template
391 | # c.ZMQInteractiveShell.prompt_out = 'Out[\\#]: '
392 | 
393 | # Set the size of the output cache.  The default is 1000, you can change it
394 | # permanently in your config file.  Setting it to 0 completely disables the
395 | # caching system, and the minimum value accepted is 20 (if you provide a value
396 | # less than 20, it is reset to 0 and a warning is issued).  This limit is
397 | # defined because otherwise you'll spend more time re-flushing a too small cache
398 | # than working
399 | # c.ZMQInteractiveShell.cache_size = 1000
400 | 
401 | # 'all', 'last', 'last_expr' or 'none', specifying which nodes should be run
402 | # interactively (displaying output from expressions).
403 | # c.ZMQInteractiveShell.ast_node_interactivity = 'last_expr'
404 | 
405 | # Automatically call the pdb debugger after every exception.
406 | # c.ZMQInteractiveShell.pdb = False
407 | 
408 | #------------------------------------------------------------------------------
409 | # KernelManager configuration
410 | #------------------------------------------------------------------------------
411 | 
412 | # Manages a single kernel in a subprocess on this host.
413 | # 
414 | # This version starts kernels with Popen.
415 | 
416 | # KernelManager will inherit config from: ConnectionFileMixin
417 | 
418 | # The Popen Command to launch the kernel. Override this if you have a custom
419 | # c.KernelManager.kernel_cmd = []
420 | 
421 | # Set the kernel's IP address [default localhost]. If the IP address is
422 | # something other than localhost, then Consoles on other machines will be able
423 | # to connect to the Kernel, so be careful!
424 | # c.KernelManager.ip = '127.0.0.1'
425 | 
426 | # 
427 | # c.KernelManager.transport = 'tcp'
428 | 
429 | # Should we autorestart the kernel if it dies.
430 | # c.KernelManager.autorestart = False
431 | 
432 | #------------------------------------------------------------------------------
433 | # ProfileDir configuration
434 | #------------------------------------------------------------------------------
435 | 
436 | # An object to manage the profile directory and its resources.
437 | # 
438 | # The profile directory is used by all IPython applications, to manage
439 | # configuration, logging and security.
440 | # 
441 | # This object knows how to find, create and manage these directories. This
442 | # should be used by any code that wants to handle profiles.
443 | 
444 | # Set the profile location directly. This overrides the logic used by the
445 | # `profile` option.
446 | # c.ProfileDir.location = u''
447 | 
448 | #------------------------------------------------------------------------------
449 | # Session configuration
450 | #------------------------------------------------------------------------------
451 | 
452 | # Object for handling serialization and sending of messages.
453 | # 
454 | # The Session object handles building messages and sending them with ZMQ sockets
455 | # or ZMQStream objects.  Objects can communicate with each other over the
456 | # network via Session objects, and only need to work with the dict-based IPython
457 | # message spec. The Session will handle serialization/deserialization, security,
458 | # and metadata.
459 | # 
460 | # Sessions support configurable serialiization via packer/unpacker traits, and
461 | # signing with HMAC digests via the key/keyfile traits.
462 | # 
463 | # Parameters ----------
464 | # 
465 | # debug : bool
466 | #     whether to trigger extra debugging statements
467 | # packer/unpacker : str : 'json', 'pickle' or import_string
468 | #     importstrings for methods to serialize message parts.  If just
469 | #     'json' or 'pickle', predefined JSON and pickle packers will be used.
470 | #     Otherwise, the entire importstring must be used.
471 | # 
472 | #     The functions must accept at least valid JSON input, and output *bytes*.
473 | # 
474 | #     For example, to use msgpack:
475 | #     packer = 'msgpack.packb', unpacker='msgpack.unpackb'
476 | # pack/unpack : callables
477 | #     You can also set the pack/unpack callables for serialization directly.
478 | # session : bytes
479 | #     the ID of this Session object.  The default is to generate a new UUID.
480 | # username : unicode
481 | #     username added to message headers.  The default is to ask the OS.
482 | # key : bytes
483 | #     The key used to initialize an HMAC signature.  If unset, messages
484 | #     will not be signed or checked.
485 | # keyfile : filepath
486 | #     The file containing a key.  If this is set, `key` will be initialized
487 | #     to the contents of the file.
488 | 
489 | # Username for the Session. Default is your system username.
490 | # c.Session.username = u'oracle'
491 | 
492 | # The name of the unpacker for unserializing messages. Only used with custom
493 | # functions for `packer`.
494 | # c.Session.unpacker = 'json'
495 | 
496 | # Threshold (in bytes) beyond which a buffer should be sent without copying.
497 | # c.Session.copy_threshold = 65536
498 | 
499 | # The name of the packer for serializing messages. Should be one of 'json',
500 | # 'pickle', or an import name for a custom callable serializer.
501 | # c.Session.packer = 'json'
502 | 
503 | # The maximum number of digests to remember.
504 | # 
505 | # The digest history will be culled when it exceeds this value.
506 | # c.Session.digest_history_size = 65536
507 | 
508 | # The UUID identifying this session.
509 | # c.Session.session = u''
510 | 
511 | # The digest scheme used to construct the message signatures. Must have the form
512 | # 'hmac-HASH'.
513 | # c.Session.signature_scheme = 'hmac-sha256'
514 | 
515 | # execution key, for extra authentication.
516 | # c.Session.key = ''
517 | 
518 | # Debug output in the Session
519 | # c.Session.debug = False
520 | 
521 | # The maximum number of items for a container to be introspected for custom
522 | # serialization. Containers larger than this are pickled outright.
523 | # c.Session.item_threshold = 64
524 | 
525 | # path to file containing execution key.
526 | # c.Session.keyfile = ''
527 | 
528 | # Threshold (in bytes) beyond which an object's buffer should be extracted to
529 | # avoid pickling.
530 | # c.Session.buffer_threshold = 1024
531 | 
532 | # Metadata dictionary, which serves as the default top-level metadata dict for
533 | # each message.
534 | # c.Session.metadata = {}
535 | 
536 | #------------------------------------------------------------------------------
537 | # InlineBackend configuration
538 | #------------------------------------------------------------------------------
539 | 
540 | # An object to store configuration of the inline backend.
541 | 
542 | # The image format for figures with the inline backend.
543 | # c.InlineBackend.figure_format = 'png'
544 | 
545 | # Close all figures at the end of each cell.
546 | # 
547 | # When True, ensures that each cell starts with no active figures, but it also
548 | # means that one must keep track of references in order to edit or redraw
549 | # figures in subsequent cells. This mode is ideal for the notebook, where
550 | # residual plots from other cells might be surprising.
551 | # 
552 | # When False, one must call figure() to create new figures. This means that
553 | # gcf() and getfigs() can reference figures created in other cells, and the
554 | # active figure can continue to be edited with pylab/pyplot methods that
555 | # reference the current active figure. This mode facilitates iterative editing
556 | # of figures, and behaves most consistently with other matplotlib backends, but
557 | # figure barriers between cells must be explicit.
558 | # c.InlineBackend.close_figures = True
559 | 
560 | # Subset of matplotlib rcParams that should be different for the inline backend.
561 | # c.InlineBackend.rc = {'font.size': 10, 'figure.figsize': (6.0, 4.0), 'figure.facecolor': 'white', 'savefig.dpi': 72, 'figure.subplot.bottom': 0.125, 'figure.edgecolor': 'white'}
562 | 
563 | #------------------------------------------------------------------------------
564 | # MappingKernelManager configuration
565 | #------------------------------------------------------------------------------
566 | 
567 | # A KernelManager that handles notebook mapping and HTTP error handling
568 | 
569 | # MappingKernelManager will inherit config from: MultiKernelManager
570 | 
571 | # The kernel manager class.  This is configurable to allow subclassing of the
572 | # KernelManager for customized behavior.
573 | # c.MappingKernelManager.kernel_manager_class = 'IPython.kernel.ioloop.IOLoopKernelManager'
574 | 
575 | #------------------------------------------------------------------------------
576 | # NotebookManager configuration
577 | #------------------------------------------------------------------------------
578 | 
579 | # The directory to use for notebooks.
580 | # c.NotebookManager.notebook_dir = u'/home/oracle'
581 | 
582 | #------------------------------------------------------------------------------
583 | # FileNotebookManager configuration
584 | #------------------------------------------------------------------------------
585 | 
586 | # FileNotebookManager will inherit config from: NotebookManager
587 | 
588 | # The location in which to keep notebook checkpoints
589 | # 
590 | # By default, it is notebook-dir/.ipynb_checkpoints
591 | # c.FileNotebookManager.checkpoint_dir = u''
592 | 
593 | # Automatically create a Python script when saving the notebook.
594 | # 
595 | # For easier use of import, %run and %load across notebooks, a <notebook-
596 | # name>.py script will be created next to any <notebook-name>.ipynb on each
597 | # save.  This can also be set with the short `--script` flag.
598 | # c.FileNotebookManager.save_script = False
599 | 
600 | # The directory to use for notebooks.
601 | # c.FileNotebookManager.notebook_dir = u'/home/oracle'
602 | 


--------------------------------------------------------------------------------
/setup/setup_pyspark_notebook.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | echo "installing pyspark profile for ipython"
4 | ipython profile create pyspark
5 | cp ipython_notebook_config_spark.py $HOME/.config/ipython/profile_pyspark/ipython_notebook_config.py
6 | cp 00-pyspark-setup.py $HOME/.config/ipython/profile_pyspark/startup/
7 | 


--------------------------------------------------------------------------------
/templates/pandas_oracle_template.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | """ Template for connecting pandas to Oracle 12c"""
 4 | from sqlalchemy import create_engine
 5 | import pandas as pd
 6 | 
 7 | def main(username, password, hoststring, table):
 8 |     engine = create_engine("oracle://{0}:{1}@{2}".format(username, password, hoststring))
 9 |     data = pd.read_sql_table(table, engine)
10 |     print data.head()
11 | 
12 | if __name__ == "__main__":
13 |     if len(sys.argv) != 5:
14 |         print "Usage: sql_alchemy_oracle_template.py username password hoststring table"
15 |     else:
16 |         main(*sys.argv[1:])
17 | 


--------------------------------------------------------------------------------
/templates/raw_oracle_template.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | """ A template for python programs connecting to
 4 | Oracle DB 12c using cx-Oracle"""
 5 | 
 6 | import sys
 7 | import cx_Oracle as cx
 8 | 
 9 | def main(username, password, hoststring):
10 |     # connect to the db
11 |     db = cx.connect(username, password, hoststring)
12 |     # get a cursor
13 |     c = db.cursor()
14 |     #do stuff!
15 | 
16 | if __name__ == "__main__":
17 |     if len(sys.argv) != 4:
18 |         print "Usage: raw_oracle_template.py username password hoststring"
19 |     else:
20 |         main(*sys.argv[1:])
21 | 
22 | 


--------------------------------------------------------------------------------
/templates/sql_alchemy_oracle_template.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | """ Rough template for using SQLAlchemy with Oracle DB 12c"""
 4 | 
 5 | from sqlalchemy import *
 6 | from sqlalchemy.orm import *
 7 | 
 8 | def main(username, password, hoststring):
 9 |     engine = create_engine("oracle://{0}:{1}@{2}".format(username, password, hoststring))
10 |     metadata = MetaData(engine)
11 |     Session = sessionmaker(engine)
12 |     session = Session()
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     if len(sys.argv) != 4:
17 |         print "Usage: sql_alchemy_oracle_template.py username password hoststring"
18 |     else:
19 |         main(*sys.argv[1:])
20 | 


--------------------------------------------------------------------------------