├── .gitignore ├── README.md ├── flu_news ├── get_news_data.sh ├── news_clustering │ ├── build.sbt │ ├── project │ │ └── build.properties │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── oracle │ │ └── newscluster │ │ └── NewsClustering.scala ├── news_rss_collector.py └── news_streaming │ ├── flu_news_flume_config │ ├── news_rss_collector.py │ ├── news_streamer │ ├── build.sbt │ ├── project │ │ ├── assembly.sbt │ │ └── build.properties │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── oracle │ │ └── newsstream │ │ └── NewsStreamer.scala │ └── start_flume.sh ├── flu_shots └── flu_shots_to_db.py ├── flu_statistics ├── OIE_Pathogenic_Flu.py ├── country_chop_who.py ├── get_flu_summary_data.sh ├── state_populations.csv ├── us_chop_ilinet.py └── us_chop_who.py ├── notebooks ├── 01 WHO US Simple CSV Loading.ipynb ├── 02 WHO Country-Level Flu Data.ipynb ├── 03 HHS_Flu_Vaccination_Data.ipynb ├── 04 OIE_Pathogenic_Flu.ipynb ├── 05 Collecting Web Data With Pandas.ipynb ├── 06 Does Ethnicity Impact Vaccination Rates?.ipynb ├── 07 Do vaccination rates impact flu rates?.ipynb ├── 08 Does GDP explain flu rates?.ipynb ├── 09 Does living in cities influence flu rates?.ipynb ├── 10 How do sanitation and clean water effect flu outbreaks?.ipynb ├── 11 Basic Big Data with PySpark.ipynb ├── 12 Moving and Clustering Data with Sqoop and Spark.ipynb ├── 13 Finding Important Words With Spark.ipynb ├── 14 Trend Search with Big Data and SQL.ipynb ├── 15 Clustering the News with Spark and MLlib.ipynb ├── 16 Collecting Streaming News with Flume and Spark.ipynb ├── Video Opportunities.ipynb └── Visual Data Inspection.ipynb ├── setup ├── 00-pyspark-setup.py ├── data_science_bootcamp_setup.sh ├── download_data.sh ├── fludb.sql ├── ipython_notebook_config_spark.py └── setup_pyspark_notebook.sh └── templates ├── pandas_oracle_template.py ├── raw_oracle_template.py └── sql_alchemy_oracle_template.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *.*~ 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | notebooks/.ipynb_checkpoints/* 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | odsb2014 2 | ======== 3 | 4 | Oracle Data Science Bootcamp 2014 5 | 6 | A series of workshops to explain both basic and advanced data science concepts using SQL, Python, Matplotlib, and Apache Spark. 7 | 8 | Getting Started 9 | ---------------- 10 | 11 | * Download the Oracle Big Data Lite VM at http://www.oracle.com/technetwork/database/bigdata-appliance/oracle-bigdatalite-2104726.html 12 | This requires Virtual Box. All username/passwords for the VM are `oracle/welcome1` unless stated otherwise. 13 | * Clone this git repository: `git clone https://github.com/dwmclary/odsb2014` 14 | * Change into the setup directory: `cd odsb2014/setup` 15 | * Run the setup script: `./data_science_bootcamp_setup.sh` 16 | * Run the data download script `./download_data.sh` 17 | * Run the pyspark installation script `./setup_pyspark_notebook.sh` 18 | * Run the database setup script `sqlplus sys/welcome1 as sysdba @fludb.sql` 19 | * Start the database listener `lsnrctl start` 20 | * Source ~/.bashrc or open a new terminal window 21 | * Change to the `odsb2014/notebooks` directory and start ipython: `ipython notebook --profile pyspark` 22 | 23 | Loading Data 24 | ---------------- 25 | The `flu_statitics` and `flu_news` directories contain data download scripts that must be run 26 | in order to complete the workshop. These can be run standalone, or by running `setup/download_data.sh`. 27 | The `flu_shots` directory contains a script for fetching data from the US Dept. HHS, but collection 28 | of this data is included as part of the series of notebooks. 29 | -------------------------------------------------------------------------------- /flu_news/get_news_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | mkdir data 4 | cd data 5 | wget https://s3.amazonaws.com/orcl-dsb-fludata/wikinews/wikinews.json 6 | split --lines=10000 wikinews.json wikinews_data 7 | wget http://mattmahoney.net/dc/text8.zip 8 | unzip text8.zip 9 | tr -s '[[:punct:][:space:]]' '\n' < text8 > linewise_text_8 10 | cd .. 11 | wget https://s3.amazonaws.com/orcl-dsb-fludata/wikinews/wikinews_builder.py 12 | -------------------------------------------------------------------------------- /flu_news/news_clustering/build.sbt: -------------------------------------------------------------------------------- 1 | name := "NewsClustering" 2 | version := "0.1" 3 | scalaVersion := "2.10.4" 4 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0" 5 | libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.1.0" 6 | libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.3.0-cdh5.1.2" 7 | libraryDependencies += "org.json4s" %% "json4s-jackson" % "3.2.11" 8 | libraryDependencies += "org.json4s" %% "json4s-native" % "3.2.11" 9 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" 10 | resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/" -------------------------------------------------------------------------------- /flu_news/news_clustering/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.7 -------------------------------------------------------------------------------- /flu_news/news_clustering/src/main/scala/com/oracle/newscluster/NewsClustering.scala: -------------------------------------------------------------------------------- 1 | package com.oracle.newscluster 2 | 3 | import org.json4s._ 4 | import org.json4s.jackson.Serialization.{read,write} 5 | import org.apache.spark._ 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.rdd._ 9 | import org.apache.spark.mllib.clustering.KMeans 10 | import org.apache.spark.mllib.feature.Word2Vec 11 | import org.apache.spark.mllib.feature.Word2VecModel 12 | import org.apache.spark.mllib.linalg._ 13 | 14 | case class NewsArticle(date : String, title : String, byline : String, fulltext : String) 15 | 16 | object NewsClustering { 17 | def sumArray (m: Array[Double], n: Array[Double]): Array[Double] = { 18 | for (i <- 0 until m.length) {m(i) += n(i)} 19 | return m 20 | } 21 | 22 | def divArray (m: Array[Double], divisor: Double) : Array[Double] = { 23 | for (i <- 0 until m.length) {m(i) /= divisor} 24 | return m 25 | } 26 | 27 | def wordToVector (w:String, m: Word2VecModel): Vector = { 28 | try { 29 | return m.transform(w) 30 | } catch { 31 | case e: Exception => return Vectors.zeros(100) 32 | } 33 | } 34 | 35 | def main(args : Array[String]) = { 36 | val sc = new SparkContext(new SparkConf().setAppName("News Clustering")) 37 | 38 | val news_rdd = sc.textFile("hdfs://localhost:8020/user/oracle/flu_news") 39 | 40 | val news_json = news_rdd.map(record => { 41 | implicit val formats = DefaultFormats 42 | read[NewsArticle](record) 43 | }) 44 | 45 | val news_titles = news_json.map(_.title.split(" ").toSeq) 46 | val news_title_words = news_titles.flatMap(x => x).map(x => Seq(x)) 47 | 48 | val w2v_input = sc.textFile("file:///home/oracle/odsb2014/flu_news/data/linewise_text_8").sample(false, 0.25,2).map(x => Seq(x)) 49 | val all_input = w2v_input ++ news_title_words 50 | 51 | val word2vec = new Word2Vec() 52 | val model = word2vec.fit(all_input) 53 | 54 | val title_vectors = news_titles.map(x => new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector]) 55 | val title_pairs = news_titles.map(x => (x,new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector])) 56 | 57 | var numClusters = 100 58 | val numIterations = 25 59 | 60 | var clusters = KMeans.train(title_vectors, numClusters, numIterations) 61 | var wssse = clusters.computeCost(title_vectors) 62 | println("WSSSE for clusters:"+wssse) 63 | 64 | val article_membership = title_pairs.map(x => (clusters.predict(x._2), x._1)) 65 | val cluster_centers = sc.parallelize(clusters.clusterCenters.zipWithIndex.map{ e => (e._2,e._1)}) 66 | val cluster_topics = cluster_centers.mapValues(x => model.findSynonyms(x,5).map(x => x._1)) 67 | 68 | var sample_topic = cluster_topics.take(12) 69 | var sample_members = article_membership.filter(x => x._1 == 6).take(10) 70 | for (i <- 6 until 12) { 71 | println("Topic Group #"+i) 72 | println(sample_topic(i)._2.mkString(",")) 73 | println("-----------------------------") 74 | sample_members = article_membership.filter(x => x._1 == i).take(10) 75 | sample_members.foreach{x => println(x._2.mkString(" "))} 76 | println("-----------------------------") 77 | } 78 | 79 | article_membership.map{x => x._1.toString+","+x._2.mkString(" ")}.saveAsTextFile("/user/oracle/flu_news_categorization") 80 | cluster_topics.map{x => x._1+","+x._2.mkString(" ")}.saveAsTextFile("/user/oracle/flu_news_categories") 81 | 82 | } 83 | 84 | } 85 | 86 | 87 | -------------------------------------------------------------------------------- /flu_news/news_rss_collector.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import feedparser 4 | from bs4 import BeautifulSoup 5 | import json 6 | import time 7 | 8 | urls = {"top_news":"http://feeds.reuters.com/reuters/topNews", \ 9 | "health": "http://feeds.reuters.com/reuters/healthNews", \ 10 | "healthcare":"http://feeds.reuters.com/reuters/UShealthcareNews", \ 11 | "science":"http://feeds.reuters.com/reuters/scienceNews"} 12 | 13 | etags = {"top_news": None, "health": None, "healthcare": None, "science": None} 14 | 15 | done = False 16 | 17 | while not done: 18 | for k, v in urls.items(): 19 | if etags[k]: 20 | d = feedparser.parse(v, etag=etags[k]) 21 | else: 22 | d = feedparser.parse(v) 23 | for e in d.entries: 24 | doc = json.dumps({"category":k, "title":e.title.strip(), "summary":BeautifulSoup(e.summary).text.strip()}) 25 | print doc 26 | etags[k] = d.etag 27 | time.sleep(60) 28 | -------------------------------------------------------------------------------- /flu_news/news_streaming/flu_news_flume_config: -------------------------------------------------------------------------------- 1 | # example.conf: A single-node Flume configuration 2 | 3 | # Name the components on this agent 4 | newsAgent.sources = r1 5 | newsAgent.sinks = k1 6 | newsAgent.channels = c1 7 | 8 | # Describe/configure the source 9 | newsAgent.sources.r1.type = exec 10 | newsAgent.sources.r1.command = ./news_rss_collector.py 11 | 12 | # Describe the sink 13 | newsAgent.sinks.k1.type = avro 14 | newsAgent.sinks.k1.channel = c1 15 | newsAgent.sinks.k1.hostname = localhost 16 | newsAgent.sinks.k1.port = 44444 17 | 18 | # Use a channel which buffers events in memory 19 | newsAgent.channels.c1.type = memory 20 | newsAgent.channels.c1.capacity = 1000 21 | newsAgent.channels.c1.transactionCapacity = 100 22 | 23 | # Bind the source and sink to the channel 24 | newsAgent.sources.r1.channels = c1 25 | newsAgent.sinks.k1.channel = c1 -------------------------------------------------------------------------------- /flu_news/news_streaming/news_rss_collector.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import feedparser 4 | from bs4 import BeautifulSoup 5 | import json 6 | import time 7 | import sys 8 | 9 | urls = {"top_news":"http://feeds.reuters.com/reuters/topNews", \ 10 | "health": "http://feeds.reuters.com/reuters/healthNews", \ 11 | "healthcare":"http://feeds.reuters.com/reuters/UShealthcareNews", \ 12 | "science":"http://feeds.reuters.com/reuters/scienceNews"} 13 | 14 | etags = {"top_news": None, "health": None, "healthcare": None, "science": None} 15 | 16 | done = False 17 | 18 | while not done: 19 | for k, v in urls.items(): 20 | if etags[k]: 21 | d = feedparser.parse(v, etag=etags[k]) 22 | else: 23 | d = feedparser.parse(v) 24 | for e in d.entries: 25 | doc = json.dumps({"category":k, "title":e.title.strip(), "summary":BeautifulSoup(e.summary).text.strip()}) 26 | print doc 27 | sys.stdout.flush() 28 | etags[k] = d.etag 29 | time.sleep(30) 30 | -------------------------------------------------------------------------------- /flu_news/news_streaming/news_streamer/build.sbt: -------------------------------------------------------------------------------- 1 | name := "NewsStreamer" 2 | version := "0.1" 3 | scalaVersion := "2.10.4" 4 | libraryDependencies ++= Seq( 5 | "org.apache.spark" %% "spark-core" % "1.1.0" % "provided", 6 | "org.apache.spark" %% "spark-streaming" % "1.1.0" % "provided", 7 | "org.apache.spark" % "spark-streaming-flume_2.10" % "1.1.0" , 8 | "org.apache.hadoop" % "hadoop-client" % "2.3.0-cdh5.1.2", 9 | "org.json4s" %% "json4s-jackson" % "3.2.11", 10 | "org.json4s" %% "json4s-native" % "3.2.11").map({dep => 11 | dep.exclude("org.mortbay.jetty", "servlet-api"). 12 | exclude("commons-beanutils", "commons-beanutils-core"). 13 | exclude("commons-collections", "commons-collections"). 14 | exclude("commons-collections", "commons-collections"). 15 | exclude("commons-logging", "commons-logging"). 16 | exclude("com.esotericsoftware.minlog", "minlog"). 17 | exclude("asm", "asm"). 18 | exclude("org.apache.hadoop", "hadoop-yarn-common") 19 | }) 20 | resolvers += "Akka Repository" at "http://repo.akka.io/releases/" 21 | resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/" 22 | 23 | mainClass in assembly := Some("com.oracle.newsstream.NewsStreamer") 24 | 25 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => 26 | { 27 | case x if x.startsWith("META-INF/ECLIPSEF.RSA") => MergeStrategy.last 28 | case x if x.startsWith("META-INF/mailcap") => MergeStrategy.last 29 | case x if x.startsWith("META-INF/mimetypes") => MergeStrategy.last 30 | case x if x.startsWith("plugin.properties") => MergeStrategy.last 31 | case x if x.startsWith("javax") => MergeStrategy.first 32 | case x => old(x) 33 | } 34 | } -------------------------------------------------------------------------------- /flu_news/news_streaming/news_streamer/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") -------------------------------------------------------------------------------- /flu_news/news_streaming/news_streamer/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.7 -------------------------------------------------------------------------------- /flu_news/news_streaming/news_streamer/src/main/scala/com/oracle/newsstream/NewsStreamer.scala: -------------------------------------------------------------------------------- 1 | package com.oracle.newsstream 2 | 3 | import org.json4s._ 4 | import org.json4s.jackson.Serialization.{read,write} 5 | import org.apache.spark._ 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.rdd._ 8 | import org.apache.spark.streaming._ 9 | import org.apache.spark.streaming.StreamingContext._ 10 | import org.apache.spark.streaming.flume._ 11 | import scala.collection.immutable.StringOps 12 | 13 | 14 | case class RSSItem(category : String, title : String, summary : String) 15 | 16 | 17 | object NewsStreamer { 18 | def containsFlu(x : String): Boolean = x match { 19 | case x if x contains " flu " => true 20 | case x if x contains " influenza " => true 21 | case x if x contains " disease " => true 22 | case x if x contains " outbreak " => true 23 | case x if x contains " H1N1 " => true 24 | case x if x contains " H5N1 " => true 25 | case x if x contains " sick " => true 26 | case _ => false 27 | } 28 | 29 | def main(args : Array[String]) = { 30 | val conf = new SparkConf().setAppName("NewsStreamer") 31 | val ssc = new StreamingContext(conf, Seconds(30)) 32 | 33 | val flumeStream = FlumeUtils.createStream(ssc, "localhost", 44444) 34 | val rssData = flumeStream.map(record => { 35 | implicit val formats = DefaultFormats 36 | read[RSSItem](new String(record.event.getBody().array())) 37 | }) 38 | val healthSummaries = rssData.filter(x => containsFlu(x.summary)) 39 | 40 | // print batch summaries to the screen 41 | rssData.count().map(cnt => "rss recv " + cnt + " events").print() 42 | 43 | val hsc = healthSummaries.count() 44 | hsc.map(cnt => "health summaries recv " + cnt + " events").print() 45 | 46 | //write health data out to HDFS 47 | val now: Long = System.currentTimeMillis 48 | healthSummaries.foreachRDD(r => { 49 | if (r.count() > 0) { 50 | r.map(item => { 51 | implicit val formats = DefaultFormats 52 | write(item) 53 | }).saveAsTextFile("/user/oracle/flu_streaming/flu_stream-"+now.toString()) 54 | } 55 | }) 56 | ssc.start() 57 | ssc.awaitTermination() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /flu_news/news_streaming/start_flume.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | flume-ng agent --name newsAgent --conf-file ./flu_news_flume_config -f /usr/lib/flume-ng/conf/flume-conf.properties.template -Dflume.root.logger=DEBUG,console 3 | -------------------------------------------------------------------------------- /flu_shots/flu_shots_to_db.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ Working version of a parser/db-writer for WHO Continent-Level data. Will be reformatted as an iPython Notebook as well.""" 3 | 4 | import sys 5 | import cx_Oracle 6 | import pprint 7 | import re 8 | import string 9 | import json 10 | import urllib2 11 | 12 | def jsonify_data(d): 13 | return(json.dumps(d),) 14 | 15 | 16 | 17 | def write_to_db(db, data): 18 | cursor = db.cursor() 19 | try: 20 | cursor.prepare("INSERT INTO flu_shot_json(doc) VALUES (:1)") 21 | cursor.executemany(None, map(jsonify_data, data['results'])) 22 | db.commit() 23 | except Exception as e: 24 | print e 25 | 26 | 27 | def main(): 28 | db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl') 29 | drop_table = 'drop table flu_shot_json' 30 | ddl = 'create table flu_shot_json (doc varchar2(4000), CONSTRAINT "ENSURE_JSON" CHECK (doc IS JSON))' 31 | cursor = db.cursor() 32 | try: 33 | cursor.execute(drop_table) 34 | except: 35 | pass 36 | cursor.execute(ddl) 37 | cursor.close() 38 | print "parsing dataset..." 39 | for e in ["T","W","A","B","H"]: 40 | url = "http://flu-vaccination-map.hhs.gov/api/v1/states.json?ethnicity="+e+"&year=lte:2014" 41 | data = json.load(urllib2.urlopen(url)) 42 | print "writing to DB..." 43 | write_to_db(db, data) 44 | 45 | view_ddl = """CREATE OR REPLACE VIEW FLUSHOTS 46 | AS SELECT 47 | CAST(j.doc.count AS NUMBER) eligible, 48 | CAST(j.doc.week AS NUMBER) week, 49 | CAST(j.doc.name AS VARCHAR2(20)) state_name, 50 | CAST(j.doc.short_name AS VARCHAR2(2)) state, 51 | CAST(j.doc.fips_id\tAS NUMBER) fips_id, 52 | CAST(j.doc.disparity as VARCHAR2(20)) disparity, 53 | CAST(j.doc.medicare_status as VARCHAR2(20)) medicare_status, 54 | CAST(j.doc.year as NUMBER) year, 55 | CAST(j.doc.percentage AS NUMBER) percentage_claimed, 56 | CAST(j.doc.ethnicity AS VARCHAR2(20)) ethnicity 57 | FROM flu_shot_json j""" 58 | cursor = db.cursor() 59 | cursor.execute(view_ddl) 60 | cursor.close() 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /flu_statistics/OIE_Pathogenic_Flu.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import cx_Oracle 5 | import datetime 6 | 7 | 8 | base_year = 2004 9 | years = 10 10 | base_url = "http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/" 11 | report = requests.get(base_url+str(base_year)) 12 | 13 | searchable_report = BeautifulSoup(report.text) 14 | flu_table = searchable_report.table 15 | 16 | row_tags = flu_table.find_all("tr")[1:] 17 | 18 | def make_db_row(r): 19 | #newer reports have an extra year column 20 | if len(r) > 4: 21 | r.pop(2) 22 | 23 | r[-1]=r[-1].findChild() 24 | #try to get the report url 25 | url = None 26 | try: 27 | url = r[-1]['href'] 28 | except: 29 | pass 30 | row_text = map(lambda x: x.text.encode('ascii', 'ignore'), r)+[url] 31 | try: 32 | row_text[2] = datetime.datetime.strptime(row_text[2], "%d/%m/%y").date() 33 | except Exception as e: 34 | print r, e 35 | return tuple(row_text) 36 | 37 | data_to_insert = map(lambda x: make_db_row(x.find_all("td")), row_tags) 38 | for i in range(1,years+1): 39 | print base_year+i 40 | report = requests.get(base_url+str(base_year+i)) 41 | searchable_report = BeautifulSoup(report.text) 42 | flu_table = searchable_report.table 43 | row_tags = flu_table.find_all("tr")[1:] 44 | data_to_insert += map(lambda x: make_db_row(x.find_all("td")), row_tags) 45 | create_table = """CREATE TABLE PATHOGENIC_FLU ( 46 | INCIDENT_LOCATION VARCHAR2(100), 47 | INCIDENT_TYPE VARCHAR2(50), 48 | INCIDENT_DATE DATE, 49 | INCIDENT_REPORT VARCHAR2(100), 50 | REPORT_LINK VARCHAR2(200) 51 | ) 52 | """ 53 | db = cx_Oracle.connect("fludb", "flushot", "localhost:1521/orcl") 54 | cursor = db.cursor() 55 | cursor.execute(create_table) 56 | 57 | 58 | cursor.prepare("""INSERT INTO PATHOGENIC_FLU 59 | (INCIDENT_LOCATION,INCIDENT_TYPE,INCIDENT_DATE, INCIDENT_REPORT, REPORT_LINK) 60 | VALUES 61 | (:1, :2, :3, :4, :5) 62 | """) 63 | cursor.executemany(None, data_to_insert) 64 | db.commit() 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /flu_statistics/country_chop_who.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ Working version of a parser/db-writer for WHO Continent-Level data. Will be reformatted as an iPython Notebook as well.""" 3 | 4 | import sys 5 | import cx_Oracle 6 | import pprint 7 | import re 8 | import string 9 | 10 | def write_to_db(db, dataset): 11 | dt = "DROP TABLE %s" % dataset['title'] 12 | cursor = db.cursor() 13 | #try: 14 | # cursor.execute(dt) 15 | #except: 16 | # pass 17 | ctine = """CREATE TABLE %s ( 18 | region VARCHAR(12), 19 | country VARCHAR(500), 20 | year NUMBER, 21 | week NUMBER, 22 | measure NUMBER, 23 | CONSTRAINT %s_id PRIMARY KEY (country, year, week) 24 | )""" % (dataset['title'], dataset['title']) 25 | try: 26 | cursor.execute(ctine) 27 | except Exception as e: 28 | print e 29 | print "failed to create table", dataset['title'] 30 | pass 31 | for country in dataset['data']: 32 | try: 33 | cursor.prepare("INSERT INTO %s (region, country, year, week, measure) VALUES (:1, :2, :3, :4, :5)" % dataset['title']) 34 | cursor.executemany(None, country) 35 | db.commit() 36 | except Exception as e: 37 | print e 38 | print country[0] 39 | 40 | 41 | def make_tablename(d): 42 | title_string = d.split("|")[0] 43 | if "specimen" in title_string: 44 | title = title_string.split("->")[-1].strip()+"_Specimens" 45 | if "processed" in title_string: 46 | title += "_proc" 47 | else: 48 | title += "_recv" 49 | else: 50 | title = title_string.split("->")[-1].strip()+"_Infections" 51 | title = re.sub("\(", "", title) 52 | title = re.sub("\)", "", title) 53 | title = re.sub(" ","_", title) 54 | return title[:25] 55 | 56 | def makerows(region_code, data): 57 | dateprefix = data[0] 58 | print dateprefix 59 | dates = map(lambda x: x.split(), dateprefix.strip().split("|")[1:]) 60 | print dates 61 | dates = map(lambda x: [x[0], x[-1]], dates) 62 | print dates 63 | data = map(lambda x: x.strip().split("|"), data[1:]) 64 | # what we're really doing here is pivoting the data so that we can have country, year, week, value 65 | # for each row of raw data, we want a list of tuple (country, year, week, value) 66 | def row_to_tuple(dates, r): 67 | t = [] 68 | for i in range(1,len(r)): 69 | t.append((region_code, r[0], int(dates[i-1][0]), int(dates[i-1][1]), int(re.sub(",","",r[i])))) 70 | return t 71 | data = map(lambda x: row_to_tuple(dates, x), data) 72 | return data 73 | 74 | def parseWHOCountryFile(filename): 75 | region = re.sub("WHO+", "",filename) 76 | region = re.sub(".psv", "", region) 77 | region = re.sub("\+","", region) 78 | region = region.split("/")[-1] 79 | raw = open(filename).readlines() 80 | big_splits = [] 81 | for i in range(len(raw)): 82 | if len(raw[i].split("|")) == 2: 83 | big_splits.append(i) 84 | datasets = [] 85 | #pp = pprint.PrettyPrinter(indent=4) 86 | for i in range(0,len(big_splits),2): 87 | ds = {} 88 | ds['title'] = raw[big_splits[i]] 89 | ds['title'] = make_tablename(ds['title']) 90 | ds['period'] = raw[big_splits[i+1]] 91 | if i < len(big_splits)-2: 92 | ds['starts'] = big_splits[i+1]+1 93 | ds['ends'] = big_splits[i+2]-1 94 | else: 95 | ds['starts'] = big_splits[i+1]+1 96 | ds['ends'] = len(raw)-1 97 | ds['data'] = raw[ds['starts']:ds['ends']] 98 | ds['data'] = makerows(region, ds['data']) 99 | datasets.append(ds) 100 | #pp.pprint(ds) 101 | return datasets 102 | 103 | def build_view(datasets): 104 | view = """CREATE OR REPLACE VIEW flu_statistics AS 105 | SELECT a.region, a.country, a.year, a.week, \n""" 106 | from_clause = " from \n" 107 | where_clause = " where \n" 108 | column_creation = "" 109 | table_creation = "" 110 | join_creation = "" 111 | for i in range(len(datasets)): 112 | column_creation += "{0}.measure as {1}".format(string.ascii_lowercase[i], datasets[i]['title']) 113 | 114 | table_creation += "{0} {1}".format(datasets[i]['title'], string.ascii_lowercase[i]) 115 | 116 | if (i < len(datasets)-1): 117 | column_creation += ",\n" 118 | table_creation += ",\n" 119 | join_creation += "{1}.country = {0}.country and {1}.year = {0}.year and {1}.week = {0}.week \n".format(string.ascii_lowercase[i], string.ascii_lowercase[i+1]) 120 | if (i < len(datasets)-2): 121 | join_creation += "and\n" 122 | else: 123 | column_creation += "\n" 124 | table_creation += "\n" 125 | 126 | 127 | view += column_creation + from_clause + table_creation + where_clause + join_creation 128 | return view 129 | 130 | def main(filename): 131 | db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl') 132 | print "parsing datasets..." 133 | datasets = parseWHOCountryFile(filename) 134 | print "writing to DB..." 135 | for dataset in datasets: 136 | write_to_db(db, dataset) 137 | print "creating view..." 138 | c = db.cursor() 139 | c.execute(build_view(datasets)) 140 | 141 | if __name__ == "__main__": 142 | main(sys.argv[1]) 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /flu_statistics/get_flu_summary_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | mkdir data 4 | cd data 5 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+AFRO.psv 6 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+EURO.psv 7 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+PAHO.psv 8 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+WEST+ASIA.psv 9 | wget https://s3.amazonaws.com/orcl-dsb-fludata/WHO-data/WHO+US+comprehensive.psv 10 | wget https://s3.amazonaws.com/orcl-dsb-fludata/FluNet/WHO_NREVSS.csv 11 | wget https://s3.amazonaws.com/orcl-dsb-fludata/FluNet/ILINet.csv 12 | cd .. -------------------------------------------------------------------------------- /flu_statistics/state_populations.csv: -------------------------------------------------------------------------------- 1 | state_name,state,region,population 2 | Alabama,AL,East South Central,4779736 3 | Alaska,AK,Pacific,710231 4 | Arizona,AZ,Mountain,6392017 5 | Arkansas,AR,West South Central,2915918 6 | California,CA,Pacific,37253956 7 | Colorado,CO,Mountain,5029196 8 | Connecticut,CT,New England,3574097 9 | Delaware,DE,South Atlantic,897934 10 | Florida,FL,South Atlantic,18801310 11 | Georgia,GA,South Atlantic,9687653 12 | Hawaii,HI,Pacific,1360301 13 | Idaho,ID,Mountain,1567582 14 | Illinois,IL,East North Central,12830632 15 | Indiana,IN,East North Central,6483802 16 | Iowa,IA,West North Central,3046355 17 | Kansas,KS,West North Central,2853118 18 | Kentucky,KY,East South Central,4339367 19 | Louisiana,LA,West South Central,4533372 20 | Maine,ME,New England,1328361 21 | Maryland,MD,South Atlantic,5773552 22 | Massachusetts,MA,New England,6547629 23 | Michigan,MI,East North Central,9883640 24 | Minnesota,MN,East North Central,5303925 25 | Mississippi,MS,East South Central,2967297 26 | Missouri,MO,West North Central,5988927 27 | Montana,MT,Mountain,989415 28 | Nebraska,NE,West North Central,1826341 29 | Nevada,NV,Mountain,2700551 30 | New Hampshire,NH,New England,1316470 31 | New Jersey,NJ,Mid-Atlantic,8791894 32 | New Mexico,NM,Mountain,2059179 33 | New York,NY,Mid-Atlantic,19378102 34 | North Carolina,NC,South Atlantic,9535483 35 | North Dakota,ND,West North Central,672591 36 | Ohio,OH,East North Central,11536504 37 | Oklahoma,OK,West South Central,3751351 38 | Oregon,OR,Pacific,3831074 39 | Pennsylvania,PA,Mid-Atlantic,12702379 40 | Rhode Island,RI,New England,1052567 41 | South Carolina,SC,South Atlantic,4625364 42 | South Dakota,SD,West North Central,814180 43 | Tennessee,TN,East South Central,6346105 44 | Texas,TX,West South Central,25145561 45 | Utah,UT,Mountain,2763885 46 | Vermont,VT,New England,625741 47 | Virginia,VA,South Atlantic,8001024 48 | Washington,WA,Pacific,6724540 49 | West Virginia,WV,South Atlantic,1852994 50 | Wisconsin,WI,East North Central,5686986 51 | Wyoming,WY,Mountain,563626 -------------------------------------------------------------------------------- /flu_statistics/us_chop_ilinet.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ Working version of a parser/db-writer for WHO Continent-Level data. Will be reformatted as an iPython Notebook as well.""" 3 | 4 | import sys 5 | import cx_Oracle 6 | import pprint 7 | import re 8 | import string 9 | 10 | def parseILIFile(filename): 11 | raw = map(lambda x: x.strip().split(","),open(filename).readlines()) 12 | data = raw[1:] 13 | return data 14 | 15 | def columns_to_type(row): 16 | try: 17 | for i in range(len(row)): 18 | if (row[i] == "X"): 19 | row[i] = None 20 | elif (i > 1 and i != 7 and i != 8): 21 | row[i] = int(row[i]) 22 | elif (i == 7 or i==8): 23 | row[i] = float(row[i]) 24 | return tuple(row[1:]) 25 | except: 26 | return tuple() 27 | 28 | def write_to_db(db, data): 29 | create_table = """CREATE TABLE US_FLU_DEMOGRAPHICS ( 30 | REGION VARCHAR2(50), 31 | YEAR NUMBER(10,0), 32 | WEEK NUMBER(10,0), 33 | TOTAL_SICK NUMBER(10,0), 34 | TOTAL_PATIENTS NUMBER(10,0), 35 | TOTAL_PROVIDERS NUMBER(10,0), 36 | WEIGHTED_SICK NUMBER, 37 | UNWEIGHTED_SICK NUMBER, 38 | AGE_0_4 NUMBER(10,0), 39 | AGE_5_24 NUMBER(10,0), 40 | AGE_25_64 NUMBER(10,0), 41 | AGE_25_49 NUMBER(10,0), 42 | AGE_50_64 NUMBER(10,0), 43 | AGE_65_PLUS NUMBER(10,0))""" 44 | rows_to_insert = filter(lambda x: len(x)> 0, map(columns_to_type, data)) 45 | cursor = db.cursor() 46 | try: 47 | cursor.execute("drop table us_flu_demographics") 48 | except: 49 | pass 50 | cursor.execute(create_table) 51 | try: 52 | cursor.prepare("""INSERT INTO US_FLU_DEMOGRAPHICS ( 53 | REGION, YEAR, WEEK, TOTAL_SICK, 54 | TOTAL_PATIENTS, TOTAL_PROVIDERS, WEIGHTED_SICK, UNWEIGHTED_SICK, 55 | AGE_0_4, AGE_5_24, AGE_25_64, AGE_25_49, AGE_50_64, AGE_65_PLUS) 56 | VALUES 57 | (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14)""") 58 | cursor.executemany(None, rows_to_insert) 59 | db.commit() 60 | except Exception as e: 61 | print e 62 | 63 | 64 | def main(filename): 65 | db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl') 66 | print "parsing datasets..." 67 | datasets = parseILIFile(filename) 68 | print "writing to DB..." 69 | write_to_db(db, datasets) 70 | 71 | if __name__ == "__main__": 72 | main(sys.argv[1]) 73 | -------------------------------------------------------------------------------- /flu_statistics/us_chop_who.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ Working version of a parser/db-writer for WHO Continent-Level data. Will be reformatted as an iPython Notebook as well.""" 3 | 4 | import sys 5 | import cx_Oracle 6 | import pprint 7 | import re 8 | import string 9 | 10 | def parseWHOFile(filename): 11 | raw = map(lambda x: x.strip().split(","),open(filename).readlines()) 12 | data = raw[1:] 13 | return data 14 | 15 | def columns_to_type(row): 16 | try: 17 | for i in range(len(row)): 18 | if (i > 1 and i != 5): 19 | row[i] = int(row[i]) 20 | if (i == 5): 21 | row[i] = float(row[i]) 22 | return tuple(row[1:]) 23 | except: 24 | return tuple() 25 | 26 | def write_to_db(db, data): 27 | create_table = """CREATE TABLE US_WHO_FLU_STATS ( 28 | REGION VARCHAR2(50), 29 | YEAR NUMBER(10,0), 30 | WEEK NUMBER(10,0), 31 | TOTAL_SPECIMENS NUMBER(10,0), 32 | PERCENT_POSITIVE NUMBER, 33 | A_H1 NUMBER(10,0), 34 | A_NO_SUBTYPE NUMBER(10,0), 35 | A_H3 NUMBER(10,0), 36 | H1N1 NUMBER(10,0), 37 | A_TOTAL NUMBER(10,0), 38 | B NUMBER(10,0), 39 | H3N2v NUMBER(10,0))""" 40 | rows_to_insert = filter(lambda x: len(x)> 0, map(columns_to_type, data)) 41 | cursor = db.cursor() 42 | cursor.execute("drop table us_who_flu_stats") 43 | cursor.execute(create_table) 44 | try: 45 | cursor.prepare("""INSERT INTO US_WHO_FLU_STATS ( 46 | REGION, YEAR, WEEK, TOTAL_SPECIMENS, 47 | PERCENT_POSITIVE, A_H1, A_NO_SUBTYPE, 48 | A_H3, H1N1, A_TOTAL, B, H3N2v) VALUES 49 | (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12)""") 50 | cursor.executemany(None, rows_to_insert) 51 | db.commit() 52 | except Exception as e: 53 | print e 54 | 55 | 56 | def main(filename): 57 | db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl') 58 | print "parsing datasets..." 59 | datasets = parseWHOFile(filename) 60 | print "writing to DB..." 61 | write_to_db(db, datasets) 62 | 63 | if __name__ == "__main__": 64 | main(sys.argv[1]) 65 | -------------------------------------------------------------------------------- /notebooks/01 WHO US Simple CSV Loading.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:d48ee330d31bd89a1d9a6618b4ec0ae255df9f196a5935716a4aa2a2e15b8dd8" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Loading and Sharing Simple CSV Data" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Sometimes we get lucky with the datasets we find: they're formatted just right, and in a format everyone can read. The most common of these would be a CSV or delimited-text file. All the columns are present on each row, delimited by the same character: all we need to do is parse and load the file. The [Centers for Disease Control](http://www.cdc.gov/flu/weekly/fluviewinteractive.htm) provides a way to download just this kind of read-to-consume data about flu rates in the US.\n", 24 | "\n", 25 | "There are lots of tools to do this with most databases. Oracle provides `SQL*Loader` as well as external table capabilities for this. MySQL can use the `LOAD DATA INFILE` directive to quickly load CSV data. However, we're going to use Python to illustrate how we can quickly connect this analysis-ready scripting languge to our database.\n", 26 | "\n", 27 | "We'll focus on cx-Oracle, a python module designed to connect to Oracle database." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "collapsed": false, 33 | "input": [ 34 | "import sys\n", 35 | "import cx_Oracle\n", 36 | "import pprint\n", 37 | "import re\n", 38 | "import string" 39 | ], 40 | "language": "python", 41 | "metadata": {}, 42 | "outputs": [], 43 | "prompt_number": 1 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Since the data is delimited by commas, parsing out the data we want is simple. We'll put it in a function for later use." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "collapsed": false, 55 | "input": [ 56 | "def parseWHOFile(filename):\n", 57 | " raw = map(lambda x: x.strip().split(\",\"),open(filename).readlines())\n", 58 | " data = raw[1:]\n", 59 | " return data" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [], 64 | "prompt_number": 2 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "Our data isn't all strings or numbers, so we'll have to write a quick function to type-convert the rows. We'll write the function to handle a single row, then rely on python's map operator to convert the whole dataset at once. " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "collapsed": false, 76 | "input": [ 77 | "def columns_to_type(row):\n", 78 | " try:\n", 79 | " for i in range(len(row)):\n", 80 | " if (i > 1 and i != 5):\n", 81 | " row[i] = int(row[i])\n", 82 | " if (i == 5):\n", 83 | " row[i] = float(row[i])\n", 84 | " return tuple(row[1:])\n", 85 | " except:\n", 86 | " return tuple()" 87 | ], 88 | "language": "python", 89 | "metadata": {}, 90 | "outputs": [], 91 | "prompt_number": 3 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "*write_to_db* is the most interesting part of our work. We're going to need to create a table in Oracle 12c and then fill it with rows. We do this by passing SQL language statements through cx-Oracle to the database. Notice we create a **cursor** in the method. A cursor is the structure which allows us to traverse over records in a database and execute commands. Any time we use cx-Oracle, we'll create a cursor.\n", 98 | "\n", 99 | "Once we have a cursor, we execute a few statements\n", 100 | "\n", 101 | "* We drop US_WHO_FLU_STATS to make sure there's no stale data\n", 102 | "* We create the US_WHO_FLU_STATS table, whcih tracks statistics for census regions of the US\n", 103 | "* We insert our dataset into the table using the `executemany` statement\n", 104 | "* We save our work to the database by calling `db.commit`" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "collapsed": false, 110 | "input": [ 111 | "def write_to_db(db, data):\n", 112 | " create_table = \"\"\"CREATE TABLE US_WHO_FLU_STATS (\n", 113 | " REGION VARCHAR2(50),\n", 114 | " YEAR NUMBER(10,0),\n", 115 | " WEEK NUMBER(10,0),\n", 116 | " TOTAL_SPECIMENS NUMBER(10,0),\n", 117 | " PERCENT_POSITIVE NUMBER,\n", 118 | " A_H1 NUMBER(10,0),\n", 119 | " A_NO_SUBTYPE NUMBER(10,0),\n", 120 | " A_H3 NUMBER(10,0),\n", 121 | " H1N1 NUMBER(10,0),\n", 122 | " A_TOTAL NUMBER(10,0),\n", 123 | " B NUMBER(10,0),\n", 124 | " H3N2v NUMBER(10,0))\"\"\"\n", 125 | " rows_to_insert = filter(lambda x: len(x)> 0, map(columns_to_type, data))\n", 126 | " cursor = db.cursor()\n", 127 | " try:\n", 128 | " cursor.execute(\"drop table us_who_flu_stats\")\n", 129 | " except Exception:\n", 130 | " pass\n", 131 | " cursor.execute(create_table)\n", 132 | " try:\n", 133 | " cursor.prepare(\"\"\"INSERT INTO US_WHO_FLU_STATS (\n", 134 | " REGION, YEAR, WEEK, TOTAL_SPECIMENS,\n", 135 | " PERCENT_POSITIVE, A_H1, A_NO_SUBTYPE,\n", 136 | " A_H3, H1N1, A_TOTAL, B, H3N2v) VALUES \n", 137 | " (:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12)\"\"\")\n", 138 | " cursor.executemany(None, rows_to_insert)\n", 139 | " db.commit()\n", 140 | " except Exception as e:\n", 141 | " print e\n" 142 | ], 143 | "language": "python", 144 | "metadata": {}, 145 | "outputs": [], 146 | "prompt_number": 4 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "With all of our methods complete, parsing the data is as simple as parsing the file and passing the results to our writer function." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "collapsed": false, 158 | "input": [ 159 | "db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')" 160 | ], 161 | "language": "python", 162 | "metadata": {}, 163 | "outputs": [], 164 | "prompt_number": 2 165 | }, 166 | { 167 | "cell_type": "code", 168 | "collapsed": false, 169 | "input": [ 170 | "print \"parsing datasets...\"\n", 171 | "datasets = parseWHOFile('../flu_statistics/data/WHO_NREVSS.csv')\n", 172 | "print \"writing to DB...\"\n", 173 | "write_to_db(db, datasets)\n" 174 | ], 175 | "language": "python", 176 | "metadata": {}, 177 | "outputs": [] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "We have another CSV file which might help us. It contains mappings of states to their populations and flu surveillance regions. Let's load it in a similar fashion." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "collapsed": false, 189 | "input": [ 190 | "state_mappings = map(lambda x: x.strip().split(\",\"), open(\"../flu_statistics/state_populations.csv\").readlines())\n", 191 | "header = state_mappings[0]\n", 192 | "state_data = map(lambda x: tuple(x), state_mappings[1:])\n", 193 | "state_data[:10]" 194 | ], 195 | "language": "python", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "metadata": {}, 200 | "output_type": "pyout", 201 | "prompt_number": 3, 202 | "text": [ 203 | "[('Alabama', 'AL', 'East South Central', '4779736'),\n", 204 | " ('Alaska', 'AK', 'Pacific', '710231'),\n", 205 | " ('Arizona', 'AZ', 'Mountain', '6392017'),\n", 206 | " ('Arkansas', 'AR', 'West South Central', '2915918'),\n", 207 | " ('California', 'CA', 'Pacific', '37253956'),\n", 208 | " ('Colorado', 'CO', 'Mountain', '5029196'),\n", 209 | " ('Connecticut', 'CT', 'New England', '3574097'),\n", 210 | " ('Delaware', 'DE', 'South Atlantic', '897934'),\n", 211 | " ('Florida', 'FL', 'South Atlantic', '18801310'),\n", 212 | " ('Georgia', 'GA', 'South Atlantic', '9687653')]" 213 | ] 214 | } 215 | ], 216 | "prompt_number": 3 217 | }, 218 | { 219 | "cell_type": "code", 220 | "collapsed": false, 221 | "input": [ 222 | "create_state_table = \"\"\"CREATE TABLE state_stats(\n", 223 | " state_name varchar2(26),\n", 224 | " state varchar2(2),\n", 225 | " region_name varchar2(26),\n", 226 | " population number,\n", 227 | " primary key (state)\n", 228 | " )\"\"\"\n", 229 | "cursor = db.cursor()\n", 230 | "cursor.execute(create_state_table)" 231 | ], 232 | "language": "python", 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "ename": "DatabaseError", 237 | "evalue": "ORA-00955: name is already used by an existing object\n", 238 | "output_type": "pyerr", 239 | "traceback": [ 240 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mDatabaseError\u001b[0m Traceback (most recent call last)", 241 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 7\u001b[0m )\"\"\"\n\u001b[0;32m 8\u001b[0m \u001b[0mcursor\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcreate_state_table\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 242 | "\u001b[1;31mDatabaseError\u001b[0m: ORA-00955: name is already used by an existing object\n" 243 | ] 244 | } 245 | ], 246 | "prompt_number": 4 247 | }, 248 | { 249 | "cell_type": "code", 250 | "collapsed": false, 251 | "input": [ 252 | "cursor.prepare(\"INSERT INTO state_stats (state_name, state, region_name, population) values (:1, :2, :3, :4)\")\n", 253 | "cursor.executemany(None, state_data)\n" 254 | ], 255 | "language": "python", 256 | "metadata": {}, 257 | "outputs": [], 258 | "prompt_number": 16 259 | }, 260 | { 261 | "cell_type": "code", 262 | "collapsed": false, 263 | "input": [ 264 | "db.commit()" 265 | ], 266 | "language": "python", 267 | "metadata": {}, 268 | "outputs": [], 269 | "prompt_number": 17 270 | } 271 | ], 272 | "metadata": {} 273 | } 274 | ] 275 | } -------------------------------------------------------------------------------- /notebooks/02 WHO Country-Level Flu Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:7bca72e089995043bc31f094a29934ac96b1a43fa92cca8e9caa02736dbb9c7a" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Loading Simple Delimited Data" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "There are lots of tools to load delimited data in to a database for sharing. But very often, the delimited data we're given doesn't fit the form of the tables we'd like to make. In our case, the [WHO](http://www.who.int/influenza/gisrs_laboratory/flunet/en/) makes available country-level data for influenza surveillance: what strain, how many samples, and so on for every week of the year. The data is free to download, but just a raw dump of delimited strings.\n", 24 | "\n", 25 | "In this exercise, we'll use basic python and the cx-Oracle to clean up the data and form it into useful tables. To begin, let's import cx-Oracle, which we'll use to communicate with our database, and a few standard python libraries." 26 | ] 27 | }, 28 | { 29 | "cell_type": "heading", 30 | "level": 2, 31 | "metadata": {}, 32 | "source": [ 33 | "Munging and Pivoting Data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "collapsed": false, 39 | "input": [ 40 | "import sys\n", 41 | "import cx_Oracle\n", 42 | "import re\n", 43 | "import string\n", 44 | "from glob import glob" 45 | ], 46 | "language": "python", 47 | "metadata": {}, 48 | "outputs": [], 49 | "prompt_number": 10 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "First thing's first: we need a method that will parse one of the `.psv` files which our WHO data comes in. Take a minute to look at `WHO+EURO.psv`. How many potential tables do you see in the data?\n", 56 | "\n", 57 | "Unfortunately, lines in the files aren't all the same length. Some rows have 1 delimiter, some have more than 50. In our method, we've decided that the split between tables occurs where the rows have 2 \"columns.\" Can you see why?\n", 58 | "\n", 59 | "We're going to break our file up into logical chunks, and each chunk will become a table. For each of these chunks, we'll need to get the name of the table and the data for the table. We'll also extract the region of the world this data belongs to from the filename.\n", 60 | "\n", 61 | "Finding the title and the dataset inside each chunk is fairly simple, but we'll need to do more to create well-formed data we can share with a database. For this, we'll define a few functions: one for creating the table name, the other for formatting rows. Then, for each logical chunk, we'll add it to a list of datasets to be inserted into the database." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "collapsed": false, 67 | "input": [ 68 | "def parseWHOCountryFile(filename):\n", 69 | " region = re.sub(\"WHO+\", \"\",filename)\n", 70 | " region = re.sub(\".psv\", \"\", region)\n", 71 | " region = re.sub(\"\\+\",\"\", region)\n", 72 | " region = region.split(\"/\")[-1]\n", 73 | " raw = open(filename).readlines()\n", 74 | " big_splits = []\n", 75 | " for i in range(len(raw)):\n", 76 | " if len(raw[i].split(\"|\")) == 2:\n", 77 | " big_splits.append(i)\n", 78 | " datasets = []\n", 79 | " for i in range(0,len(big_splits),2):\n", 80 | " ds = {}\n", 81 | " ds['title'] = raw[big_splits[i]]\n", 82 | " ds['title'] = make_tablename(ds['title'])\n", 83 | " ds['period'] = raw[big_splits[i+1]]\n", 84 | " if i < len(big_splits)-2:\n", 85 | " ds['starts'] = big_splits[i+1]+1\n", 86 | " ds['ends'] = big_splits[i+2]-1\n", 87 | " else:\n", 88 | " ds['starts'] = big_splits[i+1]+1\n", 89 | " ds['ends'] = len(raw)-1\n", 90 | " ds['data'] = raw[ds['starts']:ds['ends']]\n", 91 | " ds['data'] = makerows(region, ds['data'])\n", 92 | " datasets.append(ds)\n", 93 | " return datasets" 94 | ], 95 | "language": "python", 96 | "metadata": {}, 97 | "outputs": [], 98 | "prompt_number": 5 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Making the name for the table is pretty easy. We know we can't have special characters like `(` or `)` in our table name. We also can't have spaces. So, with a little chopping up of the string and a few calls to the re module, we've got a clean table name." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "collapsed": false, 110 | "input": [ 111 | "def make_tablename(d):\n", 112 | " title_string = d.split(\"|\")[0]\n", 113 | " if \"specimen\" in title_string:\n", 114 | " title = title_string.split(\"->\")[-1].strip()+\"_Specimens\"\n", 115 | " if \"processed\" in title_string:\n", 116 | " title += \"_proc\"\n", 117 | " else:\n", 118 | " title += \"_recv\"\n", 119 | " else:\n", 120 | " title = title_string.split(\"->\")[-1].strip()+\"_Infections\"\n", 121 | " title = re.sub(\"\\(\", \"\", title)\n", 122 | " title = re.sub(\"\\)\", \"\", title)\n", 123 | " title = re.sub(\" \",\"_\", title)\n", 124 | " return title[:25]" 125 | ], 126 | "language": "python", 127 | "metadata": {}, 128 | "outputs": [], 129 | "prompt_number": 3 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Separating the lines of data into rows and columns for our database insert is easy: everything is delimited by the `|` character. However, we'd like to have a table of tuples like this (region, country, year, week, measurement) and instead we've got all the measures for every week of the year on a single line. Fortunately, this sort of *en masse* string manipulation is easy with python's map function. By defining an inner function and using the map operator, we can quickly pivot all the weeks into the 4-tuple for our database table." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "collapsed": false, 141 | "input": [ 142 | "def makerows(region_code, data):\n", 143 | " dateprefix = data[0]\n", 144 | " dates = map(lambda x: x.split(), dateprefix.strip().split(\"|\")[1:])\n", 145 | " dates = map(lambda x: [x[0], x[-1]], dates)\n", 146 | " data = map(lambda x: x.strip().split(\"|\"), data[1:])\n", 147 | " # what we're really doing here is pivoting the data so that we can have country, year, week, value\n", 148 | " # for each row of raw data, we want a list of tuple (country, year, week, value)\n", 149 | " def row_to_tuple(dates, r):\n", 150 | " t = []\n", 151 | " for i in range(1,len(r)):\n", 152 | " t.append((region_code, r[0], int(dates[i-1][0]), int(dates[i-1][1]), int(re.sub(\",\",\"\",r[i]))))\n", 153 | " return t\n", 154 | " data = map(lambda x: row_to_tuple(dates, x), data)\n", 155 | " return data" 156 | ], 157 | "language": "python", 158 | "metadata": {}, 159 | "outputs": [], 160 | "prompt_number": 4 161 | }, 162 | { 163 | "cell_type": "heading", 164 | "level": 2, 165 | "metadata": {}, 166 | "source": [ 167 | "Writing Tables and Views" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "We now have a list of datasets, each with a table name and a set of 4-tuples. In order to write this to the database, we'll need a method which does bulk inserts into Oracle database. Our *write_to_db* function does just that. Notice that because we don't know the name of the table, we use string substitution to automatically create a table for each of the logical chunks in a `psv` file." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "collapsed": false, 180 | "input": [ 181 | "def write_to_db(db, dataset):\n", 182 | " dt = \"DROP TABLE %s\" % dataset['title']\n", 183 | " cursor = db.cursor()\n", 184 | " ctine = \"\"\"CREATE TABLE %s (\n", 185 | "\t region VARCHAR(12),\n", 186 | "\t country VARCHAR(500),\n", 187 | "\t year NUMBER,\n", 188 | "\t week NUMBER,\n", 189 | "\t measure NUMBER,\n", 190 | " CONSTRAINT %s_id PRIMARY KEY (country, year, week)\n", 191 | " )\"\"\" % (dataset['title'], dataset['title'])\n", 192 | " try:\n", 193 | " cursor.execute(ctine)\n", 194 | " except Exception as e:\n", 195 | " print e\n", 196 | " print \"failed to create table\", dataset['title']\n", 197 | " pass\n", 198 | " for country in dataset['data']:\n", 199 | " try:\n", 200 | " cursor.prepare(\"INSERT INTO %s (region, country, year, week, measure) VALUES (:1, :2, :3, :4, :5)\" % dataset['title'])\n", 201 | " cursor.executemany(None, country)\n", 202 | " db.commit()\n", 203 | " except Exception as e:\n", 204 | " print e\n", 205 | " print country[0]" 206 | ], 207 | "language": "python", 208 | "metadata": {}, 209 | "outputs": [], 210 | "prompt_number": 2 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "While the *write_to_db* method will rapidly insert a dataset into the database, the tables it creates only show us one measure at a time. When we think about the flu, we would like to look at measures for different strains side-by-side. More importantly, when we expose that data to external tools, we'd like to present a single dataset. Fortunately, database *views* make it easy to \"publish\" a particular query for others to quickly access.\n", 217 | "\n", 218 | "The view we want needs to do the following: for each instance of (country, year, week), produce all the measurements from the tables we created from the raw data. That means our SQL will need to:\n", 219 | "\n", 220 | "* SELECT region, country, year, and week from **one** table\n", 221 | "* SELECT the measurement from **each** table with a new column name (say, the table name)\n", 222 | "* JOIN the tables together so that there is one row with all measures for each (region, country, year, week) tuple.\n", 223 | "\n", 224 | "Can you write the SQL query yourself? Can you see how the *build_view* method assembles the query automatically?" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "collapsed": false, 230 | "input": [ 231 | "def build_view(datasets):\n", 232 | " view = \"\"\"CREATE OR REPLACE VIEW flu_statistics AS\n", 233 | " SELECT a.region, a.country, a.year, a.week, \\n\"\"\"\n", 234 | " from_clause = \" from \\n\"\n", 235 | " where_clause = \" where \\n\"\n", 236 | " column_creation = \"\"\n", 237 | " table_creation = \"\"\n", 238 | " join_creation = \"\"\n", 239 | " for i in range(len(datasets)):\n", 240 | " column_creation += \"{0}.measure as {1}\".format(string.ascii_lowercase[i], datasets[i]['title'])\n", 241 | " \n", 242 | " table_creation += \"{0} {1}\".format(datasets[i]['title'], string.ascii_lowercase[i])\n", 243 | " \n", 244 | " if (i < len(datasets)-1):\n", 245 | " column_creation += \",\\n\"\n", 246 | " table_creation += \",\\n\"\n", 247 | " join_creation += \"{1}.country = {0}.country and {1}.year = {0}.year and {1}.week = {0}.week \\n\".format(string.ascii_lowercase[i], string.ascii_lowercase[i+1])\n", 248 | " if (i < len(datasets)-2):\n", 249 | " join_creation += \"and\\n\"\n", 250 | " else:\n", 251 | " column_creation += \"\\n\"\n", 252 | " table_creation += \"\\n\"\n", 253 | " \n", 254 | "\n", 255 | " view += column_creation + from_clause + table_creation + where_clause + join_creation\n", 256 | " return view" 257 | ], 258 | "language": "python", 259 | "metadata": {}, 260 | "outputs": [], 261 | "prompt_number": 6 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "With our methods complete, we can finally list the files and get to processing. The glob function allows us a wildcard search of the flu statistics data we downloaded." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "collapsed": false, 273 | "input": [ 274 | "files = glob(\"../flu_statistics/data/*.psv\")\n", 275 | "print files\n", 276 | "files = files[:-1]" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "output_type": "stream", 283 | "stream": "stdout", 284 | "text": [ 285 | "['/home/oracle/odsb2014/flu_statistics/data/WHO+WEST+ASIA.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+EURO.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+PAHO.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+AFRO.psv', '/home/oracle/odsb2014/flu_statistics/data/WHO+US+comprehensive.psv']\n" 286 | ] 287 | } 288 | ], 289 | "prompt_number": 9 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "And a simple for-loop will load the data into our database and create a view to share with our teammembers." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "collapsed": false, 301 | "input": [ 302 | "db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')\n", 303 | "print \"parsing datasets...\"\n", 304 | "for filename in files:\n", 305 | " datasets = parseWHOCountryFile(filename)\n", 306 | " print \"writing to DB...\"\n", 307 | " for dataset in datasets:\n", 308 | " write_to_db(db, dataset)\n", 309 | " print \"creating view...\"\n", 310 | " c = db.cursor()\n", 311 | " c.execute(build_view(datasets))" 312 | ], 313 | "language": "python", 314 | "metadata": {}, 315 | "outputs": [] 316 | } 317 | ], 318 | "metadata": {} 319 | } 320 | ] 321 | } -------------------------------------------------------------------------------- /notebooks/03 HHS_Flu_Vaccination_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:39723e6a76e2b17dd6babf6931598ccb7c0d08ec6fea51f65e4bf875ab75348a" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Loading HHS Flu Vaccination JSON Data" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Sometimes data is already in a handy document store for us. JSON (JavaScript Object Notation) data is in the form of a list of objects composed of a set of key-value pairs. \n", 24 | "\n", 25 | "In this exercise, we'll use basic python and the cx-Oracle to pull Flu Vaccination Data from HHS.gov and store it in a table in our database. The data and API are provided by the [US HHS](http://flu-vaccination-map.hhs.gov/). Since the data we want is already in JSON format, we won't need to do much parsing, we'll mostly just \"dump\" it into the database. As usual, we begin by importing the libraries we'll need." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "collapsed": false, 31 | "input": [ 32 | "import sys\n", 33 | "import cx_Oracle\n", 34 | "import pprint\n", 35 | "import re\n", 36 | "import string\n", 37 | "import json\n", 38 | "import urllib2" 39 | ], 40 | "language": "python", 41 | "metadata": {}, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "First thing we'll need to do is write a quick helper function to \"jsonify\" our data. This will turn our data into separate JSON strings and return them as a list of tuples. " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [ 55 | "def jsonify_data(d):\n", 56 | " return(json.dumps(d),)" 57 | ], 58 | "language": "python", 59 | "metadata": {}, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "First, we'll need to connect to the database and make a table to store the data in. Because the data we're collecting is JSON, we can save time by assigning the whole document to a single column. For this exercise we'll just need one table, call it flu_shot_json with, one column, call it doc. We'll pull the 'results' from our data, jsonify it, then insert it into our table in the doc column." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "collapsed": false, 72 | "input": [ 73 | "db = cx_Oracle.connect('fludb', 'flushot', 'localhost:1521/orcl')\n", 74 | "drop_table = 'drop table flu_shot_json'\n", 75 | "ddl = 'create table flu_shot_json (doc varchar2(4000), CONSTRAINT \"ENSURE_JSON\" CHECK (doc IS JSON))'\n", 76 | "cursor = db.cursor()\n", 77 | "try:\n", 78 | " cursor.execute(drop_table)\n", 79 | "except:\n", 80 | " pass\n", 81 | "cursor.execute(ddl)\n", 82 | "cursor.close()" 83 | ], 84 | "language": "python", 85 | "metadata": {}, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "That's pretty much all the setup we need to do, so now we'll go ahead and create a `write to db` function. \n", 93 | "\n", 94 | "As with most database operations, we need a cursor. Don't forget to commit the inserts after they've executed!" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "collapsed": false, 100 | "input": [ 101 | "def write_to_db(db, data):\n", 102 | " cursor = db.cursor()\n", 103 | " try:\n", 104 | " cursor.prepare(\"INSERT INTO flu_shot_json(doc) VALUES (:1)\")\n", 105 | " cursor.executemany(None, map(jsonify_data, data['results']))\n", 106 | " db.commit()\n", 107 | " except Exception as e:\n", 108 | " print e" 109 | ], 110 | "language": "python", 111 | "metadata": {}, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Now all we need to do is pull the data from HHS.gov and write it to the db. There are a number of ethnicities, so we need to collect all of them." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "collapsed": false, 124 | "input": [ 125 | "print \"parsing dataset...\"\n", 126 | "for eth in [\"T\",\"A\",\"W\",\"B\",\"H\"]:\n", 127 | " url = \"http://flu-vaccination-map.hhs.gov/api/v1/states.json?ethnicity=\"+eth+\"{&year=lte:2014}\"\n", 128 | " data = json.load(urllib2.urlopen(url))\n", 129 | " print \"writing to DB...\"\n", 130 | " write_to_db(db, data)" 131 | ], 132 | "language": "python", 133 | "metadata": {}, 134 | "outputs": [] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Finally, we'll make a database view on the data so that we don't have to write JSON access paths to get at individual fields." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "collapsed": false, 146 | "input": [ 147 | "view_ddl = \"\"\"CREATE OR REPLACE VIEW FLUSHOTS \n", 148 | "AS SELECT\n", 149 | "CAST(j.doc.count AS NUMBER) eligible,\n", 150 | "CAST(j.doc.week AS NUMBER) week,\n", 151 | "CAST(j.doc.name AS VARCHAR2(20)) state_name,\n", 152 | "CAST(j.doc.short_name AS VARCHAR2(2)) state,\n", 153 | "CAST(j.doc.fips_id\tAS NUMBER) fips_id,\n", 154 | "CAST(j.doc.disparity as VARCHAR2(20)) disparity,\n", 155 | "CAST(j.doc.medicare_status as VARCHAR2(20)) medicare_status,\n", 156 | "CAST(j.doc.year as NUMBER) year,\n", 157 | "CAST(j.doc.percentage AS NUMBER) percentage_claimed,\n", 158 | "CAST(j.doc.ethnicity AS VARCHAR2(20)) ethnicity\n", 159 | "FROM flu_shot_json j;\"\"\"\n", 160 | "cursor = db.cursor()\n", 161 | "cursor.execute(view_ddl)\n", 162 | "cursor.close()" 163 | ], 164 | "language": "python", 165 | "metadata": {}, 166 | "outputs": [] 167 | } 168 | ], 169 | "metadata": {} 170 | } 171 | ] 172 | } -------------------------------------------------------------------------------- /notebooks/04 OIE_Pathogenic_Flu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:f589cc76360d648a1ea77317abc619ebb9e34ba612bf4545f47efc777dc82060" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Gathering OIE Pathogenic Flu Data: Scraping the Web" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Sometimes the data we want to analyze is available on the web, but isn't as conveniently accessed via an API or direct download. In some cases, the data is embedded in web pages and needs to be scraped out. In this exercise, we'll collect data on pathogenic strains of influenza in animals, provided by the [World Organization for Animal Health](http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/2013/).\n", 24 | "\n", 25 | "To get this data we'll use a number of pythonic tools\n", 26 | "\n", 27 | "- Requests: a library which simplifies making web requests\n", 28 | "- Beautiful Soup: a library designed to parse and extract information from HTML pages\n", 29 | "- cx-Oracle: our standard library for making bulk inserts into Oracle 12c\n", 30 | "- ipython-sql: an iPython extension that allows us to write SQL directly in our notebook " 31 | ] 32 | }, 33 | { 34 | "cell_type": "heading", 35 | "level": 2, 36 | "metadata": {}, 37 | "source": [ 38 | "Exploring Web Data" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "collapsed": false, 44 | "input": [ 45 | "import requests\n", 46 | "from bs4 import BeautifulSoup\n", 47 | "import cx_Oracle\n", 48 | "import datetime" 49 | ], 50 | "language": "python", 51 | "metadata": {}, 52 | "outputs": [], 53 | "prompt_number": 94 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "By looking at the OIE site, we can tell there are reports for the years 2004-2014, all with the same base URL. Since we want to create a table of all data for all years, let's keep the base URL and year in a pair of variables." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "collapsed": false, 65 | "input": [ 66 | "base_year = 2004\n", 67 | "years = 10\n", 68 | "base_url = \"http://www.oie.int/animal-health-in-the-world/update-on-avian-influenza/\"" 69 | ], 70 | "language": "python", 71 | "metadata": {}, 72 | "outputs": [], 73 | "prompt_number": 95 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "The requests library will let us quickly get the report for 2004. However, if we look at the first few lines, we have a whole page of HTML, not just the table we want. How can we quickly get just the information in the table?" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "collapsed": false, 85 | "input": [ 86 | "report = requests.get(base_url+str(base_year))\n", 87 | "print report.text[:1000]" 88 | ], 89 | "language": "python", 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "output_type": "stream", 94 | "stream": "stdout", 95 | "text": [ 96 | "\n", 99 | "\n", 100 | "\n", 101 | "\t\n", 102 | "\n", 103 | "\n", 109 | "\n", 110 | "\t\n", 111 | "\t\n", 112 | "\t\n", 113 | "\t\n", 114 | "\t` elements to find out what the column names are. We may not use these names, but it's handy to know what data to expect." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "columns = map(lambda x: x.text, flu_table.find_all(\"th\"))\n", 177 | "print columns" 178 | ], 179 | "language": "python", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "output_type": "stream", 184 | "stream": "stdout", 185 | "text": [ 186 | "[u'Location', u'Virus Type', u'Date', u'Link']\n" 187 | ] 188 | } 189 | ], 190 | "prompt_number": 98 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "What we *really* want are the rows with data. They're marked with `` tags, so they're easy to find. We should exclude the first row, since it only contains the column names." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "collapsed": false, 202 | "input": [ 203 | "row_tags = flu_table.find_all(\"tr\")[1:]\n", 204 | "row_tags[0].find_all(\"td\")" 205 | ], 206 | "language": "python", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "metadata": {}, 211 | "output_type": "pyout", 212 | "prompt_number": 99, 213 | "text": [ 214 | "[Vietnam,\n", 215 | " H5N1,\n", 216 | " 08/01/04,\n", 217 | " Emergency report\u00a0\u00a02053]" 218 | ] 219 | } 220 | ], 221 | "prompt_number": 99 222 | }, 223 | { 224 | "cell_type": "heading", 225 | "level": 2, 226 | "metadata": {}, 227 | "source": [ 228 | "Fetching and Storing Web Data" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "Now that we understand how to find the row data in each OIE report, we'll need a function to convert the contents of a `` tag into a tuple we can store in a database. The *make_db_row* function will do this. In the function we need to do several things, including\n", 236 | "\n", 237 | "* extracting the `href` field from the link field\n", 238 | "* converting the date field into a python date object\n", 239 | "* dealing with inconsistencies in the data\n", 240 | "\n", 241 | "Look at the series of OIE reports. Does the structure change over time? How does our method need to adapt to those changes?" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "collapsed": false, 247 | "input": [ 248 | "def make_db_row(r):\n", 249 | " #newer reports have an extra year column\n", 250 | " if len(r) > 4:\n", 251 | " r.pop(2)\n", 252 | " \n", 253 | " r[-1]=r[-1].findChild()\n", 254 | " #try to get the report url\n", 255 | " url = None\n", 256 | " try:\n", 257 | " url = r[-1]['href']\n", 258 | " except:\n", 259 | " pass\n", 260 | " row_text = map(lambda x: x.text.encode('ascii', 'ignore'), r)+[url]\n", 261 | " try:\n", 262 | " row_text[2] = datetime.datetime.strptime(row_text[2], \"%d/%m/%y\").date()\n", 263 | " except Exception as e:\n", 264 | " print r, e\n", 265 | " return tuple(row_text)" 266 | ], 267 | "language": "python", 268 | "metadata": {}, 269 | "outputs": [], 270 | "prompt_number": 121 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Let's test our method by passing the 2004 HTML data into it. We should get back a list of tuples." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "collapsed": false, 282 | "input": [ 283 | "data_to_insert = map(lambda x: make_db_row(x.find_all(\"td\")), row_tags)\n", 284 | "data_to_insert[:5]" 285 | ], 286 | "language": "python", 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "metadata": {}, 291 | "output_type": "pyout", 292 | "prompt_number": 101, 293 | "text": [ 294 | "[('Vietnam',\n", 295 | " 'H5N1',\n", 296 | " datetime.date(2004, 1, 8),\n", 297 | " 'Emergency report',\n", 298 | " 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf'),\n", 299 | " ('Japan',\n", 300 | " 'H5N1',\n", 301 | " datetime.date(2004, 1, 12),\n", 302 | " 'Emergency report',\n", 303 | " 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n", 304 | " ('Japan',\n", 305 | " 'H5N1',\n", 306 | " datetime.date(2004, 1, 13),\n", 307 | " 'Follow up report No.1',\n", 308 | " 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n", 309 | " ('Chinese Taipei',\n", 310 | " 'H5N2',\n", 311 | " datetime.date(2004, 1, 20),\n", 312 | " 'Emergency report',\n", 313 | " 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf'),\n", 314 | " ('Japan',\n", 315 | " 'H5N1',\n", 316 | " datetime.date(2004, 1, 20),\n", 317 | " 'Follow up report No.2',\n", 318 | " 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf')]" 319 | ] 320 | } 321 | ], 322 | "prompt_number": 101 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "Now that we know our method works, all we need to do is step through the years and build up our dataset." 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "collapsed": false, 334 | "input": [ 335 | "for i in range(1,years+1):\n", 336 | " print base_year+i\n", 337 | " report = requests.get(base_url+str(base_year+i))\n", 338 | " searchable_report = BeautifulSoup(report.text)\n", 339 | " flu_table = searchable_report.table\n", 340 | " row_tags = flu_table.find_all(\"tr\")[1:]\n", 341 | " data_to_insert += map(lambda x: make_db_row(x.find_all(\"td\")), row_tags)\n", 342 | "print len(data_to_insert)" 343 | ], 344 | "language": "python", 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "output_type": "stream", 349 | "stream": "stdout", 350 | "text": [ 351 | "2005\n", 352 | "2006" 353 | ] 354 | }, 355 | { 356 | "output_type": "stream", 357 | "stream": "stdout", 358 | "text": [ 359 | "\n", 360 | "2007" 361 | ] 362 | }, 363 | { 364 | "output_type": "stream", 365 | "stream": "stdout", 366 | "text": [ 367 | "\n", 368 | "2008" 369 | ] 370 | }, 371 | { 372 | "output_type": "stream", 373 | "stream": "stdout", 374 | "text": [ 375 | "\n", 376 | "2009" 377 | ] 378 | }, 379 | { 380 | "output_type": "stream", 381 | "stream": "stdout", 382 | "text": [ 383 | "\n", 384 | "2010" 385 | ] 386 | }, 387 | { 388 | "output_type": "stream", 389 | "stream": "stdout", 390 | "text": [ 391 | "\n", 392 | "2011" 393 | ] 394 | }, 395 | { 396 | "output_type": "stream", 397 | "stream": "stdout", 398 | "text": [ 399 | "\n", 400 | "2012" 401 | ] 402 | }, 403 | { 404 | "output_type": "stream", 405 | "stream": "stdout", 406 | "text": [ 407 | "\n", 408 | "2013" 409 | ] 410 | }, 411 | { 412 | "output_type": "stream", 413 | "stream": "stdout", 414 | "text": [ 415 | "\n", 416 | "2014" 417 | ] 418 | }, 419 | { 420 | "output_type": "stream", 421 | "stream": "stdout", 422 | "text": [ 423 | "\n", 424 | "1350" 425 | ] 426 | }, 427 | { 428 | "output_type": "stream", 429 | "stream": "stdout", 430 | "text": [ 431 | "\n" 432 | ] 433 | } 434 | ], 435 | "prompt_number": 102 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "Now that we have all the data, we can insert it into Oracle 12c just like our other data sets. First, we create the table." 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "collapsed": false, 447 | "input": [ 448 | "create_table = \"\"\"CREATE TABLE PATHOGENIC_FLU (\n", 449 | " INCIDENT_LOCATION VARCHAR2(100),\n", 450 | " INCIDENT_TYPE VARCHAR2(50),\n", 451 | " INCIDENT_DATE DATE,\n", 452 | " INCIDENT_REPORT VARCHAR2(100),\n", 453 | " REPORT_LINK VARCHAR2(200)\n", 454 | " )\n", 455 | " \"\"\"\n", 456 | "db = cx_Oracle.connect(\"fludb\", \"flushot\", \"localhost:1521/orcl\")\n", 457 | "cursor = db.cursor()\n", 458 | "cursor.execute(create_table)" 459 | ], 460 | "language": "python", 461 | "metadata": {}, 462 | "outputs": [], 463 | "prompt_number": 110 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "Then we insert the rows." 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "collapsed": false, 475 | "input": [ 476 | "cursor.prepare(\"\"\"INSERT INTO PATHOGENIC_FLU\n", 477 | " (INCIDENT_LOCATION,INCIDENT_TYPE,INCIDENT_DATE, INCIDENT_REPORT, REPORT_LINK)\n", 478 | " VALUES\n", 479 | " (:1, :2, :3, :4, :5)\n", 480 | " \"\"\")\n", 481 | "cursor.executemany(None, data_to_insert)\n", 482 | "db.commit()" 483 | ], 484 | "language": "python", 485 | "metadata": {}, 486 | "outputs": [], 487 | "prompt_number": 111 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "Once our rows are inserted, we can use ipython-sql to connect to the database and query our new table directly." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "collapsed": false, 499 | "input": [ 500 | "%load_ext sql" 501 | ], 502 | "language": "python", 503 | "metadata": {}, 504 | "outputs": [], 505 | "prompt_number": 114 506 | }, 507 | { 508 | "cell_type": "code", 509 | "collapsed": false, 510 | "input": [ 511 | "%sql oracle://fludb:flushot@localhost:1521/orcl" 512 | ], 513 | "language": "python", 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "metadata": {}, 518 | "output_type": "pyout", 519 | "prompt_number": 117, 520 | "text": [ 521 | "u'Connected: fludb@orcl'" 522 | ] 523 | } 524 | ], 525 | "prompt_number": 117 526 | }, 527 | { 528 | "cell_type": "code", 529 | "collapsed": false, 530 | "input": [ 531 | "%sql select * from pathogenic_flu where rownum < 5 order by incident_date" 532 | ], 533 | "language": "python", 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "output_type": "stream", 538 | "stream": "stdout", 539 | "text": [ 540 | "0 rows affected.\n" 541 | ] 542 | }, 543 | { 544 | "html": [ 545 | "\n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | "
incident_locationincident_typeincident_dateincident_reportreport_link
VietnamH5N12004-01-08 00:00:00Emergency reportftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf
JapanH5N12004-01-12 00:00:00Emergency reportftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf
JapanH5N12004-01-13 00:00:00Follow up report No.1ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf
Chinese TaipeiH5N22004-01-20 00:00:00Emergency reportftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf
" 582 | ], 583 | "metadata": {}, 584 | "output_type": "pyout", 585 | "prompt_number": 120, 586 | "text": [ 587 | "[('Vietnam', 'H5N1', datetime.datetime(2004, 1, 8, 0, 0), 'Emergency report', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040109v17n02.pdf'),\n", 588 | " ('Japan', 'H5N1', datetime.datetime(2004, 1, 12, 0, 0), 'Emergency report', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n", 589 | " ('Japan', 'H5N1', datetime.datetime(2004, 1, 13, 0, 0), 'Follow up report No.1', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040116v17n03.pdf'),\n", 590 | " ('Chinese Taipei', 'H5N2', datetime.datetime(2004, 1, 20, 0, 0), 'Emergency report', 'ftp://ftp.oie.int/infos_san_archives/eng/2004/en_040123v17n04.pdf')]" 591 | ] 592 | } 593 | ], 594 | "prompt_number": 120 595 | }, 596 | { 597 | "cell_type": "code", 598 | "collapsed": false, 599 | "input": [], 600 | "language": "python", 601 | "metadata": {}, 602 | "outputs": [] 603 | } 604 | ], 605 | "metadata": {} 606 | } 607 | ] 608 | } -------------------------------------------------------------------------------- /notebooks/05 Collecting Web Data With Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Collecting Web Data with Pandas" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "As we explore how flu interacts with the world around us, it would be useful to have data about countries' populations, GDP, and other indicators of economic and healthcare development. One place we might look is in the [World Bank's collection of development indicators](http://data.worldbank.org/data-catalog/world-development-indicators/).\n", 23 | "\n", 24 | "We've looked at a number of ways to collect data from the web, but sometimes useful data is included in the tools that we use. We could go directly to the World Bank site, download information and then parse it, but it's actually included in a tool we've been using: Pandas. Pandas includes [remote data access](http://pandas.pydata.org/pandas-docs/stable/remote_data.html) for quickly grabbing data about world development, finance, and web traffic.\n", 25 | "\n", 26 | "In this short workshop, we'll quickly collect data via Pandas and include it in our database.\n", 27 | "\n", 28 | "We start with what is becoming a common set of `import`s" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "import numpy as np\n", 36 | "import pandas as pd\n", 37 | "import matplotlib as mpl\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "%matplotlib inline\n", 40 | "%load_ext sql\n", 41 | "%sql oracle://fludb:flushot@localhost:1521/orcl" 42 | ], 43 | "language": "python", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "metadata": {}, 48 | "output_type": "pyout", 49 | "prompt_number": 1, 50 | "text": [ 51 | "u'Connected: fludb@orcl'" 52 | ] 53 | } 54 | ], 55 | "prompt_number": 1 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "Since we're looking for data about countries, we should start with the list of countries we have data about. They're easy to get using SQL." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "collapsed": false, 67 | "input": [ 68 | "countries = %sql select distinct(country) from flu_statistics\n", 69 | "countries = countries.DataFrame()\n", 70 | "countries[:5]" 71 | ], 72 | "language": "python", 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "output_type": "stream", 77 | "stream": "stdout", 78 | "text": [ 79 | "0 rows affected.\n" 80 | ] 81 | }, 82 | { 83 | "html": [ 84 | "
\n", 85 | "\n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | "
country
0 Ireland
1 Algeria
2 Brazil
3 Lao People's Democratic Republic
4 Croatia
\n", 115 | "
" 116 | ], 117 | "metadata": {}, 118 | "output_type": "pyout", 119 | "prompt_number": 2, 120 | "text": [ 121 | " country\n", 122 | "0 Ireland\n", 123 | "1 Algeria\n", 124 | "2 Brazil\n", 125 | "3 Lao People's Democratic Republic\n", 126 | "4 Croatia" 127 | ] 128 | } 129 | ], 130 | "prompt_number": 2 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "We have the country names, but not the ISO codes by which they are most commonly identified. Let's start by using pandas to get the list of country codes from the world bank." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "collapsed": false, 142 | "input": [ 143 | "from pandas.io import wb\n", 144 | "wb_country_codes = wb.get_countries()[['name', 'iso2c']]\n", 145 | "wb_country_codes[:5]" 146 | ], 147 | "language": "python", 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "html": [ 152 | "
\n", 153 | "\n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | "
nameiso2c
0 Aruba AW
1 Afghanistan AF
2 Africa A9
3 Angola AO
4 Albania AL
\n", 189 | "
" 190 | ], 191 | "metadata": {}, 192 | "output_type": "pyout", 193 | "prompt_number": 3, 194 | "text": [ 195 | " name iso2c\n", 196 | "0 Aruba AW\n", 197 | "1 Afghanistan AF\n", 198 | "2 Africa A9\n", 199 | "3 Angola AO\n", 200 | "4 Albania AL" 201 | ] 202 | } 203 | ], 204 | "prompt_number": 3 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "In order to create a frame which has only the countries (and codes) that we have flu data about, we'll use pandas `merge` function. This is akin to writing a right join in SQL." 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "collapsed": false, 216 | "input": [ 217 | "flu_country_codes = pd.merge(countries, wb_country_codes, how=\"right\", left_on=\"country\", right_on=\"name\")[['country','iso2c']]\n", 218 | "flu_country_codes[:10]" 219 | ], 220 | "language": "python", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "html": [ 225 | "
\n", 226 | "\n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | "
countryiso2c
0 Ireland IE
1 Algeria DZ
2 Brazil BR
3 Croatia HR
4 Denmark DK
5 Kenya KE
6 South Africa ZA
7 Serbia RS
8 Poland PL
9 Bosnia and Herzegovina BA
\n", 287 | "
" 288 | ], 289 | "metadata": {}, 290 | "output_type": "pyout", 291 | "prompt_number": 4, 292 | "text": [ 293 | " country iso2c\n", 294 | "0 Ireland IE\n", 295 | "1 Algeria DZ\n", 296 | "2 Brazil BR\n", 297 | "3 Croatia HR\n", 298 | "4 Denmark DK\n", 299 | "5 Kenya KE\n", 300 | "6 South Africa ZA\n", 301 | "7 Serbia RS\n", 302 | "8 Poland PL\n", 303 | "9 Bosnia and Herzegovina BA" 304 | ] 305 | } 306 | ], 307 | "prompt_number": 4 308 | }, 309 | { 310 | "cell_type": "heading", 311 | "level": 2, 312 | "metadata": {}, 313 | "source": [ 314 | "World Population" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "Now that we have our ISO codes, we can use pandas to get data on the world's population. We'll need this to normalize things like the number of flu cases in each country." 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "collapsed": false, 327 | "input": [ 328 | "world_population = wb.download(indicator='sp.pop.totl', country=flu_country_codes['iso2c'], start=2013, end=2013)\n", 329 | "world_population[:5]" 330 | ], 331 | "language": "python", 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "output_type": "stream", 336 | "stream": "stderr", 337 | "text": [ 338 | "/usr/lib64/python2.6/site-packages/pandas/io/wb.py:128: UserWarning: Non-standard ISO country codes: 1A, 1W, 4E, 7E, 8S, A4, A5, A9, B8, C4, C5, C6, C7, C8, C9, EU, F1, JG, KV, L4, L5, L6, L7, M2, OE, S1, S2, S3, S4, XC, XD, XE, XJ, XL, XM, XN, XO, XP, XQ, XR, XS, XT, XU, XY, Z4, Z7, ZF, ZG, ZJ, ZQ\n", 339 | " warnings.warn('Non-standard ISO country codes: %s' % tmp)\n" 340 | ] 341 | }, 342 | { 343 | "html": [ 344 | "
\n", 345 | "\n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | "
sp.pop.totl
countryyear
Aruba2013 102911
Andorra2013 79218
Afghanistan2013 30551674
Angola2013 21471618
Albania2013 2773620
\n", 386 | "
" 387 | ], 388 | "metadata": {}, 389 | "output_type": "pyout", 390 | "prompt_number": 37, 391 | "text": [ 392 | " sp.pop.totl\n", 393 | "country year \n", 394 | "Aruba 2013 102911\n", 395 | "Andorra 2013 79218\n", 396 | "Afghanistan 2013 30551674\n", 397 | "Angola 2013 21471618\n", 398 | "Albania 2013 2773620" 399 | ] 400 | } 401 | ], 402 | "prompt_number": 37 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "We might also care about the percentage of a country's population that lives in cities. There's a World Bank Development Indicator for that as well." 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "collapsed": false, 414 | "input": [ 415 | "urban_population = wb.download(indicator='SP.URB.TOTL.IN.ZS', country=flu_country_codes['iso2c'], start=2013, end=2013)\n", 416 | "urban_population[:5]" 417 | ], 418 | "language": "python", 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "html": [ 423 | "
\n", 424 | "\n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | "
SP.URB.TOTL.IN.ZS
countryyear
Aruba2013 42.058
Andorra2013 86.165
Afghanistan2013 25.871
Angola2013 42.490
Albania2013 55.383
\n", 465 | "
" 466 | ], 467 | "metadata": {}, 468 | "output_type": "pyout", 469 | "prompt_number": 38, 470 | "text": [ 471 | " SP.URB.TOTL.IN.ZS\n", 472 | "country year \n", 473 | "Aruba 2013 42.058\n", 474 | "Andorra 2013 86.165\n", 475 | "Afghanistan 2013 25.871\n", 476 | "Angola 2013 42.490\n", 477 | "Albania 2013 55.383" 478 | ] 479 | } 480 | ], 481 | "prompt_number": 38 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "If we're getting urban population, we should probably grab the percentage which is rural and the growth in population as well. Each of these is just a download away." 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "collapsed": false, 493 | "input": [ 494 | "rural_population = wb.download(indicator='SP.RUR.TOTL.ZS', country=flu_country_codes['iso2c'], start=2013, end=2013)\n", 495 | "population_growth = wb.download(indicator='SP.POP.GROW', country=flu_country_codes['iso2c'], start=2013, end=2013)" 496 | ], 497 | "language": "python", 498 | "metadata": {}, 499 | "outputs": [], 500 | "prompt_number": 39 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "With all of this population-related data, it makes sense to put them together into a single DataFrame. We can use `pd.merge` again to do this; this time we're performing the equivalent of a left-join in SQL. The result is a single DataFrame with all of our data." 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "collapsed": false, 512 | "input": [ 513 | "population_frame = pd.merge(world_population, urban_population, how=\"left\", left_index=True, right_index=True)\n", 514 | "population_frame = pd.merge(population_frame, rural_population, how=\"left\", left_index=True, right_index=True)\n", 515 | "population_frame = pd.merge(population_frame, population_growth, how=\"left\", left_index=True, right_index=True)\n", 516 | "population_frame.columns = ['total_population', 'urban_pop_percent', 'rural_pop_percent', 'pop_grow']\n", 517 | "population_frame[:10]" 518 | ], 519 | "language": "python", 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "html": [ 524 | "
\n", 525 | "\n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | "
total_populationurban_pop_percentrural_pop_percentpop_grow
countryyear
Aruba2013 102911 42.058000 57.942000 0.513409
Andorra2013 79218 86.165000 13.835000 1.088995
Afghanistan2013 30551674 25.871000 74.129000 2.408807
Angola2013 21471618 42.490000 57.510000 3.079269
Albania2013 2773620 55.383000 44.617000-1.006627
Arab World2013 369761523 57.339136 42.660864 2.012570
United Arab Emirates2013 9346129 84.981000 15.019000 1.514471
Argentina2013 41446246 91.452000 8.548000 0.870732
Armenia2013 2976566 62.975000 37.025000 0.251781
American Samoa2013 55165 87.334000 12.666000 0.067094
\n", 627 | "
" 628 | ], 629 | "metadata": {}, 630 | "output_type": "pyout", 631 | "prompt_number": 46, 632 | "text": [ 633 | " total_population urban_pop_percent \\\n", 634 | "country year \n", 635 | "Aruba 2013 102911 42.058000 \n", 636 | "Andorra 2013 79218 86.165000 \n", 637 | "Afghanistan 2013 30551674 25.871000 \n", 638 | "Angola 2013 21471618 42.490000 \n", 639 | "Albania 2013 2773620 55.383000 \n", 640 | "Arab World 2013 369761523 57.339136 \n", 641 | "United Arab Emirates 2013 9346129 84.981000 \n", 642 | "Argentina 2013 41446246 91.452000 \n", 643 | "Armenia 2013 2976566 62.975000 \n", 644 | "American Samoa 2013 55165 87.334000 \n", 645 | "\n", 646 | " rural_pop_percent pop_grow \n", 647 | "country year \n", 648 | "Aruba 2013 57.942000 0.513409 \n", 649 | "Andorra 2013 13.835000 1.088995 \n", 650 | "Afghanistan 2013 74.129000 2.408807 \n", 651 | "Angola 2013 57.510000 3.079269 \n", 652 | "Albania 2013 44.617000 -1.006627 \n", 653 | "Arab World 2013 42.660864 2.012570 \n", 654 | "United Arab Emirates 2013 15.019000 1.514471 \n", 655 | "Argentina 2013 8.548000 0.870732 \n", 656 | "Armenia 2013 37.025000 0.251781 \n", 657 | "American Samoa 2013 12.666000 0.067094 " 658 | ] 659 | } 660 | ], 661 | "prompt_number": 46 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": {}, 666 | "source": [ 667 | "We'd like to put this information in our database, both to avoid re-downloading it and to enable SQL access over it. To do that quickly, we'll transform the frame into delimite data, then use ipython-sql to insert the data into our database." 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "collapsed": false, 673 | "input": [ 674 | "from StringIO import StringIO" 675 | ], 676 | "language": "python", 677 | "metadata": {}, 678 | "outputs": [], 679 | "prompt_number": 5 680 | }, 681 | { 682 | "cell_type": "code", 683 | "collapsed": false, 684 | "input": [ 685 | "output = StringIO()\n", 686 | "population_frame.to_csv(output, sep=\"|\")\n", 687 | "\n", 688 | "population_data = output.getvalue().split(\"\\n\")[1:]\n", 689 | "output.close()\n", 690 | "population_data[:5]" 691 | ], 692 | "language": "python", 693 | "metadata": {}, 694 | "outputs": [ 695 | { 696 | "ename": "NameError", 697 | "evalue": "name 'population_frame' is not defined", 698 | "output_type": "pyerr", 699 | "traceback": [ 700 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 701 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mStringIO\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0moutput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mpopulation_frame\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"|\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mpopulation_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0moutput\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetvalue\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"\\n\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 702 | "\u001b[1;31mNameError\u001b[0m: name 'population_frame' is not defined" 703 | ] 704 | } 705 | ], 706 | "prompt_number": 10 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "With the data ready for insert, we now create a table." 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "collapsed": false, 718 | "input": [ 719 | "%%sql create table population_info (country VARCHAR2(50), year NUMBER, total_pop NUMBER, \n", 720 | " urban_pop NUMBER, rural_pop NUMBER,\n", 721 | " pop_grow NUMBER, primary key(country))" 722 | ], 723 | "language": "python", 724 | "metadata": {}, 725 | "outputs": [ 726 | { 727 | "metadata": {}, 728 | "output_type": "pyout", 729 | "prompt_number": 90, 730 | "text": [ 731 | "[]" 732 | ] 733 | } 734 | ], 735 | "prompt_number": 90 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "And then insert the data" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "collapsed": false, 747 | "input": [ 748 | " %config SqlMagic.feedback = False" 749 | ], 750 | "language": "python", 751 | "metadata": {}, 752 | "outputs": [] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "collapsed": false, 757 | "input": [ 758 | "for d in population_data:\n", 759 | " try:\n", 760 | " country, year, total_pop, urban_pop, rural_pop, pop_grow = d.split(\"|\")\n", 761 | " %sql insert into population_info (country, year, total_pop, urban_pop, rural_pop, pop_grow) values (:country, :year, :total_pop, :urban_pop, :rural_pop, :pop_grow)\n", 762 | " except:\n", 763 | " pass" 764 | ], 765 | "language": "python", 766 | "metadata": {}, 767 | "outputs": [], 768 | "prompt_number": 91 769 | }, 770 | { 771 | "cell_type": "heading", 772 | "level": 2, 773 | "metadata": {}, 774 | "source": [ 775 | "GDP Data" 776 | ] 777 | }, 778 | { 779 | "cell_type": "markdown", 780 | "metadata": {}, 781 | "source": [ 782 | "We can follow exactly the same procedure to get economic data from the World Bank. Economic data is essential if we want to understand how national or individual wealth impacts flu susceptability." 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "collapsed": false, 788 | "input": [ 789 | "per_cap_gdp = wb.download(indicator='NY.GDP.PCAP.CD', country=flu_country_codes['iso2c'], start=2013, end=2013)\n", 790 | "gdp = wb.download(indicator='NY.GDP.MKTP.CD', country=flu_country_codes['iso2c'], start=2013, end=2013)\n", 791 | "gdp_frame = pd.merge(gdp, per_cap_gdp, how=\"left\",left_index=True, right_index=True)\n", 792 | "gdp_frame.columns = [\"GDP\", \"PerCapGDP\"]\n", 793 | "gdp_frame[:5]" 794 | ], 795 | "language": "python", 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "html": [ 800 | "
\n", 801 | "\n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | "
GDPPerCapGDP
countryyear
Aruba2013 NaN NaN
Andorra2013 NaN NaN
Afghanistan2013 2.030967e+10 664.764589
Angola2013 1.241782e+11 5783.366760
Albania2013 1.292324e+10 4659.340601
\n", 849 | "
" 850 | ], 851 | "metadata": {}, 852 | "output_type": "pyout", 853 | "prompt_number": 93, 854 | "text": [ 855 | " GDP PerCapGDP\n", 856 | "country year \n", 857 | "Aruba 2013 NaN NaN\n", 858 | "Andorra 2013 NaN NaN\n", 859 | "Afghanistan 2013 2.030967e+10 664.764589\n", 860 | "Angola 2013 1.241782e+11 5783.366760\n", 861 | "Albania 2013 1.292324e+10 4659.340601" 862 | ] 863 | } 864 | ], 865 | "prompt_number": 93 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": {}, 870 | "source": [ 871 | "As before, we'll create a table, dump the output to text, and insert it." 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "collapsed": false, 877 | "input": [ 878 | "%%sql create table gdp_data (country varchar2(50), year number, \n", 879 | " gdp number, percapgdp number, primary key(country))" 880 | ], 881 | "language": "python", 882 | "metadata": {}, 883 | "outputs": [ 884 | { 885 | "metadata": {}, 886 | "output_type": "pyout", 887 | "prompt_number": 106, 888 | "text": [ 889 | "[]" 890 | ] 891 | } 892 | ], 893 | "prompt_number": 106 894 | }, 895 | { 896 | "cell_type": "code", 897 | "collapsed": false, 898 | "input": [ 899 | "output = StringIO()\n", 900 | "gdp_frame.to_csv(output, sep=\"|\")\n", 901 | "\n", 902 | "gdp_data = output.getvalue().split(\"\\n\")[1:]\n", 903 | "output.close()\n", 904 | "for d in gdp_data:\n", 905 | " try:\n", 906 | " country, year, gdp, percapgdp = d.split(\"|\")\n", 907 | " %sql insert into gdp_data (country, year, gdp, percapgdp) values (:country, :year, :gdp, :percapgdp)\n", 908 | " except:\n", 909 | " pass" 910 | ], 911 | "language": "python", 912 | "metadata": {}, 913 | "outputs": [], 914 | "prompt_number": 107 915 | }, 916 | { 917 | "cell_type": "heading", 918 | "level": 2, 919 | "metadata": {}, 920 | "source": [ 921 | "Sanitation, Water, Cellular and Education" 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "metadata": {}, 927 | "source": [ 928 | "Lastly, we're going to grab some information about other indicators of development:\n", 929 | "\n", 930 | "* sanitation improvement\n", 931 | "* rural improved access to clean water\n", 932 | "* urban improved access to clean water\n", 933 | "* access to cellular phone service\n", 934 | "* percentage of the population with a primary school education or better\n", 935 | "\n", 936 | "The data here opens up many questions. Does worse sanitation correlate with higher numbers of flu cases? Does better education reduce flu cases?\n", 937 | "\n", 938 | "Just like population and GDP, these indicators are easy to grab." 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "collapsed": false, 944 | "input": [ 945 | "sanitation = wb.download(indicator='SH.STA.ACSN', country=flu_country_codes['iso2c'], start=2012, end=2012)\n", 946 | "safe_rural_water = wb.download(indicator='SH.H2O.SAFE.RU.ZS', country=flu_country_codes['iso2c'], start=2012, end=2012)\n", 947 | "safe_urb_water = wb.download(indicator='SH.H2O.SAFE.UR.ZS', country=flu_country_codes['iso2c'], start=2012, end=2012)\n", 948 | "cellular = wb.download(indicator='IT.CEL.SETS.P2', country=flu_country_codes['iso2c'], start=2012, end=2012)\n", 949 | "primary_school = wb.download(indicator='SE.PRM.CMPT.ZS', country=flu_country_codes['iso2c'], start=2012, end=2012)" 950 | ], 951 | "language": "python", 952 | "metadata": {}, 953 | "outputs": [ 954 | { 955 | "output_type": "stream", 956 | "stream": "stderr", 957 | "text": [ 958 | "/usr/lib64/python2.6/site-packages/pandas/io/wb.py:128: UserWarning: Non-standard ISO country codes: 1A, 1W, 4E, 7E, 8S, A4, A5, A9, B8, C4, C5, C6, C7, C8, C9, EU, F1, JG, KV, L4, L5, L6, L7, M2, OE, S1, S2, S3, S4, XC, XD, XE, XJ, XL, XM, XN, XO, XP, XQ, XR, XS, XT, XU, XY, Z4, Z7, ZF, ZG, ZJ, ZQ\n", 959 | " warnings.warn('Non-standard ISO country codes: %s' % tmp)\n" 960 | ] 961 | } 962 | ], 963 | "prompt_number": 6 964 | }, 965 | { 966 | "cell_type": "code", 967 | "collapsed": false, 968 | "input": [ 969 | "development_frame = pd.merge(sanitation, safe_rural_water, how='left', left_index=True, right_index=True)\n", 970 | "development_frame = pd.merge(development_frame, safe_urb_water, how='left', left_index=True, right_index=True)\n", 971 | "development_frame = pd.merge(development_frame, cellular, how='left', left_index=True, right_index=True)\n", 972 | "development_frame = pd.merge(development_frame, primary_school, how='left', left_index=True, right_index=True)\n", 973 | "development_frame.columns = [\"sanitation\", \"safe_r_h2o\", \"safe_urb_h20\", \"cellular\", \"primary_ed\"]" 974 | ], 975 | "language": "python", 976 | "metadata": {}, 977 | "outputs": [], 978 | "prompt_number": 7 979 | }, 980 | { 981 | "cell_type": "code", 982 | "collapsed": false, 983 | "input": [ 984 | "%%sql create table dev_indc (country varchar2(50), year number, \n", 985 | " sanitation number, rh2o number, uh2o number, cellular number, \n", 986 | " primary_ed number, primary key(country))" 987 | ], 988 | "language": "python", 989 | "metadata": {}, 990 | "outputs": [ 991 | { 992 | "output_type": "stream", 993 | "stream": "stdout", 994 | "text": [ 995 | "Done.\n" 996 | ] 997 | }, 998 | { 999 | "metadata": {}, 1000 | "output_type": "pyout", 1001 | "prompt_number": 8, 1002 | "text": [ 1003 | "[]" 1004 | ] 1005 | } 1006 | ], 1007 | "prompt_number": 8 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "collapsed": false, 1012 | "input": [ 1013 | "output = StringIO()\n", 1014 | "development_frame.to_csv(output, sep=\"|\")\n", 1015 | "\n", 1016 | "dev_data = output.getvalue().split(\"\\n\")[1:]\n", 1017 | "output.close()\n", 1018 | "for d in dev_data:\n", 1019 | " try:\n", 1020 | " country, year, san, rh2o, uh2o, cell, prim = d.split(\"|\")\n", 1021 | " %sql insert into dev_indc (country, year, sanitation, rh2o, uh2o, cellular, primary_ed) values (:country, :year, :san, :rh2o, :uh2o, :cell, :prim)\n", 1022 | " except:\n", 1023 | " pass" 1024 | ], 1025 | "language": "python", 1026 | "metadata": {}, 1027 | "outputs": [ 1028 | { 1029 | "output_type": "stream", 1030 | "stream": "stdout", 1031 | "text": [ 1032 | "1 rows affected.\n", 1033 | "1 rows affected.\n", 1034 | "1 rows affected.\n", 1035 | "1 rows affected.\n", 1036 | "1 rows affected.\n", 1037 | "1 rows affected.\n", 1038 | "1 rows affected.\n", 1039 | "1 rows affected.\n", 1040 | "1 rows affected.\n", 1041 | "1 rows affected." 1042 | ] 1043 | }, 1044 | { 1045 | "output_type": "stream", 1046 | "stream": "stdout", 1047 | "text": [ 1048 | "\n", 1049 | "1 rows affected.\n", 1050 | "1 rows affected.\n", 1051 | "1 rows affected.\n", 1052 | "1 rows affected.\n", 1053 | "1 rows affected.\n", 1054 | "1 rows affected.\n", 1055 | "1 rows affected.\n", 1056 | "1 rows affected." 1057 | ] 1058 | }, 1059 | { 1060 | "output_type": "stream", 1061 | "stream": "stdout", 1062 | "text": [ 1063 | "\n", 1064 | "1 rows affected.\n", 1065 | "1 rows affected.\n", 1066 | "1 rows affected.\n", 1067 | "1 rows affected.\n", 1068 | "1 rows affected.\n", 1069 | "1 rows affected.\n", 1070 | "1 rows affected.\n", 1071 | "1 rows affected.\n", 1072 | "1 rows affected." 1073 | ] 1074 | }, 1075 | { 1076 | "output_type": "stream", 1077 | "stream": "stdout", 1078 | "text": [ 1079 | "\n", 1080 | "1 rows affected.\n", 1081 | "1 rows affected.\n", 1082 | "1 rows affected.\n", 1083 | "1 rows affected.\n", 1084 | "1 rows affected.\n", 1085 | "1 rows affected.\n", 1086 | "1 rows affected." 1087 | ] 1088 | }, 1089 | { 1090 | "output_type": "stream", 1091 | "stream": "stdout", 1092 | "text": [ 1093 | "\n", 1094 | "1 rows affected.\n", 1095 | "1 rows affected.\n", 1096 | "1 rows affected.\n", 1097 | "1 rows affected.\n", 1098 | "1 rows affected.\n", 1099 | "1 rows affected.\n", 1100 | "1 rows affected.\n", 1101 | "1 rows affected.\n", 1102 | "1 rows affected." 1103 | ] 1104 | }, 1105 | { 1106 | "output_type": "stream", 1107 | "stream": "stdout", 1108 | "text": [ 1109 | "\n", 1110 | "1 rows affected.\n", 1111 | "1 rows affected.\n", 1112 | "1 rows affected.\n", 1113 | "1 rows affected.\n", 1114 | "1 rows affected.\n", 1115 | "1 rows affected.\n", 1116 | "1 rows affected.\n", 1117 | "1 rows affected." 1118 | ] 1119 | }, 1120 | { 1121 | "output_type": "stream", 1122 | "stream": "stdout", 1123 | "text": [ 1124 | "\n", 1125 | "1 rows affected.\n", 1126 | "1 rows affected.\n", 1127 | "1 rows affected.\n", 1128 | "1 rows affected.\n", 1129 | "1 rows affected.\n", 1130 | "1 rows affected.\n", 1131 | "1 rows affected.\n", 1132 | "1 rows affected." 1133 | ] 1134 | }, 1135 | { 1136 | "output_type": "stream", 1137 | "stream": "stdout", 1138 | "text": [ 1139 | "\n", 1140 | "1 rows affected.\n", 1141 | "1 rows affected.\n", 1142 | "1 rows affected.\n", 1143 | "1 rows affected.\n", 1144 | "1 rows affected.\n", 1145 | "1 rows affected.\n", 1146 | "1 rows affected.\n", 1147 | "1 rows affected." 1148 | ] 1149 | }, 1150 | { 1151 | "output_type": "stream", 1152 | "stream": "stdout", 1153 | "text": [ 1154 | "\n", 1155 | "1 rows affected.\n", 1156 | "1 rows affected.\n", 1157 | "1 rows affected.\n", 1158 | "1 rows affected.\n", 1159 | "1 rows affected.\n", 1160 | "1 rows affected.\n", 1161 | "1 rows affected.\n", 1162 | "1 rows affected." 1163 | ] 1164 | }, 1165 | { 1166 | "output_type": "stream", 1167 | "stream": "stdout", 1168 | "text": [ 1169 | "\n", 1170 | "1 rows affected.\n", 1171 | "1 rows affected.\n", 1172 | "1 rows affected.\n", 1173 | "1 rows affected.\n", 1174 | "1 rows affected.\n", 1175 | "1 rows affected.\n", 1176 | "1 rows affected.\n", 1177 | "1 rows affected." 1178 | ] 1179 | }, 1180 | { 1181 | "output_type": "stream", 1182 | "stream": "stdout", 1183 | "text": [ 1184 | "\n", 1185 | "1 rows affected.\n", 1186 | "1 rows affected.\n", 1187 | "1 rows affected.\n", 1188 | "1 rows affected.\n", 1189 | "1 rows affected.\n", 1190 | "1 rows affected.\n", 1191 | "1 rows affected.\n", 1192 | "1 rows affected.\n", 1193 | "1 rows affected." 1194 | ] 1195 | }, 1196 | { 1197 | "output_type": "stream", 1198 | "stream": "stdout", 1199 | "text": [ 1200 | "\n", 1201 | "1 rows affected.\n", 1202 | "1 rows affected.\n", 1203 | "1 rows affected.\n", 1204 | "1 rows affected.\n", 1205 | "1 rows affected.\n", 1206 | "1 rows affected.\n", 1207 | "1 rows affected.\n", 1208 | "1 rows affected.\n", 1209 | "1 rows affected." 1210 | ] 1211 | }, 1212 | { 1213 | "output_type": "stream", 1214 | "stream": "stdout", 1215 | "text": [ 1216 | "\n", 1217 | "1 rows affected.\n", 1218 | "1 rows affected.\n", 1219 | "1 rows affected.\n", 1220 | "1 rows affected.\n", 1221 | "1 rows affected.\n", 1222 | "1 rows affected.\n", 1223 | "1 rows affected.\n", 1224 | "1 rows affected.\n", 1225 | "1 rows affected." 1226 | ] 1227 | }, 1228 | { 1229 | "output_type": "stream", 1230 | "stream": "stdout", 1231 | "text": [ 1232 | "\n", 1233 | "1 rows affected.\n", 1234 | "1 rows affected.\n", 1235 | "1 rows affected.\n", 1236 | "1 rows affected.\n", 1237 | "1 rows affected.\n", 1238 | "1 rows affected.\n", 1239 | "1 rows affected.\n", 1240 | "1 rows affected." 1241 | ] 1242 | }, 1243 | { 1244 | "output_type": "stream", 1245 | "stream": "stdout", 1246 | "text": [ 1247 | "\n", 1248 | "1 rows affected.\n", 1249 | "1 rows affected.\n", 1250 | "1 rows affected.\n", 1251 | "1 rows affected.\n", 1252 | "1 rows affected.\n", 1253 | "1 rows affected.\n", 1254 | "1 rows affected.\n", 1255 | "1 rows affected.\n", 1256 | "1 rows affected." 1257 | ] 1258 | }, 1259 | { 1260 | "output_type": "stream", 1261 | "stream": "stdout", 1262 | "text": [ 1263 | "\n", 1264 | "1 rows affected.\n", 1265 | "1 rows affected.\n", 1266 | "1 rows affected.\n", 1267 | "1 rows affected.\n", 1268 | "1 rows affected.\n", 1269 | "1 rows affected.\n", 1270 | "1 rows affected.\n", 1271 | "1 rows affected.\n", 1272 | "1 rows affected." 1273 | ] 1274 | }, 1275 | { 1276 | "output_type": "stream", 1277 | "stream": "stdout", 1278 | "text": [ 1279 | "\n", 1280 | "1 rows affected.\n", 1281 | "1 rows affected.\n", 1282 | "1 rows affected.\n", 1283 | "1 rows affected.\n", 1284 | "1 rows affected.\n", 1285 | "1 rows affected.\n", 1286 | "1 rows affected." 1287 | ] 1288 | }, 1289 | { 1290 | "output_type": "stream", 1291 | "stream": "stdout", 1292 | "text": [ 1293 | "\n", 1294 | "1 rows affected.\n", 1295 | "1 rows affected.\n", 1296 | "1 rows affected.\n", 1297 | "1 rows affected.\n", 1298 | "1 rows affected.\n", 1299 | "1 rows affected.\n", 1300 | "1 rows affected.\n", 1301 | "1 rows affected.\n", 1302 | "1 rows affected." 1303 | ] 1304 | }, 1305 | { 1306 | "output_type": "stream", 1307 | "stream": "stdout", 1308 | "text": [ 1309 | "\n", 1310 | "1 rows affected.\n", 1311 | "1 rows affected.\n", 1312 | "1 rows affected.\n", 1313 | "1 rows affected.\n", 1314 | "1 rows affected.\n", 1315 | "1 rows affected.\n", 1316 | "1 rows affected." 1317 | ] 1318 | }, 1319 | { 1320 | "output_type": "stream", 1321 | "stream": "stdout", 1322 | "text": [ 1323 | "\n", 1324 | "1 rows affected.\n", 1325 | "1 rows affected.\n", 1326 | "1 rows affected.\n", 1327 | "1 rows affected.\n", 1328 | "1 rows affected.\n", 1329 | "1 rows affected.\n", 1330 | "1 rows affected." 1331 | ] 1332 | }, 1333 | { 1334 | "output_type": "stream", 1335 | "stream": "stdout", 1336 | "text": [ 1337 | "\n", 1338 | "1 rows affected.\n", 1339 | "1 rows affected.\n", 1340 | "1 rows affected.\n", 1341 | "1 rows affected.\n", 1342 | "1 rows affected.\n", 1343 | "1 rows affected.\n", 1344 | "1 rows affected.\n", 1345 | "1 rows affected." 1346 | ] 1347 | }, 1348 | { 1349 | "output_type": "stream", 1350 | "stream": "stdout", 1351 | "text": [ 1352 | "\n", 1353 | "1 rows affected.\n", 1354 | "1 rows affected.\n", 1355 | "1 rows affected.\n", 1356 | "1 rows affected.\n", 1357 | "1 rows affected.\n", 1358 | "1 rows affected.\n", 1359 | "1 rows affected.\n", 1360 | "1 rows affected." 1361 | ] 1362 | }, 1363 | { 1364 | "output_type": "stream", 1365 | "stream": "stdout", 1366 | "text": [ 1367 | "\n", 1368 | "1 rows affected.\n", 1369 | "1 rows affected.\n", 1370 | "1 rows affected.\n", 1371 | "1 rows affected.\n", 1372 | "1 rows affected.\n", 1373 | "1 rows affected.\n", 1374 | "1 rows affected." 1375 | ] 1376 | }, 1377 | { 1378 | "output_type": "stream", 1379 | "stream": "stdout", 1380 | "text": [ 1381 | "\n", 1382 | "1 rows affected.\n", 1383 | "1 rows affected.\n", 1384 | "1 rows affected.\n", 1385 | "1 rows affected.\n", 1386 | "1 rows affected.\n", 1387 | "1 rows affected.\n", 1388 | "1 rows affected.\n", 1389 | "1 rows affected." 1390 | ] 1391 | }, 1392 | { 1393 | "output_type": "stream", 1394 | "stream": "stdout", 1395 | "text": [ 1396 | "\n", 1397 | "1 rows affected.\n", 1398 | "1 rows affected.\n", 1399 | "1 rows affected.\n", 1400 | "1 rows affected.\n", 1401 | "1 rows affected.\n", 1402 | "1 rows affected.\n", 1403 | "1 rows affected." 1404 | ] 1405 | }, 1406 | { 1407 | "output_type": "stream", 1408 | "stream": "stdout", 1409 | "text": [ 1410 | "\n", 1411 | "1 rows affected.\n", 1412 | "1 rows affected.\n", 1413 | "1 rows affected.\n", 1414 | "1 rows affected.\n", 1415 | "1 rows affected.\n", 1416 | "1 rows affected.\n", 1417 | "1 rows affected." 1418 | ] 1419 | }, 1420 | { 1421 | "output_type": "stream", 1422 | "stream": "stdout", 1423 | "text": [ 1424 | "\n", 1425 | "1 rows affected.\n", 1426 | "1 rows affected.\n", 1427 | "1 rows affected.\n", 1428 | "1 rows affected.\n", 1429 | "1 rows affected.\n", 1430 | "1 rows affected.\n", 1431 | "1 rows affected.\n", 1432 | "1 rows affected." 1433 | ] 1434 | }, 1435 | { 1436 | "output_type": "stream", 1437 | "stream": "stdout", 1438 | "text": [ 1439 | "\n", 1440 | "1 rows affected.\n", 1441 | "1 rows affected.\n", 1442 | "1 rows affected.\n", 1443 | "1 rows affected.\n", 1444 | "1 rows affected.\n", 1445 | "1 rows affected.\n", 1446 | "1 rows affected.\n", 1447 | "1 rows affected." 1448 | ] 1449 | }, 1450 | { 1451 | "output_type": "stream", 1452 | "stream": "stdout", 1453 | "text": [ 1454 | "\n", 1455 | "1 rows affected.\n", 1456 | "1 rows affected.\n", 1457 | "1 rows affected.\n", 1458 | "1 rows affected.\n", 1459 | "1 rows affected.\n", 1460 | "1 rows affected.\n", 1461 | "1 rows affected.\n", 1462 | "1 rows affected." 1463 | ] 1464 | }, 1465 | { 1466 | "output_type": "stream", 1467 | "stream": "stdout", 1468 | "text": [ 1469 | "\n", 1470 | "1 rows affected.\n", 1471 | "1 rows affected.\n", 1472 | "1 rows affected.\n", 1473 | "1 rows affected.\n", 1474 | "1 rows affected.\n", 1475 | "1 rows affected.\n", 1476 | "1 rows affected.\n", 1477 | "1 rows affected." 1478 | ] 1479 | }, 1480 | { 1481 | "output_type": "stream", 1482 | "stream": "stdout", 1483 | "text": [ 1484 | "\n", 1485 | "1 rows affected.\n", 1486 | "1 rows affected.\n", 1487 | "1 rows affected.\n", 1488 | "1 rows affected.\n" 1489 | ] 1490 | } 1491 | ], 1492 | "prompt_number": 9 1493 | }, 1494 | { 1495 | "cell_type": "heading", 1496 | "level": 2, 1497 | "metadata": {}, 1498 | "source": [ 1499 | "Summary" 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "markdown", 1504 | "metadata": {}, 1505 | "source": [ 1506 | "In this short exercise, we learned how to use Pandas built-in capabilities to quickly grab data from the web in an analyzable format. We now have data on:\n", 1507 | "\n", 1508 | "* World population, population growth, and rural/urban splits\n", 1509 | "* GDP and Per-capita GDP\n", 1510 | "* Sanitation, clean water, cellular and education data\n", 1511 | "\n", 1512 | "These data sets should help us to ask a number of questions about what influences flu rates around the world. " 1513 | ] 1514 | }, 1515 | { 1516 | "cell_type": "code", 1517 | "collapsed": false, 1518 | "input": [], 1519 | "language": "python", 1520 | "metadata": {}, 1521 | "outputs": [] 1522 | } 1523 | ], 1524 | "metadata": {} 1525 | } 1526 | ] 1527 | } -------------------------------------------------------------------------------- /notebooks/15 Clustering the News with Spark and MLlib.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:033999fe4f6eaa5796cb83f44c3efb5d8d4007d75dd6e35b6e33f7b8cec3ef37" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 1, 14 | "metadata": {}, 15 | "source": [ 16 | "Clustering the News with Spark and MLLib" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "We've previously looked at using Spark for both the analysis of text and some machine learning tasks via the PySpark interface. Through this, we've learned about what words are important over time, and what articles are about. However, what if we wanted to understand what sort of categories the news breaks into? This might mean that we'd have to use both our text processing skills and some machine learning.\n", 24 | "\n", 25 | "In this lesson, we'll do just that: we'll use a simple unsupervised machine learning method, k-means clustering, to determine what broad categories the news fits into. To do this, we'll use Spark and it's MLLib libraries via the Scala programming language. This means the following notebook is **not** interactive. All of the commands can be copied into Spark's interactive Scala shell (launch it by typing `spark-shell`) or by building a standalone application.\n", 26 | "\n", 27 | "We'll discuss building and running a standalone app at the end of the lesson." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "We'll begin with our imports. We'll need a few things: the MLLib classes that we require and the json4s package for parsing JSON in Scala." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "collapsed": false, 40 | "input": [ 41 | "import org.json4s._\n", 42 | "import org.json4s.jackson.Serialization.{read,write}\n", 43 | "import org.apache.spark.rdd.RDD\n", 44 | "import org.apache.spark.mllib.clustering.KMeans\n", 45 | "import org.apache.spark.mllib.feature.Word2Vec\n", 46 | "import org.apache.spark.mllib.feature.Word2VecModel\n", 47 | "import org.apache.spark.mllib.linalg._" 48 | ], 49 | "language": "python", 50 | "metadata": {}, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Before we parse our JSON, we're going to want a class to put it in. Rather than treating it like a python dictionary, we're going to use a Scala Case Class. This lets us get a full Scala class with just a single line declaration." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "collapsed": false, 63 | "input": [ 64 | "case class NewsArticle(date : String, title : String, byline : String, fulltext : String)" 65 | ], 66 | "language": "python", 67 | "metadata": {}, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "We need a number of helper functions for our lesson, so we'll define them here. Don't worry about what they do yet, we'll cover than in a moment." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "collapsed": false, 80 | "input": [ 81 | "def sumArray (m: Array[Double], n: Array[Double]): Array[Double] = {\n", 82 | " for (i <- 0 until m.length) {m(i) += n(i)}\n", 83 | " return m\n", 84 | "}\n", 85 | "\n", 86 | "def divArray (m: Array[Double], divisor: Double) : Array[Double] = {\n", 87 | " for (i <- 0 until m.length) {m(i) /= divisor}\n", 88 | " return m\n", 89 | "}\n", 90 | "\n", 91 | "def wordToVector (w:String, m: Word2VecModel): Vector = {\n", 92 | " try {\n", 93 | " return m.transform(w)\n", 94 | " } catch {\n", 95 | " case e: Exception => return Vectors.zeros(100)\n", 96 | " } \n", 97 | "}" 98 | ], 99 | "language": "python", 100 | "metadata": {}, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Now we're read to get started analyzing data. Let's load up our news data using, as before, `sc.textFile`." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "collapsed": false, 113 | "input": [ 114 | "val news_rdd = sc.textFile(\"hdfs://localhost:8020/user/oracle/flu_news\")" 115 | ], 116 | "language": "python", 117 | "metadata": {}, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "We need to parse the JSON data into objects, so, as with our PySpark work, we'll use the `map` function. However, we're using json4s' mechanisms. This means we'll use the `read` operation and provide it a *type* of `NewsArticle`. Unlike Python, Scala is a strongly-typed language. If the distinction is new to you, try to read up a bit on either Scala basics or on the importance of type to programming languages." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "collapsed": false, 130 | "input": [ 131 | "val news_json = news_rdd.map(record => {\n", 132 | " implicit val formats = DefaultFormats\n", 133 | " read[NewsArticle](record)\n", 134 | "})" 135 | ], 136 | "language": "python", 137 | "metadata": {}, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "We're planning to use [k-means](http://en.wikipedia.org/wiki/K-means_clustering) clustering to determine automatically which news articles belong to which clusters. However, we have to deal with a bit of an inconsistency first. K-means operates on numeric vectors (i.e., points in space), but we have words, not vectors in our articles. One way to treat this would be to compute TF-IDF for each article and treat that as a point in space. Each word would be a dimension, and each each TF-IDF score would be the value in that dimension.\n", 145 | "\n", 146 | "Ask yourself, how big would that vector be? Would it only include the dimensions for the words in the article? For the words in all articles? Maybe the vectors need to be as big as all the words in the English language!\n", 147 | "\n", 148 | "In fact, MLLib has a built-in TF-IDF transform which produces, by default, vectors that are 2^20 long. That's way too big to deal with in our virtual machine.\n", 149 | "\n", 150 | "What would happen if we did the following\n", 151 | "\n", 152 | "+ Made each article's vector include only the dimensions of the words in each title?\n", 153 | "+ Made each article's vector include the dimensions of only the words in all titles?\n", 154 | "+ Made each article's vector a dimensional reduction of all the words in all titles?\n", 155 | "\n", 156 | "For simplicity, we're only going to deal with the titles, as opposed to all the words in the articles. We're also not going to use TF-IDF, for reasons that will become apparent if you answer the questions above. Instead, we're going to rely on a method called [Word2Vec](https://code.google.com/p/word2vec/). Originated at Google, word2vec does a remarkably good job of transforming single words into reasonably-sized vectors. When generated from a large corpus, these vectors allow us to find synonyms with surprising accuracy.\n", 157 | "\n", 158 | "So, the first thing we'll need is a corpus of words. Let's make one from our titles." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "collapsed": false, 164 | "input": [ 165 | "val news_titles = news_json.map(_.title.split(\" \").toSeq)\n", 166 | "val news_title_words = news_titles.flatMap(x => x).map(x => Seq(x))" 167 | ], 168 | "language": "python", 169 | "metadata": {}, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "In order to find better synonyms, we should add more words to our corpus. Let's do just that by grabbing a sample from the `linewise_text_8` file included in `flu_news/data`." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "collapsed": false, 182 | "input": [ 183 | "val w2v_input = sc.textFile(\"file:///home/oracle/odsb2014/flu_news/data/linewise_text_8\").sample(false, 0.25,2).map(x => Seq(x))\n", 184 | "val all_input = w2v_input ++ news_title_words" 185 | ], 186 | "language": "python", 187 | "metadata": {}, 188 | "outputs": [] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Now we're ready to build a word2vec model from our corpus. Constructing this model using Spark is easy!" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "collapsed": false, 200 | "input": [ 201 | "val word2vec = new Word2Vec()\n", 202 | "val model = word2vec.fit(all_input)" 203 | ], 204 | "language": "python", 205 | "metadata": {}, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "Now we've got a model which can compute synonyms, but we have another problem. Titles have many words and word2vec only operates on one of 2 things: words or vectors. How can we find the synonyms for a whole title?!\n", 213 | "\n", 214 | "One of the interesting features of word2vec is that it displays reasonably good synonym prediction when the fectors for words are added together or subtracted. That is `v(king) - v(man) ~= v(queen)`. Thus, we could rationalize that a title is just the average vector of all the words in the title. Let's give that a try.\n", 215 | "\n", 216 | "For this, we'll need to use a couple of our helper functions. \n", 217 | "\n", 218 | "+ Inside our Spark RDD's `map` operation, we're going to call Scala's `map` to apply the word2vec model to each word. \n", 219 | "+ That gives us a Sequence of Arrays for each title, which we need to\n", 220 | " * Sum up\n", 221 | " * Divide by the total number of words in the title\n", 222 | "+ The summing can be handled by using the `reduceLeft` Scala operator. Look at the helper function and see if you can determine what is happening.\n", 223 | "+ The dividing is taken care of by the `divArray` helper function\n", 224 | "\n", 225 | "Once this is done, we now have RDDs which contain average vectors for each title. We're ready to cluster!" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "collapsed": false, 231 | "input": [ 232 | "val title_vectors = news_titles.map(x => new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector])\n", 233 | "\n", 234 | "val title_pairs = news_titles.map(x => (x,new DenseVector(divArray(x.map(m => wordToVector(m, model).toArray).reduceLeft(sumArray),x.length)).asInstanceOf[Vector]))" 235 | ], 236 | "language": "python", 237 | "metadata": {}, 238 | "outputs": [] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "As with word2vec, Spark's MLLib make k-means clustering easy. All we need to do is specify the number of clusters and iterations." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "collapsed": false, 250 | "input": [ 251 | "var numClusters = 100\n", 252 | "val numIterations = 25\n", 253 | "var clusters = KMeans.train(title_vectors, numClusters, numIterations)\n", 254 | "var wssse = clusters.computeCost(title_vectors)" 255 | ], 256 | "language": "python", 257 | "metadata": {}, 258 | "outputs": [] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "With our cluster model complete, we can assign article titles to clusters. We can also create RDDs for each of the cluster centers and produce words for their vectors (i.e., make titles for these purely numerical cluster centers)." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "collapsed": false, 270 | "input": [ 271 | "val article_membership = title_pairs.mapValues(x => clusters.predict(x))\n", 272 | "val cluster_centers = sc.parallelize(clusters.clusterCenters.zipWithIndex.map{ e => (e._2,e._1)})\n", 273 | "val cluster_topics = cluster_centers.mapValues(x => model.findSynonyms(x,5).map(x => x(0)))" 274 | ], 275 | "language": "python", 276 | "metadata": {}, 277 | "outputs": [] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "Taking a look at the cluster membership, we can see not everything is a perfect match. But on the whole more articles make sense in the cluster than do not. It seems we've done a reasonable job classifying the types of stories in the news." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "collapsed": false, 289 | "input": [ 290 | "var sample_topic = cluster_topics.take(10)(6)\n", 291 | "println(sample_topic._2.mkString(\",\"))\n", 292 | "\n", 293 | "var sample_members = article_membership.filter(x => x._2 == 6).take(100)\n", 294 | "sample_members.foreach{x => println(x._1.mkString(\",\"))}" 295 | ], 296 | "language": "python", 297 | "metadata": {}, 298 | "outputs": [] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "Let's consider some of the output from a sample run of the method. We'll start by looking at the cluster categories that have small membership. In this example, cluster 15 was small, and had word2vec synonyms of \"rugby soccer baseball basketball hockey.\" Let's look at what was in its cluster:\n", 305 | "\n", 306 | "###rugby soccer baseball basketball hockey\n", 307 | "\n", 308 | "* Thunderbird wins MBA rugby tournament\n", 309 | "* Florida wins NCAA basketball championship\n", 310 | "* NHL rival teams fight during hockey game\n", 311 | "* Ice hockey\n", 312 | "* American football\n", 313 | "* Scottish football team Hibernian appoint new manager\n", 314 | "* Wheelchair basketball\n", 315 | "* Australia men's national wheelchair basketball team\n", 316 | "* Australia women's national wheelchair basketball team\n", 317 | "* Australian women's national wheelchair basketball team\n", 318 | "* Wheelchair rugby\n", 319 | "* Women's sports\n", 320 | "* Australian rules football\n", 321 | "* Australian football\n", 322 | "* Association football\n", 323 | "* Scotland national football team\n", 324 | "* Scottish national football team\n", 325 | "* Women's association football\n", 326 | "* Sledge hockey\n", 327 | "* Field hockey\n", 328 | "* Wheelchair curling\n", 329 | "* Japan women's national wheelchair basketball team\n", 330 | "* Germany women's national wheelchair basketball team\n", 331 | "* China women's national wheelchair basketball team\n", 332 | "* Canada women's national wheelchair basketball team\n", 333 | "* Rugby league" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "That's a very small, specific cluster. But what about larger clusters? Some of these are good fits, but others are not. Let's take a look at a sample of a cluster that appears to be about the Internet or Nations.\n", 341 | "\n", 342 | "### Internet Google Nations Manchester Africa\n", 343 | "* Colombia releases official notice in response to Venezuela\n", 344 | "* Spanish government to hold ISPs responsible for web content\n", 345 | "* Spanish government to enforce ISP's to censor web content\n", 346 | "* FBI places limitation on public viewing of files\n", 347 | "* Google releases test of mapping service\n", 348 | "* Google offers to help Wikipedia\n", 349 | "* ABC to move Internet news network back to U.S. TVs\n", 350 | "* ABC to move successful Internet news network to U.S. TVs\n", 351 | "* Separatists fail to stop re-opening of Kashmir bus service\n", 352 | "* German Wikipedia DVD on P2P networks\n", 353 | "* IBM and National Geographic to launch DNA database project" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "What about the flu in the news? One category we came up with was \"Security UK - H5N1 High,\" which suggests it has something to do with the flu. Let's look at some of the articles that belong to that cluster:\n", 361 | "\n", 362 | "* Premature aging disease reversed in cells\n", 363 | "* Robot Zoe finds life in Atacama Desert\n", 364 | "* Fire in Tema\n", 365 | "* Cure for cat allergies may be close\n", 366 | "* Talk-therapy can make a difference in early treatment of severe depression\n", 367 | "* Drug-resistant infections on the rise\n", 368 | "* Deadly virus samples missing in Mexico/Lebanon\n", 369 | "* Australian blitz on fish poaching\n", 370 | "* Partnership for a Drug-Free America study finds 1 in 5 teens abused prescription drugs\n", 371 | "* U.S. EPA submits 2003 greenhouse gas inventory to U.N.\n", 372 | "* United States begins testing equipment for demolition of a major VX nerve gas stockpile\n", 373 | "* Nuclear fuel leaks at Sellafield facility on Cumbrian coast\n", 374 | "* Red and processed meats linked to bowel cancer\n", 375 | "* No H5N1 virus found in blood tests of suspected human Bird Flu cluster\n", 376 | "* Swan in German zoo tests positive for H5N1 virus\n", 377 | "* Swan in German zoo tests positive for H5N1virus\n", 378 | "* American cyclist Floyd Landis tested positive for excessive levels of testosterone in second test\n", 379 | "* Vaccine targets obesity in rats\n", 380 | "* Suspected low pathogenic H5N1 Bird Flu virus found in the United States\n", 381 | "* Possible low pathogenic H5N1 Bird Flu virus discovered in the United States\n", 382 | "\n", 383 | "Many of the entries in the cluster have to do with national security, but many, like those above deal with health and disease. Given that word2vec finds synonyms, it's possible that H5N1 maps very closely with other words about disease." 384 | ] 385 | }, 386 | { 387 | "cell_type": "heading", 388 | "level": 2, 389 | "metadata": {}, 390 | "source": [ 391 | "Building a Stand-Alone Application" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "To really get the most out of this, we need to build a stand-alone Spark application. To do this, we'll need to do a few things. We've provided the framework for the standalone application (and the code) in the `flu_data/news_clustering` directory.\n", 399 | "\n", 400 | "First, we'll need to set up a directory structure for the project. You can see the directory structure here:" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "collapsed": false, 406 | "input": [ 407 | "!ls -R ../flu_news/news_clustering/*" 408 | ], 409 | "language": "python", 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "output_type": "stream", 414 | "stream": "stdout", 415 | "text": [ 416 | "../flu_news/news_clustering/build.sbt\r\n", 417 | "\r\n", 418 | "../flu_news/news_clustering/project:\r\n", 419 | "build.properties\r\n", 420 | "\r\n", 421 | "../flu_news/news_clustering/src:\r\n", 422 | "\u001b[34mmain\u001b[m\u001b[m\r\n", 423 | "\r\n", 424 | "../flu_news/news_clustering/src/main:\r\n", 425 | "\u001b[34mscala\u001b[m\u001b[m\r\n", 426 | "\r\n", 427 | "../flu_news/news_clustering/src/main/scala:\r\n", 428 | "\u001b[34mcom\u001b[m\u001b[m\r\n", 429 | "\r\n", 430 | "../flu_news/news_clustering/src/main/scala/com:\r\n", 431 | "\u001b[34moracle\u001b[m\u001b[m\r\n", 432 | "\r\n", 433 | "../flu_news/news_clustering/src/main/scala/com/oracle:\r\n", 434 | "\u001b[34mnewscluster\u001b[m\u001b[m\r\n", 435 | "\r\n", 436 | "../flu_news/news_clustering/src/main/scala/com/oracle/newscluster:\r\n", 437 | "NewsClustering.scala\r\n" 438 | ] 439 | } 440 | ], 441 | "prompt_number": 1 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "The application consists of three main parts:\n", 448 | "\n", 449 | "* The build.sbt file\n", 450 | "* The build.properties file\n", 451 | "* The source code (`NewsClustering.scala`)\n", 452 | "\n", 453 | "The `build.sbt` contains the library dependencies and build instructions for our application. The `build.properties` specifies the version of the `sbt` program we're using. Of course, all of the hard work is in `NewsClustering.scala`\n", 454 | "\n", 455 | "Once these pieces are setup, we simply need to change to the `news_clustering` directory, and build a JAR which contains our application. We build and package with the following command:" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "collapsed": false, 461 | "input": [ 462 | "sbt package" 463 | ], 464 | "language": "python", 465 | "metadata": {}, 466 | "outputs": [] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "This gives us a JAR under the `target` directory. We can submit this to the spark cluster using the `spark-submit` command." 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "collapsed": false, 478 | "input": [ 479 | "spark-submit --class com.oracle.newscluster.NewsClustering target/scala-2.10/newsclustering_2.10-0.1.jar" 480 | ], 481 | "language": "python", 482 | "metadata": {}, 483 | "outputs": [] 484 | }, 485 | { 486 | "cell_type": "heading", 487 | "level": 2, 488 | "metadata": {}, 489 | "source": [ 490 | "Summary" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "In this exploration, we considered the question \"What is the news about?\" using machine learning. Along the way, we added some valuable skills to our Big Data toolkit. We learned to:\n", 498 | "\n", 499 | "* Apply Apache Spark's MLlib component to classify text in a supervised fashion\n", 500 | "* Used Apache Spark's core to perform simple algebra on vectors\n", 501 | "* Used MLlib's k-means clustering algorithms to classify text in an unsupervised fashion\n", 502 | "* Learned to build and submit a standalone Spark application\n", 503 | "\n", 504 | "At this point, you should be more than ready to write other standalone Spark applications. What sort of applications could you build to apply machine learning to the term-frequency data from previous sections?" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "collapsed": false, 510 | "input": [], 511 | "language": "python", 512 | "metadata": {}, 513 | "outputs": [] 514 | } 515 | ], 516 | "metadata": {} 517 | } 518 | ] 519 | } -------------------------------------------------------------------------------- /notebooks/Video Opportunities.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:b46ff24614ac36ce277e9e3f9e20f9bf1c864750c507c1907249c31e6d2bdebc" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "heading", 13 | "level": 2, 14 | "metadata": {}, 15 | "source": [ 16 | "Video Opportunities: each notebook 10-20 min of video" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Introduction:\n", 24 | "\n", 25 | " - purpose of the course\n", 26 | " - summary of the story we'll tell\n", 27 | " - what we're doing, why, and how\n", 28 | " \n", 29 | " * we need to think carefully about defining an overarching question that will guide our course\n", 30 | " \n", 31 | " * we should also talk in overview about data frames, RDMS, databases, python, etc." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "Part I: Loading Data\n", 39 | "\n", 40 | " Q: What kind of data are we looking for?\n", 41 | " A: Flu statistics. Who is getting the flu, where, when and what kind?\n", 42 | " \n", 43 | " Q: Where might we find this data?\n", 44 | " A: CDC, WHO, HHS, OIE\n", 45 | " \n", 46 | "Notebook 1: Loading and Sharing Simple CSV Data\n", 47 | " - What data is in this file?\n", 48 | " -- mappings of states to their populations and flu surveillance regions, ????\n", 49 | " - How do we plan on using this data?\n", 50 | " - Overview of loading a CSV file\n", 51 | " -- what's the file type? what tools?\n", 52 | " -- process overview: import, parse file, convert, write to DB\n", 53 | " - Putting stuff into the DB (cursors, tables, etc.)\n", 54 | " \n", 55 | "Notebook 2: Loading Simple Delimited Data\n", 56 | " - What data is in this file?\n", 57 | " -- country-level data for influenza surveillance: what strain, how many samples, etc.\n", 58 | " - How are we going to split this data?\n", 59 | " - What does it mean to put the data into tables in the DB?\n", 60 | " - What's a view?\n", 61 | "\n", 62 | "Notebook 3: Loading HHS Flu Vaccination JSON Data\n", 63 | " - What data is in this file?\n", 64 | " - What is JSON? Why do people use it?\n", 65 | "\n", 66 | "Notebook 4: Gathering OIE Pathogenic Flu Data from the Web\n", 67 | " - What data is in this file?\n", 68 | " -- pathogenic straings of influenza in animals\n", 69 | " - Pythonic tools --> what are they? why?\n", 70 | " - Extracting useful data from HTML pages\n", 71 | " - Dealing with inconsistencies in the data\n", 72 | "\n", 73 | " * are we interested in combining these first few notebooks into one video since they are simpler in content?\n", 74 | " \n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Part II: Analyzing and Visualizing Data with Databases\n", 82 | "\n", 83 | " Q: Now that we have some data, what's our next step?\n", 84 | " A: One next step is to use our data to give visual answers to our questions. In this next part, we'll ask four broad questions about the flu and use different data visualization tools to test hypotheses, and then visualize the results.\n", 85 | " \n", 86 | "Notebook 1: Does Ethnicity Impact Vaccination Rates?\n", 87 | " - What is hypothesis testing? Overview of formulating a testable hypothesis\n", 88 | " - T-tests (**** Is this one or two tailed?)\n", 89 | " - Conclusions --> only group that did not differ is African-American vs. Hispanic (**** Do we want to speculate on more conclusions about this?)\n", 90 | " \n", 91 | "Notebook 2: Do Vaccination Rates Impact Flu Rates?\n", 92 | " - Analytical SQL? Ordinary Least Squares regression models? Why?\n", 93 | " - What's statsmodel Python library? Why are we using it?\n", 94 | " - What manipulations do we need to do to the data to get it to something we can analyze?\n", 95 | " - Why should we \"start by taking a look at it?\" Shouldn't we have a hypothesis first?\n", 96 | " -- Using linear models when results aren't obvious visually\n", 97 | " - Why our linear model isn't working\n", 98 | " -- what is this \"print shot_to_sick_model.summary()\" business? nonrobust?\n", 99 | " - New model explanation; theories about why it's better; checking goodness of fit\n", 100 | " - Summary + food for thought\n", 101 | " \n", 102 | " * yikes to \"year-over-year change in flu is CAUSED\" ... we always learned to be about making claims about causation --> correlation does not imply causation\n", 103 | " \n", 104 | "Notebook 3: Does GDP explain flu rates?\n", 105 | " - What's the difference between per capita and total GDP? Why might wealthier countries ahve lower rates of infection?\n", 106 | " - What's a linear model and why is it a good way to answer this question? Correlation between GDP and flu rates? Whats a linear correlation? What's a linear regression model and why would knowing about linear correlation help us decide if it's an appropriate model?\n", 107 | " - Results of type-A and type-B regressions, next steps? \n", 108 | " - Speculations on minor effect of GDP on the flu?\n", 109 | " \n", 110 | "Notebook 4: Does Living in Cities Influence Flu Rates?\n", 111 | " * is it fair to generalize a country as \"largely urban\" or \"largely agrarian\"?\n", 112 | " - Explanation of analysis 'per capita'\n", 113 | " - Quick scatter plot to look for obvious relationships --> why do this before forming a hypothesis? How is poking around different from hypothesis testing? How are they related?\n", 114 | " - Results table: p value vs r-squared\n", 115 | " - Set of residual plots\n", 116 | " - \"Notice that as urban population percentage increases, the model explains fewer of the data points\" ** this conclusion needs more explanation, it's not clear\n", 117 | " - A follow-up approach: population and GDP \n", 118 | " - Two-factor linear models\n", 119 | " - Results explain almost 20% more of the data\n", 120 | " - Interaction plot\n", 121 | " \n", 122 | " * more rhetorical questions in the notebooks\n", 123 | " * factor interaction and residual plotting needs more explanation in general, it's not obvious\n", 124 | " \n", 125 | "Notebook 5: How Do Sanitation a\n", 126 | "nd Clean Water Effect Flu Transmission? \n", 127 | " - Thinking about other factors that effect our health: sanitation + clean water\n", 128 | " - Where/what data can we get to look at this?\n", 129 | " - Hypothesis: compare improvements in sanitation to flu cases\n", 130 | " - What does cellular phone access and primary education have to do with anything?\n", 131 | " - Modeling data in parallel\n", 132 | " - Are there noticable differences in sanitation across the WHO regions? \n", 133 | " - What are our factors of interest?\n", 134 | " - Follow-up: modeling the AFRO region\n", 135 | " - Handling sample size\n", 136 | " - Binary vs. linear effects; logistic regression model; testing and training sets\n", 137 | " - Actual classification of the training set? (the graph is confusing)\n", 138 | " - Predictions: **** this whole section of the notebook is confusing\n", 139 | " - Handling an inconclusive analysis\n", 140 | " " 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Part III: Analyzing Big Data with Apache Spark\n", 148 | "\n", 149 | "Introduction:\n", 150 | " - What is...\n", 151 | " - Big Data?\n", 152 | " - Apache Spark?\n", 153 | " - Apache Hadoop?\n", 154 | " \n", 155 | "Notebook 1: Basic Big Data Manipulation with PySpark\n", 156 | " - Loading data into a Hadoop file system\n", 157 | " - What's an RDD? Explain the Spark Context\n", 158 | " - What data are we going to work with now? What kinds of 'big data' would help us with our flu investigation?\n", 159 | " - What news stories mention the flu? What are the stories that mention the flu about? How could we go about finding out? What's wrong with our first-pass method?\n", 160 | " \n", 161 | "Notebook 2: Moving and Clustering Data with Sqoop and Spark\n", 162 | "\t- flu types a + b (http://www.cdc.gov/flu/about/viruses/types.htm)\n", 163 | "\t- integrating sql\n", 164 | "\t- deciding how many clusters with k means (within set sum of squared error, elbow in the graph, point of diminishing returns)\n", 165 | "\t- flatten_cluster_data\n", 166 | "\t- how good is the flu reporting? how large is the population?\n", 167 | " \n", 168 | "Notebook 3: Finding Important Words with Spark\n", 169 | " - Guiding questions: How do we figure out what articles are \"about\"? How important is news about the flu over time?\n", 170 | " - TF-IDF transformation\n", 171 | " - Cleaning and normalizing the text\n", 172 | " - PairRDD\n", 173 | " - Scale term frequency by how common that word is across all docs\n", 174 | " - Reframing the problem: Why would we want to know about importance over time? Is there a flu season?\n", 175 | " - Incorporating all the data\n", 176 | " - Real trends\n", 177 | "\n", 178 | " * Is it robust to consider words inside of date ranges without considering them inside of articles?\n", 179 | " \n", 180 | " * I think this notebook needs more explanation\n", 181 | " \n", 182 | "Notebook 4: Building a Trend Search with Big Data and Big Data SQL\n", 183 | " - Preliminary question unclear -- does the flu impact the news?\n", 184 | " - Pairing term-trend data with statistics data\n", 185 | " - Big Data SQL, what is it and how is it different from what we just did with batch-based copies in Hadoop\n", 186 | " - First-pass comparison\n", 187 | " - No correlated rise between H5N1 and countries with the most outbreaks --> what's another approach?\n", 188 | " - The SQL 'like' operator\n", 189 | " - Summary, what is causing what here? \n", 190 | " \n", 191 | " * possible error in [36]? should there be a graph here?\n", 192 | " * it's a little fuzzy in the logic, may need more explanation step by step\n", 193 | " * what is the Out[40] graph?\n", 194 | " \n", 195 | "Notebok 5: Clustering the news with Spark and MLLib\n", 196 | " - Categorizing the news: text processing + machine learning\n", 197 | " - k-means clustering, brief overview of supervised vs. unsupervised machine learning techniques, importance of clustering\n", 198 | " - Scala Case Class?\n", 199 | " - JSON4s? Read operation with type\n", 200 | " - How to get numeric vectors out of words (word2vec)\n", 201 | " - Building a stand-alone application\n", 202 | " - Follow-up questions, summary of machine learning techniques with Big Data\n", 203 | " \n", 204 | " * this notebook needs graphs??\n", 205 | " * why randomly talk about a stand-alone application here? this should be a separate, supplementary notebook with more meat or more suggestions for future work\n", 206 | " * the material in this notebook is already difficult, we probably won't have time to cover the stand-alone part sufficiently\n", 207 | " \n", 208 | "Notebook 6: Collecting Streaming News with Flume and Spark Streaming\n", 209 | " - How do we keep up with news happening in real time? \n", 210 | " - Explanation of a \"stream\" of data\n", 211 | " - End-to-end stream processing system, Reuters news wire\n", 212 | " - Collecting RSS data in python\n", 213 | " - Apache Flume -- distributed data transfer system\n", 214 | " - Spark streaming, spark applications, value of processing incoming data in real time (how is this different from analyzing historical data?)\n", 215 | " \n", 216 | " - E-Tag data, high-water mark for news we've seen\n", 217 | " - Source, channel, sink\n", 218 | " - Using Spark Streaming to Search Streams\n", 219 | " - processs the JSON records, keep track of how many articles we've seen, and write any articles about the flu to disk for later use\n", 220 | " - If we've found flu data, we want to write it to HDFS --> how do we know we've found flu data?\n", 221 | " \n", 222 | " - Providing libraries in a JAR file\n", 223 | " - Summary of end-to-end pipeline that can monitor the news\n", 224 | " - Follow-up projects: Add some of the machine learning approaches we applied to clustering the news to the streaming case. Could we detect a flu outbreak as it's happening?\n", 225 | " \n", 226 | " * should we cut down the output In[52]?" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Conclusion:\n", 234 | " - Restate purpose of the workshop\n", 235 | " - Who we are\n", 236 | " - Where to learn more, how to ask questions, social media, etc.\n", 237 | "\n", 238 | " * How are we going to handle ours/Oracle's affiliation with this project?" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "FAQ's\n", 246 | " - Should we have a list of all the developer/data tools we use with links to their documentation ('for more information')\n", 247 | " - How to contact us/report bugs\n", 248 | " - Where'd we get our data?\n", 249 | " - Suggested follow-up projects\n", 250 | " - Github repo, video links" 251 | ] 252 | } 253 | ], 254 | "metadata": {} 255 | } 256 | ] 257 | } -------------------------------------------------------------------------------- /setup/00-pyspark-setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | spark_home = os.environ.get('SPARK_HOME', None) 5 | if not spark_home: 6 | raise ValueError('SPARK_HOME environment variable is not set') 7 | sys.path.insert(0, os.path.join(spark_home, 'python')) 8 | sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip')) 9 | execfile(os.path.join(spark_home, 'python/pyspark/shell.py')) 10 | -------------------------------------------------------------------------------- /setup/data_science_bootcamp_setup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #we need a newer version of numpy, so we'll get it from pip 4 | echo "Fixing setuptools" 5 | sudo -E yum remove numpy python-setuptools 6 | #install the necessary libraries 7 | echo "installing BLAS and LAPACK" 8 | sudo -E yum install blas blas-devel lapack lapack-devel 9 | wget --no-check-certificate https://bootstrap.pypa.io/ez_setup.py 10 | sudo -E python ez_setup.py --insecure 11 | sudo -E easy_install nose 12 | sudo yum install impala-shell 13 | #fix pip and easy_install 14 | echo "Installing pip" 15 | sudo -E easy_install pip 16 | sudo -E easy_install -U distribute 17 | #pip install all our basic python modules 18 | echo "installing numpy" 19 | sudo -E easy_install numpy 20 | echo "installing scipy" 21 | sudo -E easy_install scipy 22 | echo "installing pandas" 23 | sudo -E pip install pandas 24 | echo "installing cx_Oracle and SQLAlchemy" 25 | sudo -E pip install cx_Oracle 26 | sudo -E pip install SQLAlchemy 27 | echo "installing pandasql vincent and seaborn" 28 | sudo -E pip install pandasql vincent seaborn 29 | echo "installing bs4 requests and feedparser" 30 | sudo -E pip install beautifulsoup4 requests feedparser 31 | echo "installing statsmodels and scikit-learn" 32 | sudo -E pip install statsmodels scikit-learn 33 | #upgrade spark to a reasonable version 34 | echo "installing spark 1.2" 35 | sudo -E yum install spark-core spark-master spark-worker spark-history-server spark-python 36 | #python 2.6 requires ipython 1.x, so we need to git clone 37 | echo "installing ipython" 38 | git clone https://github.com/ipython/ipython.git 39 | cd ipython 40 | git checkout 1.x 41 | git pull origin 1.x 42 | sudo -E python setup.py install 43 | sudo -E pip install pyzmq jinja2 tornado ipython-sql 44 | #set up the pyspark profile 45 | echo "installing pyspark profile for ipython" 46 | ipython profile create pyspark 47 | cp ipython_notebook_config_spark.py $HOME/.config/ipython/profile_pyspark/ipython_notebook_config.py 48 | cp 00-pyspark-setup.py $HOME/.config/ipython/profile_pyspark/startup/ 49 | echo "installing SBT" 50 | #install sbt 51 | wget -O sbt-0.13.7.rpm https://dl.bintray.com/sbt/rpm/sbt-0.13.7.rpm 52 | sudo -E yum localinstall sbt-0.13.7.rpm 53 | #run the get-data scripts 54 | ./download_data.sh 55 | #run the database setup script 56 | cat fludb.sql | sqlplus sys/welcome1 as sysdba 57 | #install rvm and rubies 58 | echo "installing RVM and Ruby" 59 | gpg2 --keyserver hkp://keys.gnupg.net --recv-keys D39DC0E3 60 | \curl -sSL https://get.rvm.io | bash -s stable 61 | source /home/oracle/.rvm/scripts/rvm 62 | rvm install jruby 63 | echo "setting environment and loading data" 64 | echo "export SPARK_HOME=/usr/lib/spark" >> ~/.bashrc 65 | #finished -------------------------------------------------------------------------------- /setup/download_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #run the get-data scripts 4 | cd ../flu_statistics 5 | ./get_flu_summary_data.sh 6 | cd ../flu_news 7 | ./get_news_data.sh 8 | cd ../setup -------------------------------------------------------------------------------- /setup/fludb.sql: -------------------------------------------------------------------------------- 1 | prompt >> Starting up 2 | startup 3 | prompt >> Dropping fludb user 4 | drop user fludb cascade; 5 | prompt >> Creating fludb... 6 | 7 | prompt >> Creating tablespace 8 | CREATE TABLESPACE FLUDB DATAFILE 'fludb.dbf' SIZE 1G reuse AUTOEXTEND ON nologging; 9 | 10 | prompt >> Creating user 11 | CREATE USER fludb IDENTIFIED BY flushot 12 | DEFAULT TABLESPACE FLUDB 13 | QUOTA UNLIMITED ON FLUDB; 14 | 15 | prompt >> Assiging privileges 16 | grant dba to fludb; 17 | grant ALTER ANY PROCEDURE to fludb; 18 | grant ALTER SYSTEM to fludb; 19 | grant CREATE ANY PROCEDURE to fludb; 20 | grant CREATE PROCEDURE to fludb; 21 | grant CREATE TABLE to fludb; 22 | grant DEBUG ANY PROCEDURE to fludb; 23 | grant DEBUG CONNECT SESSION to fludb; 24 | grant EXECUTE ANY PROCEDURE to fludb; 25 | grant UNLIMITED TABLESPACE to fludb; 26 | 27 | prompt >> creating staging directory 28 | !mkdir /home/oracle/fludb_staging 29 | create or replace directory fludb_directory as '/home/oracle/fludb_staging'; 30 | grant all on directory fludb_directory to fludb; 31 | quit 32 | -------------------------------------------------------------------------------- /setup/ipython_notebook_config_spark.py: -------------------------------------------------------------------------------- 1 | # Configuration file for ipython-notebook. 2 | 3 | c = get_config() 4 | 5 | #------------------------------------------------------------------------------ 6 | # NotebookApp configuration 7 | #------------------------------------------------------------------------------ 8 | 9 | # NotebookApp will inherit config from: BaseIPythonApplication, Application 10 | 11 | # The url for MathJax.js. 12 | # c.NotebookApp.mathjax_url = '' 13 | 14 | # The IP address the notebook server will listen on. 15 | # c.NotebookApp.ip = '127.0.0.1' 16 | 17 | # The base URL for the notebook server. 18 | # 19 | # Leading and trailing slashes can be omitted, and will automatically be added. 20 | # c.NotebookApp.base_project_url = '/' 21 | 22 | # Create a massive crash report when IPython encounters what may be an internal 23 | # error. The default is to append a short message to the usual traceback 24 | # c.NotebookApp.verbose_crash = False 25 | 26 | # The random bytes used to secure cookies. By default this is a new random 27 | # number every time you start the Notebook. Set it to a value in a config file 28 | # to enable logins to persist across server sessions. 29 | # 30 | # Note: Cookie secrets should be kept private, do not share config files with 31 | # cookie_secret stored in plaintext (you can read the value from a file). 32 | # c.NotebookApp.cookie_secret = '' 33 | 34 | # The number of additional ports to try if the specified port is not available. 35 | # c.NotebookApp.port_retries = 50 36 | 37 | # Whether to open in a browser after starting. The specific browser used is 38 | # platform dependent and determined by the python standard library `webbrowser` 39 | # module, unless it is overridden using the --browser (NotebookApp.browser) 40 | # configuration option. 41 | # c.NotebookApp.open_browser = True 42 | 43 | # The notebook manager class to use. 44 | # c.NotebookApp.notebook_manager_class = 'IPython.html.services.notebooks.filenbmanager.FileNotebookManager' 45 | 46 | # The date format used by logging formatters for %(asctime)s 47 | # c.NotebookApp.log_datefmt = '%Y-%m-%d %H:%M:%S' 48 | 49 | # The base URL for the kernel server 50 | # 51 | # Leading and trailing slashes can be omitted, and will automatically be added. 52 | # c.NotebookApp.base_kernel_url = '/' 53 | 54 | # The port the notebook server will listen on. 55 | c.NotebookApp.port = 8880 56 | 57 | # Whether to overwrite existing config files when copying 58 | # c.NotebookApp.overwrite = False 59 | 60 | # Whether to enable MathJax for typesetting math/TeX 61 | # 62 | # MathJax is the javascript library IPython uses to render math/LaTeX. It is 63 | # very large, so you may want to disable it if you have a slow internet 64 | # connection, or for offline use of the notebook. 65 | # 66 | # When disabled, equations etc. will appear as their untransformed TeX source. 67 | # c.NotebookApp.enable_mathjax = True 68 | 69 | # The full path to an SSL/TLS certificate file. 70 | # c.NotebookApp.certfile = u'' 71 | 72 | # Path to an extra config file to load. 73 | # 74 | # If specified, load this config file in addition to any other IPython config. 75 | # c.NotebookApp.extra_config_file = u'' 76 | 77 | # The IPython profile to use. 78 | # c.NotebookApp.profile = u'default' 79 | 80 | # The base URL for the websocket server, if it differs from the HTTP server 81 | # (hint: it almost certainly doesn't). 82 | # 83 | # Should be in the form of an HTTP origin: ws[s]://hostname[:port] 84 | # c.NotebookApp.websocket_url = '' 85 | 86 | # The name of the IPython directory. This directory is used for logging 87 | # configuration (through profiles), history storage, etc. The default is usually 88 | # $HOME/.ipython. This options can also be specified through the environment 89 | # variable IPYTHONDIR. 90 | # c.NotebookApp.ipython_dir = u'/home/oracle/.config/ipython' 91 | 92 | # Set the log level by value or name. 93 | # c.NotebookApp.log_level = 30 94 | 95 | # Hashed password to use for web authentication. 96 | # 97 | # To generate, type in a python/IPython shell: 98 | # 99 | # from IPython.lib import passwd; passwd() 100 | # 101 | # The string should be of the form type:salt:hashed-password. 102 | # c.NotebookApp.password = u'' 103 | 104 | # The Logging format template 105 | # c.NotebookApp.log_format = '[%(name)s]%(highlevel)s %(message)s' 106 | 107 | # Wether to use Browser Side less-css parsing instead of compiled css version in 108 | # templates that allows it. This is mainly convenient when working on the less 109 | # file to avoid a build step, or if user want to overwrite some of the less 110 | # variables without having to recompile everything. 111 | # 112 | # You will need to install the less.js component in the static directory either 113 | # in the source tree or in your profile folder. 114 | # c.NotebookApp.use_less = False 115 | 116 | # Extra paths to search for serving static files. 117 | # 118 | # This allows adding javascript/css to be available from the notebook server 119 | # machine, or overriding individual files in the IPython 120 | # c.NotebookApp.extra_static_paths = [] 121 | 122 | # Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded- 123 | # For headerssent by the upstream reverse proxy. Neccesary if the proxy handles 124 | # SSL 125 | # c.NotebookApp.trust_xheaders = False 126 | 127 | # Whether to install the default config files into the profile dir. If a new 128 | # profile is being created, and IPython contains config files for that profile, 129 | # then they will be staged into the new directory. Otherwise, default config 130 | # files will be automatically generated. 131 | # c.NotebookApp.copy_config_files = False 132 | 133 | # The full path to a private key file for usage with SSL/TLS. 134 | # c.NotebookApp.keyfile = u'' 135 | 136 | # Supply overrides for the tornado.web.Application that the IPython notebook 137 | # uses. 138 | # c.NotebookApp.webapp_settings = {} 139 | 140 | # Specify what command to use to invoke a web browser when opening the notebook. 141 | # If not specified, the default browser will be determined by the `webbrowser` 142 | # standard library module, which allows setting of the BROWSER environment 143 | # variable to override it. 144 | # c.NotebookApp.browser = u'' 145 | 146 | #------------------------------------------------------------------------------ 147 | # IPKernelApp configuration 148 | #------------------------------------------------------------------------------ 149 | 150 | # IPython: an enhanced interactive Python shell. 151 | 152 | # IPKernelApp will inherit config from: BaseIPythonApplication, Application, 153 | # InteractiveShellApp 154 | 155 | # The importstring for the DisplayHook factory 156 | # c.IPKernelApp.displayhook_class = 'IPython.kernel.zmq.displayhook.ZMQDisplayHook' 157 | 158 | # Set the IP or interface on which the kernel will listen. 159 | # c.IPKernelApp.ip = u'' 160 | 161 | # Pre-load matplotlib and numpy for interactive use, selecting a particular 162 | # matplotlib backend and loop integration. 163 | # c.IPKernelApp.pylab = None 164 | 165 | # Create a massive crash report when IPython encounters what may be an internal 166 | # error. The default is to append a short message to the usual traceback 167 | # c.IPKernelApp.verbose_crash = False 168 | 169 | # The Kernel subclass to be used. 170 | # 171 | # This should allow easy re-use of the IPKernelApp entry point to configure and 172 | # launch kernels other than IPython's own. 173 | # c.IPKernelApp.kernel_class = 'IPython.kernel.zmq.ipkernel.Kernel' 174 | 175 | # Run the module as a script. 176 | # c.IPKernelApp.module_to_run = '' 177 | 178 | # The date format used by logging formatters for %(asctime)s 179 | # c.IPKernelApp.log_datefmt = '%Y-%m-%d %H:%M:%S' 180 | 181 | # set the shell (ROUTER) port [default: random] 182 | # c.IPKernelApp.shell_port = 0 183 | 184 | # set the control (ROUTER) port [default: random] 185 | # c.IPKernelApp.control_port = 0 186 | 187 | # Whether to overwrite existing config files when copying 188 | # c.IPKernelApp.overwrite = False 189 | 190 | # Execute the given command string. 191 | # c.IPKernelApp.code_to_run = '' 192 | 193 | # set the stdin (ROUTER) port [default: random] 194 | # c.IPKernelApp.stdin_port = 0 195 | 196 | # Set the log level by value or name. 197 | # c.IPKernelApp.log_level = 30 198 | 199 | # lines of code to run at IPython startup. 200 | # c.IPKernelApp.exec_lines = [] 201 | 202 | # Path to an extra config file to load. 203 | # 204 | # If specified, load this config file in addition to any other IPython config. 205 | # c.IPKernelApp.extra_config_file = u'' 206 | 207 | # The importstring for the OutStream factory 208 | # c.IPKernelApp.outstream_class = 'IPython.kernel.zmq.iostream.OutStream' 209 | 210 | # Whether to create profile dir if it doesn't exist 211 | # c.IPKernelApp.auto_create = False 212 | 213 | # set the heartbeat port [default: random] 214 | # c.IPKernelApp.hb_port = 0 215 | 216 | # 217 | # c.IPKernelApp.transport = 'tcp' 218 | 219 | # redirect stdout to the null device 220 | # c.IPKernelApp.no_stdout = False 221 | 222 | # dotted module name of an IPython extension to load. 223 | # c.IPKernelApp.extra_extension = '' 224 | 225 | # A file to be run 226 | # c.IPKernelApp.file_to_run = '' 227 | 228 | # The IPython profile to use. 229 | # c.IPKernelApp.profile = u'default' 230 | 231 | # 232 | # c.IPKernelApp.parent_appname = u'' 233 | 234 | # kill this process if its parent dies. On Windows, the argument specifies the 235 | # HANDLE of the parent process, otherwise it is simply boolean. 236 | # c.IPKernelApp.parent_handle = 0 237 | 238 | # JSON file in which to store connection info [default: kernel-.json] 239 | # 240 | # This file will contain the IP, ports, and authentication key needed to connect 241 | # clients to this kernel. By default, this file will be created in the security 242 | # dir of the current profile, but can be specified by absolute path. 243 | # c.IPKernelApp.connection_file = '' 244 | 245 | # If true, IPython will populate the user namespace with numpy, pylab, etc. and 246 | # an 'import *' is done from numpy and pylab, when using pylab mode. 247 | # 248 | # When False, pylab mode should not import any names into the user namespace. 249 | # c.IPKernelApp.pylab_import_all = True 250 | 251 | # The name of the IPython directory. This directory is used for logging 252 | # configuration (through profiles), history storage, etc. The default is usually 253 | # $HOME/.ipython. This options can also be specified through the environment 254 | # variable IPYTHONDIR. 255 | # c.IPKernelApp.ipython_dir = u'/home/oracle/.config/ipython' 256 | 257 | # Configure matplotlib for interactive use with the default matplotlib backend. 258 | # c.IPKernelApp.matplotlib = None 259 | 260 | # ONLY USED ON WINDOWS Interrupt this process when the parent is signaled. 261 | # c.IPKernelApp.interrupt = 0 262 | 263 | # Whether to install the default config files into the profile dir. If a new 264 | # profile is being created, and IPython contains config files for that profile, 265 | # then they will be staged into the new directory. Otherwise, default config 266 | # files will be automatically generated. 267 | # c.IPKernelApp.copy_config_files = False 268 | 269 | # List of files to run at IPython startup. 270 | # c.IPKernelApp.exec_files = [] 271 | 272 | # Enable GUI event loop integration with any of ('glut', 'gtk', 'gtk3', 'none', 273 | # 'osx', 'pyglet', 'qt', 'qt4', 'tk', 'wx'). 274 | # c.IPKernelApp.gui = None 275 | 276 | # A list of dotted module names of IPython extensions to load. 277 | # c.IPKernelApp.extensions = [] 278 | 279 | # redirect stderr to the null device 280 | # c.IPKernelApp.no_stderr = False 281 | 282 | # The Logging format template 283 | # c.IPKernelApp.log_format = '[%(name)s]%(highlevel)s %(message)s' 284 | 285 | # set the iopub (PUB) port [default: random] 286 | # c.IPKernelApp.iopub_port = 0 287 | 288 | #------------------------------------------------------------------------------ 289 | # ZMQInteractiveShell configuration 290 | #------------------------------------------------------------------------------ 291 | 292 | # A subclass of InteractiveShell for ZMQ. 293 | 294 | # ZMQInteractiveShell will inherit config from: InteractiveShell 295 | 296 | # Use colors for displaying information about objects. Because this information 297 | # is passed through a pager (like 'less'), and some pagers get confused with 298 | # color codes, this capability can be turned off. 299 | # c.ZMQInteractiveShell.color_info = True 300 | 301 | # A list of ast.NodeTransformer subclass instances, which will be applied to 302 | # user input before code is run. 303 | # c.ZMQInteractiveShell.ast_transformers = [] 304 | 305 | # 306 | # c.ZMQInteractiveShell.history_length = 10000 307 | 308 | # Don't call post-execute functions that have failed in the past. 309 | # c.ZMQInteractiveShell.disable_failing_post_execute = False 310 | 311 | # Show rewritten input, e.g. for autocall. 312 | # c.ZMQInteractiveShell.show_rewritten_input = True 313 | 314 | # Set the color scheme (NoColor, Linux, or LightBG). 315 | # c.ZMQInteractiveShell.colors = 'Linux' 316 | 317 | # 318 | # c.ZMQInteractiveShell.separate_in = '\n' 319 | 320 | # Deprecated, use PromptManager.in2_template 321 | # c.ZMQInteractiveShell.prompt_in2 = ' .\\D.: ' 322 | 323 | # 324 | # c.ZMQInteractiveShell.separate_out = '' 325 | 326 | # Deprecated, use PromptManager.in_template 327 | # c.ZMQInteractiveShell.prompt_in1 = 'In [\\#]: ' 328 | 329 | # Enable deep (recursive) reloading by default. IPython can use the deep_reload 330 | # module which reloads changes in modules recursively (it replaces the reload() 331 | # function, so you don't need to change anything to use it). deep_reload() 332 | # forces a full reload of modules whose code may have changed, which the default 333 | # reload() function does not. When deep_reload is off, IPython will use the 334 | # normal reload(), but deep_reload will still be available as dreload(). 335 | # c.ZMQInteractiveShell.deep_reload = False 336 | 337 | # Make IPython automatically call any callable object even if you didn't type 338 | # explicit parentheses. For example, 'str 43' becomes 'str(43)' automatically. 339 | # The value can be '0' to disable the feature, '1' for 'smart' autocall, where 340 | # it is not applied if there are no more arguments on the line, and '2' for 341 | # 'full' autocall, where all callable objects are automatically called (even if 342 | # no arguments are present). 343 | # c.ZMQInteractiveShell.autocall = 0 344 | 345 | # 346 | # c.ZMQInteractiveShell.separate_out2 = '' 347 | 348 | # Deprecated, use PromptManager.justify 349 | # c.ZMQInteractiveShell.prompts_pad_left = True 350 | 351 | # 352 | # c.ZMQInteractiveShell.readline_parse_and_bind = ['tab: complete', '"\\C-l": clear-screen', 'set show-all-if-ambiguous on', '"\\C-o": tab-insert', '"\\C-r": reverse-search-history', '"\\C-s": forward-search-history', '"\\C-p": history-search-backward', '"\\C-n": history-search-forward', '"\\e[A": history-search-backward', '"\\e[B": history-search-forward', '"\\C-k": kill-line', '"\\C-u": unix-line-discard'] 353 | 354 | # Enable magic commands to be called without the leading %. 355 | # c.ZMQInteractiveShell.automagic = True 356 | 357 | # 358 | # c.ZMQInteractiveShell.debug = False 359 | 360 | # 361 | # c.ZMQInteractiveShell.object_info_string_level = 0 362 | 363 | # 364 | # c.ZMQInteractiveShell.ipython_dir = '' 365 | 366 | # 367 | # c.ZMQInteractiveShell.readline_remove_delims = '-/~' 368 | 369 | # Start logging to the default log file. 370 | # c.ZMQInteractiveShell.logstart = False 371 | 372 | # The name of the logfile to use. 373 | # c.ZMQInteractiveShell.logfile = '' 374 | 375 | # 376 | # c.ZMQInteractiveShell.wildcards_case_sensitive = True 377 | 378 | # Save multi-line entries as one entry in readline history 379 | # c.ZMQInteractiveShell.multiline_history = True 380 | 381 | # Start logging to the given file in append mode. 382 | # c.ZMQInteractiveShell.logappend = '' 383 | 384 | # 385 | # c.ZMQInteractiveShell.xmode = 'Context' 386 | 387 | # 388 | # c.ZMQInteractiveShell.quiet = False 389 | 390 | # Deprecated, use PromptManager.out_template 391 | # c.ZMQInteractiveShell.prompt_out = 'Out[\\#]: ' 392 | 393 | # Set the size of the output cache. The default is 1000, you can change it 394 | # permanently in your config file. Setting it to 0 completely disables the 395 | # caching system, and the minimum value accepted is 20 (if you provide a value 396 | # less than 20, it is reset to 0 and a warning is issued). This limit is 397 | # defined because otherwise you'll spend more time re-flushing a too small cache 398 | # than working 399 | # c.ZMQInteractiveShell.cache_size = 1000 400 | 401 | # 'all', 'last', 'last_expr' or 'none', specifying which nodes should be run 402 | # interactively (displaying output from expressions). 403 | # c.ZMQInteractiveShell.ast_node_interactivity = 'last_expr' 404 | 405 | # Automatically call the pdb debugger after every exception. 406 | # c.ZMQInteractiveShell.pdb = False 407 | 408 | #------------------------------------------------------------------------------ 409 | # KernelManager configuration 410 | #------------------------------------------------------------------------------ 411 | 412 | # Manages a single kernel in a subprocess on this host. 413 | # 414 | # This version starts kernels with Popen. 415 | 416 | # KernelManager will inherit config from: ConnectionFileMixin 417 | 418 | # The Popen Command to launch the kernel. Override this if you have a custom 419 | # c.KernelManager.kernel_cmd = [] 420 | 421 | # Set the kernel's IP address [default localhost]. If the IP address is 422 | # something other than localhost, then Consoles on other machines will be able 423 | # to connect to the Kernel, so be careful! 424 | # c.KernelManager.ip = '127.0.0.1' 425 | 426 | # 427 | # c.KernelManager.transport = 'tcp' 428 | 429 | # Should we autorestart the kernel if it dies. 430 | # c.KernelManager.autorestart = False 431 | 432 | #------------------------------------------------------------------------------ 433 | # ProfileDir configuration 434 | #------------------------------------------------------------------------------ 435 | 436 | # An object to manage the profile directory and its resources. 437 | # 438 | # The profile directory is used by all IPython applications, to manage 439 | # configuration, logging and security. 440 | # 441 | # This object knows how to find, create and manage these directories. This 442 | # should be used by any code that wants to handle profiles. 443 | 444 | # Set the profile location directly. This overrides the logic used by the 445 | # `profile` option. 446 | # c.ProfileDir.location = u'' 447 | 448 | #------------------------------------------------------------------------------ 449 | # Session configuration 450 | #------------------------------------------------------------------------------ 451 | 452 | # Object for handling serialization and sending of messages. 453 | # 454 | # The Session object handles building messages and sending them with ZMQ sockets 455 | # or ZMQStream objects. Objects can communicate with each other over the 456 | # network via Session objects, and only need to work with the dict-based IPython 457 | # message spec. The Session will handle serialization/deserialization, security, 458 | # and metadata. 459 | # 460 | # Sessions support configurable serialiization via packer/unpacker traits, and 461 | # signing with HMAC digests via the key/keyfile traits. 462 | # 463 | # Parameters ---------- 464 | # 465 | # debug : bool 466 | # whether to trigger extra debugging statements 467 | # packer/unpacker : str : 'json', 'pickle' or import_string 468 | # importstrings for methods to serialize message parts. If just 469 | # 'json' or 'pickle', predefined JSON and pickle packers will be used. 470 | # Otherwise, the entire importstring must be used. 471 | # 472 | # The functions must accept at least valid JSON input, and output *bytes*. 473 | # 474 | # For example, to use msgpack: 475 | # packer = 'msgpack.packb', unpacker='msgpack.unpackb' 476 | # pack/unpack : callables 477 | # You can also set the pack/unpack callables for serialization directly. 478 | # session : bytes 479 | # the ID of this Session object. The default is to generate a new UUID. 480 | # username : unicode 481 | # username added to message headers. The default is to ask the OS. 482 | # key : bytes 483 | # The key used to initialize an HMAC signature. If unset, messages 484 | # will not be signed or checked. 485 | # keyfile : filepath 486 | # The file containing a key. If this is set, `key` will be initialized 487 | # to the contents of the file. 488 | 489 | # Username for the Session. Default is your system username. 490 | # c.Session.username = u'oracle' 491 | 492 | # The name of the unpacker for unserializing messages. Only used with custom 493 | # functions for `packer`. 494 | # c.Session.unpacker = 'json' 495 | 496 | # Threshold (in bytes) beyond which a buffer should be sent without copying. 497 | # c.Session.copy_threshold = 65536 498 | 499 | # The name of the packer for serializing messages. Should be one of 'json', 500 | # 'pickle', or an import name for a custom callable serializer. 501 | # c.Session.packer = 'json' 502 | 503 | # The maximum number of digests to remember. 504 | # 505 | # The digest history will be culled when it exceeds this value. 506 | # c.Session.digest_history_size = 65536 507 | 508 | # The UUID identifying this session. 509 | # c.Session.session = u'' 510 | 511 | # The digest scheme used to construct the message signatures. Must have the form 512 | # 'hmac-HASH'. 513 | # c.Session.signature_scheme = 'hmac-sha256' 514 | 515 | # execution key, for extra authentication. 516 | # c.Session.key = '' 517 | 518 | # Debug output in the Session 519 | # c.Session.debug = False 520 | 521 | # The maximum number of items for a container to be introspected for custom 522 | # serialization. Containers larger than this are pickled outright. 523 | # c.Session.item_threshold = 64 524 | 525 | # path to file containing execution key. 526 | # c.Session.keyfile = '' 527 | 528 | # Threshold (in bytes) beyond which an object's buffer should be extracted to 529 | # avoid pickling. 530 | # c.Session.buffer_threshold = 1024 531 | 532 | # Metadata dictionary, which serves as the default top-level metadata dict for 533 | # each message. 534 | # c.Session.metadata = {} 535 | 536 | #------------------------------------------------------------------------------ 537 | # InlineBackend configuration 538 | #------------------------------------------------------------------------------ 539 | 540 | # An object to store configuration of the inline backend. 541 | 542 | # The image format for figures with the inline backend. 543 | # c.InlineBackend.figure_format = 'png' 544 | 545 | # Close all figures at the end of each cell. 546 | # 547 | # When True, ensures that each cell starts with no active figures, but it also 548 | # means that one must keep track of references in order to edit or redraw 549 | # figures in subsequent cells. This mode is ideal for the notebook, where 550 | # residual plots from other cells might be surprising. 551 | # 552 | # When False, one must call figure() to create new figures. This means that 553 | # gcf() and getfigs() can reference figures created in other cells, and the 554 | # active figure can continue to be edited with pylab/pyplot methods that 555 | # reference the current active figure. This mode facilitates iterative editing 556 | # of figures, and behaves most consistently with other matplotlib backends, but 557 | # figure barriers between cells must be explicit. 558 | # c.InlineBackend.close_figures = True 559 | 560 | # Subset of matplotlib rcParams that should be different for the inline backend. 561 | # c.InlineBackend.rc = {'font.size': 10, 'figure.figsize': (6.0, 4.0), 'figure.facecolor': 'white', 'savefig.dpi': 72, 'figure.subplot.bottom': 0.125, 'figure.edgecolor': 'white'} 562 | 563 | #------------------------------------------------------------------------------ 564 | # MappingKernelManager configuration 565 | #------------------------------------------------------------------------------ 566 | 567 | # A KernelManager that handles notebook mapping and HTTP error handling 568 | 569 | # MappingKernelManager will inherit config from: MultiKernelManager 570 | 571 | # The kernel manager class. This is configurable to allow subclassing of the 572 | # KernelManager for customized behavior. 573 | # c.MappingKernelManager.kernel_manager_class = 'IPython.kernel.ioloop.IOLoopKernelManager' 574 | 575 | #------------------------------------------------------------------------------ 576 | # NotebookManager configuration 577 | #------------------------------------------------------------------------------ 578 | 579 | # The directory to use for notebooks. 580 | # c.NotebookManager.notebook_dir = u'/home/oracle' 581 | 582 | #------------------------------------------------------------------------------ 583 | # FileNotebookManager configuration 584 | #------------------------------------------------------------------------------ 585 | 586 | # FileNotebookManager will inherit config from: NotebookManager 587 | 588 | # The location in which to keep notebook checkpoints 589 | # 590 | # By default, it is notebook-dir/.ipynb_checkpoints 591 | # c.FileNotebookManager.checkpoint_dir = u'' 592 | 593 | # Automatically create a Python script when saving the notebook. 594 | # 595 | # For easier use of import, %run and %load across notebooks, a .py script will be created next to any .ipynb on each 597 | # save. This can also be set with the short `--script` flag. 598 | # c.FileNotebookManager.save_script = False 599 | 600 | # The directory to use for notebooks. 601 | # c.FileNotebookManager.notebook_dir = u'/home/oracle' 602 | -------------------------------------------------------------------------------- /setup/setup_pyspark_notebook.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | echo "installing pyspark profile for ipython" 4 | ipython profile create pyspark 5 | cp ipython_notebook_config_spark.py $HOME/.config/ipython/profile_pyspark/ipython_notebook_config.py 6 | cp 00-pyspark-setup.py $HOME/.config/ipython/profile_pyspark/startup/ 7 | -------------------------------------------------------------------------------- /templates/pandas_oracle_template.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ Template for connecting pandas to Oracle 12c""" 4 | from sqlalchemy import create_engine 5 | import pandas as pd 6 | 7 | def main(username, password, hoststring, table): 8 | engine = create_engine("oracle://{0}:{1}@{2}".format(username, password, hoststring)) 9 | data = pd.read_sql_table(table, engine) 10 | print data.head() 11 | 12 | if __name__ == "__main__": 13 | if len(sys.argv) != 5: 14 | print "Usage: sql_alchemy_oracle_template.py username password hoststring table" 15 | else: 16 | main(*sys.argv[1:]) 17 | -------------------------------------------------------------------------------- /templates/raw_oracle_template.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ A template for python programs connecting to 4 | Oracle DB 12c using cx-Oracle""" 5 | 6 | import sys 7 | import cx_Oracle as cx 8 | 9 | def main(username, password, hoststring): 10 | # connect to the db 11 | db = cx.connect(username, password, hoststring) 12 | # get a cursor 13 | c = db.cursor() 14 | #do stuff! 15 | 16 | if __name__ == "__main__": 17 | if len(sys.argv) != 4: 18 | print "Usage: raw_oracle_template.py username password hoststring" 19 | else: 20 | main(*sys.argv[1:]) 21 | 22 | -------------------------------------------------------------------------------- /templates/sql_alchemy_oracle_template.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ Rough template for using SQLAlchemy with Oracle DB 12c""" 4 | 5 | from sqlalchemy import * 6 | from sqlalchemy.orm import * 7 | 8 | def main(username, password, hoststring): 9 | engine = create_engine("oracle://{0}:{1}@{2}".format(username, password, hoststring)) 10 | metadata = MetaData(engine) 11 | Session = sessionmaker(engine) 12 | session = Session() 13 | 14 | 15 | if __name__ == "__main__": 16 | if len(sys.argv) != 4: 17 | print "Usage: sql_alchemy_oracle_template.py username password hoststring" 18 | else: 19 | main(*sys.argv[1:]) 20 | --------------------------------------------------------------------------------