├── License.md ├── app ├── PaperToDigital.pdf └── PaperToDigital.tex ├── ds1 ├── .gitignore ├── ds1_present_web.pdf └── ds1_web.tex ├── ds2 ├── ds2_present_web.pdf ├── ds2_web.tex └── scraping_assignment_web.txt ├── ds3 ├── ds3_present_web.pdf └── ds3_web.tex ├── ds4 ├── ds4_present_web.pdf └── ds4_web.tex ├── ds6 ├── kmeans.pdf └── kmeans.tex ├── graphs └── ggplot2.md └── readme.md /License.md: -------------------------------------------------------------------------------- 1 | [![Creative Commons License](https://i.creativecommons.org/l/by-sa/4.0/88x31.png)](http://creativecommons.org/licenses/by-sa/4.0/) 2 | Data Science: Some Basics by [Gaurav Sood](https://github.com/soodoku/data-science) is licensed under a [Creative Commons Attribution-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-sa/4.0/). 3 | Based on a work at [https://github.com/soodoku/data-science](https://github.com/soodoku/data-science). 4 | Permissions beyond the scope of this license may be available at [http://gsood.com](http://gsood.com). 5 | -------------------------------------------------------------------------------- /app/PaperToDigital.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soodoku/data-science/0af39968f028b6c507945ebdb38bea8771f49303/app/PaperToDigital.pdf -------------------------------------------------------------------------------- /app/PaperToDigital.tex: -------------------------------------------------------------------------------- 1 | \documentclass[compress, black]{beamer} 2 | \setbeamercolor{normal text}{fg=black} 3 | \beamertemplatesolidbackgroundcolor{white} 4 | \usecolortheme[named=black]{structure} 5 | \usepackage{caption} 6 | \captionsetup{labelformat=empty} 7 | \setbeamertemplate{navigation symbols}{} 8 | %\usefonttheme{structurebold} 9 | \usepackage[scaled]{helvet} 10 | \renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif 11 | \usepackage[T1]{fontenc} 12 | \usepackage{setspace} 13 | %\usepackage{beamerthemesplit} 14 | \usepackage{graphics} 15 | \usepackage{hyperref} 16 | \usepackage{graphicx} 17 | \usepackage{verbatim} 18 | \usepackage{amssymb} 19 | \usepackage{wrapfig} 20 | \usefonttheme[onlymath]{serif} 21 | \usepackage{cmbright} 22 | 23 | \def\labelitemi{\textemdash} 24 | \setbeamertemplate{frametitle}{ 25 | \begin{centering} 26 | \vskip15pt 27 | \insertframetitle 28 | \par 29 | \end{centering} 30 | } 31 | \title[DS]{From Paper to Digital} 32 | \author[Sood]{Gaurav~Sood} 33 | \large 34 | \date[2015]{Spring 2015} 35 | \subject{LearnDS} 36 | \begin{document} 37 | \newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}} 38 | \newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}} 39 | \newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}} 40 | 41 | \newenvironment{large_enum}{ 42 | \Large 43 | \begin{itemize} 44 | \setlength{\itemsep}{7pt} 45 | \setlength{\parskip}{0pt} 46 | \setlength{\parsep}{0pt} 47 | }{\end{itemize}} 48 | 49 | \begin{comment} 50 | 51 | setwd(paste0(basedir, "github/data-science/app/")) 52 | tools::texi2dvi("PaperToDigital.tex", pdf=TRUE,clean=TRUE) 53 | setwd(basedir) 54 | 55 | \end{comment} 56 | 57 | \frame 58 | { 59 | \titlepage 60 | } 61 | 62 | \begin{frame} 63 | \frametitle{} 64 | \only<1>{\Large When we think about paper \ldots} 65 | \only<2>{\Large We think about \alert{government offices}} 66 | \only<3>{\centering \scalebox{0.46}{\includegraphics{img/files1.jpg}}} 67 | \only<4>{\centering \scalebox{0.26}{\includegraphics{img/files2.jpg}}} 68 | \only<5>{\centering \scalebox{0.30}{\includegraphics{img/files3.jpg}}} 69 | \only<6>{\Large But paper based storage of information is common} 70 | \only<7>{\Large Libraries and Archives} 71 | \only<8>{\Large Health records} 72 | \only<9>{\Large Receipts \ldots} 73 | \only<10>{\Large Small Businesses} 74 | \only<11>{\Large And it isn't going away (quickly).} 75 | \end{frame} 76 | 77 | \begin{frame} 78 | \frametitle{The Dead Tree Format} 79 | \begin{large_enum} 80 | \item[--]<2-> Accessible only on location 81 | \item[--]<3-> Typically needs help of another human, who may in turn want \alert{money} 82 | \item[--]<4-> Hard to copy, distribute 83 | \item[--]<5-> Flammable 84 | \item[--]<6-> Time consuming to find stuff \\ \normalsize \pause \pause \pause \pause \pause \pause 85 | Google returns average search query in .2 seconds 86 | \item[--]<8-> Hard to analyze, summarize stored information 87 | \item[--]<9-> Hard to track performance, identify anomalous transactions, identify patterns ... 88 | \end{large_enum} 89 | \end{frame} 90 | 91 | \begin{frame} 92 | \frametitle{Solved Problem?} 93 | \begin{large_enum} 94 | \item[--]<2-> Lots of software: 95 | \begin{enumerate} 96 | \item[--]<3->Adobe Professional 97 | \item[--]<4->Abbyy FineReader 98 | \item[--]<5->Tesseract 99 | \end{enumerate} 100 | \item[--]<6->But ... 101 | \begin{enumerate} 102 | \item[--]<7->Still can't handle complex layout, languages other than english etc.\\ 103 | \only<8>{\normalsize 104 | \begin{quote}``I found that even native OCR software such as \ldots the Abbyy Fine Reader \alert{proved utterly incapable of extracting words from scanned images of the texts}, even when those scanned images were of high quality.''\end{quote}} 105 | \item[--]<9->No information on how well you do (\alert{Quality Metrics}). 106 | \item[--]<10->Not scalable 107 | \end{enumerate} 108 | \end{large_enum} 109 | \end{frame} 110 | 111 | \begin{frame} 112 | \frametitle{How to Convert Squiggles to Bits?} 113 | \begin{large_enum} 114 | \item[--]<2-> Take images of paper 115 | \item[--]<3-> Within images, find where \alert{relevant} text is located 116 | \item[--]<4-> Find out how the text is laid out 117 | \item[--]<5-> Recognize the characters 118 | \end{large_enum} 119 | \end{frame} 120 | 121 | \begin{frame} 122 | \frametitle{Thus Performance Depends on...} 123 | \begin{large_enum} 124 | \item[--]<1->Quality of the scan: spine, contrast etc. 125 | \only<2>{\scalebox{0.6}{\includegraphics{ScannedBook3.png}}} 126 | \item[--]<3->Complexity of the layout 127 | \only<4>{\scalebox{0.35}{\includegraphics{ScannedBook2.png}}} 128 | \only<5>{\scalebox{0.6}{\includegraphics{ScannedBook4.png}}} 129 | \item[--]<6->Font 130 | \item[--]<7->Language 131 | \item[--]<8->Hardware and Software (duh!) 132 | \end{large_enum} 133 | \end{frame} 134 | 135 | \begin{frame} 136 | \frametitle{OCR} 137 | \begin{large_enum} 138 | \item[--]<1->Make images 139 | \item[--]<2->Detect Text \\ \pause 140 | \only<3->{\scalebox{1}{\includegraphics{TextArea.png}}} 141 | \item[--]<4->Segment ``Characters''\\ \pause \pause 142 | \only<5->{\scalebox{1}{\includegraphics{CharacterBoxes.png}}} 143 | \item[--]<6->Classify ``Characters''\\ \pause 144 | \only<7->{\scalebox{1}{\includegraphics{recognize2.png}}} 145 | \end{large_enum} 146 | \end{frame} 147 | 148 | \begin{frame} 149 | \frametitle{Mechanics} 150 | \begin{large_enum} 151 | \item[--]<1->Detect Text 152 | \begin{enumerate} 153 | \item[--]<2-> Supervised Learning 154 | \item[--]<3-> Blobs with text, Blobs without 155 | \item[--]<4-> But size of a blob is an issue 156 | \end{enumerate} 157 | \item[--]<5->Character Segmentation 158 | \begin{enumerate} 159 | \item[--]<6-> Supervised Learning 160 | \item[--]<7-> Letters (and \alert{Ligatures}) versus Splits 161 | \end{enumerate} 162 | \item[--]<8->Classify Characters (and Ligatures) 163 | \begin{enumerate} 164 | \item[--]<1-> Supervised Learning 165 | \item[--]<2-> A versus B versus C... 166 | \end{enumerate} 167 | \end{large_enum} 168 | \end{frame} 169 | 170 | \begin{frame} 171 | \frametitle{Supervised Learning} 172 | \begin{large_enum} 173 | \item[--]<1->Classified (training) data 174 | \item[--]<2->Estimate a model\\ \pause \normalsize 175 | \only<3>{$logit [p(spam)] = \alpha + f'\beta$ where $f$ is frequencies.\\} 176 | \only<4>{Predict class (e.g. Blobs with or without text) using features (pixel by pixel rgb)\\ 177 | Use cross-validation to tune the parameters} 178 | \item[--]<5->Predict classes of unseen data (groups of pixels) 179 | \end{large_enum} 180 | \end{frame} 181 | 182 | \begin{frame} 183 | \frametitle{Paper to Digital Pipeline} 184 | \begin{large_enum} 185 | \item[--]<1-> Take images of paper 186 | \item[--]<1-> Within images, find where \alert{relevant} text is located 187 | \item[--]<1-> Find out how the text is laid out 188 | \item[--]<1-> Recognize the characters 189 | \item[--]<2-> \alert{Every step is error prone} 190 | \end{large_enum} 191 | \end{frame} 192 | 193 | \begin{frame} 194 | \only<1->{\Large Optimize all steps w.r.t final error rate.} 195 | \only<2->{\Large How to deal with errors that remain} 196 | \end{frame} 197 | \begin{frame} 198 | \frametitle{How to Fix Errors} 199 | \begin{large_enum} 200 | \item[--]<1->How confident are you that... 201 | \begin{enumerate} 202 | \item[--]<2-> An area has \alert{relevant} text 203 | \item[--]<3-> Split is correct 204 | \item[--]<4-> Right character (or ligature) is recognized 205 | \end{enumerate} 206 | \item[--]<5-> Flag low confidence areas, splits, characters... 207 | \item[--]<6-> Get humans to identify the correct classes 208 | \item[--]<7-> Use that knowledge to fix other errors 209 | \end{large_enum} 210 | \end{frame} 211 | 212 | \begin{frame} 213 | \frametitle{Fixing Character Recognition Errors} 214 | \begin{large_enum} 215 | \item[--]<1-> Search and Replace 216 | \item[--]<2-> OCR makes certain kinds of errors (| is mistaken for an I) 217 | \item[--]<3-> Compare against a corpora (dictionary) and replace 218 | \item[--]<4-> But replace with what? 219 | \item[--]<5-> standd -> strand, stand, stood, or sand? 220 | \end{large_enum} 221 | \end{frame} 222 | 223 | \begin{frame} 224 | \frametitle{Edit Distance} 225 | \begin{large_enum} 226 | \item[--]<1->How similar are two strings? 227 | \item[--]<2->Typically refers to minimum edit distance 228 | \item[--]<3->Minimum number of editing operations (Insertion, Deletion, Substitution) to convert one string to another. 229 | \item[--]<4->Levenshtein Distance, substitution cost = 2 230 | \item[--]<5->You can implement this at word level so Microsoft Corp. is 1 away from Microsoft. 231 | \end{large_enum} 232 | \end{frame} 233 | 234 | \begin{frame} 235 | \frametitle{Supervised Learning} 236 | \begin{large_enum} 237 | \item[--]<1->But edit distance isn't context aware. Use surrounding words. 238 | \item[--]<2->How likely is a certain word within a phrase? 239 | \item[--]<3->$\sim$ Contemporary spelling correction algorithms 240 | \item[--]<4->A bigram model of language: given previous word, probability of next word 241 | \item[--]<5->But good training data is paramount. 242 | \end{large_enum} 243 | \end{frame} 244 | 245 | \begin{frame} 246 | \frametitle{Supervised Learning} 247 | \begin{large_enum} 248 | \item[--]<1->Training data is `similar data' (topic model) and data from human computation 249 | \item[--]<2->Estimate a model based on similar data 250 | \item[--]<3->Use stochastic gradient descent to continue to tweak parameters based on human computation 251 | \item[--]<4->Human computation parallelized, data for costlier (most duplicated low confidence strings, errors in recognition correlated) errors prioritized 252 | \item[--]<5->Calculate error rate against trained random sample 253 | \end{large_enum} 254 | \end{frame} 255 | 256 | \end{document} 257 | -------------------------------------------------------------------------------- /ds1/.gitignore: -------------------------------------------------------------------------------- 1 | /img/* -------------------------------------------------------------------------------- /ds1/ds1_present_web.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soodoku/data-science/0af39968f028b6c507945ebdb38bea8771f49303/ds1/ds1_present_web.pdf -------------------------------------------------------------------------------- /ds1/ds1_web.tex: -------------------------------------------------------------------------------- 1 | \documentclass[compress, black]{beamer} 2 | \setbeamercolor{normal text}{fg=black} 3 | \beamertemplatesolidbackgroundcolor{white} 4 | \usecolortheme[named=black]{structure} 5 | \usepackage{caption} 6 | \captionsetup{labelformat=empty} 7 | \setbeamertemplate{navigation symbols}{} 8 | %\usefonttheme{structurebold} 9 | \usepackage[scaled]{helvet} 10 | \renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif 11 | \usepackage[T1]{fontenc} 12 | \usepackage{setspace} 13 | %\usepackage{beamerthemesplit} 14 | \usepackage{graphics} 15 | \usepackage{hyperref} 16 | \usepackage{graphicx} 17 | \usepackage{verbatim} 18 | \usepackage{amssymb} 19 | \usepackage{wrapfig} 20 | \usefonttheme[onlymath]{serif} 21 | \usepackage{cmbright} 22 | \usepackage[normalem]{ulem} 23 | \def\labelitemi{\textemdash} 24 | \setbeamertemplate{frametitle}{ 25 | \begin{centering} 26 | \vskip15pt 27 | \insertframetitle 28 | \par 29 | \end{centering} 30 | } 31 | \title[DS]{Data Science} 32 | \author[Sood]{Gaurav~Sood} 33 | \large 34 | \date[2015]{Spring 2015} 35 | \subject{LearnDS} 36 | \begin{document} 37 | \newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}} 38 | \newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}} 39 | \newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}} 40 | 41 | \newenvironment{large_enum}{ 42 | \Large 43 | \begin{itemize} 44 | \setlength{\itemsep}{7pt} 45 | \setlength{\parskip}{0pt} 46 | \setlength{\parsep}{0pt} 47 | }{\end{itemize}} 48 | 49 | \begin{comment} 50 | 51 | setwd(paste0(basedir, "github/data-science/ds1/")) 52 | tools::texi2dvi("ds1_web.tex", pdf=TRUE,clean=TRUE) 53 | setwd(basedir) 54 | 55 | \end{comment} 56 | \frame 57 | { 58 | \titlepage 59 | } 60 | 61 | \frame{ 62 | \frametitle{Big Data} 63 | \Large 64 | \only<2>{\indent Lots of hype recently.\\\vspace{10mm}} \only<3>{\scalebox{0.7}{\includegraphics{img/bigdata.png}}} \only<4>{\indent But where's the cheese?\\\vspace{10mm}} 65 | } 66 | \frame{ 67 | \center 68 | \Large Some examples 69 | } 70 | \frame{ 71 | \frametitle{Fishing Out Fishy Figures} 72 | \only<1>{ 73 | \begin{figure}[p] 74 | \centering \includegraphics[scale=0.5]{img/economist.png} 75 | \end{figure} 76 | \small ``The CPINu's August inflation figure of \alert{1.3\%} is less than half the \alert{2.65\%} of the CPI Congreso, a compilation of private estimates 77 | gathered by opposition members of Congress.'' (\href{http://www.economist.com/blogs/americasview/2014/09/statistics-argentina}{Economist})} 78 | \only<2>{ 79 | \begin{figure}[p] 80 | \centering \includegraphics[scale=0.5]{img/ArgentinaPriceIndex.png} 81 | \end{figure} 82 | \small Source: \href{http://www.mit.edu/~afc/papers/Cavallo-Argentina-JME.pdf}{Online vs Official Price Indexes: Measuring Argentina's Inflation By Alberto Cavallo}} 83 | } 84 | 85 | \frame{ 86 | \frametitle{Suicide Prevention in the Army} 87 | \Large 88 | \only<2>{``In 2012, more soldiers committed suicide than died while fighting in Afghanistan: 349 suicides compared to 295 combat deaths.''} 89 | \only<3>{\begin{figure}[p] 90 | \centering 91 | \includegraphics[scale=0.5]{img/ArmySuicides.png}\\ 92 | \end{figure} 93 | } 94 | \only<4>{``Research has repeatedly shown that doctors are not accurate in predicting who is at risk of suicide.''} 95 | \only<5>{``The soldiers with the \alert{highest 5 percent of risk scores} committed over \alert{half of all suicides} in the period covered --- at an extraordinary rate of about 3,824 suicides per 100,000 person-years.''\\\vspace{5mm} 96 | \small \href{http://fivethirtyeight.com/features/the-army-is-building-an-algorithm-to-prevent-suicide/}{538 Article}\\ 97 | \href{http://www.ncbi.nlm.nih.gov/pubmed/25390793}{STARRS paper}} 98 | } 99 | \frame{ 100 | \frametitle{Reducing Crime} 101 | \Large 102 | \only<1>{Minority Report} 103 | \only<2>{Predictive, `CompStat', `HotSpot' Policing} 104 | \only<3>{\href{http://www.predpol.com/}{PredPol}: Predictive Policing\\\normalsize 105 | LAPD, Atlanta PD \\ 106 | Based off earthquake prediction algorithm} 107 | \only<4>{``During a four-month trial in Kent, \alert{8.5\%} of all street crime occurred within PredPol's pink boxes, with plenty more next door to them; predictions from police analysts scored only \alert{5\%}. An earlier trial in Los Angeles saw the machine score \alert{6\%} compared with human analysts' \alert{3\%}.''\\\vspace{5mm} 108 | \small \href{http://www.economist.com/news/briefing/21582042-it-getting-easier-foresee-wrongdoing-and-spot-likely-wrongdoers-dont-even-think-about-it}{Economist}} 109 | \only<5>{\includegraphics[scale=0.45]{img/PredictivePolicing.jpg}} 110 | } 111 | \frame{ 112 | \frametitle{Web Search} 113 | \only<2>{\begin{figure}[p] 114 | \centering\scalebox{0.4}{\includegraphics{img/yahoo.jpg}} 115 | \end{figure} 116 | } 117 | \only<3-7>{ 118 | \begin{large_enum} 119 | \item<3->[--]Human Curation, Ad-hoc automation 120 | \item<4->[--]Google crawls over 20 billion URLs a day (Sullivan 2012). 121 | \item<5->[--]Google answers 100 billion search queries a month (Sullivan 2012). 122 | \item<6->[--] ``\ldots a typical search returns results in less than 0.2 seconds'' (Google) 123 | \item<7->[--]Page Rank 124 | \end{large_enum} 125 | } 126 | } 127 | \frame{ 128 | \frametitle{Side-effects of Drugs} 129 | \Large 130 | \only<2>{``Adverse drug events cause substantial morbidity and mortality and are often discovered after a drug comes to market.''} 131 | \only<3>{FDA collects this information from ``physicians, pharmacists, patients, and drug companies'' but these reports ``are incomplete and biased''} 132 | \only<4>{``\alert{paroxetine} and \alert{pravastatin}, whose interaction was \alert{reported to cause hyperglycemia after the time period of the online logs} used in the analysis''} 133 | \only<5>{\begin{figure}[p] 134 | \centering 135 | \includegraphics[scale=0.60]{img/jama.png}\\ 136 | \end{figure} 137 | \small \href{http://www.ncbi.nlm.nih.gov/pubmed/23467469}{Web-scale Pharmacovigilance: Listening to Signals from the Crowd.} By White et al.} 138 | } 139 | \frame{ 140 | \frametitle{Flu Season} 141 | \Large 142 | \only<1>{How many got the sniffles?} 143 | \only<2>{How many got the sniffles in the past month?} 144 | \only<3>{ 145 | \begin{figure}[p] 146 | \centering 147 | \includegraphics[scale=0.60]{img/GoogleFlu09Current.png}\\ 148 | \end{figure} 149 | \small \href{http://www.google.org/flutrends/us/}{Google Flu Trends} 150 | } 151 | \only<4>{Google Flu is sick.} 152 | \only<5>{\begin{figure}[p] 153 | \centering 154 | \includegraphics[scale=0.45]{img/GoogleFlu.png}\\ 155 | \end{figure} 156 | \small \href{http://bit.ly/1KMwZ0Y}{The Parable of Google Flu: Traps in Big Data Analysis.} By Lazer et al. 157 | } 158 | } 159 | 160 | \frame{ 161 | \frametitle{Spam or Ham} 162 | \only<1>{ 163 | \begin{figure}[p] 164 | \centering 165 | \includegraphics[scale=0.75]{img/spam.jpg}\\ 166 | \end{figure} 167 | } 168 | \only<2-4>{ 169 | \begin{large_enum} 170 | \item[-]<2->``According to Commtouch's Internet Threats Trend Report for the first quarter of 2013, an average of \alert{97.4 billion} spam e-mails and \alert{973 million} malware e-mails were sent worldwide each day in Q1 2013 (h/t Softpedia).'' 171 | \item[-]<3->Spam Filter 172 | \item[-]<4->$logit [p(spam)] = \alpha + f'\beta$ where $f$ is frequencies. 173 | \end{large_enum} 174 | } 175 | } 176 | \frame{ 177 | \frametitle{Vote} 178 | \only<1-2>{ 179 | \begin{figure}[!htb] 180 | \centering 181 | \includegraphics[scale=0.5]{img/61million.png} 182 | \end{figure} 183 | } 184 | \Large 185 | \only<2>{\alert{.39\% direct effect}, and $.01$ to .1\% indirect effect.} 186 | } 187 | \frame{ 188 | \frametitle{Vote for Obama} 189 | \begin{large_enum} 190 | \item[--]<1->{Obama 2012 Campaign} 191 | \item[--]<2->{Highly customized messaging: Soccer Moms, NASCAR dads \ldots} 192 | \item[--]<3->{\href{http://citoresearch.com/print/1813}{Used SQL database Vertica for `speed-of-thought' analyses.}} 193 | \end{large_enum} 194 | } 195 | \frame{ 196 | \center 197 | \Large What do we mean by big data? 198 | } 199 | \frame{ 200 | \frametitle{What do we mean by big data?} 201 | \begin{large_enum} 202 | \item[--]<1->Big in rows (size \textit{n})\\ 203 | Big in columns (dimensions \textit{p}) 204 | \item[--]<2->Hard to extract value from 205 | \item[--]<3-> `Big data' is high \textbf{volume}, high \textbf{velocity} and high \textbf{variety} information assets that demand cost-effective, innovative forms of information processing[.]\\\normalsize\indent 206 | Gartner, Inc.'s ``3Vs'' definition. 207 | \end{large_enum} 208 | } 209 | \frame{ 210 | \frametitle{Sources of (Big) Data} 211 | \begin{large_enum} 212 | \item[--]<1->Data as the by-product of other activities\\ 213 | \begin{enumerate} 214 | \item[--]<2-> Click trail, clicks before a purchase 215 | \item[--]<3-> Moving your body (\href{http://www.fitbit.com/}{Fitbit}) 216 | \item[--]<4-> Moving yourself without moving your body (\href{http://www.progressive.com/auto/snapshot/?version=default}{Snapshot}) 217 | \item[--]<5-> Data were always being generated. They just weren't being captured. 218 | \begin{enumerate} 219 | \item[--]<6->Cheaper, smaller sensors help. 220 | \item[--]<7->So does cheaper storage. (1950 $\sim$ \$10,000/MB. 2015 $\lll$ \$.0001/MB) 221 | \end{enumerate} 222 | \end{enumerate} 223 | \item[--]<8-> Data as the primary goal of activities\\\normalsize 224 | Telescopes, Genetic sequencers, 61 million person experiments \ldots 225 | \end{large_enum} 226 | } 227 | \frame{ 228 | \frametitle{How big are the data?} 229 | \begin{large_enum} 230 | \item[--]<2->Web\\\normalsize 231 | 20 billion webpages, each 20 KB = 400 TB \\ \pause 232 | Say all stored on a single disk.\\ \pause 233 | Read speed $\sim$ 50MB/sec.\\ \pause 234 | 92 days to read from disk to memory. 235 | \item[--]<6->Astronomy \\\normalsize 236 | Apache Point Telescope $\sim$ 200 GB/night.\\ 237 | Large Synoptic Survey Telescope: 3 billion pixel camera \\ 238 | $\sim$ 30TB/night. In 10 years $\sim$ 60 PB 239 | \item[--]<7->Life Sciences\\\normalsize 240 | High Throughput sequencer $\sim$ 1 TB/day 241 | \item[--]<8->CIA \\\normalsize \pause \pause 242 | REDACTED 243 | \end{large_enum} 244 | } 245 | 246 | \frame{ 247 | \center 248 | \Large Implications for Statistics, Computation. 249 | } 250 | \frame{ 251 | \frametitle{Implications for Statistics} 252 | \begin{large_enum} 253 | \item[--]<2->Little data, Big data\\ \pause \pause \normalsize 254 | Sampling still matters 255 | \item[--]<4->Everything is significant (The Starry Night)\\\normalsize \pause \pause 256 | \begin{equation}\text{False discovery Proportion} = \frac{\text{\# of FP}}{\text{\# of Sig. Results}}\end{equation}\\ 257 | Benjamini and Hochberg (1995) (FDR), cost of false discovery, Familywise error rate (Bonferroni) 258 | \item[--]<6->Inverting a matrix\\\normalsize \pause \pause 259 | (Stochastic) Gradient Descent (Ascent), BFGS, \ldots 260 | \item[--]<8->Causal inference\\\normalsize \pause \pause 261 | Large $p$ may help\\ \pause 262 | Passive observation as things change arbitrarily may help 263 | \end{large_enum} 264 | } 265 | 266 | \frame{ 267 | \frametitle{Implications for Computation} 268 | \begin{large_enum} 269 | \item[--]<1->Conventional Understanding of what is computationally tractable: Polynomial time algorithm ($N^k$) 270 | \item[--]<2->Now it is $(N^k)/m$, where $m$ is the number of computers. 271 | \item[--]<3->For really big data: N*log(N)\\\normalsize 272 | Traversing a binary tree, sort and search N log(N)\\ 273 | Streaming application 274 | \end{large_enum} 275 | } 276 | \frame{ 277 | \center 278 | \Large MapReduce and PageRank 279 | } 280 | \frame{ 281 | \Large 282 | 20 billion webpages, each 20 KB = 400 TB \\ 283 | Say all data stored on a single disk. Read speed $\sim$ 50MB/sec.\\ 284 | 92 days to read from disk to memory. 285 | } 286 | \frame{ 287 | \frametitle{Solution, Problem} 288 | \begin{large_enum} 289 | \item[--]<1->Parallelize, if 1000 computers, then just $\sim$ 1 hour 290 | \item[--]<2->But \ldots 291 | \begin{enumerate} 292 | \item[--]<3->Nodes can fail.\\ 293 | Say single node fails once every 1000 days.\\ 294 | But 1000 failures per day if 1 million servers. 295 | \item[--]<4->Network bandwidth $\sim$ 1GBps.\\ 296 | If you have 10 TB of day -- takes 1 day. 297 | \item[--]<5->Distributed programming can be very very hard. 298 | \end{enumerate} 299 | \end{large_enum} 300 | } 301 | \frame{ 302 | \frametitle{Solution: MapReduce} 303 | \begin{large_enum} 304 | \item[--]<1->Store data redundantly 305 | \begin{enumerate} 306 | \item[--]<2->Distributed File Systems, e.g. GFS, HDFS 307 | \item[--]<3->Typical usage pattern: Data rarely updated. Read often. Updated through appends. 308 | \item[--]<4->Implementation:\\ 309 | Data kept in chunks, machines called `chunk servers'\\ 310 | Chunks replicated. Typically 3x. One in a completely separate rack.\\ 311 | Master node (GFS)/Name Node (HDFS) tracks metadata 312 | \end{enumerate} 313 | \item[--]<5->Minimize data movement 314 | \item[--]<6->Simple programming model 315 | \end{large_enum} 316 | } 317 | \frame{ 318 | \frametitle{More About MapReduce} 319 | \Large 320 | \begin{large_enum} 321 | \item[--]<1->Name comes from 2004 paper \href{http://static.googleusercontent.com/media/research.google.com/en/us/archive/mapreduce-osdi04.pdf}{MapReduce: Simplified Data Processing on Large Clusters} by Dean and Ghemawat. 322 | \item[--]<2->Implementation - Hadoop (via Yahoo)/Apache 323 | \end{large_enum} 324 | } 325 | \frame{ 326 | \frametitle{MapReduce Example} 327 | \begin{large_enum} 328 | \item[--]<1->Count each distinct word in a huge document, e.g. urls 329 | \begin{enumerate} 330 | \item[--]<2->Map function produces key, value pairs\\ 331 | Word frequency of every word in each document 332 | \item[--]<3->Send different words to different computers 333 | \item[--]<4->Combine 334 | \end{enumerate} 335 | \end{large_enum} 336 | } 337 | \begin{frame}[fragile] 338 | \begin{verbatim} 339 | map(String input_key, String input_value): 340 | // input_key: document name 341 | // input_value: document contents 342 | for each word w in input_value: 343 | EmitIntermediate(w, ``1''); 344 | 345 | 346 | reduce(String output_key, Iterator intermediate_values): 347 | // output_key: a word 348 | // output_values: a list of counts 349 | int result = 0; 350 | for each v in intermediate_values: 351 | result += ParseInt(v); 352 | Emit(AsString(result)); 353 | \end{verbatim} 354 | \end{frame} 355 | \frame{ 356 | \frametitle{PageRank} 357 | \begin{large_enum} 358 | \item[--]<1->\href{http://ilpubs.stanford.edu:8090/422/1/1999-66.pdf}{The PageRank Citation Ranking: Bringing Order to the Web} By Page et al. 359 | \item[--]<2->\href{http://www.cs.uvm.edu/~icdm/algorithms/10Algorithms-08.pdf}{Among the top 10 data mining algorithms} 360 | \item[--]<3->Search 361 | \begin{enumerate} 362 | \item[-]<4->Searching for similarity 363 | \item[-]<5->Works well when you look for a document in your computer.\\ Any small trusted corpora. 364 | \item[-]<6->Web - lots of matches. 365 | \item[-]<7->Web - lots of false positives. Some of it malware peddling sites. 366 | \end{enumerate} 367 | \end{large_enum} 368 | } 369 | \frame{ 370 | \frametitle{PageRank} 371 | \begin{large_enum} 372 | \item[--]<1->How to order conditional on similarity? 373 | \begin{enumerate} 374 | \item[-]<2->One can rank by popularity if you have complete web traffic information. 375 | \item[-]<3->But those data are hard to get. 376 | \item[-]<4->Or you can do ad hoc automation and human curation. 377 | \item[-]<5->But costly to implement. And don't scale. 378 | \end{enumerate} 379 | \item[--]<6->Innovation: see Internet as a graph\\ \normalsize 380 | Nodes = webpages, Edges = hyperlinks \\ 381 | Ranks based on linking patterns alone. 382 | \item[--]<7->Currency is in-links. 383 | \end{large_enum} 384 | } 385 | \frame{ 386 | \frametitle{PageRank} 387 | \Large 388 | \begin{large_enum} 389 | \item[--]<1->Each in-link is a vote. 390 | \item[--]<2->But each vote is not equal. 391 | \item[--]<3->In-links from more important pages count for more 392 | \item[--]<4->Value of each vote: 393 | \begin{enumerate} 394 | \item[-]<5->Proportional to importance of source page 395 | \item[-]<6->Inversely proportional to number of outgoing links on the source page 396 | \item[-]<7->Say page $i$ has importance $r_i$, and $n$ outgoing links, each vote = $r_i/n$ 397 | \end{enumerate} 398 | \end{large_enum} 399 | } 400 | 401 | \frame{ 402 | \frametitle{Page Rank Example} 403 | \begin{figure}[p] 404 | \centering 405 | \includegraphics[scale=0.10]{img/PageRankExample.png} 406 | \end{figure} 407 | Source: Wikipedia 408 | } 409 | \frame{ 410 | \frametitle{PageRank} 411 | \Large 412 | \only<1->{Say page $i$ gets links from pages $j$ ($n=5$, $r_j$) and $k$ ($n=2$, $r_k$)}\\ 413 | \only<2->{\begin{align*} r_{i} = \frac{r_{j}}{5} + \frac{r_{k}}{2} \end{align*}} 414 | \only<3->{Page $j$ will have its own outlinks, and each will have a value $r_j$} 415 | \only<4->{\begin{align*} r_i = \sum \frac{r_j}{d_j}\end{align*} over $j$ where $j$ tracks all the pages pointing to $i$.} 416 | } 417 | \frame{ 418 | \center 419 | \Large Class 420 | } 421 | \frame{ 422 | \frametitle{Data Science} 423 | \begin{figure}[p] 424 | \centering 425 | \includegraphics[scale=0.50]{img/DataScience.png}\\ 426 | \end{figure} 427 | \small \href{http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram}{Data Science Venn Diagram.} By Drew Conoway 428 | } 429 | \frame{ 430 | \frametitle{Course Outline} 431 | \begin{large_enum} 432 | \item[--]<1->Get your own (big) data\\\normalsize 433 | Scrape it, clean it\\ 434 | Basics of webscraping\\ 435 | Basics of regular expressions 436 | \item[--]<2->Manage your (big) data\\\normalsize 437 | Store it, organize it, use it\\ 438 | Relational Databases, SQL (SQLite)\\ 439 | Other databases 440 | \item[--]<3->Analyze your (big) data\\\normalsize 441 | Cross-validation\\ 442 | (Not) Maximally Likely\\ 443 | Numerical Optimization 444 | \end{large_enum} 445 | } 446 | \frame{ 447 | \frametitle{Prerequisites} 448 | \begin{large_enum} 449 | \item[--]<1->Basic but important 450 | \item[--]<2->Statistics: 451 | \begin{enumerate} 452 | \item[--]<1->Probability theory, some combinatorics 453 | \item[--]<2->Linear Regression and other standard estimation techniques 454 | \end{enumerate} 455 | \item[--]<3->Computation: 456 | \begin{enumerate} 457 | \item[--]Have written a loop 458 | \item[--]Have written a function 459 | \end{enumerate} 460 | \end{large_enum} 461 | } 462 | \frame{ 463 | \frametitle{Software and Programming} 464 | \begin{large_enum} 465 | \item[--]<1->Open Source\\\normalsize 466 | License fees add up if you are running software on 1000's of machines 467 | \item[--]<2->R 468 | \begin{enumerate} 469 | \item[--]<3->{\tt RStudio} IDE for R, Makes it easier to code. 470 | \item[--]<4->{\tt R Markdown} For Documenting. 471 | \end{enumerate} 472 | \item[--]<6->Python \\\normalsize 473 | \begin{enumerate} 474 | \item[--]<7->Academic Version {\tt Enthought Python Distribution} 475 | \item[--]<8->Eclipse, PyCharm, PyDev, Aptana Studio etc. 476 | \end{enumerate} 477 | \item[--]<9->SQLite 478 | \end{large_enum} 479 | } 480 | 481 | 482 | \end{document} -------------------------------------------------------------------------------- /ds2/ds2_present_web.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soodoku/data-science/0af39968f028b6c507945ebdb38bea8771f49303/ds2/ds2_present_web.pdf -------------------------------------------------------------------------------- /ds2/ds2_web.tex: -------------------------------------------------------------------------------- 1 | \documentclass[compress, black]{beamer} 2 | \setbeamercolor{normal text}{fg=black} 3 | \beamertemplatesolidbackgroundcolor{white} 4 | \usecolortheme[named=black]{structure} 5 | \usepackage{caption} 6 | \captionsetup{labelformat=empty} 7 | \setbeamertemplate{navigation symbols}{} 8 | %\usefonttheme{structurebold} 9 | \usepackage[scaled]{helvet} 10 | \renewcommand*\familydefault{\sfdefault} %% Only if the base font of the document is to be sans serif 11 | \usepackage[T1]{fontenc} 12 | \usepackage{setspace} 13 | %\usepackage{beamerthemesplit} 14 | \usepackage{graphics} 15 | \usepackage{hyperref} 16 | \usepackage{graphicx} 17 | \usepackage{verbatim} 18 | \usepackage{amssymb} 19 | \usepackage{wrapfig} 20 | \usefonttheme[onlymath]{serif} 21 | \usepackage{cmbright} 22 | 23 | \def\labelitemi{\textemdash} 24 | \setbeamertemplate{frametitle}{ 25 | \begin{centering} 26 | \vskip15pt 27 | \insertframetitle 28 | \par 29 | \end{centering} 30 | } 31 | \title[DS]{Get, Clean Data} 32 | \author[Sood]{Gaurav~Sood} 33 | \large 34 | \date[2015]{Spring 2015} 35 | \subject{LearnDS} 36 | \begin{document} 37 | \newcommand{\multilineR}[1]{\begin{tabular}[b]{@{}r@{}}#1\end{tabular}} 38 | \newcommand{\multilineL}[1]{\begin{tabular}[b]{@{}l@{}}#1\end{tabular}} 39 | \newcommand{\multilineC}[1]{\begin{tabular}[b]{@{}c@{}}#1\end{tabular}} 40 | 41 | \newenvironment{large_enum}{ 42 | \Large 43 | \begin{itemize} 44 | \setlength{\itemsep}{7pt} 45 | \setlength{\parskip}{0pt} 46 | \setlength{\parsep}{0pt} 47 | }{\end{itemize}} 48 | 49 | \begin{comment} 50 | 51 | setwd(paste0(basedir, "github/data-science/ds2/")) 52 | tools::texi2dvi("ds2_web.tex", pdf=TRUE,clean=TRUE) 53 | setwd(basedir) 54 | 55 | \end{comment} 56 | 57 | \frame 58 | { 59 | \titlepage 60 | } 61 | 62 | \frame{ 63 | \frametitle{Data, Data, Everywhere} 64 | \begin{large_enum} 65 | \item[--]<2->The Famous Five:\\\normalsize 66 | Aural, Visual, Somatic, Gustatory, Olfactory 67 | \item[--]<3->The Social Famous Five:\\\normalsize 68 | What people (like to) hear, see, sense, smell, taste, \ldots 69 | \item[--]<4->Manifest Data:\\\normalsize 70 | Likes, Ratings, Reviews, Comments, Views, Searches \ldots 71 | \item[--]<5->Data about data:\\\normalsize 72 | Location of a tweet, photo, who called whom, \ldots 73 | \item[--]<6->Social data:\\\normalsize 74 | Friend graph, followers, who retweeted, liked,\ldots 75 | \item[--]<7->Data about structure:\\\normalsize\vspace{-.4\baselineskip} Layout of the site, In/out links, \ldots 76 | \end{large_enum} 77 | } 78 | 79 | \frame{ 80 | \frametitle{Collecting Digital Data} 81 | \begin{large_enum} 82 | \item[--]<1->Proprietary Data collections\\\normalsize 83 | Lexis-Nexis, comScore \ldots 84 | \item[--]<2->APIs \\\normalsize 85 | Facebook, \href{http://developer.nytimes.com/docs}{NY Times}, Twitter, Google, FourSquare, \href{dfr.jstor.org}{Jstor}, Zillow \ldots 86 | \item[--]<3->Bulk Downloads \\\normalsize 87 | Wikipedia, data.gov, IMDB, Million Song Database, Google n-grams \ldots 88 | \item[--]<4->Scraping 89 | \item[--]<5->Custom Apps\\\normalsize Build custom apps to observe behavior, get (pay) people to download these apps 90 | \end{large_enum} 91 | } 92 | 93 | \frame{ 94 | \frametitle{Scraping} 95 | \begin{large_enum} 96 | \item[--]<1->To analyze data, we typically need structure.\\\normalsize 97 | For instance, same number of rows for each column. 98 | \item[--]<2->But found data often with human readable structure. 99 | \item[--]<2->Copy and paste, type, to a dataset. 100 | \item[--]<4->But error prone, and not scalable. 101 | \item[--]<5->\alert{Idea:} Find the less accessible structure, automate based on it. 102 | \end{large_enum} 103 | } 104 | 105 | \frame{ 106 | \frametitle{Collecting Found Digital Data} 107 | \begin{large_enum} 108 | \item[-]<1->Software 109 | \begin{enumerate} 110 | \item[-]<2->R - Not the best but will do. 111 | \item[-]<3->Python, Ruby, Perl, Java, \ldots 112 | \item[-]<4->30 Digits, 80 Legs, Grepsr \ldots 113 | \end{enumerate} 114 | \item[-]<5->Some things to keep in mind 115 | \begin{enumerate} 116 | \item[-]<6->Check if there is an API, or if data are available for download 117 | \item[-]<7->Play Nice: \\\pause \pause \pause \pause \pause \pause \pause 118 | - Scraper may be disallowed in `robots.txt' \\ \pause 119 | - Build lag between requests. \alert{Make lags random.}\\\pause 120 | - Scrape during off-peak hours 121 | \end{enumerate} 122 | \end{large_enum} 123 | 124 | } 125 | 126 | \begin{frame} 127 | \frametitle{Paper} 128 | \only<1>{\scalebox{0.35}{\includegraphics{ScannedBook.png}}} 129 | \only<2->{ 130 | \begin{large_enum} 131 | \item[-]<2-> Create digital images of paper 132 | \item[-]<3-> Identify colored pixels as characters (OCR) 133 | \item[-]<4-> Software 134 | \begin{enumerate} 135 | \item[-]<5->Adobe Pro., etc. 136 | \item[-]<6->Best in class commercial: Abbyy FineReader \\ 137 | Now has an API 138 | \item[-]<7->Best in class open-source: Tesseract 139 | \end{enumerate} 140 | \item[-]<8->Scrape off recognized characters: pyPdf etc. 141 | \item[-]<9->Post-processing 142 | \end{large_enum} 143 | } 144 | \end{frame} 145 | 146 | \begin{frame} 147 | \frametitle{Pictures, Audio, and Video} 148 | \begin{large_enum} 149 | \item[-]<1->Audio (or Video with audio) to text: Dragon Dictates, Google transcription 150 | \item[-]<2->Pictures: recognize color, faces 151 | \item[-]<3->Objects in images: \href{clarifai.com}{Clarifai} 152 | \item[-]<4->Scrape closed-captions 153 | \end{large_enum} 154 | \end{frame} 155 | 156 | \begin{frame} 157 | \frametitle{Get Others to Work} 158 | \begin{large_enum} 159 | \item[-]<1->Human Computing 160 | \item[-]<2->Amazon.com's Mechanical Turk 161 | \begin{enumerate} 162 | \item[-]<3-> Create Human Intensive Tasks (HITs) 163 | \item[-]<4-> \href{https://www.mturk.com/mturk/findhits?match=false}{Surveys, transcription, translation, \ldots} 164 | \item[-]<5-> You assess the work and pay out 165 | \end{enumerate} 166 | \item[-]<6->Odesk, elance, impact sourcing, run your own ads \ldots 167 | \item[-]<7->\href{http://www.google.com/insights/consumersurveys/home}{Google} -- surveys as payment for content 168 | \end{large_enum} 169 | 170 | \end{frame} 171 | 172 | 173 | \begin{frame}[fragile] 174 | \frametitle{Scraping one HTML page in Python} 175 | 176 | Shakespeare's Twelfth Night\\ 177 | Using \href{http://www.crummy.com/software/BeautifulSoup/}{Beautiful Soup} 178 | \small 179 | \begin{enumerate} 180 | \item[]<2->\begin{verbatim} from BeautifulSoup import BeautifulSoup \end{verbatim} 181 | \item[]<3->\begin{verbatim} from urllib import urlopen \end{verbatim} 182 | \item[]<3-> 183 | \item[]<4->\begin{verbatim} url = urlopen(`http://bit.ly/1D7wKcH').read()\end{verbatim} 184 | \item[]<5->\begin{verbatim} soup = BeautifulSoup(url)\end{verbatim} 185 | \item[]<6->\begin{verbatim} text = soup.p.contents\end{verbatim} 186 | \item[]<7->\begin{verbatim} print text\end{verbatim} 187 | \end{enumerate} 188 | \end{frame} 189 | 190 | \begin{frame}[fragile] 191 | \frametitle{Getting text from one pdf in Python} 192 | 193 | A Political Ad\\ 194 | Using \href{http://pybrary.net/pyPdf/}{PyPdf} 195 | \small 196 | \begin{enumerate} 197 | \item[]<1->\begin{verbatim} import pyPdf \end{verbatim} 198 | \item[]<2-> 199 | \item[]<2->\begin{verbatim} pdf = pyPdf.PdfFileReader(file('path to pdf', `rb'))\end{verbatim} 200 | \item[]<3->\begin{verbatim} content = pdf.getPage(0).extractText()\end{verbatim} 201 | \item[]<4->\begin{verbatim} print content\end{verbatim} 202 | \end{enumerate} 203 | \end{frame} 204 | 205 | \begin{frame}[fragile] 206 | \frametitle{Scraping many urls/files to structured data} 207 | \begin{large_enum} 208 | \item[-]<1->Loop, exploiting structure of the urls/file paths\\\normalsize \pause 209 | e.g. \href{http://search.espncricinfo.com/ci/content/match/search.html?search=odi;all=1;page=1}{ESPN URL} 210 | \item[-]<3->Handle errors, if files or urls don't open, what do you do? 211 | \item[-]<4->To harvest structured data, exploit structure within text 212 | \item[-]<5->Trigger words, html tags, \ldots 213 | \end{large_enum} 214 | \end{frame} 215 | 216 | \begin{frame}[fragile] 217 | \frametitle{Exception(al) Handling} 218 | \begin{enumerate} 219 | \item[]<1->\begin{verbatim}try: \end{verbatim} 220 | \item[]<1->\begin{verbatim} pdf = pyPdf.PdfFileReader(file(pdfFile, 'rb')) \end{verbatim} 221 | \item[]<2->\begin{verbatim}except Exception, e:\end{verbatim} 222 | \item[]<2->\begin{verbatim} return `Cannot Open: %s with error: %s' % 223 | (pdfFile, str(e))\end{verbatim} 224 | \end{enumerate} 225 | \end{frame} 226 | 227 | \begin{frame}[fragile] 228 | \frametitle{Inside the page} 229 | \begin{enumerate} 230 | \item[-]<1->Chrome Developer Tools 231 | \item[-]<2->Quick Tour of HTML 232 | \begin{enumerate} 233 | \item[-]<3->Tags begin with < and end with > 234 | \item[-]<4->Tags usually occur in pairs. Some don't (see img). And can be nested. 235 | \item[-]<5->\href{https://developer.mozilla.org/en-US/docs/Web/HTML/Element}{Mozilla HTML elements} 236 | \item[-]<6->

is for paragraph 237 | \item[-]<7-> is for a link 238 | \item[-]<8->

    ,