├── .gitignore ├── README.md ├── Decision Tree.R ├── Social Network Analysis.R ├── Principal Component Analysis.R ├── Support Vector Machine.R ├── Association Rules.R ├── k-Means Clustering.R ├── Random Forest.R ├── Regression Analysis.R ├── References.md └── k-Nearest Neighbors.R /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Numsense! Data Science for the Layman 2 | 3 | Repository of R analysis scripts used in the tutorial book. 4 | 5 | Scripts include explanatory comments for a layman to understand. 6 | 7 | Data sources are cited in [References.md](https://github.com/algobeans/numsense/blob/master/References.md). 8 | 9 | Book download page: http://getbook.at/numsense 10 | 11 | 12 | 13 |

14 | 15 | Sign up below to get bite-sized tutorials delivered to your inbox: 16 | 17 | [![Free Data Science Tutorials](https://annalyzin.files.wordpress.com/2016/08/sign-up-button-transparent-bg-and-cropped.png?w=340&h=55)](http://eepurl.com/cbVFY1) 18 | 19 | *Copyright © 2015-Present Algobeans.com. All rights reserved. Be a cool bean* 20 | -------------------------------------------------------------------------------- /Decision Tree.R: -------------------------------------------------------------------------------- 1 | # load decision tree package 2 | library(rpart) 3 | library(tree) 4 | 5 | # read in data 6 | d <- read.csv("Titanic.csv") 7 | 8 | # sample data 9 | 10 | # Class Age Sex Survived 11 | # First Adult Male Yes 12 | # First Adult Male Yes 13 | # First Adult Male Yes 14 | 15 | ############################ 16 | 17 | # run decision tree 18 | minsplit = 20 19 | fit <- rpart(Survived ~., data = d, method = "class", 20 | minsplit = minsplit, minbucket = round(minsplit/3)) 21 | fit 22 | 23 | printcp(fit) # display the results 24 | plotcp(fit) # visualize cross-validation results 25 | summary(fit) # detailed summary of splits 26 | 27 | # plot tree 28 | plot(fit, uniform=TRUE, margin=0.2) 29 | text(fit, use.n=TRUE, all=TRUE, cex=.8) 30 | 31 | 32 | # nicer plot of tree 33 | library(rattle) 34 | library(rpart.plot) 35 | library(RColorBrewer) 36 | 37 | fancyRpartPlot(fit) 38 | 39 | -------------------------------------------------------------------------------- /Social Network Analysis.R: -------------------------------------------------------------------------------- 1 | d <- read.csv("armstrade.csv") 2 | 3 | # sample data 4 | 5 | # Name United.States Russia India China Germany 6 | # United States 0 0 3004 0 1037 7 | # Russia 16 0 22279 11132 0 8 | # India 0 0 0 0 0 9 | # China 0 0 0 0 0 10 | # Germany 1393 4 285 69 0 11 | # France 545 5 383 1954 128 12 | 13 | # remove name column 14 | d <- d[,-which(colnames(d)=="Name")] 15 | 16 | # reformat table for export to graphing software Gephi 17 | Source <- rep(0,5000) 18 | Target <- rep(0,5000) 19 | Weight <- rep(0,5000) 20 | Type <- rep("Undirected",5000) 21 | edgetable <- data.frame(Source,Target,Weight,Type) 22 | index <- 1; 23 | 24 | for (i in 1:105){ 25 | for (j in (i + 1):106){ 26 | trade <- d[i,j] + d[j,i] 27 | if(trade >= 100){ 28 | edgetable$Source[index] <- i; 29 | edgetable$Target[index] <- j; 30 | edgetable$Weight[index] <- trade; 31 | index <- index + 1; 32 | } 33 | } 34 | } 35 | 36 | edgetable <- edgetable[edgetable$Source>0,] 37 | write.csv(edgetable,"edgestable.csv",row.names=F) 38 | 39 | # import the resulting CSV file into Gephic to generate network 40 | -------------------------------------------------------------------------------- /Principal Component Analysis.R: -------------------------------------------------------------------------------- 1 | library(scales) # plot transparent data points 2 | library(Hmisc) # correlation significance 3 | 4 | 5 | d <- read.csv("foods.csv") 6 | 7 | # sample data 8 | 9 | # food Vitamin.C Energy Fiber Protein Fat Vitamin.A 10 | # Asparagus 5.6 20 2.1 2.20 0.12 38 11 | # Beef 0.0 198 0.0 19.42 12.73 0 12 | # Bluefish 0.0 124 0.0 20.04 4.24 120 13 | 14 | ############################## 15 | 16 | # remove food with missing data 17 | d <- na.omit(d[order(d$food),]) 18 | 19 | # run PCA 20 | dpca <- d[,-c(1,2,4,8)] 21 | dpca <- prcomp(dpca, scale = T) 22 | 23 | # screeplot 24 | plot(dpca$sdev*100/sum(dpca$sdev), type = "l", axes = F, 25 | xlab = "No. of Components", 26 | ylab = "% of Data Spread Accounted for") 27 | axis(side=1, at=c(1:4)) 28 | axis(side=2, at=seq(0, 50, by=10)) 29 | 30 | # plot food 31 | plot(predict(dpca)[,1:2], type='n') 32 | text(predict(dpca)[,1:2], labels=d$food, cex=0.6) 33 | 34 | ############################## 35 | 36 | # correlations 37 | dcor <- d[,-c(1,2,4,8)] 38 | rcorr(as.matrix(dcor), type="pearson") # p values 39 | 40 | ############################## 41 | 42 | # attach PC values 43 | d <- cbind.data.frame(d[,1:2], scale(d[,-c(1:2)])) 44 | d <- cbind.data.frame(d, predict(dpca)[,1:2]) 45 | d$vitCfiber <- d$Vitamin.C..total.ascorbic.acid.mg.Per.100.g + d$Fiber..total.dietary.g.Per.100.g 46 | -------------------------------------------------------------------------------- /Support Vector Machine.R: -------------------------------------------------------------------------------- 1 | library(e1071) # svm 2 | library(caret) # tuning via cross-validation 3 | 4 | # read in data 5 | d <- read.csv("cleveland.csv") 6 | 7 | # sample data 8 | 9 | # age trestbps chol thalach oldpeak num 10 | # 63 145 233 150 2.3 0 11 | # 67 160 286 108 1.5 2 12 | # 67 120 229 129 2.6 1 13 | # 37 130 250 187 3.5 0 14 | # 41 130 204 172 1.4 0 15 | # 56 120 236 178 0.8 0 16 | 17 | 18 | # code outcome as binary 19 | # 0 (healthy) vs. 1 (at risk) 20 | d$num[d$num > 0] = 1 21 | d$num <- factor(d$num) 22 | 23 | # select predictors: 24 | # maximum heart rate achieved (thalach) vs. 25 | # ST depression (ECG) induced by exercise wrt rest (oldpeak) 26 | 27 | 28 | # cross-validation to tune parameters 29 | svmModel <- train(d$num ~ d$age + d$thalach, 30 | data = d, method = "svmRadial", 31 | tuneGrid = expand.grid( 32 | sigma=seq(1,3,by=0.5), 33 | C=seq(0.1,2,by=0.1)), 34 | trControl = trainControl(method = "cv")) 35 | gamma <- svmModel$bestTune$sigma 36 | cost <- svmModel$bestTune$C 37 | 38 | 39 | # predictions 40 | model <- svm(num ~ age + thalach, data = d, 41 | kernel = "radial", 42 | gamma = gamma, 43 | cost = cost) 44 | prop.table(table(fitted(model, d), d$num),2) 45 | 46 | # confusion matrix (types of error) 47 | err <- prop.table(table(fitted(model, d), d$num)) 48 | err 49 | sum(err[1,1], err[2,2]) 50 | 51 | 52 | # plot 53 | plot(model, data = d, 54 | thalach ~ age, 55 | ylab = "Maximum Heart Rate during Exercise", 56 | xlab = "Age", 57 | grid = 50, 58 | symbolPalette = c("green", "black"), 59 | svSymbol = 16, 60 | dataSymbol = 16, 61 | color.palette = terrain.colors) 62 | -------------------------------------------------------------------------------- /Association Rules.R: -------------------------------------------------------------------------------- 1 | set.seed(88) 2 | 3 | # load packages 4 | require(arules) 5 | require(arulesViz) 6 | require(cluster) 7 | 8 | # load data 9 | data("Groceries") 10 | 11 | ################################################ 12 | # Find itemsets with minimum support 13 | ################################################ 14 | 15 | # get list of rules with 2 items 16 | rules <- apriori(Groceries, 17 | parameter = list(maxlen = 2, 18 | minlen = 2, 19 | supp = 0.0005, 20 | conf = 0.009)) 21 | 22 | # get lift values for chosen items 23 | chosenitems <- c("soda", "other vegetables", 24 | "tropical fruit", "whole milk", 25 | "male cosmetics", "sausage", 26 | "yogurt", "canned beer") 27 | 28 | subrules <- subset(rules, 29 | subset = lhs %in% chosenitems) 30 | 31 | # subset rules to those with high support/lift 32 | subrules <- c(head(sort(subrules, by=c("lift", "support")), 30)) 33 | 34 | inspect(subrules) 35 | 36 | # plot rules 37 | plot(subrules, method="graph") 38 | plot(subrules, method="grouped") 39 | 40 | 41 | 42 | ######################################## 43 | # Find clusters of transaction patterns 44 | ######################################## 45 | 46 | # load sample of dataset 47 | s <- sample(Groceries, 2000) 48 | 49 | # get dissimilarity matrix for clustering 50 | d <- dissimilarity(s, method = "Jaccard") 51 | 52 | # perform PAM clustering with 8 clusters 53 | clus <- pam(d, k = 8) 54 | plot(clus) 55 | 56 | # predict labels for the rest of the dataset 57 | clusPred <- predict(s[clus$medoids], Groceries, method = "Jaccard") 58 | clusters <- split(Groceries, clusPred) 59 | 60 | # get clusters of purchases, support threshold = 0.05 61 | itemFrequencyPlot(clusters[[8]], 62 | population = s, 63 | support = 0.05, 64 | ylim=c(0,0.7), 65 | ylab="Support") 66 | 67 | itemFrequencyPlot(clusters[[3]], 68 | population = s, 69 | support = 0.05, 70 | ylim=c(0,0.7), 71 | ylab="Support") 72 | 73 | -------------------------------------------------------------------------------- /k-Means Clustering.R: -------------------------------------------------------------------------------- 1 | # read in data 2 | d <- read.csv("movies.csv") 3 | 4 | # sample data 5 | 6 | # movie openness conscientiousness extraversion agreeableness neuroticism 7 | # Avatar -0.279828202 0.191167006 -0.718972805 0.335909756 -0.578141335 8 | # Big Fish 1.63379515 -0.805515676 -0.718972805 -0.301430683 -0.076482424 9 | # Salt -0.279828202 1.187849689 0.705544973 -0.938771121 -0.578141335 10 | 11 | ################################# 12 | # Reduce no. of dimensions for 2D visualization 13 | ################################# 14 | 15 | # examine correlations between personality traits 16 | cor(d[,-1]) 17 | 18 | # combine correlated traits 19 | d$conExt <- d$conscientiousness + d$extraversion 20 | d$neuOpe <- d$neuroticism + d$openness 21 | 22 | dclus <- d[,c("conExt", "neuOpe")] 23 | rownames(dclus) <- d$movie 24 | 25 | ################################# 26 | # k-means clustering 27 | ################################# 28 | 29 | # standardize variables 30 | dclus <- scale(dclus) 31 | 32 | # Determine number of clusters with a scree plot 33 | set.seed(95) 34 | wss <- (nrow(dclus) - 1) * sum(apply(dclus, 2, var)) 35 | for (i in 2:10) { 36 | clus <- kmeans(dclus, centers = i) 37 | wss[i] <- sum(clus$withinss) 38 | } 39 | plot(1:10, wss, type="b", xlab="Number of Clusters", 40 | ylab="Within Cluster Scatter") 41 | 42 | 43 | 44 | nc <- 3 # number of clusters 45 | 46 | # cluster solution 47 | set.seed(95) 48 | fit <- kmeans(dclus, nc) 49 | 50 | # get cluster means 51 | aggregate(dclus, by=list(fit$cluster), FUN = mean) 52 | 53 | # append cluster assignment 54 | dclusFit <- data.frame(fit$cluster, dclus) 55 | dclusFit[order(dclusFit$fit.cluster),] 56 | 57 | 58 | # color-code clusters 59 | dclusFit$col[dclusFit$fit.cluster == 1] = "#F44268" # red 60 | dclusFit$col[dclusFit$fit.cluster == 2] = "#4274F4" # blue 61 | dclusFit$col[dclusFit$fit.cluster == 3] = "#cc8814" # brown 62 | 63 | 64 | # visualize clusters 65 | plot(dclusFit$neuOpe, dclusFit$conExt, 66 | type = 'n', 67 | xlim = c(-2, 2.5), 68 | ylim = c(-2, 2)) 69 | text(dclusFit$neuOpe, dclusFit$conExt, 70 | rownames(dclusFit), 71 | cex = 0.8, 72 | col = dclusFit$col) 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Random Forest.R: -------------------------------------------------------------------------------- 1 | 2 | library(randomForest) 3 | 4 | # read in data 5 | df <- read.csv("crime.csv") 6 | 7 | # sample data 8 | # Year Month Day DayOfWeek Location is.Violent is.Property DayID 9 | # 2014 1 1 3 1 FALSE FALSE 1 10 | # 2014 1 2 4 1 FALSE FALSE 2 11 | # 2014 1 3 5 1 FALSE FALSE 3 12 | # 2014 1 4 6 1 FALSE FALSE 4 13 | # 2014 1 5 7 1 FALSE FALSE 5 14 | # 2014 1 6 1 1 FALSE FALSE 6 15 | # 2014 1 7 2 1 FALSE FALSE 7 16 | # 2014 1 8 3 1 FALSE FALSE 8 17 | # 2014 1 9 4 1 FALSE FALSE 9 18 | # 2014 1 10 5 1 FALSE FALSE 10 19 | # .. ... ... ... ... ... ... ... ... 20 | # Variables not shown: LastDay (int), LastWeek (int), LastMonth (int), 21 | # Tmax (int), Tmin (int), Tavg (int), Depart (int), Heat (int), Cool 22 | # (int), PrecipTotal (dbl) 23 | 24 | 25 | # getting grid coordinates for crime location 26 | df$X <- as.numeric(df$Location) %% 23 27 | df$Y <- 1 + (as.numeric(df$Location) %/% 23) 28 | 29 | # split into training and test datasets 30 | df.test <- df[df$Year==2016,] 31 | df.train <- df[df$Year<2016,] 32 | 33 | # drop irrelevant variables 34 | dropVar <- c("Year","is.Property","Location") 35 | df.test <- df.test[,-which(names(df.test) %in% dropVar)] 36 | df.train <- df.train[,-which(names(df.train) %in% dropVar)] 37 | actual <- df.test$is.Violent 38 | df.test <- df.test[,-which(names(df.test) %in% c("is.Violent"))] 39 | 40 | # shortlist variables 41 | pred <- c("Month","Tavg","Heat","Cool","PrecipTotal") 42 | df.test <- df.test[,-which(names(df.test) %in% pred)] 43 | df.train <- df.train[,-which(names(df.train) %in% pred)] 44 | 45 | # format variables 46 | df.test$DayOfWeek <- as.factor(df.test$DayOfWeek) 47 | df.train$DayOfWeek <- as.factor(df.train$DayOfWeek) 48 | 49 | 50 | # classification model 51 | rf <- randomForest(y = as.factor(df.train$is.Violent), 52 | x = df.train[,-which(names(df.train) %in% "is.Violent")], 53 | ntree = 10, 54 | nodesize = 500, 55 | importance = T) 56 | 57 | # get importance of each predictor 58 | importance(rf) 59 | varImpPlot(rf) 60 | 61 | # predict crime 62 | pred <- predict(rf,df.test) 63 | 64 | # classification accuracy 65 | table(pred,actual) 66 | 67 | 68 | -------------------------------------------------------------------------------- /Regression Analysis.R: -------------------------------------------------------------------------------- 1 | library(calibrate) 2 | 3 | 4 | ###################################### 5 | # Plotting Function 6 | ###################################### 7 | 8 | graph <- function(x, xname, y, trendline = "Y", 9 | trendEg ="N", trendx = NULL) { 10 | 11 | # plot points 12 | plot(x, y, 13 | 14 | # format plot points 15 | col = "black", pch = 16, cex = 0.7, 16 | 17 | # clean plot labels 18 | xlab = "", 19 | ylab = "") 20 | 21 | # axis labels 22 | title(xlab = xname, line=3, cex.lab=1.2) 23 | title(ylab = "Median House Prices in $1000's", 24 | line=3, cex.lab=1.2) 25 | 26 | 27 | # plot trendline 28 | if (trendline == "Y") { 29 | mod <- lm(y ~ x) 30 | abline(mod, col = "blue", lwd = 5) 31 | 32 | # plot example trend prediction 33 | if (trendEg == "Y") { 34 | 35 | # get default plot axis limits 36 | plotLim <- par('usr') 37 | 38 | # get predicted y 39 | predy <- round(coef(mod)[1] + coef(mod)[2]*(trendx),2) 40 | 41 | # vertical prediction line 42 | segments(trendx, 0, 43 | trendx, predy, 44 | lwd = 1, 45 | lty = "dotted") 46 | 47 | # horizontal prediction line 48 | segments(plotLim[1], predy, 49 | trendx, predy, 50 | lwd = 1, 51 | lty = "dotted") 52 | textxy(X = plotLim[1] + 0.05, 53 | Y = predy + 0.5, 54 | labs = predy, 55 | cex = 1) # label 56 | 57 | } 58 | } 59 | } 60 | 61 | 62 | ###################################### 63 | # Read in and examine the data 64 | ###################################### 65 | 66 | # read in data 67 | d <- read.csv("housePrice.csv") 68 | 69 | # sample data 70 | # crime zone indus river nox room age dist highway 71 | # 10.0623 0 18.1 0 0.584 6.833 94.3 2.0882 24 72 | # 10.6718 0 18.1 0 0.740 6.459 94.8 1.9879 24 73 | # 11.1604 0 18.1 0 0.740 6.629 94.6 2.1247 24 74 | # 12.0482 0 18.1 0 0.614 5.648 87.6 1.9512 24 75 | # 12.2472 0 18.1 0 0.584 5.837 59.7 1.9976 24 76 | # .. ... ... ... ... ... ... ... ... ... 77 | # Variables not shown: tax (int), ptratio (dbl), black (dbl), 78 | # lstat (dbl), medv (dbl) 79 | 80 | 81 | # summarize data 82 | str(d) 83 | summary(d) 84 | 85 | # sort predictors by correlation coeff 86 | sort(cor(d)[,which(names(d) == "medv")]) 87 | 88 | 89 | 90 | ###################################### 91 | # Deriving Coefficients 92 | ###################################### 93 | 94 | # regression on top 2 strongest predictors 95 | regModscale <- lm(scale(medv) ~ scale(room) + scale(log(lstat)), 96 | data = d) 97 | summary(regModscale) 98 | regMod <- lm(medv ~ room + log(lstat), 99 | data = d) 100 | summary(regMod) 101 | # calculate mean error 102 | mean(abs(predict(regMod) - d$medv)) 103 | 104 | 105 | # regression on room 106 | roomMod <- lm(medv ~ room, data = d) 107 | summary(roomMod) 108 | # calculate mean error 109 | mean(abs(predict(roomMod) - d$medv)) 110 | 111 | 112 | # regression on lsat 113 | lsatMod <- lm(medv ~ log(lstat), data = d) 114 | summary(lsatMod) 115 | # calculate mean error 116 | mean(abs(predict(lsatMod) - d$medv)) 117 | 118 | 119 | 120 | # correlation coefficients 121 | cor(d$medv, d$room) 122 | cor(d$medv, d$lstat) 123 | cor(d$medv, log(d$lstat)) 124 | cor(d$medv, predict(regMod)) 125 | 126 | 127 | 128 | ###################################### 129 | # Generate and save plots 130 | ###################################### 131 | 132 | # plot strongest predictors 133 | graph(x = d$room, 134 | xname = "Average No. of Rooms", 135 | y = d$medv, 136 | trendline = "Y", 137 | trendEg = "Y", 138 | trendx = 8) 139 | 140 | 141 | 142 | graph(x = d$lstat, 143 | xname = "% of Population with Low SES", 144 | y = d$medv, 145 | trendline = "Y") 146 | 147 | 148 | 149 | graph(x = log(d$lstat), 150 | xname = "% of Population with Low SES (Log)", 151 | y = d$medv, 152 | trendline = "Y") 153 | 154 | 155 | 156 | # plot predicted regression values against observed values 157 | graph(x = predict(regMod), 158 | xname = "Combined Predictors", 159 | y = d$medv, 160 | trendline = "Y") 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /References.md: -------------------------------------------------------------------------------- 1 | # Data Sources and References 2 | 3 | **Personality of Facebook Users** (*k*-Means Clustering) 4 | 5 | * Stillwell, D., & Kosinski, M. (2012). *myPersonality Project* [Data files and description]. Retrieved from http://mypersonality.org/wiki/doku.php?id=download_databases 6 | * Kosinski, M., Matz, S., Gosling, S., Popov, V., & Stillwell, D. (2015). *Facebook as a Social Science Research Tool: Opportunities, Challenges, Ethical Considerations and Practical Guidelines*. American Psychologist. 7 | 8 | **Food Nutrients** (Principal Component Analysis) 9 | 10 | * Agricultural Research Service, United States Department of Agriculture (2015). *USDA Food Composition Databases* [Data]. Retrieved from https://ndb.nal.usda.gov/ndb/nutrients/index 11 | 12 | **Grocery Transactions** (Association Rules) 13 | 14 | * Dataset is included in the following R package: Hahsler, M., Buchta, C., Gruen, B., & Hornik, K. (2016). *arules: Mining Association Rules and Frequent Itemsets*. R package version 1.5-0. [https://CRAN.R-project.org/package=arules](https://CRAN.R-project.org/package=arules) 15 | * Hahsler, M., Hornik, K., & Reutterer, T. (2006). *Implications of Probabilistic Data Modeling for Mining Association Rules*. In Spiliopoulou, M., Kruse, R., Borgelt, C., Nürnberger, A.,& Gaul, W. Eds., *From Data and Information Analysis to Knowledge Engineering, Studies in Classification, Data Analysis, and Knowledge Organization*. pp. 598-605. Berlin, Germany: Springer-Verlag. 16 | * Hahsler, M., & Chelluboina, S. (2011). Visualizing Association Rules: Introduction to the R-extension Package arulesViz. *R Project Module*, 223-238. 17 | 18 | **Weapon Trade** (Network Graphs) 19 | 20 | * Stockholm International Peace Research Institute (2015). *Trade Registers* [Data]. Retrieved from http://armstrade.sipri.org/armstrade/page/trade_register.php 21 | 22 | **House Prices** (Regression Analysis) 23 | 24 | * Harrison, D., & Rubinfeld, D. (1993). *Boston Housing Data* [Data file and description]. Retrieved from https://archive.ics.uci.edu/ml/datasets/Housing 25 | * Harrison, D., & Rubinfeld, D. (1978). Hedonic Prices and the Demand for Clean Air. *Journal of Environmental Economics and Management*, *5*, 81-102. 26 | 27 | **Wine Composition** (*k*-Nearest Neighbors) 28 | 29 | * Forina, M., et al. (1998). *Wine Recognition Data* [Data file and description]. Retrieved from http://archive.ics.uci.edu/ml/datasets/Wine 30 | * Cortez, P., Cerdeira, A., Almeida, F., Matos, T., & Reis, J. (2009). Modeling Wine Preferences by Data Mining from Physicochemical Properties. *Decision Support Systems*, *47*(4), 547-553. 31 | 32 | **Heart Disease** (Support Vector Machine) 33 | 34 | * Robert Detrano (M.D., Ph.D), from Virginia Medical Center, Long Beach and Cleveland Clinic Foundation (1988). *Heart Disease Database (Cleveland)* [Data file and description]. Retrieved from https://archive.ics.uci.edu/ml/datasets/Heart+Disease 35 | 36 | 37 | * Detrano, R., et al. (1989). International Application of a New Probability Algorithm for the Diagnosis of Coronary Artery Disease. *The American journal of cardiology*, *64*(5), 304-310. 38 | 39 | **Titanic survivors** (Decision Tree) 40 | 41 | * British Board of Trade Inquiry (1990). *Titanic Data* [Data file and description]. Retrieved from [http://www.public.iastate.edu/˜hofmann/data/titanic.html](http://www.public.iastate.edu/˜hofmann/data/titanic.html) 42 | 43 | 44 | * Report on the Loss of the 'Titanic' (S.S.) (1990). British Board of Trade Inquiry Report (reprint), Gloucester, UK: Allan Sutton Publishing and are discussed in Dawson, R. J. M. (1995). The ‘Unusual Episode’ Data Revisited. *Journal of Statistics Education*, *3*(3). 45 | 46 | **Crime in San Francisco** (Random Forest) 47 | 48 | * SF OpenData, City and County of San Francisco (2016). *Crime Incidents* [Data]. Retrieved from https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z 49 | 50 | **Weather in San Francisco** (Random Forest) 51 | 52 | * National Oceanic and Atmospheric Administration, National Centers for Environmental Information (2016). *Quality Controlled Local Climatological Data (QCLCD)* [Data file and description]. Retrieved from https://www.ncdc.noaa.gov/qclcd/QCLCD?prior=N 53 | 54 | **Handwritten digits** (Neural Networks) 55 | 56 | * LeCun, Y., & Cortes, C. (1998). *The MNIST Database of Handwritten Digits* [Data file and description]. Retrieved from http://yann.lecun.com/exdb/mnist/ 57 | 58 | 59 | * LeCun, Y., Bottou, L., Bengio, Y., & Haffner, P. (1998). Gradient-based Learning Applied to Document Recognition. *Proceedings of the IEEE*, *86*(11), 2278-2324. 60 | 61 | For more open datasets, visit: 62 | 63 | Lichman, M. (2013). *UCI Machine Learning Repository*. Irvine, CA: University of California, School of Information and Computer Science. Retrieved from http://archive.ics.uci.edu/ml -------------------------------------------------------------------------------- /k-Nearest Neighbors.R: -------------------------------------------------------------------------------- 1 | library(class) # for knn 2 | library(scales) # transparent plot points 3 | library(ElemStatLearn) # visualize knn 4 | library(caret) # for cross-validation 5 | 6 | #################################### 7 | 8 | # read in and normalize wine data 9 | dred <- read.csv("winequality-red.csv") 10 | dred$c <- 2 11 | 12 | dwhite <- read.csv("winequality-white.csv") 13 | dwhite$c <- 1 14 | 15 | # sample data 16 | 17 | # fixed.acidity volatile.acidity citric.acid residual.sugar 18 | # 1 7.4 0.70 0.00 1.9 19 | # 2 7.8 0.88 0.00 2.6 20 | # 3 7.8 0.76 0.04 2.3 21 | # 4 11.2 0.28 0.56 1.9 22 | # 5 7.4 0.70 0.00 1.9 23 | # 6 7.4 0.66 0.00 1.8 24 | # Variables not shown: chlorides (dbl), free.sulfur.dioxide 25 | # (dbl), total.sulfur.dioxide (dbl), density (dbl), pH 26 | # (dbl), sulphates (dbl), alcohol (dbl), quality (int), c 27 | # (dbl) 28 | 29 | #################################### 30 | 31 | # combine red and white wine datasets 32 | d <- rbind.data.frame(dred,dwhite) 33 | d$c <- factor(d$c) 34 | d[,1:11] <- data.frame(scale(d[,1:11])) 35 | 36 | plot(d$chlorides, d$total.sulfur.dioxide, 37 | col=d$c) 38 | 39 | # subset data for prettier visualization 40 | d <- d[d$chlorides < 1.8 & d$total.sulfur.dioxide < 2.8,] 41 | 42 | #################################### 43 | 44 | # index variables most predictive of wine type 45 | selvar <- c("total.sulfur.dioxide", "chlorides") 46 | selvari <- which(names(dred) %in% selvar) 47 | 48 | # cross-validation to find k 49 | knnModel <- train(d$c ~ d$total.sulfur.dioxide + d$chlorides, 50 | data = d, method = "knn", 51 | tuneLength = 20, 52 | trControl = trainControl(method = "cv")) 53 | knnModel # cross-valiation suggests that optimal k = 9 54 | 55 | #################################### 56 | 57 | # visualize decision boundary 58 | kgrid <- c(3, 17, 50) # comparing 3 different values of k 59 | 60 | # compile misclassifcation rates 61 | miss <- NULL 62 | 63 | for (k in kgrid) { 64 | 65 | x <- d[,selvari] # training data 66 | g <- d$c # actual class 67 | 68 | # identify misclassifications 69 | knnr <- knn(d[,selvari], d[,selvari], d$c, k = k) 70 | d$misclass <- NA 71 | d$misclass[d$c != knnr] <- 1 72 | miss <- c(miss, sum(d$misclass, na.rm = T)) 73 | 74 | # decision boundary indicated by background color 75 | px1 <- seq(-1.5, 2.0, by = 0.02) 76 | px2 <- seq(-2.0, 3.0, by = 0.02) 77 | gd <- expand.grid(x=px1, y=px2) 78 | 79 | # run knn on all background points to classify them 80 | mod <- knn(x, gd, g, k=k, prob=TRUE) 81 | prob <- attr(mod, "prob") 82 | prob <- ifelse(mod=="1", prob, 1-prob) 83 | prob <- matrix(prob, length(px1), length(px2)) 84 | par(mar=rep(2,4)) 85 | 86 | # save plot 87 | png(filename = paste("k = ", k, " results.png", sep=""), 88 | width = 600, height = 600) 89 | 90 | # plot contours 91 | contour(px1, px2, prob, 92 | levels=0.5, labels="", 93 | xlab="Chlorides", ylab="Sulfur Dioxide", 94 | main=paste(k,"- nearest neighbors"), 95 | axes = F) 96 | 97 | # actual points that are correctly classified 98 | points(x[is.na(d$misclass),], 99 | pch = 16, 100 | cex = 1, 101 | col=alpha(ifelse(g==1, "black", "red"), 0.5)) 102 | 103 | # color background 104 | points(gd, pch=".", 105 | cex=1.2, 106 | col=ifelse(prob>0.5, "black", "red")) 107 | 108 | # color misclassifications 109 | points(d[d$misclass ==1,selvari], 110 | pch = 16, 111 | cex = 1, 112 | col=alpha(ifelse(g[d$misclass==1]==1, "black", "red"), 0.8)) 113 | 114 | box() 115 | 116 | dev.off() 117 | } 118 | 119 | # accuracy rate 120 | 100 - miss/nrow(d)*100 121 | 122 | 123 | 124 | #################################### 125 | 126 | # plot without predictions 127 | 128 | # save plot 129 | png(filename = paste("wine classification knn.png", sep=""), 130 | width = 600, height = 600) 131 | 132 | # actual points that are correctly classified 133 | # plot contours 134 | plot(gd, col=NA, 135 | xlab="Chlorides", ylab="Sulfur Dioxide", 136 | axes = F) 137 | 138 | points(x[is.na(d$misclass),], 139 | pch = 16, 140 | cex = 1, 141 | col=alpha(ifelse(g==1, "black", "red"), 0.5)) 142 | 143 | # color misclassifications 144 | points(d[d$misclass ==1,selvari], 145 | pch = 16, 146 | cex = 1, 147 | col=alpha(ifelse(g[d$misclass==1]==1, "black", "red"), 0.8)) 148 | 149 | 150 | box() 151 | dev.off() 152 | --------------------------------------------------------------------------------