├── .gitignore
├── README.md
├── Decision Tree.R
├── Social Network Analysis.R
├── Principal Component Analysis.R
├── Support Vector Machine.R
├── Association Rules.R
├── k-Means Clustering.R
├── Random Forest.R
├── Regression Analysis.R
├── References.md
└── k-Nearest Neighbors.R
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 |
5 | # Session Data files
6 | .RData
7 |
8 | # Example code in package build process
9 | *-Ex.R
10 |
11 | # Output files from R CMD build
12 | /*.tar.gz
13 |
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 |
17 | # RStudio files
18 | .Rproj.user/
19 |
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 |
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 |
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 |
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Numsense! Data Science for the Layman
2 |
3 | Repository of R analysis scripts used in the tutorial book.
4 |
5 | Scripts include explanatory comments for a layman to understand.
6 |
7 | Data sources are cited in [References.md](https://github.com/algobeans/numsense/blob/master/References.md).
8 |
9 | Book download page: http://getbook.at/numsense
10 |
11 |
12 |
13 |
14 |
15 | Sign up below to get bite-sized tutorials delivered to your inbox:
16 |
17 | [](http://eepurl.com/cbVFY1)
18 |
19 | *Copyright © 2015-Present Algobeans.com. All rights reserved. Be a cool bean*
20 |
--------------------------------------------------------------------------------
/Decision Tree.R:
--------------------------------------------------------------------------------
1 | # load decision tree package
2 | library(rpart)
3 | library(tree)
4 |
5 | # read in data
6 | d <- read.csv("Titanic.csv")
7 |
8 | # sample data
9 |
10 | # Class Age Sex Survived
11 | # First Adult Male Yes
12 | # First Adult Male Yes
13 | # First Adult Male Yes
14 |
15 | ############################
16 |
17 | # run decision tree
18 | minsplit = 20
19 | fit <- rpart(Survived ~., data = d, method = "class",
20 | minsplit = minsplit, minbucket = round(minsplit/3))
21 | fit
22 |
23 | printcp(fit) # display the results
24 | plotcp(fit) # visualize cross-validation results
25 | summary(fit) # detailed summary of splits
26 |
27 | # plot tree
28 | plot(fit, uniform=TRUE, margin=0.2)
29 | text(fit, use.n=TRUE, all=TRUE, cex=.8)
30 |
31 |
32 | # nicer plot of tree
33 | library(rattle)
34 | library(rpart.plot)
35 | library(RColorBrewer)
36 |
37 | fancyRpartPlot(fit)
38 |
39 |
--------------------------------------------------------------------------------
/Social Network Analysis.R:
--------------------------------------------------------------------------------
1 | d <- read.csv("armstrade.csv")
2 |
3 | # sample data
4 |
5 | # Name United.States Russia India China Germany
6 | # United States 0 0 3004 0 1037
7 | # Russia 16 0 22279 11132 0
8 | # India 0 0 0 0 0
9 | # China 0 0 0 0 0
10 | # Germany 1393 4 285 69 0
11 | # France 545 5 383 1954 128
12 |
13 | # remove name column
14 | d <- d[,-which(colnames(d)=="Name")]
15 |
16 | # reformat table for export to graphing software Gephi
17 | Source <- rep(0,5000)
18 | Target <- rep(0,5000)
19 | Weight <- rep(0,5000)
20 | Type <- rep("Undirected",5000)
21 | edgetable <- data.frame(Source,Target,Weight,Type)
22 | index <- 1;
23 |
24 | for (i in 1:105){
25 | for (j in (i + 1):106){
26 | trade <- d[i,j] + d[j,i]
27 | if(trade >= 100){
28 | edgetable$Source[index] <- i;
29 | edgetable$Target[index] <- j;
30 | edgetable$Weight[index] <- trade;
31 | index <- index + 1;
32 | }
33 | }
34 | }
35 |
36 | edgetable <- edgetable[edgetable$Source>0,]
37 | write.csv(edgetable,"edgestable.csv",row.names=F)
38 |
39 | # import the resulting CSV file into Gephic to generate network
40 |
--------------------------------------------------------------------------------
/Principal Component Analysis.R:
--------------------------------------------------------------------------------
1 | library(scales) # plot transparent data points
2 | library(Hmisc) # correlation significance
3 |
4 |
5 | d <- read.csv("foods.csv")
6 |
7 | # sample data
8 |
9 | # food Vitamin.C Energy Fiber Protein Fat Vitamin.A
10 | # Asparagus 5.6 20 2.1 2.20 0.12 38
11 | # Beef 0.0 198 0.0 19.42 12.73 0
12 | # Bluefish 0.0 124 0.0 20.04 4.24 120
13 |
14 | ##############################
15 |
16 | # remove food with missing data
17 | d <- na.omit(d[order(d$food),])
18 |
19 | # run PCA
20 | dpca <- d[,-c(1,2,4,8)]
21 | dpca <- prcomp(dpca, scale = T)
22 |
23 | # screeplot
24 | plot(dpca$sdev*100/sum(dpca$sdev), type = "l", axes = F,
25 | xlab = "No. of Components",
26 | ylab = "% of Data Spread Accounted for")
27 | axis(side=1, at=c(1:4))
28 | axis(side=2, at=seq(0, 50, by=10))
29 |
30 | # plot food
31 | plot(predict(dpca)[,1:2], type='n')
32 | text(predict(dpca)[,1:2], labels=d$food, cex=0.6)
33 |
34 | ##############################
35 |
36 | # correlations
37 | dcor <- d[,-c(1,2,4,8)]
38 | rcorr(as.matrix(dcor), type="pearson") # p values
39 |
40 | ##############################
41 |
42 | # attach PC values
43 | d <- cbind.data.frame(d[,1:2], scale(d[,-c(1:2)]))
44 | d <- cbind.data.frame(d, predict(dpca)[,1:2])
45 | d$vitCfiber <- d$Vitamin.C..total.ascorbic.acid.mg.Per.100.g + d$Fiber..total.dietary.g.Per.100.g
46 |
--------------------------------------------------------------------------------
/Support Vector Machine.R:
--------------------------------------------------------------------------------
1 | library(e1071) # svm
2 | library(caret) # tuning via cross-validation
3 |
4 | # read in data
5 | d <- read.csv("cleveland.csv")
6 |
7 | # sample data
8 |
9 | # age trestbps chol thalach oldpeak num
10 | # 63 145 233 150 2.3 0
11 | # 67 160 286 108 1.5 2
12 | # 67 120 229 129 2.6 1
13 | # 37 130 250 187 3.5 0
14 | # 41 130 204 172 1.4 0
15 | # 56 120 236 178 0.8 0
16 |
17 |
18 | # code outcome as binary
19 | # 0 (healthy) vs. 1 (at risk)
20 | d$num[d$num > 0] = 1
21 | d$num <- factor(d$num)
22 |
23 | # select predictors:
24 | # maximum heart rate achieved (thalach) vs.
25 | # ST depression (ECG) induced by exercise wrt rest (oldpeak)
26 |
27 |
28 | # cross-validation to tune parameters
29 | svmModel <- train(d$num ~ d$age + d$thalach,
30 | data = d, method = "svmRadial",
31 | tuneGrid = expand.grid(
32 | sigma=seq(1,3,by=0.5),
33 | C=seq(0.1,2,by=0.1)),
34 | trControl = trainControl(method = "cv"))
35 | gamma <- svmModel$bestTune$sigma
36 | cost <- svmModel$bestTune$C
37 |
38 |
39 | # predictions
40 | model <- svm(num ~ age + thalach, data = d,
41 | kernel = "radial",
42 | gamma = gamma,
43 | cost = cost)
44 | prop.table(table(fitted(model, d), d$num),2)
45 |
46 | # confusion matrix (types of error)
47 | err <- prop.table(table(fitted(model, d), d$num))
48 | err
49 | sum(err[1,1], err[2,2])
50 |
51 |
52 | # plot
53 | plot(model, data = d,
54 | thalach ~ age,
55 | ylab = "Maximum Heart Rate during Exercise",
56 | xlab = "Age",
57 | grid = 50,
58 | symbolPalette = c("green", "black"),
59 | svSymbol = 16,
60 | dataSymbol = 16,
61 | color.palette = terrain.colors)
62 |
--------------------------------------------------------------------------------
/Association Rules.R:
--------------------------------------------------------------------------------
1 | set.seed(88)
2 |
3 | # load packages
4 | require(arules)
5 | require(arulesViz)
6 | require(cluster)
7 |
8 | # load data
9 | data("Groceries")
10 |
11 | ################################################
12 | # Find itemsets with minimum support
13 | ################################################
14 |
15 | # get list of rules with 2 items
16 | rules <- apriori(Groceries,
17 | parameter = list(maxlen = 2,
18 | minlen = 2,
19 | supp = 0.0005,
20 | conf = 0.009))
21 |
22 | # get lift values for chosen items
23 | chosenitems <- c("soda", "other vegetables",
24 | "tropical fruit", "whole milk",
25 | "male cosmetics", "sausage",
26 | "yogurt", "canned beer")
27 |
28 | subrules <- subset(rules,
29 | subset = lhs %in% chosenitems)
30 |
31 | # subset rules to those with high support/lift
32 | subrules <- c(head(sort(subrules, by=c("lift", "support")), 30))
33 |
34 | inspect(subrules)
35 |
36 | # plot rules
37 | plot(subrules, method="graph")
38 | plot(subrules, method="grouped")
39 |
40 |
41 |
42 | ########################################
43 | # Find clusters of transaction patterns
44 | ########################################
45 |
46 | # load sample of dataset
47 | s <- sample(Groceries, 2000)
48 |
49 | # get dissimilarity matrix for clustering
50 | d <- dissimilarity(s, method = "Jaccard")
51 |
52 | # perform PAM clustering with 8 clusters
53 | clus <- pam(d, k = 8)
54 | plot(clus)
55 |
56 | # predict labels for the rest of the dataset
57 | clusPred <- predict(s[clus$medoids], Groceries, method = "Jaccard")
58 | clusters <- split(Groceries, clusPred)
59 |
60 | # get clusters of purchases, support threshold = 0.05
61 | itemFrequencyPlot(clusters[[8]],
62 | population = s,
63 | support = 0.05,
64 | ylim=c(0,0.7),
65 | ylab="Support")
66 |
67 | itemFrequencyPlot(clusters[[3]],
68 | population = s,
69 | support = 0.05,
70 | ylim=c(0,0.7),
71 | ylab="Support")
72 |
73 |
--------------------------------------------------------------------------------
/k-Means Clustering.R:
--------------------------------------------------------------------------------
1 | # read in data
2 | d <- read.csv("movies.csv")
3 |
4 | # sample data
5 |
6 | # movie openness conscientiousness extraversion agreeableness neuroticism
7 | # Avatar -0.279828202 0.191167006 -0.718972805 0.335909756 -0.578141335
8 | # Big Fish 1.63379515 -0.805515676 -0.718972805 -0.301430683 -0.076482424
9 | # Salt -0.279828202 1.187849689 0.705544973 -0.938771121 -0.578141335
10 |
11 | #################################
12 | # Reduce no. of dimensions for 2D visualization
13 | #################################
14 |
15 | # examine correlations between personality traits
16 | cor(d[,-1])
17 |
18 | # combine correlated traits
19 | d$conExt <- d$conscientiousness + d$extraversion
20 | d$neuOpe <- d$neuroticism + d$openness
21 |
22 | dclus <- d[,c("conExt", "neuOpe")]
23 | rownames(dclus) <- d$movie
24 |
25 | #################################
26 | # k-means clustering
27 | #################################
28 |
29 | # standardize variables
30 | dclus <- scale(dclus)
31 |
32 | # Determine number of clusters with a scree plot
33 | set.seed(95)
34 | wss <- (nrow(dclus) - 1) * sum(apply(dclus, 2, var))
35 | for (i in 2:10) {
36 | clus <- kmeans(dclus, centers = i)
37 | wss[i] <- sum(clus$withinss)
38 | }
39 | plot(1:10, wss, type="b", xlab="Number of Clusters",
40 | ylab="Within Cluster Scatter")
41 |
42 |
43 |
44 | nc <- 3 # number of clusters
45 |
46 | # cluster solution
47 | set.seed(95)
48 | fit <- kmeans(dclus, nc)
49 |
50 | # get cluster means
51 | aggregate(dclus, by=list(fit$cluster), FUN = mean)
52 |
53 | # append cluster assignment
54 | dclusFit <- data.frame(fit$cluster, dclus)
55 | dclusFit[order(dclusFit$fit.cluster),]
56 |
57 |
58 | # color-code clusters
59 | dclusFit$col[dclusFit$fit.cluster == 1] = "#F44268" # red
60 | dclusFit$col[dclusFit$fit.cluster == 2] = "#4274F4" # blue
61 | dclusFit$col[dclusFit$fit.cluster == 3] = "#cc8814" # brown
62 |
63 |
64 | # visualize clusters
65 | plot(dclusFit$neuOpe, dclusFit$conExt,
66 | type = 'n',
67 | xlim = c(-2, 2.5),
68 | ylim = c(-2, 2))
69 | text(dclusFit$neuOpe, dclusFit$conExt,
70 | rownames(dclusFit),
71 | cex = 0.8,
72 | col = dclusFit$col)
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/Random Forest.R:
--------------------------------------------------------------------------------
1 |
2 | library(randomForest)
3 |
4 | # read in data
5 | df <- read.csv("crime.csv")
6 |
7 | # sample data
8 | # Year Month Day DayOfWeek Location is.Violent is.Property DayID
9 | # 2014 1 1 3 1 FALSE FALSE 1
10 | # 2014 1 2 4 1 FALSE FALSE 2
11 | # 2014 1 3 5 1 FALSE FALSE 3
12 | # 2014 1 4 6 1 FALSE FALSE 4
13 | # 2014 1 5 7 1 FALSE FALSE 5
14 | # 2014 1 6 1 1 FALSE FALSE 6
15 | # 2014 1 7 2 1 FALSE FALSE 7
16 | # 2014 1 8 3 1 FALSE FALSE 8
17 | # 2014 1 9 4 1 FALSE FALSE 9
18 | # 2014 1 10 5 1 FALSE FALSE 10
19 | # .. ... ... ... ... ... ... ... ...
20 | # Variables not shown: LastDay (int), LastWeek (int), LastMonth (int),
21 | # Tmax (int), Tmin (int), Tavg (int), Depart (int), Heat (int), Cool
22 | # (int), PrecipTotal (dbl)
23 |
24 |
25 | # getting grid coordinates for crime location
26 | df$X <- as.numeric(df$Location) %% 23
27 | df$Y <- 1 + (as.numeric(df$Location) %/% 23)
28 |
29 | # split into training and test datasets
30 | df.test <- df[df$Year==2016,]
31 | df.train <- df[df$Year<2016,]
32 |
33 | # drop irrelevant variables
34 | dropVar <- c("Year","is.Property","Location")
35 | df.test <- df.test[,-which(names(df.test) %in% dropVar)]
36 | df.train <- df.train[,-which(names(df.train) %in% dropVar)]
37 | actual <- df.test$is.Violent
38 | df.test <- df.test[,-which(names(df.test) %in% c("is.Violent"))]
39 |
40 | # shortlist variables
41 | pred <- c("Month","Tavg","Heat","Cool","PrecipTotal")
42 | df.test <- df.test[,-which(names(df.test) %in% pred)]
43 | df.train <- df.train[,-which(names(df.train) %in% pred)]
44 |
45 | # format variables
46 | df.test$DayOfWeek <- as.factor(df.test$DayOfWeek)
47 | df.train$DayOfWeek <- as.factor(df.train$DayOfWeek)
48 |
49 |
50 | # classification model
51 | rf <- randomForest(y = as.factor(df.train$is.Violent),
52 | x = df.train[,-which(names(df.train) %in% "is.Violent")],
53 | ntree = 10,
54 | nodesize = 500,
55 | importance = T)
56 |
57 | # get importance of each predictor
58 | importance(rf)
59 | varImpPlot(rf)
60 |
61 | # predict crime
62 | pred <- predict(rf,df.test)
63 |
64 | # classification accuracy
65 | table(pred,actual)
66 |
67 |
68 |
--------------------------------------------------------------------------------
/Regression Analysis.R:
--------------------------------------------------------------------------------
1 | library(calibrate)
2 |
3 |
4 | ######################################
5 | # Plotting Function
6 | ######################################
7 |
8 | graph <- function(x, xname, y, trendline = "Y",
9 | trendEg ="N", trendx = NULL) {
10 |
11 | # plot points
12 | plot(x, y,
13 |
14 | # format plot points
15 | col = "black", pch = 16, cex = 0.7,
16 |
17 | # clean plot labels
18 | xlab = "",
19 | ylab = "")
20 |
21 | # axis labels
22 | title(xlab = xname, line=3, cex.lab=1.2)
23 | title(ylab = "Median House Prices in $1000's",
24 | line=3, cex.lab=1.2)
25 |
26 |
27 | # plot trendline
28 | if (trendline == "Y") {
29 | mod <- lm(y ~ x)
30 | abline(mod, col = "blue", lwd = 5)
31 |
32 | # plot example trend prediction
33 | if (trendEg == "Y") {
34 |
35 | # get default plot axis limits
36 | plotLim <- par('usr')
37 |
38 | # get predicted y
39 | predy <- round(coef(mod)[1] + coef(mod)[2]*(trendx),2)
40 |
41 | # vertical prediction line
42 | segments(trendx, 0,
43 | trendx, predy,
44 | lwd = 1,
45 | lty = "dotted")
46 |
47 | # horizontal prediction line
48 | segments(plotLim[1], predy,
49 | trendx, predy,
50 | lwd = 1,
51 | lty = "dotted")
52 | textxy(X = plotLim[1] + 0.05,
53 | Y = predy + 0.5,
54 | labs = predy,
55 | cex = 1) # label
56 |
57 | }
58 | }
59 | }
60 |
61 |
62 | ######################################
63 | # Read in and examine the data
64 | ######################################
65 |
66 | # read in data
67 | d <- read.csv("housePrice.csv")
68 |
69 | # sample data
70 | # crime zone indus river nox room age dist highway
71 | # 10.0623 0 18.1 0 0.584 6.833 94.3 2.0882 24
72 | # 10.6718 0 18.1 0 0.740 6.459 94.8 1.9879 24
73 | # 11.1604 0 18.1 0 0.740 6.629 94.6 2.1247 24
74 | # 12.0482 0 18.1 0 0.614 5.648 87.6 1.9512 24
75 | # 12.2472 0 18.1 0 0.584 5.837 59.7 1.9976 24
76 | # .. ... ... ... ... ... ... ... ... ...
77 | # Variables not shown: tax (int), ptratio (dbl), black (dbl),
78 | # lstat (dbl), medv (dbl)
79 |
80 |
81 | # summarize data
82 | str(d)
83 | summary(d)
84 |
85 | # sort predictors by correlation coeff
86 | sort(cor(d)[,which(names(d) == "medv")])
87 |
88 |
89 |
90 | ######################################
91 | # Deriving Coefficients
92 | ######################################
93 |
94 | # regression on top 2 strongest predictors
95 | regModscale <- lm(scale(medv) ~ scale(room) + scale(log(lstat)),
96 | data = d)
97 | summary(regModscale)
98 | regMod <- lm(medv ~ room + log(lstat),
99 | data = d)
100 | summary(regMod)
101 | # calculate mean error
102 | mean(abs(predict(regMod) - d$medv))
103 |
104 |
105 | # regression on room
106 | roomMod <- lm(medv ~ room, data = d)
107 | summary(roomMod)
108 | # calculate mean error
109 | mean(abs(predict(roomMod) - d$medv))
110 |
111 |
112 | # regression on lsat
113 | lsatMod <- lm(medv ~ log(lstat), data = d)
114 | summary(lsatMod)
115 | # calculate mean error
116 | mean(abs(predict(lsatMod) - d$medv))
117 |
118 |
119 |
120 | # correlation coefficients
121 | cor(d$medv, d$room)
122 | cor(d$medv, d$lstat)
123 | cor(d$medv, log(d$lstat))
124 | cor(d$medv, predict(regMod))
125 |
126 |
127 |
128 | ######################################
129 | # Generate and save plots
130 | ######################################
131 |
132 | # plot strongest predictors
133 | graph(x = d$room,
134 | xname = "Average No. of Rooms",
135 | y = d$medv,
136 | trendline = "Y",
137 | trendEg = "Y",
138 | trendx = 8)
139 |
140 |
141 |
142 | graph(x = d$lstat,
143 | xname = "% of Population with Low SES",
144 | y = d$medv,
145 | trendline = "Y")
146 |
147 |
148 |
149 | graph(x = log(d$lstat),
150 | xname = "% of Population with Low SES (Log)",
151 | y = d$medv,
152 | trendline = "Y")
153 |
154 |
155 |
156 | # plot predicted regression values against observed values
157 | graph(x = predict(regMod),
158 | xname = "Combined Predictors",
159 | y = d$medv,
160 | trendline = "Y")
161 |
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/References.md:
--------------------------------------------------------------------------------
1 | # Data Sources and References
2 |
3 | **Personality of Facebook Users** (*k*-Means Clustering)
4 |
5 | * Stillwell, D., & Kosinski, M. (2012). *myPersonality Project* [Data files and description]. Retrieved from http://mypersonality.org/wiki/doku.php?id=download_databases
6 | * Kosinski, M., Matz, S., Gosling, S., Popov, V., & Stillwell, D. (2015). *Facebook as a Social Science Research Tool: Opportunities, Challenges, Ethical Considerations and Practical Guidelines*. American Psychologist.
7 |
8 | **Food Nutrients** (Principal Component Analysis)
9 |
10 | * Agricultural Research Service, United States Department of Agriculture (2015). *USDA Food Composition Databases* [Data]. Retrieved from https://ndb.nal.usda.gov/ndb/nutrients/index
11 |
12 | **Grocery Transactions** (Association Rules)
13 |
14 | * Dataset is included in the following R package: Hahsler, M., Buchta, C., Gruen, B., & Hornik, K. (2016). *arules: Mining Association Rules and Frequent Itemsets*. R package version 1.5-0. [https://CRAN.R-project.org/package=arules](https://CRAN.R-project.org/package=arules)
15 | * Hahsler, M., Hornik, K., & Reutterer, T. (2006). *Implications of Probabilistic Data Modeling for Mining Association Rules*. In Spiliopoulou, M., Kruse, R., Borgelt, C., Nürnberger, A.,& Gaul, W. Eds., *From Data and Information Analysis to Knowledge Engineering, Studies in Classification, Data Analysis, and Knowledge Organization*. pp. 598-605. Berlin, Germany: Springer-Verlag.
16 | * Hahsler, M., & Chelluboina, S. (2011). Visualizing Association Rules: Introduction to the R-extension Package arulesViz. *R Project Module*, 223-238.
17 |
18 | **Weapon Trade** (Network Graphs)
19 |
20 | * Stockholm International Peace Research Institute (2015). *Trade Registers* [Data]. Retrieved from http://armstrade.sipri.org/armstrade/page/trade_register.php
21 |
22 | **House Prices** (Regression Analysis)
23 |
24 | * Harrison, D., & Rubinfeld, D. (1993). *Boston Housing Data* [Data file and description]. Retrieved from https://archive.ics.uci.edu/ml/datasets/Housing
25 | * Harrison, D., & Rubinfeld, D. (1978). Hedonic Prices and the Demand for Clean Air. *Journal of Environmental Economics and Management*, *5*, 81-102.
26 |
27 | **Wine Composition** (*k*-Nearest Neighbors)
28 |
29 | * Forina, M., et al. (1998). *Wine Recognition Data* [Data file and description]. Retrieved from http://archive.ics.uci.edu/ml/datasets/Wine
30 | * Cortez, P., Cerdeira, A., Almeida, F., Matos, T., & Reis, J. (2009). Modeling Wine Preferences by Data Mining from Physicochemical Properties. *Decision Support Systems*, *47*(4), 547-553.
31 |
32 | **Heart Disease** (Support Vector Machine)
33 |
34 | * Robert Detrano (M.D., Ph.D), from Virginia Medical Center, Long Beach and Cleveland Clinic Foundation (1988). *Heart Disease Database (Cleveland)* [Data file and description]. Retrieved from https://archive.ics.uci.edu/ml/datasets/Heart+Disease
35 |
36 |
37 | * Detrano, R., et al. (1989). International Application of a New Probability Algorithm for the Diagnosis of Coronary Artery Disease. *The American journal of cardiology*, *64*(5), 304-310.
38 |
39 | **Titanic survivors** (Decision Tree)
40 |
41 | * British Board of Trade Inquiry (1990). *Titanic Data* [Data file and description]. Retrieved from [http://www.public.iastate.edu/˜hofmann/data/titanic.html](http://www.public.iastate.edu/˜hofmann/data/titanic.html)
42 |
43 |
44 | * Report on the Loss of the 'Titanic' (S.S.) (1990). British Board of Trade Inquiry Report (reprint), Gloucester, UK: Allan Sutton Publishing and are discussed in Dawson, R. J. M. (1995). The ‘Unusual Episode’ Data Revisited. *Journal of Statistics Education*, *3*(3).
45 |
46 | **Crime in San Francisco** (Random Forest)
47 |
48 | * SF OpenData, City and County of San Francisco (2016). *Crime Incidents* [Data]. Retrieved from https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z
49 |
50 | **Weather in San Francisco** (Random Forest)
51 |
52 | * National Oceanic and Atmospheric Administration, National Centers for Environmental Information (2016). *Quality Controlled Local Climatological Data (QCLCD)* [Data file and description]. Retrieved from https://www.ncdc.noaa.gov/qclcd/QCLCD?prior=N
53 |
54 | **Handwritten digits** (Neural Networks)
55 |
56 | * LeCun, Y., & Cortes, C. (1998). *The MNIST Database of Handwritten Digits* [Data file and description]. Retrieved from http://yann.lecun.com/exdb/mnist/
57 |
58 |
59 | * LeCun, Y., Bottou, L., Bengio, Y., & Haffner, P. (1998). Gradient-based Learning Applied to Document Recognition. *Proceedings of the IEEE*, *86*(11), 2278-2324.
60 |
61 | For more open datasets, visit:
62 |
63 | Lichman, M. (2013). *UCI Machine Learning Repository*. Irvine, CA: University of California, School of Information and Computer Science. Retrieved from http://archive.ics.uci.edu/ml
--------------------------------------------------------------------------------
/k-Nearest Neighbors.R:
--------------------------------------------------------------------------------
1 | library(class) # for knn
2 | library(scales) # transparent plot points
3 | library(ElemStatLearn) # visualize knn
4 | library(caret) # for cross-validation
5 |
6 | ####################################
7 |
8 | # read in and normalize wine data
9 | dred <- read.csv("winequality-red.csv")
10 | dred$c <- 2
11 |
12 | dwhite <- read.csv("winequality-white.csv")
13 | dwhite$c <- 1
14 |
15 | # sample data
16 |
17 | # fixed.acidity volatile.acidity citric.acid residual.sugar
18 | # 1 7.4 0.70 0.00 1.9
19 | # 2 7.8 0.88 0.00 2.6
20 | # 3 7.8 0.76 0.04 2.3
21 | # 4 11.2 0.28 0.56 1.9
22 | # 5 7.4 0.70 0.00 1.9
23 | # 6 7.4 0.66 0.00 1.8
24 | # Variables not shown: chlorides (dbl), free.sulfur.dioxide
25 | # (dbl), total.sulfur.dioxide (dbl), density (dbl), pH
26 | # (dbl), sulphates (dbl), alcohol (dbl), quality (int), c
27 | # (dbl)
28 |
29 | ####################################
30 |
31 | # combine red and white wine datasets
32 | d <- rbind.data.frame(dred,dwhite)
33 | d$c <- factor(d$c)
34 | d[,1:11] <- data.frame(scale(d[,1:11]))
35 |
36 | plot(d$chlorides, d$total.sulfur.dioxide,
37 | col=d$c)
38 |
39 | # subset data for prettier visualization
40 | d <- d[d$chlorides < 1.8 & d$total.sulfur.dioxide < 2.8,]
41 |
42 | ####################################
43 |
44 | # index variables most predictive of wine type
45 | selvar <- c("total.sulfur.dioxide", "chlorides")
46 | selvari <- which(names(dred) %in% selvar)
47 |
48 | # cross-validation to find k
49 | knnModel <- train(d$c ~ d$total.sulfur.dioxide + d$chlorides,
50 | data = d, method = "knn",
51 | tuneLength = 20,
52 | trControl = trainControl(method = "cv"))
53 | knnModel # cross-valiation suggests that optimal k = 9
54 |
55 | ####################################
56 |
57 | # visualize decision boundary
58 | kgrid <- c(3, 17, 50) # comparing 3 different values of k
59 |
60 | # compile misclassifcation rates
61 | miss <- NULL
62 |
63 | for (k in kgrid) {
64 |
65 | x <- d[,selvari] # training data
66 | g <- d$c # actual class
67 |
68 | # identify misclassifications
69 | knnr <- knn(d[,selvari], d[,selvari], d$c, k = k)
70 | d$misclass <- NA
71 | d$misclass[d$c != knnr] <- 1
72 | miss <- c(miss, sum(d$misclass, na.rm = T))
73 |
74 | # decision boundary indicated by background color
75 | px1 <- seq(-1.5, 2.0, by = 0.02)
76 | px2 <- seq(-2.0, 3.0, by = 0.02)
77 | gd <- expand.grid(x=px1, y=px2)
78 |
79 | # run knn on all background points to classify them
80 | mod <- knn(x, gd, g, k=k, prob=TRUE)
81 | prob <- attr(mod, "prob")
82 | prob <- ifelse(mod=="1", prob, 1-prob)
83 | prob <- matrix(prob, length(px1), length(px2))
84 | par(mar=rep(2,4))
85 |
86 | # save plot
87 | png(filename = paste("k = ", k, " results.png", sep=""),
88 | width = 600, height = 600)
89 |
90 | # plot contours
91 | contour(px1, px2, prob,
92 | levels=0.5, labels="",
93 | xlab="Chlorides", ylab="Sulfur Dioxide",
94 | main=paste(k,"- nearest neighbors"),
95 | axes = F)
96 |
97 | # actual points that are correctly classified
98 | points(x[is.na(d$misclass),],
99 | pch = 16,
100 | cex = 1,
101 | col=alpha(ifelse(g==1, "black", "red"), 0.5))
102 |
103 | # color background
104 | points(gd, pch=".",
105 | cex=1.2,
106 | col=ifelse(prob>0.5, "black", "red"))
107 |
108 | # color misclassifications
109 | points(d[d$misclass ==1,selvari],
110 | pch = 16,
111 | cex = 1,
112 | col=alpha(ifelse(g[d$misclass==1]==1, "black", "red"), 0.8))
113 |
114 | box()
115 |
116 | dev.off()
117 | }
118 |
119 | # accuracy rate
120 | 100 - miss/nrow(d)*100
121 |
122 |
123 |
124 | ####################################
125 |
126 | # plot without predictions
127 |
128 | # save plot
129 | png(filename = paste("wine classification knn.png", sep=""),
130 | width = 600, height = 600)
131 |
132 | # actual points that are correctly classified
133 | # plot contours
134 | plot(gd, col=NA,
135 | xlab="Chlorides", ylab="Sulfur Dioxide",
136 | axes = F)
137 |
138 | points(x[is.na(d$misclass),],
139 | pch = 16,
140 | cex = 1,
141 | col=alpha(ifelse(g==1, "black", "red"), 0.5))
142 |
143 | # color misclassifications
144 | points(d[d$misclass ==1,selvari],
145 | pch = 16,
146 | cex = 1,
147 | col=alpha(ifelse(g[d$misclass==1]==1, "black", "red"), 0.8))
148 |
149 |
150 | box()
151 | dev.off()
152 |
--------------------------------------------------------------------------------