├── .gitignore
├── Data Resources.MD
├── More_about_colab.ipynb
├── Part_1_Intro
    ├── 2-1-DataExploration-airquality.Rmd
    ├── 2-1-DataExploration-airquality.pdf
    ├── 2-2-DataExploration-titanic.Rmd
    ├── 2-2-DataExploration-titanic.pdf
    ├── 2-3-DataExploration-Boston.Rmd
    ├── 2-3-DataExploration-Boston.pdf
    ├── 2-4-Control.Rmd
    ├── 2-4-Control.pdf
    ├── 3-1-DataVizX.Rmd
    ├── 3-1-DataVizX.pdf
    ├── 3-2-DataVizXY.Rmd
    ├── 3-2-DataVizXY.pdf
    ├── 4-1-homework_outline.Rmd
    ├── README.md
    └── titanic.csv
├── Part_2_Linear_Models
    ├── 5-1_LinReg_women.Rmd
    ├── 5-1_LinReg_women.pdf
    ├── 5-2_LinReg_Chick.Rmd
    ├── 5-2_LinReg_Chick.pdf
    ├── 5-3-LinReg-ridge-airq.Rmd
    ├── 5-3-LinReg-ridge-airq.pdf
    ├── 6_1_LogReg-plasma.Rmd
    ├── 6_1_LogReg-plasma.pdf
    ├── 6_2_LogReg-titanic.Rmd
    ├── 6_2_LogReg-titanic.pdf
    ├── 6_3_LogReg-3-iris.Rmd
    ├── 6_3_LogReg-3-iris.pdf
    ├── 6_4_logreg-scratch.Rmd
    ├── 6_4_logreg-scratch.pdf
    ├── 7_1_NBayes-titanic.Rmd
    ├── 7_1_NBayes-titanic.pdf
    ├── 7_2_NBayes-scratch.Rmd
    ├── 7_2_NBayes-scratch.pdf
    ├── 8-1-features.Rmd
    ├── 8-1-features.pdf
    ├── README.md
    └── titanic.csv
├── Part_3_Modern_R
    ├── 09_ggplot.Rmd
    ├── 09_ggplot.pdf
    ├── 09_tidy.Rmd
    ├── 09_tidy.pdf
    ├── Phishing.Rmd
    ├── Phishing.pdf
    ├── mcc.png
    └── readme.md
├── Part_4_Search_Similarity
    ├── 12_1_kNN_class.Rmd
    ├── 12_1_kNN_class.pdf
    ├── 12_2_kNN_reg.Rmd
    ├── 12_2_kNN_reg.pdf
    ├── 12_2_kNN_regression.Rmd
    ├── 12_2_kNN_regression.pdf
    ├── 12_3_kNN_reg_cv.Rmd
    ├── 12_3_kNN_reg_cv.pdf
    ├── 13-1-cluster_kmean_iris.Rmd
    ├── 13-1-cluster_kmean_iris.pdf
    ├── 13-2-cluster_kmean_wine.Rmd
    ├── 13-2-cluster_kmean_wine.pdf
    ├── 13-3-kmean-k_synthetic.Rmd
    ├── 13-3-kmean-k_synthetic.pdf
    ├── 13-4-cluster_hier.Rmd
    ├── 13-4-cluster_hier.pdf
    ├── 14_DT_boston.Rmd
    ├── 14_DT_boston.pdf
    ├── 14_DT_iris.Rmd
    ├── 14_DT_iris.pdf
    ├── 15-PCA-LDA.Rmd
    ├── 15-PCA-LDA.pdf
    ├── readme.md
    └── wine_all.csv
├── Part_5_Kernel_Ensemble
    ├── 16-SVM-1_iris.html
    ├── 16-SVM-1_iris.pdf
    ├── 16-SVM-2_housing.Rmd
    ├── 16-SVM-2_housing.pdf
    ├── 17_ensemble_phishing.Rmd
    ├── 17_ensemble_phishing.pdf
    ├── readme.md
    ├── xgboost1.Rmd
    └── xgboost1.html
├── Part_6_Python_ML
    ├── Chapter_19
    │   ├── 0 - What is a jupyter notebook_.ipynb
    │   ├── 1 - Getting started with Python 3.ipynb
    │   ├── 2 - Control.ipynb
    │   ├── 3 - Files.ipynb
    │   ├── 4 - Strings.ipynb
    │   ├── 5 - Lists.ipynb
    │   ├── 6 - Tuples and Sets.ipynb
    │   ├── 7 - Dicts.ipynb
    │   ├── 8 - Classes.ipynb
    │   ├── 9 - Exceptions and Regex.ipynb
    │   ├── hello.py
    │   ├── sample1.txt
    │   └── temp.txt
    ├── Chapter_20
    │   ├── Heart.csv
    │   ├── data cleaning.ipynb
    │   ├── diabetes.csv
    │   ├── numpy.ipynb
    │   ├── pandas.ipynb
    │   ├── readme.md
    │   ├── seaborn.ipynb
    │   └── sklearn.ipynb
    ├── Chapter_21
    │   ├── DT_classification.ipynb
    │   ├── Linear_regression.ipynb
    │   ├── Logistic_regression.ipynb
    │   ├── Naive_bayes.ipynb
    │   ├── Running Multiple Models.ipynb
    │   ├── data
    │   │   ├── Boston.csv
    │   │   └── titanic3.csv
    │   ├── kNN_classification.ipynb
    │   ├── kNN_regression.ipynb
    │   └── readme.md
    └── readme.md
├── Part_7_Neural_Networks
    ├── 23_NN_classification.ipynb
    ├── 23_NN_regression.ipynb
    ├── 24_Keras_1_classification_wine.ipynb
    ├── 24_Keras_2_PRSA_regression.ipynb
    ├── Keras_CNN_Example_MNIST.ipynb
    ├── Keras_Sequential_Example_MNIST.ipynb
    ├── Keras_imdb_2_RNN.ipynb
    └── readme.md
├── Part_8_Modeling_the_World
    ├── MM1.Rmd
    ├── MM1.pdf
    ├── RL.Rmd
    ├── RL.pdf
    ├── RL2.Rmd
    ├── RL2.pdf
    ├── bayesnet.Rmd
    ├── bayesnet.pdf
    ├── bayesnet1.Rmd
    ├── bayesnet2.Rmd
    ├── bayesnet2.pdf
    ├── grid.png
    ├── hmm1.Rmd
    ├── hmm1.pdf
    ├── hmm2.Rmd
    ├── hmm2.pdf
    └── readme.md
└── README.MD


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/Data Resources.MD:
--------------------------------------------------------------------------------
 1 | # List of Data Resources
 2 | 
 3 | * R has many built-in data sets, and packages add even more. A list of data sets in csv and other formats is available from [this repo](https://vincentarelbundock.github.io/Rdatasets/datasets.html)
 4 | 
 5 | ### Other resources:
 6 | 
 7 | * [The UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)
 8 | * [Kaggle](https://www.kaggle.com/)
 9 | * [Open data on AWS](https://registry.opendata.aws/)
10 | * [Google data sets](https://toolbox.google.com/datasetsearch)
11 | * [Microsoft research open data sets](https://msropendata.com/)
12 | * [Sets by topic](https://github.com/awesomedata/awesome-public-datasets)
13 | * [USAFacts public data](https://usafacts.org/)
14 | 
15 | 
16 | ### National, State and Local Governments also provide data:
17 | 
18 | * [Census Data](https://data.census.gov/cedsci/)
19 | * [US Govt data](https://www.data.gov/)
20 | * [Texas Data](https://www.tdlr.texas.gov/LicenseSearch/licfile.asp)
21 | * [Data from the City of Sacramento](https://data.cityofsacramento.org/)
22 | * [Data from the City of San Francisco](https://datasf.org/opendata/)
23 | 
24 | 


--------------------------------------------------------------------------------
/Part_1_Intro/2-1-DataExploration-airquality.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Exploration with the Airquality Data"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 | ---
  7 | 
  8 | 
  9 | ### Load the data
 10 | 
 11 | The airquality data set is built-into R so we can just load it with the data() function. The data set has 154 observations of 6 variables:
 12 | 
 13 | * Ozone (ppb)
 14 | * Solar.R (langleys)
 15 | * Wind (mph)
 16 | * Temp (F)
 17 | * Month (1-12)
 18 | * Day (1-31)
 19 | 
 20 | The data was collected from May to September, 1973, in New York. You can learn more about the data by typing **?airquality** at the console.
 21 | 
 22 | ```{r}
 23 | data(airquality)
 24 | head(airquality, n=2)
 25 | ```
 26 | ### NAs
 27 | 
 28 | Missing data is encoded with NAs. We can count them with the sum() and is.na() functions, nested. The is.na() function returns a TRUE FALSE vector. Then the sum functions adds the TRUE values because TRUE=1 and FALSE=0.
 29 | 
 30 | Then we try to take the mean of the Ozone column. The syntax data$col lets us access the column. The mean returned NA because it had NAs in that column. We try it again, and this time add a parameter telling it to ignore NAs.
 31 | 
 32 | Try each command one at a time by using ctrl-enter on each line.
 33 | 
 34 | ```{r}
 35 | sum(is.na(airquality$Ozone))
 36 | mean(airquality$Ozone)
 37 | mean(airquality$Ozone, na.rm=TRUE)
 38 | ```
 39 | 
 40 | ### Dealing with NAs
 41 | 
 42 | One option of dealing with NAs is to remove rows that have NAs. Another option is to replace NAs with mean values. We show that second option next. First we make a copy of the airquality data set.
 43 | 
 44 | The following syntax selects all rows which have NAs in the Ozone column.
 45 | 
 46 | df\$Ozone[is.na(df\$Ozone)]
 47 | 
 48 | These selected elements, and only these, will be replaced by the mean of the column.
 49 | 
 50 | ```{r}
 51 | df <- airquality[]
 52 | df$Ozone[is.na(df$Ozone)] <- mean(df$Ozone, na.rm=TRUE)
 53 | mean(df$Ozone)
 54 | ```
 55 | 
 56 | ### Plots
 57 | 
 58 | R has a lot of built-in data visualization tools. The first graph in the code below is a histogram of the temperature field. The second graph plots temperature on the y axis and the index of the observation on the x axis. You can see the ups and downs of the temperature. The third plot puts temperature on the x axis and Ozone on the y axis. 
 59 | 
 60 | These plots are very simple but convey important information. If you want to make more visually appealling graphs there are many options to modify the point color, symbol, size, change the labels, and much more. 
 61 | 
 62 | [This link](https://www.statmethods.net/advgraphs/parameters.html) describes many graphical parameters.
 63 | 
 64 | The final graph below uses some of these parameters.
 65 | 
 66 | ```{r}
 67 | hist(airquality$Temp)
 68 | plot(airquality$Temp)
 69 | plot(airquality$Temp, airquality$Ozone)
 70 | 
 71 |  plot(airquality$Ozone, airquality$Temp, pch=16, col="blue", cex=1.5,
 72 |         main="Airquality", xlab="Ozone", ylab="Temperature")     
 73 | ```
 74 | 
 75 | ### Correlation
 76 | 
 77 | We can check for correlation by creating a table with the cor() function or visually look for correlations by plotting pairs(). The **use="complete"** option tells it to ignore NAs.
 78 | 
 79 | ```{r}
 80 | cor(airquality[1:4], use="complete")
 81 | pairs(airquality[1:4])
 82 | ```
 83 | 
 84 | ### Adding columns to a data frame
 85 | 
 86 | The next code chunk shows how to add a column to a data frame. We first make a copy of the data set. Then we create a new column in which every element is FALSE. Then we select elements where the corresponding element in Temp is over 89, and we classify those as TRUE. Then we display a few rows from the data and plot the new Hot column. 
 87 | 
 88 | Run these lines one at a time (ctrl-enter) to make sure you understand what each line does. In particular, look at df\$Hot after each step by typing **df\$Hot** in the console.
 89 | 
 90 | ```{r}
 91 | df <- airquality[]  # copy the data set
 92 | df$Hot <- FALSE
 93 | df$Hot[df$Temp>89] <- TRUE
 94 | df$Hot <- factor(df$Hot)
 95 | df$Hot[40:46]
 96 | plot(df$Hot)
 97 | ```
 98 | 
 99 | ### Plotting factors
100 | 
101 | We have plotted numeric data above but now we are going to create 3 plots for our new factor. First we use the par() function to set up a 1x3 grid to hold the pictures then we create a plot showing the distribution, a conditional density plot, and a box plot. 
102 | 
103 | * the first plot shows us that there were many more not hot days than hot days in the data
104 | * the conditional density plot shows the same idea, with the light grey region representing the hot days
105 | * the box plot shows us the average hot day is above 90, and the average not hot day is in the high 70s; The heavy vertical bar inside the box is the median, the box itself is the 1st through 3rd quartiles and the whiskers on the end of the dotted vertical lines show the range.
106 | 
107 | ```{r}
108 | par(mfrow=c(1,3))
109 | plot(df$Hot)
110 | cdplot(df$Temp, df$Hot)
111 | plot(df$Hot, df$Temp)
112 | ```
113 | 
114 | 


--------------------------------------------------------------------------------
/Part_1_Intro/2-1-DataExploration-airquality.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_1_Intro/2-1-DataExploration-airquality.pdf


--------------------------------------------------------------------------------
/Part_1_Intro/2-2-DataExploration-titanic.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Data Exploration with the Titanic Data"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 | ---
 7 | 
 8 | 
 9 | ### Load the data
10 | 
11 | Next we use the read.csv() function to read a csv in a subdirectory called data. Once  you read in the data you will see that it has 1310 observations of 14 variables. We run the str() structure function to get a peek at the data.
12 | 
13 | ```{r}
14 | df <- read.csv("data/titanic.csv", na.strings="NA", header=TRUE)
15 | str(df)
16 | ```
17 | ### Data cleaning
18 | 
19 | The read.csv() function is a bit aggressive about making things factors. Generally if the column contains character data, it tries to make it a factor. Sometimes this makes sense, sometimes it does not. 
20 | 
21 | We can change a column to a factor with as.factor() or change a column to integer with as.integer() as shown next.
22 | 
23 | ```{r}
24 | df$survived <- as.factor(df$survived)
25 | df$pclass <- as.factor(df$pclass)
26 | df$sex <- factor(df$sex, levels=c("male", "female"))
27 | ```
28 | 
29 | ### Factors
30 | 
31 | Factors are stored internally as integer vectors but also have a character representation for human readability. We can use contrasts() to find out more about a factor column.
32 | 
33 | The contrasts for pclass shows that we need 2 variables to encode 3 classes. The base case will be class 1. R will create 2 dummy variables for classes 2 and 3. We will see the importance of these when we get to machine learning.
34 | 
35 | 
36 | ```{r}
37 | contrasts(df$pclass)
38 | contrasts(df$sex)
39 | ```
40 | 
41 | ### More exploration
42 | 
43 | The head() and tail() functions let us look at the first or last few rows. 
44 | 
45 | ```{r}
46 | head(df)
47 | tail(df, n=10)
48 | ```
49 | 
50 | The summary() function can summarize an entire data set or individual columns.
51 | 
52 | ```{r}
53 | summary(df)
54 | summary(df$pclass)
55 | ```
56 | 
57 | The names() function is helpful if you forget the column names.
58 | 
59 | ```{r}
60 | names(df)
61 | summary(df$age)
62 | ```
63 | 
64 | 
65 | That's all for now. We will revisit the Titanic data later when we explore classification algorithms: learning how to predict who survived and who didn't based on demographic data in the file.
66 | 


--------------------------------------------------------------------------------
/Part_1_Intro/2-2-DataExploration-titanic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_1_Intro/2-2-DataExploration-titanic.pdf


--------------------------------------------------------------------------------
/Part_1_Intro/2-3-DataExploration-Boston.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Data Exploration with the Boston Housing Data"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 |   html_document:
 7 |     df_print: paged
 8 | editor_options:
 9 |   chunk_output_type: inline
10 | ---
11 | 
12 | 
13 | ### Load the package, installing if necessary
14 | 
15 | ```{r}
16 | if (!require("MASS")){
17 |   install.packages("MASS")
18 | }
19 | #library(MASS)  # not needed with above code
20 | ```
21 | ### Load Boston
22 | 
23 | The Boston housing data set is a collection of data from Boston neighborhoods in the late 1970s. There are 506 rows representing different neighborhoods, and 14 variables:
24 | 
25 | * crim - per capita crime by town
26 | * zn - propostion of residential land zoned for lots over 25K sq ft
27 | * indus - proportion of non-retail business acres per town
28 | * chas - =1 if tract bounds the Charles river; 0 overwise
29 | * nox - nitrous oxide concentration in parts per 10 million
30 | * rm - average number of rooms per dwelling
31 | * age - proportion of owner-occupied units built prior to 1940
32 | * rad - index of accessibility to radial highways
33 | * tax - full-value property tax rate per $10K
34 | * ptratio - pupil-teacher ratio by town
35 | * black - proportion of blacks by town; seriously?
36 | * lstat - lower status of the populatio as a percent
37 | * medv - median value of owner-occupied homes in 1000s of dollars
38 | 
39 | The str() function tells you about the structure of the data set.
40 | 
41 | ```{r}
42 | data(Boston)
43 | str(Boston)
44 | ```
45 | 
46 | ### Built-in R Functions
47 | 
48 | First we attach the data. The advantage of attaching the data is that we can type **mean(lstat)** instead of having to specify the data frame: **mean(Boston$lstat)**. The dataframe$column format is how columns are accessed. However, if we attach the data, R can find the columns without having to specify the data frame. The disadvantage to attaching data is that you may have column names from different data frames that are the same. This can be confusing.
49 | 
50 | 
51 | ```{r}
52 | attach(Boston)
53 | range(medv)
54 | median(tax)
55 | mean(lstat)
56 | summary(age)
57 | ```
58 | 
59 | ### Plot
60 | 
61 | The plot function is powerful and versatile. Read more by typing ?plot at the console. Here we plot median home value as a function of the percent of lower economic status persons in the neighborhood. The color of the points will indicate whether or not the neighborhood is close to the Charles River. The unclass() functions converts the factors to integers so they can index the color choices blue or red. We also added a main heading. We did not specify x or y axis headings so R just used the variable names.
62 | 
63 | ```{r}
64 | plot(medv~lstat, col=c("blue","red")[unclass(chas)+1], main="Boston Housing")
65 | ```
66 | ### Correlation
67 | 
68 | We can check for correlation by creating a table with the cor() function or visually look for correlations by plotting pairs().
69 | 
70 | ```{r}
71 | cor(Boston)
72 | pairs(Boston)
73 | ```
74 | 
75 | ### Plotting a regression line
76 | 
77 | Next we plot number of rooms on the x axis and median home value on the y axis. Then we use function abline() to plot a blue regression line on top of the points. The lm() function creates a linear regression model predicting median value as a function of rooms. We will learn more later, for now just realize that the regression line tries to plot through the middle of the trend. The trend is up. Not surprisingly, houses with more rooms tend to be more expensive.
78 | 
79 | ```{r}
80 | plot(rm, medv)
81 | abline(lm(medv~rm), col="blue")
82 | ```
83 | 
84 | 
85 | That's all for now. In future notebooks we will revisit the Boston housing data.
86 | 


--------------------------------------------------------------------------------
/Part_1_Intro/2-3-DataExploration-Boston.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_1_Intro/2-3-DataExploration-Boston.pdf


--------------------------------------------------------------------------------
/Part_1_Intro/2-4-Control.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Using R Control Structures"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 |   html_document: default
 7 | editor_options: 
 8 |   chunk_output_type: console
 9 | ---
10 | 
11 | # Exploring R Control Structures with the PimaIndiansDiabetes2 data
12 | 
13 | First we load the mlbench package and then the diabetes data set. The data has 768 observations of 9 variables, which we can see with the structure str() function.  The data was collected from Pima Indian women in the late 1990s. 
14 | 
15 | ```{r}
16 | library(mlbench)
17 | data(PimaIndiansDiabetes2)
18 | df <- PimaIndiansDiabetes2   # df points to the data frame
19 | str(df)
20 | ```
21 | 
22 | ### Look for NAs
23 | 
24 | The sapply() function applies a function to elements of a list. In this case the elements of the list are columns in our data frame. The function is an anonymous function (we didn't name it), and it just sums, ignoring NAs.
25 | 
26 | ```{r}
27 | sapply(PimaIndiansDiabetes2, function(x) sum(is.na(x)))
28 | ```
29 | 
30 | ### Write a function
31 | 
32 | As an example of how to write a function, we write a function named fill_NA that takes two arguments and returns a vector. In R the return() statement is often not needed since R will return the last thing evaluated. In this function, the mean_med variable is a switch to choose whether to fill NAs with the mean or the median. 
33 | 
34 | After the function is defined, we can call it with different columns. Then we use the complete.cases() function to get rid of the remaining rows that have NAs.
35 | 
36 | ```{r}
37 | fill_NA <- function(mean_med, v){
38 |   # fill missing values with either 1=mean or 2=median
39 |   if (mean_med == 1){
40 |     m <- mean(v, na.rm=TRUE)
41 |   } else {
42 |     m <- median(v, na.rm=TRUE)
43 |   }
44 |   v[is.na(v)] <- m
45 |   v
46 | }
47 | 
48 | # make a new data set with NA's filled
49 | df <- PimaIndiansDiabetes2[]
50 | df$triceps <- fill_NA(1, df$triceps)
51 | df$insulin <- fill_NA(1, df$insulin)
52 | df <- df[complete.cases(df),]
53 | 
54 | ```
55 | 
56 | 
57 | 
58 | 
59 | 
60 | ### Plots and for loops
61 | 
62 | The following plots some data from the data set and 3 ablines which are 3 linear regression lines created by lm(). Each regression line is a different color and we added a legend.
63 | 
64 | ```{r}
65 | cols <- c(6,8,1)
66 | plot(df$mass, df$glucose, main="PimaIndianDiabetes2")
67 | for (i in 1:3){
68 |     model <- lm(glucose~df[,cols[i]], data=df)[1]
69 |     abline(model, col=i)
70 | }
71 | legend("topright", title="Predictors", c("Mass", "Age", "Pregnant"), fill=c(1,2,3))
72 | ```
73 | 
74 | ### Using ifelse()
75 | 
76 | We use ifelse() to set a variable=1 if insulin is over 155 and 0 otherwise. Then we convert it to a factor, and place it in a new variable.
77 | 
78 | Then we plot. 
79 | 
80 | ```{r}
81 | df$large <- factor(ifelse(df$insulin>155,1,0))
82 | plot(df$mass, df$glucose, pch=21, bg=c("blue","red")[unclass(df$large)])
83 | ```
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/Part_1_Intro/2-4-Control.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_1_Intro/2-4-Control.pdf


--------------------------------------------------------------------------------
/Part_1_Intro/3-1-DataVizX.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Visualization with the Titanic Data"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | editor_options:
  9 |   chunk_output_type: console
 10 | ---
 11 | # Data exploration in R with the Titanic data set
 12 | # for one Variable X, quantitative or qualitative
 13 | 
 14 | Graphical Parameters: [read more here](https://www.statmethods.net/advgraphs/parameters.html)
 15 | 
 16 | Colors in R Graphs: [read more here](http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf)
 17 | 
 18 | ### Load the data
 19 | 
 20 | Load the Titanic data, changing certain columns to factors.
 21 | 
 22 | ```{r}
 23 | df <- read.csv("data/titanic.csv", na.strings="NA", header=TRUE)
 24 | str(df)
 25 | 
 26 | df$survived <- as.factor(df$survived)
 27 | df$pclass <- as.factor(df$pclass)
 28 | df$sex <- factor(df$sex, levels=c("male", "female"))
 29 | ```
 30 | 
 31 | # Plotting one dimension, X is quantitative
 32 | 
 33 | We use the par() function to specify we want to display graphs in a 1x2 grid. To easily restore the parameters, we save them before changing them so that we can  restore them with the last line in the following code block.
 34 | 
 35 | The most common graph for one quantitative variable is the histogram. You can specify the bins but the bins that were created automatically seem fine.
 36 | 
 37 | Another plot that can be used for a single quantitative variable is a simple scatterplot. In this case, the x axis will just be index numbers. In the graph below we color coded the dots to be white if the person did not survive.
 38 | 
 39 | ```{r, warning=FALSE}
 40 | opar <- par()    # copy original settings
 41 | par(mfrow=c(1,2))
 42 | hist(df$age, col="slategray", main="Age of Titanic Passengers", xlab="Age")
 43 | plot(df$age, pch=21, cex=0.75, bg=c("snow", "slategray")[unclass(df$survived)], ylab="Age", main="Age (White Deceased)")
 44 | par(opar)
 45 | ```
 46 | 
 47 | Another option for quantitative data is the kernel density plot. First we compute the density, then plot it. This plot gives you similar information to the histogram, but smoothing is applied. 
 48 | 
 49 | In order to color the plot, we make a polygon in the last line below.
 50 | 
 51 | ```{r}
 52 | d <- density(df$age, na.rm = TRUE)
 53 | plot(d, main="Kernel Density Plot for Age", xlab="Age")
 54 | polygon(d, col="wheat", border="slategrey")
 55 | ```
 56 | 
 57 | We can overlay several kernel density plots using package sm. First we subset the data frame to just be the two columns of interest so that we can use complete.cases() to get rid of NAs. 
 58 | 
 59 | ```{r}
 60 | library(sm)
 61 | df_subset <- df[,c(1,5)]
 62 | df_subset <- df_subset[complete.cases(df_subset),]
 63 | sm.density.compare(df_subset$age, df_subset$pclass, col=c("seagreen", "wheat", "sienna3"), lwd=2)
 64 | title(main="Age by Passenger Class")
 65 | legend("topright", inset=0.05, legend=c(1:3), fill=c("seagreen", "wheat", "sienna3"))
 66 | ```
 67 | 
 68 | We can create a boxplot for a single quantitative variable. Here we made it horizontal. The box shows the 2nd and 3rd quartiles of the data. The "whiskers" at either end of the dashed lines show the 1st and 4th quartiles. Dots beyond a whisker indicate suspected outliers. The bold line through the box indicates the median.
 69 | 
 70 | ```{r}
 71 | boxplot(df$age, col="slategray", horizontal=TRUE, xlab="Age", main="Age of Titanic Passengers")
 72 | ```
 73 | 
 74 | 
 75 | # Plotting one dimension, X is qualitative
 76 | 
 77 | Barplots can be used for qualitative data. The can be vertical or horizontal.
 78 | 
 79 | ```{r}
 80 | counts <- table(df$pclass)
 81 | barplot(counts, xlab="Passenger Class", ylab="Frequency", col=c("seagreen","wheat","sienna3"))
 82 | ```
 83 | 
 84 | Here is the same plot, but with horizontal bars. 
 85 | 
 86 | ```{r}
 87 | counts <- table(df$pclass)
 88 | barplot(counts, horiz=TRUE, names=c("Class 1", "Class 2", "Class 3"), col=c("seagreen","wheat","sienna3"), ylab="Passenger Class", xlab="Frequency")
 89 | ```
 90 | 
 91 | A pie chart can be made with relative frequencies of a quantitative variable. First we specify frequencies for each of the 3 classes, then supply labels. With slices and labels defined, we can make a pie chart. 
 92 | 
 93 | ```{r}
 94 | slices <- c(sum(df$pclass==1, na.rm = TRUE), sum(df$pclass==2, na.rm = TRUE), sum(df$pclass==3, na.rm = TRUE))
 95 | lbls <- c("Class 1", "Class 2", "Class 3")
 96 | pie(slices, labels=lbls, main="Passenger Classes", col=c("seagreen","wheat","sienna3"))
 97 | ```
 98 | 
 99 | 
100 | 
101 | ```
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/Part_1_Intro/3-1-DataVizX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_1_Intro/3-1-DataVizX.pdf


--------------------------------------------------------------------------------
/Part_1_Intro/3-2-DataVizXY.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Visualization with the Titanic Data"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   html_document:
  6 |     df_print: paged
  7 |   pdf_document: default
  8 | editor_options:
  9 |   chunk_output_type: inline
 10 | ---
 11 | # Data exploration in R with the Titanic data set.
 12 | 
 13 | Graphical Parameters: [read more here](https://www.statmethods.net/advgraphs/parameters.html)
 14 | 
 15 | Colors in R Graphs: [read more here](http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf)
 16 | 
 17 | ### Load the data
 18 | 
 19 | Load the data, changing certain columns to factors.
 20 | 
 21 | ```{r}
 22 | df <- read.csv("data/titanic.csv", na.strings="NA", header=TRUE)
 23 | str(df)
 24 | 
 25 | df$survived <- as.factor(df$survived)
 26 | df$pclass <- as.factor(df$pclass)
 27 | df$sex <- factor(df$sex, levels=c("male", "female"))
 28 | ```
 29 | 
 30 | # Plotting X and Y
 31 | 
 32 | In this notebook we look at plotting two variables, which gives us four cases:
 33 | 
 34 | * both X and Y are qualitative
 35 | * X is qualitative, Y is quantitative
 36 | * X is quantitative, Y is qualitative
 37 | * X and Y ar both quantitative
 38 | 
 39 | # X,Y both Qualitative
 40 | 
 41 | If X and Y are both qualitative data, mosaic and association plots are helpful.
 42 | 
 43 | ### Mosaic and association plot examples
 44 | 
 45 | Using the vcd (visualizing categorical data) package. 
 46 | 
 47 | First, a mosaic example. We want to plot survived and pclass. The mosaic() function wants the first argument to be a table or formula, so we surround the subsetted data frame with table(). SHADE=TRUE gives you a color graph, FALSE gives you a greyscale graph. 
 48 | 
 49 | The mosaic plot shows each group in tiles. The area of the tiles is proportional to its counts in the data. 
 50 | 
 51 | 
 52 | The legend indicates the Pearson residuals. The "null" model would consider an even distribution into the cells but clearly we don't have that case here. The blue indicates we have more observations than expected, the red indicates fewer than expected, and gray is about what is expected given a null hypothesis. We didn't have to specify legend=TRUE because that is the default.
 53 | 
 54 | ```{r}
 55 | library(vcd)
 56 | mosaic(table(df[,c(2,1)]),  shade=TRUE, legend=TRUE)
 57 | ```
 58 | 
 59 | We get the same information if we reverse columns 1 and 2. The graph flips around.
 60 | 
 61 | ```{r}
 62 | mosaic(table(df[,c(1,2)]),  shade=TRUE, legend=TRUE)
 63 | 
 64 | ```
 65 | 
 66 | An association plot visualizes the residuals of an independence model. Each tile has an area that is proportional to the difference in observed and expected frequencies. The dotted line is the baseline. Tiles above the line have a frequency greater than what was expected, those below have a frequency below what was expected. 
 67 | 
 68 | In the plot below, pclass 1 survived more than expected, pclass 3 less than expected. 
 69 | 
 70 | ```{r}
 71 | assoc(table(df[,c(1,2)]), shade=TRUE)
 72 | ```
 73 | 
 74 | 
 75 | # X is Qualitative, Y is Quantitative
 76 | 
 77 | When X is qualitative (a factor), and Y is quantitative, box plots are good choices.
 78 | 
 79 | ```{r}
 80 | plot(df$survived, df$age, varwidth=TRUE, main="Survival and Age", xlab="Survived", ylab="Age")
 81 | # the following creates an identical plot
 82 | boxplot(df$age~df$survived, varwidth=TRUE, main="Survival and Age", xlab="Survived", ylab="Age")
 83 | ```
 84 | 
 85 | 
 86 | 
 87 | 
 88 | Notches at the median can be added with the notch=TRUE parameter. If the notches do not overlap, then it is likely that medians differ. 
 89 | 
 90 | ```{r}
 91 | plot(df$age~df$pclass, varwidth=TRUE, notch=TRUE, xlab="Passenger Class", ylab="Age", col=c("seagreen","wheat","sienna3"))
 92 | ```
 93 | 
 94 | Note: You can also create violin plots with package vioplot. Violin plots are a combination of a boxplot and a kernel density plot. This plot does not like NAs so we remove them. 
 95 | 
 96 | 
 97 | ```{r}
 98 | library(vioplot)
 99 | df_subset <- df[,c(1,2,5)]
100 | df_subset <- df_subset[complete.cases(df_subset),]
101 | x1 <- df_subset$age[df_subset$pclass==1]
102 | x2 <- df_subset$age[df_subset$pclass==2]
103 | x3 <- df_subset$age[df_subset$pclass==3]
104 | vioplot(x1, x2, x3, col="wheat", names=c("Class 1", "Class 2", "Class"))
105 | ```
106 | 
107 | 
108 | 
109 | # X is Quantitative, Y is Qualitative
110 | 
111 | When X is quantitative and Y is qualitative, a conditional density plot can be used. The following plot shows how survived changes over the various ages. 
112 | 
113 | ```{r}
114 | cdplot(df_subset$age, df_subset$survived, col=c("snow", "slategray"))
115 | ```
116 | 
117 | The following is not informative because X is quantitative and Y is qualitative. 
118 | 
119 | ```{r}
120 | plot(df_subset$age, df_subset$survived)
121 | ```
122 | 
123 | 
124 | # X,Y both Quantitative
125 | 
126 | If X and Y are both quantitative, scatter plots are recommended. Here we have crosses for the points in blue, 75% of the usual size. We would have to dig further into the Titanic data to understand this chart. Why do so many passengers seem to have a fare of 0? And why did a few passengers pay 500? Perhaps the 500 fares paid for several people and the 0 fares reflect passengers whose fares were paid by a spouse or parent or adult child? Further investigation is required to understand this. 
127 | 
128 | ```{r}
129 | plot(df$age, df$fare, pch='+', cex=0.75, col="blue", xlab="Age", ylab="Fare")
130 | ```
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/Part_1_Intro/3-2-DataVizXY.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_1_Intro/3-2-DataVizXY.pdf


--------------------------------------------------------------------------------
/Part_1_Intro/4-1-homework_outline.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Homework n"
 3 | author: Your Name Here
 4 | output: html_notebook
 5 | ---
 6 | ```{r}
 7 | 
 8 | ```
 9 | 
10 | # Step 1: Identify objective(s).
11 | 
12 | # Step 2: Data collection.
13 | 
14 | Describe the collected data in terms of size and the meaning of columns.
15 | 
16 | # Step 3: Data Exploration
17 | 
18 | Use statistical functions and graphs to get to know the data.
19 | 
20 | ```{r}
21 | # your code here
22 | ```
23 | 
24 | # Step 4: Formulate a learning objective
25 | 
26 | What do you want to learn from the data?
27 | 
28 | # Step 5: Divide the data into train/test sets
29 | 
30 | ```{r}
31 | # your code here
32 | ```
33 | 
34 | # Step 6: Train the algoirthm on the training data
35 | 
36 | ```{r}
37 | # your code here
38 | ```
39 | 
40 | # Step 7: Make predictions on the test data
41 | 
42 | ```{r}
43 | # your code here
44 | ```
45 | 
46 | # Step 8: Evaluate 
47 | 
48 | 
49 | ```{r}
50 | # your code here
51 | ```
52 | 
53 | # Step 9: Identify approaches for improvement
54 | 
55 | # Step 10: Repeat steps as required
56 | 
57 | 


--------------------------------------------------------------------------------
/Part_1_Intro/README.md:
--------------------------------------------------------------------------------
 1 | # Part One: An Introduction to Machine Learning 
 2 | 
 3 | These notebooks/pdfs accompany Chapter 2: Learning R
 4 | 
 5 | * 2-1 data exploration with the airquality data set
 6 | * 2-2 data exploration with the titanic data set
 7 | * 2-3 data exploration with the Boston housing data set
 8 | * 2-4 control structures in R
 9 | 
10 | These notebooks/pdfs accompany Chapter 3: Data Visualization in R
11 | 
12 | * 3-1 Visualization of one Qualitative or Quantitative Vector
13 | * 3-2 Visualization of two Qualitative or Quantitative Vectors
14 | 
15 | This notebook provides an outline for beginning projects.
16 | 
17 | * 4-1 Project outline
18 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/5-1_LinReg_women.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Linear Regression on the Women Data Set'
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | editor_options:
  9 |   chunk_output_type: inline
 10 | ---
 11 | 
 12 | This example looks at the built-in data set **women** as an introductory example of linear regression in R.
 13 | 
 14 | ### Data Exploration
 15 | 
 16 | First, load the data, look at its structure and plot the points.
 17 | 
 18 | 
 19 | ```{r}
 20 | data(women)
 21 | str(women)
 22 | plot(women$weight~women$height, xlab="height", ylab="weight")
 23 | abline(lm(women$weight~women$height), col="red")
 24 | ```
 25 | 
 26 | ### Build a linear regression model
 27 | 
 28 | The first argument to the lm() function is the formula, in this case weight as a function of height. The second argument specifies what data to use.
 29 | 
 30 | When we type the model name, R returns basic information about the model.
 31 | 
 32 | ```{r}
 33 | lm1 <- lm(weight~height, data=women)
 34 | lm1
 35 | ```
 36 | 
 37 | ```{r}
 38 | pred <- lm1$fitted.values
 39 | cov(pred, women$weight) / (sd(pred) * sd(women$weight))
 40 | ```
 41 | 
 42 | ### The summary() function
 43 | 
 44 | Output more about the model with summary(). This gives statistics on the residuals, the coefficients, and the model itself. 
 45 | 
 46 | ```{r}
 47 | summary(lm1)
 48 | ```
 49 | 
 50 | ### Test and train
 51 | 
 52 | Normally we will divide a data set into at least two portions. The larger portion, usually more than 50% of the data, will be used for training the model. The smaller portion is a test set that will be used to evaluate how well the model does on data it has not seen before. In this case we are just getting to know linear regression on a tiny data set so we are just going to make up some test data.
 53 | 
 54 | 
 55 | Let's hallucinate some test data.
 56 | 
 57 | ```{r}
 58 | test <- women[c(5, 9, 11),]
 59 | test[1, 2] <- 135
 60 | test[2, 2] <- 118
 61 | test[3, 2] <- 156
 62 | test
 63 | ```
 64 | 
 65 | ### The predict() function
 66 | 
 67 | Once we have a model of our data, lm1, we can use this model to predict target values for new data with the predict() function. The first argument to predict() is our model, the second specifies the new data. The output is a vector of predicted values. It's always a good idea to look at your output at each stage so that simple errors don't propagate forward. You can look at pred by typing "pred" at the console, or by looking in the Environment pane at the upper right of the RStudio screen. You should see that it is a vector with 3 values.
 68 | 
 69 | Now predict on our made-up test data. 
 70 | 
 71 | ```{r}
 72 | pred <- predict(lm1, newdata=test)
 73 | 
 74 | ```
 75 | 
 76 | 
 77 | ### Evaluate the results
 78 | 
 79 | We expect to get poor results since our test data was purposely chosen to be far from the regression line for illustration purposes. And our expectations are met.
 80 | 
 81 | ```{r}
 82 | correlation <- cor(pred, test$weight)
 83 | print(paste("correlation: ", correlation))
 84 | mse <- mean((pred - test$weight)^2)
 85 | print(paste("mse: ", mse))
 86 | rmse <- sqrt(mse)
 87 | print(paste("rmse: ", rmse))
 88 | ```
 89 | 
 90 | The correlation is not great, numbers closer to +/- 1 are better. The mse is hard to interpret in isolation, it is most helpful in comparing models. The rmse tells us that our test data was off by an average of almost 15 pounds.
 91 | 
 92 | ### Residuals
 93 | 
 94 | Residuals are errors in our predictions. They quantify how far off from the regression line (the predicted values) our actual values are. In the diagram the residuals are drawn with red lines. 
 95 | 
 96 | Let's plot.
 97 | 
 98 | ```{r}
 99 | plot(women$height, women$weight, main="Women's Height and Weight", 
100 |      xlab="height", ylab="weight")
101 | abline(lm1)
102 | points(test$height, test$weight, pch=0)
103 | segments(test$height, test$weight, test$height, pred, col="red")
104 | ```
105 | 
106 | ### Coefficient Estimates
107 | 
108 | Proving to ourselves that the coefficients match the equations in the text. Notice that these equations provide the same coefficients as lm1.
109 | 
110 | ```{r}
111 | x <- women$height
112 | y <- women$weight
113 | x_mean <- mean(women$height)
114 | y_mean <- mean(women$weight)
115 | 
116 | w_hat <- sum((x-x_mean)*(y-y_mean)) / sum((x-x_mean)^2)
117 | b_hat <- y_mean - w_hat * x_mean
118 | print(paste("w and b estimates = ", w_hat, b_hat))
119 | ```
120 | 
121 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/5-1_LinReg_women.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/5-1_LinReg_women.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/5-2_LinReg_Chick.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Multiple Linear Regression"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | editor_options:
  9 |   chunk_output_type: inline
 10 | ---
 11 | 
 12 | ChickWeight is a built-in R data set with 578 rows and 4 columns of data resulting from an experiment on the effect of different types of feed on chick weight. Each observation (row) in the data set represents the weight in grams of a given chick on a given day, recorded in column Time. 
 13 | 
 14 | ### Data exploration
 15 | 
 16 | Let's explore the data with R functions and plots.
 17 | 
 18 | ```{r}
 19 | data(ChickWeight)
 20 | dim(ChickWeight)
 21 | head(ChickWeight)
 22 | ```
 23 | 
 24 | 
 25 | 
 26 | ```{r}
 27 | par(mfrow=c(1,2))
 28 | plot(ChickWeight$Time, ChickWeight$weight,
 29 |      xlab="Time", ylab="Weight")
 30 | plot(ChickWeight$Diet, ChickWeight$weight,
 31 |      xlab="Diet", ylab="Weight")
 32 | ```
 33 | 
 34 | ### Divide the data into train and test sets
 35 | 
 36 | We randomly sample the rows to get a vector i with row indices. This is used to divide into train and test sets. 
 37 | 
 38 | ```{r}
 39 | set.seed(1234)
 40 | i <- sample(1:nrow(ChickWeight), nrow(ChickWeight)*0.75, replace=FALSE)
 41 | train <- ChickWeight[i,]
 42 | test <- ChickWeight[-i,]
 43 | ```
 44 | 
 45 | ### Simple linear regression
 46 | 
 47 | In simple linear regression we have a single predictor variable for our target variable. Here we wish to see the impact of Time on weight.
 48 | 
 49 | ```{r}
 50 | lm1 <- lm(weight~Time, data=train)
 51 | summary(lm1)
 52 | ```
 53 | 
 54 | ### Plotting the residuals
 55 | 
 56 | The 4 residual plots are placed in a 2x2 grid.
 57 | 
 58 | ```{r}
 59 | par(mfrow=c(2,2))
 60 | plot(lm1)
 61 | ```
 62 | ### Evaluate on the test set
 63 | 
 64 | ```{r}
 65 | pred1 <- predict(lm1, newdata=test)
 66 | cor1 <- cor(pred1, test$weight)
 67 | mse1 <- mean((pred1-test$weight)^2) 
 68 | rmse1 <- sqrt(mse1)
 69 | 
 70 | print(paste('correlation:', cor1))
 71 | print(paste('mse:', mse1))
 72 | print(paste('rmse:', rmse1))
 73 | ```
 74 | 
 75 | 
 76 | 
 77 | ### Multiple Linear Regression
 78 | 
 79 | If we have more than one predictor in linear regression we call it multiple linear regression. Here we want to see the effect of both Time and Diet on chick weight.
 80 | 
 81 | ```{r}
 82 | lm2 <- lm(weight~Time+Diet, data=train)
 83 | summary(lm2)
 84 | ```
 85 | 
 86 | The adjusted R-squared for lm2 is an improvement over lm1.
 87 | 
 88 | ### The anova() function
 89 | 
 90 | The analysis of variance function here is used to compare the two models. We see that lm2 lowered the errors, RSS, and had a low p-value. These are indications that lm2 is a better model than lm1. 
 91 | 
 92 | ```{r}
 93 | anova(lm1, lm2)
 94 | ```
 95 | 
 96 | ### Evaluate on the test set
 97 | 
 98 | ```{r}
 99 | pred2 <- predict(lm2, newdata=test)
100 | cor2 <- cor(pred2, test$weight)
101 | mse2 <- mean((pred2-test$weight)^2) 
102 | rmse2 <- sqrt(mse2)
103 | 
104 | print(paste('correlation:', cor2))
105 | print(paste('mse:', mse2))
106 | print(paste('rmse:', rmse2))
107 | ```
108 | 
109 | ### Linear models are not always straight lines
110 | 
111 | Next we try predicting the log of weight to illustrate that linear models are not always straight lines. This damped down some of the variation in the residuals. The lm3 model had a higher R-squared of 0.8474. We cannot do anova() comparing lm3 because it has a different target, the log(weight) instead of weight.
112 | 
113 | ```{r}
114 | lm3 <- lm(log(weight)~Time+Diet, data=train)
115 | summary(lm3)
116 | par(mfrow=c(2,2))
117 | plot(lm3)
118 | ```
119 | 
120 | ### Evaluate on the test set
121 | 
122 | ```{r}
123 | pred3 <- predict(lm3, newdata=test)
124 | pred3 <- exp(pred3)
125 | cor3 <- cor(pred3, test$weight)
126 | mse3 <- mean((pred3-test$weight)^2) 
127 | rmse3 <- sqrt(mse3)
128 | 
129 | print(paste('correlation:', cor3))
130 | print(paste('mse:', mse3))
131 | print(paste('rmse:', rmse3))
132 | ```
133 | 
134 | Note that we can't do an anova comparison with model 3 because it has a target of log(weight) and lm1 and lm2 have weight as a target.


--------------------------------------------------------------------------------
/Part_2_Linear_Models/5-2_LinReg_Chick.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/5-2_LinReg_Chick.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/5-3-LinReg-ridge-airq.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Ridge Regression"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 |   html_document:
 7 |     df_print: paged
 8 | ---
 9 | 
10 | Compare linear regression and ridge regression on the airquality data set.
11 | 
12 | ### Data cleaning
13 | 
14 | First, remove rows with NAs using complete.cases(). Then remove the Day column.
15 | 
16 | ```{r}
17 | df <- airquality[complete.cases(airquality[, 1:5]),]
18 | df <- df[,-6]
19 | ```
20 | 
21 | ### Train and test sets for linear regression
22 | 
23 | Divide into train and test sets, then create a model predicting Ozone from the other columns. 
24 | 
25 | ```{r}
26 | set.seed(1234)
27 | i <- sample(1:nrow(df), .75*nrow(df), replace=FALSE)
28 | train <- df[i,]
29 | test <- df[-i,]
30 | lm1 <- lm(Ozone~., data=train)
31 | pred <- predict(lm1, newdata=test)
32 | mse1 <- mean((pred-test$Ozone)^2)
33 | print(paste("mse=", mse1))
34 | ```
35 | 
36 | ### Ridge Regression
37 | 
38 | Try ridge regession using glmnet. 
39 | 
40 | First use the model.matrix() function to create a matrix of the predictors. Then split into test and train.
41 | 
42 | ```{r}
43 | library(glmnet)
44 | x <- model.matrix(Ozone~., df)[,-1]
45 | y <- df$Ozone
46 | train_x <- x[i,]
47 | train_y <- y[i]
48 | test_x <- x[-i,]
49 | test_y <- y[-i]
50 | 
51 | # build a ridge regression model
52 | rm <- glmnet(train_x, train_y, alpha=0)
53 | 
54 | # use cv to see which lambda is best
55 | set.seed(1)
56 | cv_results <- cv.glmnet(train_x, train_y, alpha=0)
57 | plot(cv_results)
58 | l <- cv_results$lambda.min
59 | 
60 | # get data for best lambda, which is the 99th
61 | # as determined by looking at rm$lambda
62 | pred2 <- predict(rm, s=l, newx=test_x)
63 | mse2 <- mean((pred2-test_y)^2)
64 | coef2 <- coef(rm)[,99]
65 | 
66 | ```
67 | 
68 | ### Compare mse and coefficients
69 | 
70 | The ridge regression got about 10% lower mse. Notice that its coefficients are smaller in absolute value.
71 | 
72 | ```{r}
73 | print(paste("mse for linear regression = ", mse1))
74 | coef(lm1)
75 | 
76 | ```
77 | 
78 | ```{r}
79 | print(paste("mse for ridge regression = ", mse2))
80 | coef2
81 | 
82 | ```
83 | 
84 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/5-3-LinReg-ridge-airq.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/5-3-LinReg-ridge-airq.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_1_LogReg-plasma.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Logistic Regression on the Plasma Data"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   html_document:
  6 |     df_print: paged
  7 |   html_notebook: default
  8 |   pdf_document: default
  9 | editor_options:
 10 |   chunk_output_type: inline
 11 | ---
 12 | 
 13 | Logistic regression example using the plasma data set in package HSAUR.
 14 | 
 15 | ### Data exploration
 16 | 
 17 | We can read more about the plasma data set by typing "?plasma" at the console, after package HSAUR is loaded. We want to learn to predict ESR>20 or not, based on the levels of the plasma proteins fibrinogen and globulin. ESR stands for erythrocyte sedimentation rate, the rate at which red blood cells settle in blood plasma. Values >20 indicate some possible associations with various health conditions. 
 18 | 
 19 | ```{r}
 20 | library(HSAUR)
 21 | attach(plasma)
 22 | str(plasma)
 23 | head(plasma)
 24 | attach(plasma)
 25 | ```
 26 | ### Plot the data
 27 | 
 28 | The first pair of plots show us that observations where ESR>20 are rarer. This is indicated by the thinner boxes because we set varwidth=TRUE in the bloxplot call. More importantly, the boxplots show that ESR>20 observations are associated with slightly higher levels of globulin and significantly higher levels of fibronogen. 
 29 | 
 30 | The second set of pots are conditional density plots. We can make the same observations as the box plots. Here they are just visualized differently. The total probability space is the rectangle, with the lighter grey indicating ESR>20.
 31 | 
 32 | ```{r}
 33 | par(mfrow=c(1,2))
 34 | plot(ESR, fibrinogen, main="Fibrinogen", ylab="", varwidth=TRUE)
 35 | plot(ESR, globulin, main="Globulin", ylab="", varwidth=TRUE)
 36 | 
 37 | par(mfrow=c(1,2))
 38 | cdplot(ESR~fibrinogen)
 39 | cdplot(ESR~globulin)
 40 | ```
 41 | ### Train and test sets
 42 | 
 43 | Even though our data is small, we will go ahead and divide it into train and test sets. 
 44 | 
 45 | ```{r}
 46 | set.seed(3)
 47 | i <- sample(1:nrow(plasma), 0.75*nrow(plasma), replace=FALSE)
 48 | train <- plasma[i,]
 49 | test <- plasma[-i,]
 50 | ```
 51 | 
 52 | 
 53 | 
 54 | ### Build a logistic regression model
 55 | 
 56 | Our first model uses only fibronogen as a predictor. The glm() function is used for logistic regression, with parameter family=binomial
 57 | 
 58 | The summary is a little different for logistic regression compared to linear regression:
 59 | * the residual are deviance residuals - measures of deviance contributed from each observation
 60 | * the coefficients represent changes in the log odds of y
 61 | * model metrics
 62 | 
 63 | ```{r}
 64 | glm1 <- glm(ESR~fibrinogen, data=train, family=binomial)
 65 | summary(glm1)
 66 | ```
 67 | ### Evaluate
 68 | 
 69 | Our first model uses only fibronogen as a predictor. On our small test data we got about 88% accuracy. The table shows that all test observations were predicted as not ESR>20 and one of the 8 observations actually was ESR>20. Internally, the ESR>20 factor is coded as 1 for not >20 and 2 for ESR>20. This is why we compare them as.integer(). 
 70 | 
 71 | ```{r}
 72 | probs <- predict(glm1, newdata=test, type="response")
 73 | pred <- ifelse(probs>0.5, 2, 1)
 74 | acc1 <- mean(pred==as.integer(test$ESR))
 75 | print(paste("glm1 accuracy = ", acc1))
 76 | table(pred, as.integer(test$ESR))
 77 | 
 78 | ```
 79 | 
 80 | ### What does it mean?
 81 | 
 82 | Let's explore the meaning of the coefficient.
 83 | 
 84 | ```{r}
 85 | fibro <- glm1$coefficients[2]
 86 | intercept <- glm1$coefficients[1]
 87 | 
 88 | log_odds <- function(x, fibro, intercept){
 89 |   intercept + fibro * x
 90 | }
 91 | 
 92 | x <- seq(from=2.25, to=5.0, by=0.25)
 93 | y <- log_odds(x, fibro, intercept)
 94 | par(mfrow=c(1,2))
 95 | plot(x,y, main="log odds", ylab="")
 96 | 
 97 | prob <- exp(y) / (1 + exp(y))
 98 | plot(x, prob, main="probabilities", ylab="")
 99 | ```
100 | 
101 | 
102 | ### Build another model
103 | 
104 | This model uses both predictors. On the test set we got the same accuracy. 
105 | 
106 | ```{r}
107 | glm2 <- glm(ESR~fibrinogen+globulin, data=train, family=binomial)
108 | summary(glm2)
109 | probs <- predict(glm1, newdata=test, type="response")
110 | pred <- ifelse(probs>0.5, 2, 1)
111 | acc2 <- mean(pred==as.integer(test$ESR))
112 | print(paste("glm2 accuracy = ", acc2))
113 | table(pred, as.integer(test$ESR))
114 | ```
115 | 
116 | ### Compare the models with anova()
117 | 
118 | The second model is only slightly better than the first. The residuals dropped by only 2 points, and the p-value is not small. 
119 | 
120 | ```{r}
121 | anova(glm1, glm2)
122 | ```
123 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_1_LogReg-plasma.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/6_1_LogReg-plasma.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_2_LogReg-titanic.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Logistic Regression with the Titanic Data"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   html_document:
  6 |     df_print: paged
  7 |   pdf_document: default
  8 | editor_options:
  9 |   chunk_output_type: inline
 10 | ---
 11 | 
 12 | 
 13 | There are many versions of the Titanic data. The one used here was downloaded [from this link.](biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls) and then converted to csv format.
 14 | 
 15 | 
 16 | ### Load the data
 17 | 
 18 | ```{r}
 19 | df <- read.csv("data/titanic.csv", header=TRUE)
 20 | str(df)
 21 | ```
 22 | 
 23 | ### Data cleaning
 24 | 
 25 | First we subset the data frame because we only care about columns pclass, survived, sex, and age. Then we make survived and pclass factors, sex is already a factor.
 26 | 
 27 | ```{r}
 28 | df <- df[,c(1,2,4,5)]
 29 | df$pclass <- factor(df$pclass)
 30 | df$survived <- factor(df$survived)
 31 | df$sex <- factor(df$sex)
 32 | head(df)
 33 | ```
 34 | 
 35 | ### Handle missing values
 36 | 
 37 | We first find out how many missing values we have for each of our 4 columns with the sapply() function. The first argument is the object we wish to apply the function to. In this case the function sums the number of NAs for each column of the data frame.
 38 | 
 39 | 
 40 | 
 41 | 
 42 | ```{r}
 43 | sapply(df, function(x) sum(is.na(x)==TRUE))
 44 | ```
 45 | 
 46 | 
 47 | We see that there are no NAs for pclass, survived, or sex. There are 263 observations out of the total 1309 where we have NA for the age. We could just delete those but that's a lot of data to lose. Instead we will replace the NAs with the median age. 
 48 | 
 49 | ```{r}
 50 | df$age[is.na(df$age)] <- median(df$age,na.rm=T)
 51 | ```
 52 | 
 53 | ### Divide into train and test
 54 | 
 55 | ```{r}
 56 | set.seed(1234)
 57 | i <- sample(1:nrow(df), 0.75*nrow(df), replace=FALSE)
 58 | train <- df[i,]
 59 | test <- df[-i,]
 60 | ```
 61 | 
 62 | ### Build a logistic regression model
 63 | 
 64 | ```{r}
 65 | glm1 <- glm(survived~., data=train, family="binomial")
 66 | summary(glm1)
 67 | ```
 68 | ### Evaluate on the test set
 69 | 
 70 | ```{r}
 71 | probs <- predict(glm1, newdata=test, type="response")
 72 | pred <- ifelse(probs>0.5, 1, 0)
 73 | acc <- mean(pred==test$survived)
 74 | print(paste("accuracy = ", acc))
 75 | table(pred, test$survived)
 76 | ```
 77 | 
 78 | ### Additional metrics
 79 | 
 80 | The confusion matrix in the caret package gives us more information than our simple table above. One of the more useful statistics is the Kappa value which adjusts for the distribution of the data set. In this case the data set was only slightly unbalanced, with about 60% survived to 40$ not. 
 81 | 
 82 | Note that the vector 'pred' was an integer vector while survived is a factor. The pred vector needs to be converted to a factor for the confusion matrix code.
 83 | 
 84 | ```{r}
 85 | library(caret)
 86 | confusionMatrix(as.factor(pred), reference=test$survived)
 87 | ```
 88 | 
 89 | ###ROC
 90 | 
 91 | The ROC is a curve that plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings. The AUC is the area under the ROC curve. A good AUC is close to 1 than 0.5. Also we like to see the ROC shoot up rather quickly.
 92 | 
 93 | ```{r}
 94 | library(ROCR)
 95 | p <- predict(glm1, newdata=test, type="response")
 96 | pr <- prediction(p, test$survived)
 97 | prf <- performance(pr, measure = "tpr", x.measure = "fpr")
 98 | plot(prf)
 99 | ```
100 | 
101 | ```{r}
102 | auc <- performance(pr, measure = "auc")
103 | auc <- auc@y.values[[1]]
104 | auc
105 | ```
106 | 
107 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_2_LogReg-titanic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/6_2_LogReg-titanic.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_3_LogReg-3-iris.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "MultiClass Classification"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   html_document:
 6 |     df_print: paged
 7 |   pdf_document: default
 8 | editor_options:
 9 |   chunk_output_type: inline
10 | ---
11 | 
12 | In this notebook we look at multi-class classification with the iris data set, built into R. This is a well-known data set that takes measurements from 150 irises, equally divided into 3 species virginica, setosa and versicolor.
13 | 
14 | ### Data exploration
15 | 
16 | Explore the data with functions and graphics. 
17 | 
18 | ```{r}
19 | attach(iris)
20 | str(iris)
21 | pairs(iris[1:4], pch = 21, bg = c("red", "yellow", "blue")[unclass(Species)])
22 | 
23 | ```
24 | 
25 | Let's see how well Petal.Length and Petal.Width separate the classes. 
26 | 
27 | ```{r}
28 | plot(Petal.Length, Petal.Width, pch=21, bg=c("red","yellow","blue")
29 |      [unclass(Species)])
30 | 
31 | ```
32 | 
33 | ### One versus all
34 | 
35 | In one versus all classification we will build 3 classifiers on 3 data sets:
36 | 
37 | * virginica versus not 
38 | * setosa versus not
39 | * versicolor versus not
40 | 
41 | ```{r}
42 | # reclassify as virginica or not
43 | iris_virginica <- iris
44 | iris_virginica$Species <- as.factor(ifelse (iris_virginica$Species=="virginica",1,0))
45 | 
46 | # reclassify as setosa or not
47 | iris_setosa <- iris
48 | iris_setosa$Species <- as.factor(ifelse (iris_setosa$Species=="setosa",1,0))
49 | 
50 | # reclassify as versicolor or not
51 | iris_versicolor <- iris
52 | iris_versicolor$Species <- as.factor(ifelse (iris_versicolor$Species=="versicolor",1,0))
53 | ```
54 | 
55 | ### Function for logistic regression
56 | 
57 | We will write a function to handle repeated calls. 
58 | 
59 | ```{r}
60 | fun <- function(df, i){
61 |     train <- df[i,]
62 |     test <- df[-i,]
63 |     glm1 <- glm(Species~., data=train, family="binomial")
64 |     probs <- predict(glm1, newdata=test)
65 |     pred <- ifelse(probs>0.5, 1, 0)
66 |     acc <- mean(pred==test$Species)
67 |     print(paste("accuracy = ", acc))
68 |     table(pred, test$Species) 
69 | }
70 | ```
71 | 
72 | 
73 | 
74 | ### Virginica
75 | 
76 | ```{r}
77 | set.seed(1234)
78 | i <- sample(1:150, 100, replace=FALSE)
79 | fun(iris_virginica, i)
80 | ```
81 | ### Setosa
82 | 
83 | ```{r}
84 | fun(iris_setosa, i)
85 | ```
86 | 
87 | ### Versicolor
88 | 
89 | ```{r}
90 | fun(iris_versicolor, i)
91 | ```
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_3_LogReg-3-iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/6_3_LogReg-3-iris.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_4_logreg-scratch.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Logistic Regression from Scratch"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   html_document:
 6 |     df_print: paged
 7 |   pdf_document: default
 8 | editor_options:
 9 |   chunk_output_type: inline
10 | ---
11 | 
12 | # Notebook for the Logistic Regression Chapter
13 | 
14 | First, load the package and plasma data set. 
15 | 
16 | ```{r}
17 | library(HSAUR)
18 | data(plasma)
19 | 
20 | ```
21 | 
22 | ### Logistic Regression using R
23 | 
24 | Use R's glm() function first as our ground truth. 
25 | 
26 | 
27 | ```{r}
28 | set.seed(1234)
29 | i <- sample(1:nrow(plasma), 0.75*nrow(plasma), replace=FALSE)
30 | train <- plasma[i,]
31 | test <- plasma[-i,]
32 | glm1 <- glm(ESR~fibrinogen, data=train, family=binomial)
33 | probs <- predict(glm1,  newdata=test, type="response")
34 | pred <- ifelse(probs> 0.5, 2, 1)
35 | acc <- mean(pred == as.integer(test$ESR))
36 | summary(glm1)
37 | ```
38 | 
39 | ### Logistic Regression from Scratch
40 | 
41 | First we need to define the sigmoid function.
42 | 
43 | ```{r}
44 | # function to return a vector of sigmoid values from an input matrix
45 | sigmoid <- function(z){
46 |   1.0 / (1+exp(-z))
47 | }
48 | # set up weight vector, label vector, and data matrix
49 | weights <- c(1, 1)
50 | data_matrix <- cbind(rep(1, nrow(train)), train$fibrinogen)
51 | labels <- as.integer(train$ESR) - 1
52 | ```
53 | 
54 | Then we need code for gradient descent. The algorithm used here first starts with all weights = 1, then iterates. Notice we get the same weights (coefficients) as R gave us, but it took a lot longer.
55 | 
56 | 
57 | ```{r}
58 | weights <- c(1, 1)  # repeat this for rerunning the block
59 | learning_rate <- 0.001
60 | for (i in 1:500000){
61 |   prob_vector <- sigmoid(data_matrix %*% weights)
62 |   error <- labels - prob_vector
63 |   weights <- weights + learning_rate * t(data_matrix) %*% error
64 | }
65 | weights
66 | ```
67 | 
68 | ### Predict with the weights we generated
69 | 
70 | ```{r}
71 | # predict with our weights
72 | test_matrix <- cbind(rep(1, nrow(test)), test$fibrinogen)
73 | test_labels <- as.integer(test$ESR) - 1
74 | predicted <- test_matrix %*% weights
75 | probabilities <- exp(predicted) / (1 + exp(predicted))
76 | predictions <- ifelse(probabilities > 0.5, 1, 0)
77 | mean(predictions == test_labels)
78 | 
79 | ```
80 | 
81 | 
82 | 
83 | 
84 | Visualization that the log odds is a line.
85 | 
86 | ```{r}
87 | plasma_log_odds <- cbind(rep(1, 32), plasma$fibrinogen) %*% weights
88 | plot(plasma$fibrinogen, plasma_log_odds, col=plasma$ESR)
89 | abline(weights[1], weights[2])
90 | ```
91 | 
92 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/6_4_logreg-scratch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/6_4_logreg-scratch.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/7_1_NBayes-titanic.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Naive Bayes on the Titanic Data"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 |   html_notebook: default
 7 |   html_document:
 8 |     df_print: paged
 9 | ---
10 | 
11 | Performing Naive Bayes on the Titanic data set.
12 | 
13 | ### Load and preprocess the data
14 | 
15 | We will skip the description of these steps since they are the same as in the logistic regression example.
16 | 
17 | ```{r}
18 | df <- read.csv("data/titanic.csv", header=TRUE)
19 | 
20 | # data cleaning
21 | df <- df[,c(1,2,4,5)]
22 | df$pclass <- factor(df$pclass)
23 | df$survived <- factor(df$survived)
24 | 
25 | # handle missing values
26 | df$age[is.na(df$age)] <- median(df$age,na.rm=T)
27 | ```
28 | 
29 | 
30 | ### Divide into train and test sets 
31 | 
32 | This should be the same split as we had for logistic regression so we can compare the two algorithms.
33 | 
34 | 
35 | ```{r}
36 | set.seed(1234)
37 | i <- sample(1:nrow(df), 0.75*nrow(df), replace=FALSE)
38 | train <- df[i,]
39 | test <- df[-i,]
40 | 
41 | ```
42 | ### Build the model
43 | 
44 | The naive Bayes algorithm is in package e1071.
45 | 
46 | ```{r}
47 | library(e1071)
48 | nb1 <- naiveBayes(survived~., data=train)
49 | nb1
50 | ```
51 | 
52 | ### Evaluate on the test data
53 | 
54 | ```{r}
55 | p1 <- predict(nb1, newdata=test, type="class")
56 | table(p1, test$survived)
57 | mean(p1==test$survived)
58 | ```
59 | 
60 | We got very slightly higher for naive Bayes than we did for logistic regression. 
61 | 
62 | ### Extracting probabilities
63 | 
64 | One of the nice things about the algorithm is that you can extract the raw probabilities.
65 | 
66 | ```{r}
67 | p1_raw <- predict(nb1, newdata=test, type="raw")
68 | head(p1_raw)
69 | ```
70 | 
71 | #### Remove Age
72 | 
73 | When we look at the Naive Bayes algorithm, we see the mean for survived versus perished is different by only one year. This suggests that age has little predictive value. Let's check that by building another model, this time without age.
74 | 
75 | ```{r}
76 | nb2 <- naiveBayes(survived~.-age, data=train)
77 | p2 <- predict(nb2, newdata=test[,-4], type="class")
78 | table(p2, test$survived)
79 | mean(p2==test$survived)
80 | ```
81 | 
82 | As it turns out, there is only a very slight difference in the accuracies.


--------------------------------------------------------------------------------
/Part_2_Linear_Models/7_1_NBayes-titanic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/7_1_NBayes-titanic.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/7_2_NBayes-scratch.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Naive Bayes from Scratch with the Titanic Data"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | editor_options:
  9 |   chunk_output_type: console
 10 | ---
 11 | 
 12 | 
 13 | We will use the same data and data cleaning as in the first notebook in this chapter, so we repeate those steps first with no commentary.
 14 | 
 15 | ### Load the data
 16 | 
 17 | ```{r}
 18 | df <- read.csv("data/titanic3.csv", header=TRUE, stringsAsFactors = FALSE)
 19 | 
 20 | # subset to just columns survived, pclass, sex, and age
 21 | df <- df[,c(1,2,4,5)]
 22 | # pclass and survived  and sex should be factors
 23 | df$pclass <- factor(df$pclass)
 24 | df$survived <- factor(df$survived)
 25 | df$sex <- factor(df$sex, levels=c("female", "male"))
 26 | 
 27 | # remove NAs
 28 | df <- df[!is.na(df$pclass),]
 29 | df <- df[!is.na(df$survived),]
 30 | df$age[is.na(df$age)] <- median(df$age,na.rm=T)
 31 | 
 32 | # divide into train and test
 33 | set.seed(1234)
 34 | i <- sample(1:nrow(df), 0.75*nrow(df), replace=FALSE)
 35 | train <- df[i,]
 36 | test <- df[-i,]
 37 | 
 38 | # perform Naive Bayes
 39 | library(e1071)
 40 | nb1 <- naiveBayes(df[,-2], df[,2], data=train)
 41 | pred <- predict(nb1, newdata=test[,-2], type="raw")
 42 | 
 43 | # look at first 5 (actual: 0 1 1 1 0)
 44 | pred[1:5,]
 45 | ```
 46 | 
 47 | 
 48 | 
 49 | ### Calculate priors
 50 | 
 51 | Using the training data we calculate prior probability of survived/perished as the percentage for each category.
 52 | 
 53 | ```{r}
 54 | apriori <- c(
 55 |   nrow(df[df$survived=="0",])/nrow(df),
 56 |   nrow(df[df$survived=="1",])/nrow(df)
 57 |   )
 58 | print("Prior probability, survived=no, survived=yes:")
 59 | apriori
 60 | ```
 61 | 
 62 | ### Calculate likelihoods for qualitative data
 63 | 
 64 | The likelihood for qualitative data is calculated as follows:
 65 | 
 66 | * for each class
 67 | * for each factor level i
 68 | * likelihood (class=i|survived=yes) = count(factor = i and survived=yes) / count(survived=yes)
 69 | * likelihood (class=i|survived=no) = count(factor = i and survived=no) / count(survived=np)
 70 | 
 71 | we will use nrow() to get N
 72 | 
 73 | ```{r}
 74 | # get survived counts for no and yes
 75 | count_survived <- c(
 76 |   length(df$survived[df$survived=="0"]),
 77 |   length(df$survived[df$survived=="1"])
 78 | )
 79 | # likelihood for pclass
 80 | lh_pclass <- matrix(rep(0,6), ncol=3)
 81 | for (sv in c("0", "1")){
 82 |   for (pc in c("1","2","3")) {
 83 |     lh_pclass[as.integer(sv)+1, as.integer(pc)] <- 
 84 |       nrow(df[df$pclass==pc & df$survived==sv,]) / count_survived[as.integer(sv)+1]
 85 |   }
 86 | }
 87 | 
 88 | # likelihood for sex
 89 | lh_sex <- matrix(rep(0,4), ncol=2)
 90 | for (sv in c("0", "1")){
 91 |   for (sx in c(1, 2)) {
 92 |     lh_sex[as.integer(sv)+1, sx] <- 
 93 |       nrow(df[as.integer(df$sex)==sx & df$survived==sv,]) /
 94 |        count_survived[as.integer(sv)+1]
 95 |   }
 96 | }
 97 | 
 98 | ```
 99 | 
100 | ### likelihood p(survived|pclass)
101 | 
102 | ```{r}
103 | print("Likelihood values for p(pclass|survived):")
104 | lh_pclass
105 | ```
106 | 
107 | ### likelihood p(survived|sex)
108 | 
109 | ```{r}
110 | print("Likelihood values for p(sex|survived):")
111 | lh_sex
112 | ```
113 | 
114 | 
115 | ### Calculate likelihoods for quantitative data
116 | 
117 | Age is quantitative. We need to compute the mean and variance.
118 | 
119 | ```{r}
120 | age_mean <- c(0, 0)
121 | age_var <- c(0, 0)
122 | for (sv in c("0", "1")){
123 |     age_mean[as.integer(sv)+1] <- 
124 |       mean(df$age[df$survived==sv])
125 |     age_var[as.integer(sv)+1] <- 
126 |       var(df$age[df$survived==sv])
127 |   }
128 | 
129 | ```
130 | 
131 | ### Probability density for quantitative data
132 | 
133 | For the qualitative variable we can calculate probailities by dividing but for the age variable we need a function that will calculate its probability.
134 | 
135 | ```{r}
136 | calc_age_lh <- function(v, mean_v, var_v){
137 |   # run like this: calc_age_lh(6, 25.9, 138)
138 |   1 / sqrt(2 * pi * var_v) * exp(-((v-mean_v)^2)/(2 * var_v))
139 | }
140 | 
141 | ```
142 | 
143 | 
144 | 
145 | ### Function for scratch model
146 | 
147 | Write a function to calculate raw probabilities given pclass, sex, and age. 
148 | 
149 | ```{r}
150 | calc_raw_prob <- function(pclass, sex, age) {
151 |   # pclass=1,2,3  sex=1,2   age=numeric
152 |   num_s <- lh_pclass[2, pclass] * lh_sex[2, sex] * apriori[2] *
153 |       calc_age_lh(age, age_mean[2], age_var[2])
154 |   num_p <- lh_pclass[1, pclass] * lh_sex[1, sex] * apriori[1] *
155 |       calc_age_lh(age, age_mean[1], age_var[1])
156 |   denominator <- lh_pclass[2, pclass]  * lh_sex[2, sex] * calc_age_lh(age, age_mean[2], age_var[2]) * apriori[2] + 
157 |       lh_pclass[1, pclass]  * lh_sex[1, sex] * calc_age_lh(age, age_mean[1], age_var[1]) * apriori[1]
158 |   return (list(prob_survived <- num_s / denominator, prob_perished <- num_p / denominator))
159 | }
160 | 
161 | 
162 | ```
163 | 
164 | ### Apply to the first 5 test observations
165 | 
166 | Let's look at just the first 5 test observations. 
167 | 
168 | 
169 | ```{r}
170 | for (i in 1:5){
171 |   raw <- calc_raw_prob(test[i,1], as.integer(test[i,3]), test[i,4])
172 |   print(paste(raw[2], raw[1]))
173 | }
174 | 
175 | pred[1:5,]
176 | ```
177 | 
178 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/7_2_NBayes-scratch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/7_2_NBayes-scratch.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/8-1-features.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Feature Selection"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 |   html_notebook: default
 7 | editor_options:
 8 |   chunk_output_type: inline
 9 | ---
10 | 
11 | ### Look for correlations in Pima data
12 | 
13 | The findCorrelation() function suggests that we could remove column 6, mass, because it correlates with triceps. And that we could remove column 2, glucose, because it correlates with insulin. 
14 | 
15 | ```{r}
16 | library(caret)
17 | library(mlbench)
18 | data("PimaIndiansDiabetes2")
19 | df <- PimaIndiansDiabetes2[complete.cases(PimaIndiansDiabetes2[]),]
20 | corMatrix <- cor(df[,1:7])
21 | findCorrelation(corMatrix, cutoff=0.5, verbose=TRUE)
22 | ```
23 | ### Remove the highly correlated columns
24 | 
25 | ```{r}
26 | df <- df[,-c(2,6)]
27 | ```
28 | 
29 | 
30 | ### Rank features
31 | 
32 | The varImp() function ranks variables by importance. It requires a model which we trained on method knn, using control parameters stored in variable ctrl.
33 | 
34 | ```{r}
35 | ctrl <- trainControl(method="repeatedcv", repeats=5)
36 | model <- train(diabetes~., data=df, method="knn", preProcess="scale", trControl=ctrl)
37 | importance <- varImp(model, scale=FALSE)
38 | importance
39 | plot(importance)
40 | ```
41 | ### Recursive feature selection
42 | 
43 | We start with the data set including all columns.
44 | 
45 | ```{r}
46 | df <- PimaIndiansDiabetes2[complete.cases(PimaIndiansDiabetes2[]),]
47 | ctrl <- rfeControl(functions=rfFuncs, method="cv", number=10)
48 | rfe_out <- rfe(df[,1:7], df[,8], sizes=c(1:7), rfeControl=ctrl)
49 | rfe_out
50 | ```
51 | 
52 | ### FSelector
53 | 
54 | 
55 | 
56 | ```{r}
57 | library(FSelector)
58 | var_scores <- random.forest.importance(diabetes~., df)
59 | var_scores
60 | ```
61 | 
62 | 


--------------------------------------------------------------------------------
/Part_2_Linear_Models/8-1-features.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_2_Linear_Models/8-1-features.pdf


--------------------------------------------------------------------------------
/Part_2_Linear_Models/README.md:
--------------------------------------------------------------------------------
 1 | # Part 2: Linear Models
 2 | 
 3 | 
 4 | ## Chapter 5: Linear Regression
 5 | 
 6 | * 5-1 Linear regression with the women data set
 7 | * 5-2 Linear regression with the chick data set
 8 | * 5-3 Ridge regression example
 9 | 
10 | ## Chapter 6: Logistic Regression
11 | 
12 | * 6-1 Logistic regression on the plasma data
13 | * 6-2 Logistic regression on the titanic data
14 | * 6-3 Logistic regression on the iris data (multiclass classification)
15 | * 6-4 Logistic regression from scratch (gradient descent)
16 | 
17 | ## Chapter 7: Naive Bayes
18 | 
19 | * 7-1 Naive Bayes on the titanic data
20 | * 7-2 Naive Bayes from scratch
21 | 
22 | ## Chapter 8: Predictive Modeling
23 | 
24 | * 8-1 TBD
25 | * 8-2 TBD
26 | 


--------------------------------------------------------------------------------
/Part_3_Modern_R/09_ggplot.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to ggplot2"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_notebook: default
  7 |   html_document:
  8 |     df_print: paged
  9 | editor_options:
 10 |   chunk_output_type: console
 11 | ---
 12 | 
 13 | There are 7 grammatical elements in ggplot2, the first 3 of these are essential to getting something plotted:
 14 | 
 15 | 
 16 | * data - the data being plotted should be the first argument, or specify data=...
 17 | * aesthetics - the scales onto which we plot; use aes() to specify at least x= and y= if needed as well as other parameters for customization
 18 | * geometries - visual elements such as points, lines, etc.
 19 | * facets - for plotting multiples
 20 | * statistics - representations to aid understanding
 21 | * coordinates - space on which data will be plotted
 22 | * themes - you can customize your own theme to use over and over
 23 | 
 24 | ### load tidyverse and some data
 25 | 
 26 | Loading the diabetes data set from package mlbench.
 27 | 
 28 | ```{r, message=FALSE}
 29 | library(tidyverse)
 30 | library(mlbench)
 31 | data("PimaIndiansDiabetes2")
 32 | 
 33 | tb <- tbl_df(PimaIndiansDiabetes2)
 34 | ```
 35 | 
 36 | 
 37 | ## Explore ggplot2
 38 | 
 39 | Hadley Wickham developed ggplot2 in 2005, inspired by a grammar of graphics developed by Leland Wildinson in 1999. The ggplot2 functions are much more powerful than standard R graphs but also slower. 
 40 | 
 41 | We have a short example below showing important components of building a ggplot. First we specify the data, then the aesthetics which are how the data is represented, followed by the geometry and finally labels.
 42 | 
 43 | ```{r, message=FALSE}
 44 | ggplot(tb, aes(x=mass, y=glucose)) +
 45 |   geom_point() +
 46 |   labs(title="Glucose and BMI", x="BMI", y="Glucose")
 47 | ```
 48 | Next we add some color and a smoothing line which helps us see a trend in the data. By default the smoothing line to highlight the trend in the data
 49 | 
 50 | ```{r}
 51 | ggplot(tb, aes(x=mass, y=glucose)) +
 52 |   geom_point(pch=20, color='blue', size=1.5) +
 53 |   geom_smooth(method='lm', color='red', linetype=2) +
 54 |   labs(title="Glucose and BMI", x="BMI", y="Glucose")
 55 | ```
 56 | 
 57 | ### informative graph
 58 | 
 59 | 
 60 | ```{r}
 61 | ggplot(tb, 
 62 |   aes(x=tb$mass, y=tb$age, shape=diabetes, col=pregnant)) +
 63 |   geom_point(size=2) +
 64 |   labs(x="BMI",    y="Age")
 65 | ```
 66 | 
 67 | 
 68 | 
 69 | ### facet_grid
 70 | 
 71 | * filter out rows with NAs in glucose or insulin
 72 | * create 2 new factor columns, glucose_high and insulin_high
 73 | * plot
 74 | 
 75 | The facet grid for 2 binary variables has 4 windows for all combinations. 
 76 | 
 77 | ```{r}
 78 | tb <- filter(tb, !is.na(glucose), !is.na(insulin))
 79 | 
 80 | tb <- mutate(tb, glucose_high = factor(ifelse(tb$glucose>mean(tb$glucose), 1, 0)))
 81 | tb <- mutate(tb, insulin_high = factor(ifelse(tb$insulin>mean(tb$insulin), 1, 0)))
 82 | 
 83 | ggplot(tb, 
 84 |   aes(x=mass, y=age, shape=diabetes, col=pregnant)) +
 85 |   geom_point(size=2) +
 86 |  facet_grid(glucose_high~insulin_high)       
 87 | ```
 88 | 
 89 | 
 90 | ### histogram
 91 | 
 92 | ```{r}
 93 | ggplot(tb, aes(x=mass)) +
 94 |   geom_histogram(fill="cornsilk4")
 95 | ```
 96 | 
 97 | ### boxplot and rug
 98 | 
 99 | ```{r}
100 | ggplot(tb, aes(x=diabetes, y=mass)) +
101 |   geom_boxplot(notch=TRUE) +
102 |   geom_point(position="jitter", color="cornflowerblue", alpha=.5) +
103 |   geom_rug(color="cornflowerblue")
104 | ```
105 | 
106 | ### density plot
107 | 
108 | ```{r}
109 | ggplot(tb, aes(x=mass, fill=diabetes)) +
110 |   geom_density(alpha=0.5)
111 | ```
112 | ### bubble chart
113 | 
114 | ```{r}
115 | ggplot(tb, 
116 |        aes(x=mass, y=glucose, size=pregnant)) +
117 |   geom_point(shape=21, fill="cornflowerblue")
118 | ```
119 | 
120 | ### grid
121 | 
122 | ```{r}
123 | library(gridExtra)
124 | p1 <- ggplot(tb, aes(x=insulin_high)) + geom_bar(fill="cornflowerblue") 
125 | p2 <- ggplot(tb, aes(x=glucose_high)) + geom_bar(fill="cornflowerblue")
126 | grid.arrange(p1, p2, ncol=2)
127 | ```
128 | 
129 | 


--------------------------------------------------------------------------------
/Part_3_Modern_R/09_ggplot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_3_Modern_R/09_ggplot.pdf


--------------------------------------------------------------------------------
/Part_3_Modern_R/09_tidy.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tidyverse Demo"
  3 | output:
  4 |   html_document:
  5 |     df_print: paged
  6 |   html_notebook: default
  7 |   pdf_document: default
  8 | ---
  9 | 
 10 | Demonstrating tidyverse packages and functions.
 11 | 
 12 | Install the tidyverse like this:
 13 | ```
 14 | install.packages("tidyverse")
 15 | ```
 16 | 
 17 | An associated book is [R for Data Science](https://r4ds.had.co.nz/)
 18 | 
 19 | ### Create a tibble
 20 | 
 21 | ```{r}
 22 | # use a mlbench data frame
 23 | library(mlbench)
 24 | data("PimaIndiansDiabetes2")
 25 | 
 26 | library(tidyverse)
 27 | tb <- tbl_df(PimaIndiansDiabetes2)
 28 | tb
 29 | 
 30 | # remove the data frame to free up memory
 31 | rm(PimaIndiansDiabetes2)
 32 | ```
 33 | 
 34 | A glimpse is a view similar to str.
 35 | 
 36 | ```{r}
 37 | glimpse(tb)
 38 | ```
 39 | 
 40 | ### The dplyr package
 41 | 
 42 | Some dply functions work on columns. These are demonstrated below.
 43 | 
 44 | #### select()
 45 | 
 46 | Select a subset of columns. The select() function returns a tibble but it was not saved and will be discarded after the glimpse is output.
 47 | 
 48 | ```{r}
 49 | select(tb, diabetes, pregnant) %>%
 50 |   glimpse
 51 | ```
 52 | 
 53 | or:
 54 | 
 55 | ```{r}
 56 | tb %>%
 57 |   select(diabetes, pregnant) %>%
 58 |   glimpse
 59 | ```
 60 | 
 61 | #### mutate()
 62 | 
 63 | The mutate() function can create new columns from old ones.
 64 | 
 65 | ```{r}
 66 | tb <- tb %>%
 67 |        mutate(glucose_high = factor(
 68 |          ifelse(glucose>mean(glucose, na.rm=TRUE), 1, 0)))
 69 | 
 70 | tb[1:5, c(2, 10)]
 71 | ```
 72 | 
 73 | We can also use mutate to delete a column by setting it to NULL.
 74 | 
 75 | ```{r}
 76 | tb <- tb %>%
 77 |   mutate(glucose_high = NULL)
 78 | 
 79 | names(tb)
 80 | ```
 81 | 
 82 | #### rename()
 83 | 
 84 | Rename a column.
 85 | 
 86 | ```{r}
 87 | tb <- rename(tb, blood_pressure = pressure)
 88 | ```
 89 | 
 90 | #### filter()
 91 | 
 92 | The filter function can select rows.
 93 | 
 94 | ```{r}
 95 | tb <- filter(tb, !is.na(glucose), !is.na(mass))
 96 | glimpse(tb)
 97 | ```
 98 | #### arrange()
 99 | 
100 | The following code arranges the rows by mass in descending order.
101 | 
102 | ```{r}
103 | arrange(tb, desc(mass))
104 | ```
105 | 
106 | ### summarize
107 | 
108 | The summarize function computes statistical summaries of the data.
109 | 
110 | ```{r}
111 | tb %>%
112 |   summarize(min=min(mass), max=max(mass), sd=sd(mass))
113 | ```
114 | 
115 | Another example:
116 | 
117 | ```{r}
118 | tb %>%
119 |   summarize(num_diabetic = sum(diabetes=="pos"), num_healthy = sum(diabetes=="neg"))
120 | ```
121 | 
122 | ### group_by
123 | 
124 | ```{r}
125 | tb %>%
126 |   group_by(diabetes) %>%
127 |   summarize(median_BMI = median(mass, na.rm=TRUE))
128 | ```
129 | 
130 | 


--------------------------------------------------------------------------------
/Part_3_Modern_R/09_tidy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_3_Modern_R/09_tidy.pdf


--------------------------------------------------------------------------------
/Part_3_Modern_R/Phishing.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Wrangling with the Tidyverse"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | ---
  9 | 
 10 | This notebook explores dplyr data wrangling with a phishing data set from the [UCI repository](https://archive.ics.uci.edu/ml/datasets/Phishing+Websites#
 11 | )
 12 | 
 13 | The data was converted from Weka format to an R data frame using this code:
 14 | 
 15 | ```{r}
 16 | library(RWeka)
 17 | df <- read.arff("Training Dataset.arff")
 18 | ```
 19 | 
 20 | Now using tbl_df(), convert the dataframe to a tibble. Examine the tibble with glimpse. Notice that all columns are factors. Many have two levels: 1 and -1, some have a 3rd level of 0. The documentation in the UCI ML link explains each column.
 21 | 
 22 | ```{r, warning=FALSE}
 23 | library(tidyverse)
 24 | tb <- tbl_df(df)
 25 | glimpse(tb)
 26 | ```
 27 | 
 28 | Using a ggplot bar plot shows that the target column Result is failry evenly balanced.
 29 | 
 30 | ```{r}
 31 | ggplot(tb, aes(x=Result)) + geom_bar(fill="cornflowerblue")
 32 | ```
 33 | ### Train Test Split
 34 | 
 35 | A different way to split data. First create a row ID for each row, then sample 75% into train using sample_frac(), and put the rest in test using anti_join().
 36 | 
 37 | ```{r}
 38 | tb <- tb %>% mutate(id=row_number())
 39 | set.seed(1234)
 40 | train <- tb %>% sample_frac(.75)
 41 | test <- anti_join(tb, train, by='id')
 42 | ```
 43 | 
 44 | 
 45 | ### Logistic regression on all predictors
 46 | 
 47 | The test accuracy is .93 using all 31 predictors. The mcc scores is .87. MCC returns a value between -1 and +1. The formula is:
 48 | 
 49 | ![MCC](mcc.png)
 50 | MCC, or Matthews correlation coefficient, is used for binary classification. Some people like mcc because it will produce a high score only if all 4 categories TP, TN, FP, FN are good, and it's proportional to the number of true/false observations.
 51 | 
 52 | ```{r}
 53 | library(mltools)
 54 | glm1 <- glm(Result~.-id, data=train, family=binomial)
 55 | probs <- predict(glm1, newdata=test, type="response")
 56 | pred <- ifelse(probs>0.5, 2, 1)
 57 | acc1 <- mean(pred==as.integer(test$Result))
 58 | mcc1 <- mcc(pred, as.integer(test$Result))
 59 | ```
 60 | 
 61 | ## Finding a simpler model
 62 | 
 63 | The model seems to be good but is hard to interpret with so many predictors. Let's search for a simpler model by first eliminating coefficients with low p-values. Extracting the p-values from summary() gives a named vector. This is subset to only those with p-values > .05
 64 | 
 65 | These columns will be removed.
 66 | 
 67 | ```{r}
 68 | temp <- summary(glm1)$coefficients[,4]  # extract all 39 p-values
 69 | temp <- temp[temp>0.05]
 70 | temp
 71 | ```
 72 | 
 73 | Next we use stringr's str_extract() and str_remove() functions to get rid of the dummy endings.
 74 | 
 75 | ```{r}
 76 | delete_cols <- names(temp) %>% str_extract("\\w+") %>%
 77 |   str_remove_all("\\d$")
 78 | delete_cols
 79 | ```
 80 | 
 81 | Remove columns with high p-values
 82 | 
 83 | ```{r}
 84 | tb_reduced <- tb %>% select(-one_of(delete_cols)) 
 85 | glimpse(tb_reduced)
 86 | ```
 87 | ```{r}
 88 | train2 <- tb_reduced %>% sample_frac(.75)
 89 | test2 <- anti_join(tb_reduced, train2, by='id')
 90 | ```
 91 | 
 92 | 
 93 | ### Run logistic regression on the reduced data set
 94 | 
 95 | The logistic regression model on the reduced data set is slightly less accurate but more interpretable with only 16 predictors. 
 96 | 
 97 | ```{r}
 98 | glm2 <- glm(Result~.-id, data=train2, family=binomial)
 99 | probs <- predict(glm2, newdata=test2, type="response")
100 | pred <- ifelse(probs>0.5, 2, 1)
101 | acc2 <- mean(pred==as.integer(test2$Result))
102 | mcc2 <- mcc(pred, as.integer(test2$Result))
103 | summary(glm2)
104 | ```
105 | 
106 | ### Group Features by Type
107 | 
108 | The documentation at the link above grouped the features into 4 general categories:
109 | 
110 | * address bar features
111 | * abnormal based features
112 | * html and JavaScript features
113 | * domain based features
114 | 
115 | Looking at a few of the features in the domain based category does not show any strong relationship between the 2 or 3 levels of the feature and the target. Most of these got low p-values in the summary() done at the console, but their coefficients are small, indicating a minimal contribution to the log odds. 
116 | 
117 | ```{r}
118 | library(gridExtra)
119 | p1 <- ggplot(tb_reduced, aes(x=web_traffic, fill=Result)) + geom_bar()
120 | p2 <- ggplot(tb_reduced, aes(x=DNSRecord, fill=Result)) + geom_bar()
121 | p3 <- ggplot(tb_reduced, aes(x=Links_pointing_to_page, fill=Result)) + geom_bar()
122 | grid.arrange(p1, p2, p3, nrow=1)
123 | ```
124 | 
125 | 
126 | Two predictors with larger coefficients for the dummy factors include SSLfinal_State (looks for suspicious htyps protocols) and URL_of_Anchor (looks for funny things in <a> tags). We can see the same thing in the graph:
127 | 
128 | * SSLfinal_State=1 has most of the observations as Result=1, most for levels 0 and -1 most of the observations have Result=-1
129 | * URL_of_Anchor levels 0 and 1 have most observations as Result=1 while level -1 has almost all observations a Result=-1
130 | 
131 | This is an indication that there is predictive power in some of these levels. 
132 | 
133 | ```{r}
134 | p1 <- ggplot(tb_reduced, aes(x=SSLfinal_State, fill=Result)) + geom_bar()
135 | p2 <- ggplot(tb_reduced, aes(x=URL_of_Anchor, fill=Result)) + geom_bar()
136 | grid.arrange(p1, p2, nrow=1)
137 | 
138 | ```
139 | 
140 | 
141 | ### mutate and replace
142 | 
143 | The next code uses mutate and replace to:
144 | 
145 | * Make SSL final state binary by making 0 and -1 zero
146 | * Make URL_of_Anchor binary by making 0 and 1 one and -1 zero (0 becomes 1 and -1 becomes 0)
147 | 
148 | 
149 | ```{r}
150 | tb3 <- mutate(tb_reduced, SSLfinal_State = replace(SSLfinal_State, which(SSLfinal_State == -1), 0))
151 | 
152 | tb3 <- mutate(tb3, URL_of_Anchor = replace(URL_of_Anchor, which(URL_of_Anchor==0), 1))
153 | tb3 <- mutate(tb3, URL_of_Anchor = replace(URL_of_Anchor, which(URL_of_Anchor==-1), 0))
154 | 
155 | ```
156 | 
157 | ### Another model
158 | 
159 | A third logistic regression model is used, this time only use the two variable that were just mutated. The accuracy is lower than either model, but is much more interpretable as you can see in the summary.
160 | 
161 | 
162 | ```{r}
163 | train3 <- tb3 %>% sample_frac(.75)
164 | test3 <- anti_join(tb3, train3, by='id')
165 | 
166 | glm3 <- glm(Result~SSLfinal_State+URL_of_Anchor, data=train3, family=binomial)
167 | probs <- predict(glm3, newdata=test3, type="response")
168 | pred <- ifelse(probs>0.5, 2, 1)
169 | acc3 <- mean(pred==as.integer(test3$Result))
170 | mcc3 <- mcc(pred, as.integer(test3$Result))
171 | summary(glm3)
172 | ```
173 | 
174 | 


--------------------------------------------------------------------------------
/Part_3_Modern_R/Phishing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_3_Modern_R/Phishing.pdf


--------------------------------------------------------------------------------
/Part_3_Modern_R/mcc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_3_Modern_R/mcc.png


--------------------------------------------------------------------------------
/Part_3_Modern_R/readme.md:
--------------------------------------------------------------------------------
1 | # Part 3: Modern R
2 | 
3 | These notebooks/pdfs accompany Chapter 9: Modern R and Chapter 10: ggplot2
4 | 
5 | * tidy.Rmd and pdf explore the tidyvierse with the PimaIndiansDiabetes2 data set
6 | * ggplot.Rmd and pdf explore using ggplot2 with the PimaIndiansDiabetes2 data set
7 | * phishing.Rmd and pdf explore a phishing data set with Moder R
8 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_1_kNN_class.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output:
 3 |   pdf_document: default
 4 |   html_document: default
 5 | editor_options: 
 6 |   chunk_output_type: inline
 7 | ---
 8 | # kNN - Classification
 9 | ### Karen Mazidi
10 | 
11 | This example shows how to do knn clustering for classification. 
12 | 
13 | The iris database comes with R. It has 150 instances and 5 columns:
14 | - Sepal.Length
15 | - Sepal.Width
16 | - Petal.Length
17 | - Petal.Width
18 | - Species: setosa, versicolor or virginica
19 | 
20 | 
21 | ### Load and look at the data 
22 | ```{r}
23 | attach(iris)
24 | str(iris)    # display the structure of the object
25 | summary(iris)
26 | ```
27 | 
28 | ### Plot the data
29 | 
30 | We let the 3 classes show as 3 different colors with the bg parameter and the "unclass" values 1, 2, 3 representing the 3 types of irises.
31 | ```{r}
32 | plot(Petal.Length, Petal.Width, pch=21, bg=c("red","green3","blue")
33 |      [unclass(Species)], main="Iris Data")
34 | ```
35 | 
36 | ### Pairs scatter plots
37 | 
38 | ```{r}
39 | pairs(iris[1:4], main = "Iris Data", pch = 21, bg = c("red", "green3", "blue")[unclass(Species)])
40 | ```
41 | 
42 | ### Divide into train/test sets
43 | 
44 | We will randomly sample the data set to let 2/3 be training and 1/3 test, 
45 | 
46 | ```{r}
47 | set.seed(1958)  # setting a seed gets the same results every time
48 | ind <- sample(2, nrow(iris), replace=TRUE, prob=c(0.67, 0.33))
49 | iris.train <- iris[ind==1, 1:4]
50 | iris.test <- iris[ind==2, 1:4]
51 | iris.trainLabels <- iris[ind==1, 5]
52 | iris.testLabels <- iris[ind==2, 5]
53 | ```
54 | 
55 | ### Classify
56 | 
57 | The knn() function uses Euclidean distance to find the k nearest neighbors.
58 | 
59 | Classificiation is decided by majority vote with ties broken at random. 
60 | 
61 | Using an odd k can avoid some ties.
62 | 
63 | ```{r}
64 | library(class)
65 | iris_pred <- knn(train=iris.train, test=iris.test, cl=iris.trainLabels, k=3)
66 | ```
67 | 
68 | ### Compute accuracy
69 | 
70 | We built a classifier with 98% accuracy.
71 | 
72 | It's often a good idea to scale the variables for clustring to make the distance calculations better. However in this case, the 3 predictors are roughly in the same scale so it's probably not necessary.
73 | 
74 | ```{r}
75 | results <- iris_pred == iris.testLabels
76 | acc <- length(which(results==TRUE)) / length(results)
77 | # or combine into one line:
78 | #acc <- length(which(iris_pred == iris.testLabels)) / length(iris_pred)
79 | table(results, iris_pred)
80 | acc
81 | ```
82 | 
83 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_1_kNN_class.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/12_1_kNN_class.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_2_kNN_reg.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | editor_options:
  3 |   chunk_output_type: inline
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | ---
  9 | # kNN Clustering - Regression
 10 | ### Karen Mazidi
 11 | 
 12 | This example uses the Auto data set in package ISLR. First we try linear regression as a baseline and then see if knn can beat the linear model. 
 13 | 
 14 | ```{r}
 15 | library(ISLR)
 16 | attach(Auto)
 17 | Auto$origin <- as.factor(origin)
 18 | ```
 19 | 
 20 | Build a linear model with all predictors. 
 21 | 
 22 | ```{r}
 23 | lm1 <- lm(mpg ~.-name, data=Auto)
 24 | summary(lm1)
 25 | ```
 26 | 
 27 | ## Train and test on a linear model
 28 | 
 29 | The results from lm1 indicated that weight, year, and origin appear to be significant predictors so let's build a model just from those. 
 30 | 
 31 | First, randomly sample 80% from the data set, and let those indices be for training while the others are for the test set. 
 32 | 
 33 | ```{r}
 34 | set.seed(1958)  # for reproducible results
 35 | i <- sample(1:nrow(Auto), round(nrow(Auto)*0.8), replace=FALSE)
 36 | train <- Auto[i,]
 37 | test <- Auto[-i,]
 38 | lm2 <- lm(mpg~weight+year+origin, data=train)
 39 | pred <- predict(lm2, newdata=test)
 40 | cor_lm <- cor(pred, test$mpg)
 41 | mse_lm <- mean((pred - test$mpg)^2)
 42 | print(paste("cor=", cor_lm))
 43 | print(paste("mse=", mse_lm))
 44 | ```
 45 | 
 46 | 
 47 | The correlation for the linear model was 92% which is good. Can kNN do better?
 48 | 
 49 | ### kNN for regression
 50 | 
 51 | We will use the same train and test set as we used for the linear model. We will train on weight, year and origin as for the linear model.
 52 | 
 53 | ```{r}
 54 | library(caret)
 55 | train$origin <- as.integer(train$origin)
 56 | test$origin <- as.integer(test$origin)
 57 | fit <- knnreg(train[,2:8],train[,1],k=3)
 58 | predictions <- predict(fit, test[,2:8])
 59 | cor_knn1 <- cor(predictions, test$mpg)
 60 | mse_knn1 <- mean((predictions - test$mpg)^2)
 61 | print(paste("cor=", cor_knn1))
 62 | print(paste("mse=", mse_knn1))
 63 | ```
 64 | 
 65 | So the results were not as good for knn, the correlation lower and the mse was higher.
 66 | 
 67 | As discussed in class, we know that clustering algorithms work best if the data is scaled, so let's scale the data and try again.
 68 | 
 69 | ```{r}
 70 | train_scaled <- train[, 2:8]  # omit name and don't scale mpg
 71 | means <- sapply(train_scaled, mean)
 72 | stdvs <- sapply(train_scaled, sd)
 73 | train_scaled <- scale(train_scaled, center=means, scale=stdvs)
 74 | test_scaled <- scale(test[, 2:8], center=means, scale=stdvs)
 75 |                       
 76 | fit <- knnreg(train_scaled, train$mpg, k=3)
 77 | predictions <- predict(fit, test_scaled)
 78 | cor_knn2 <- cor(predictions, test$mpg)
 79 | mse_knn2 <- mean((predictions - test$mpg)^2)
 80 | print(paste("cor=", cor_knn2))
 81 | print(paste("mse=", mse_knn2))
 82 | ```
 83 | 
 84 | Wow, scaling improved the results. Correlation is higher and mse is lower than the liner model. 
 85 | 
 86 | ### Finding the best k
 87 | 
 88 | Try various values of k and plot the results. 
 89 | 
 90 | ```{r}
 91 | cor_k <- rep(0, 20)
 92 | mse_k <- rep(0, 20)
 93 | i <- 1
 94 | for (k in seq(1, 39, 2)){
 95 |   fit_k <- knnreg(train_scaled,train$mpg, k=k)
 96 |   pred_k <- predict(fit_k, test_scaled)
 97 |   cor_k[i] <- cor(pred_k, test$mpg)
 98 |   mse_k[i] <- mean((pred_k - test$mpg)^2)
 99 |   print(paste("k=", k, cor_k[i], mse_k[i]))
100 |   i <- i + 1
101 | }
102 | 
103 | plot(1:20, cor_k, lwd=2, col='red', ylab="", yaxt='n')
104 | par(new=TRUE)
105 | plot(1:20, mse_k, lwd=2, col='blue', labels=FALSE, ylab="", yaxt='n')
106 | ```
107 | Find the best k
108 | 
109 | ```{r}
110 | which.min(mse_k)
111 | ```
112 | 
113 | ```{r}
114 | which.max(cor_k)
115 | ```
116 | 
117 | 
118 | ### k=15
119 | 
120 | Lets pick a different k and compare results. We run the model above. It is slightly worse than k=3. So the plot is very informative when picking k.
121 | 
122 | ```{r}
123 | fit_15 <- knnreg(train_scaled,train$mpg,k=15)
124 | predictions_15 <- predict(fit_15, test_scaled)
125 | cor_knn15 <- cor(predictions_15, test$mpg)
126 | mse_knn15 <- mean((predictions_15 - test$mpg)^2)
127 | print(paste("cor=", cor_knn15))
128 | print(paste("mse=", mse_knn15))
129 | ```
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_2_kNN_reg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/12_2_kNN_reg.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_2_kNN_regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "kNN Regression"
  3 | output:
  4 |   pdf_document: default
  5 |   html_notebook: default
  6 |   html_document:
  7 |     df_print: paged
  8 | ---
  9 | 
 10 | This notebook performs kNN regression on the Auto data in package ISLR. The [ISLR book](http://faculty.marshall.usc.edu/gareth-james/ISL/) is a really good and free resource for statistical learning in R. Two of the authors, Hastie and Tibshirani, also have a series of videos available [at this link](https://www.r-bloggers.com/in-depth-introduction-to-machine-learning-in-15-hours-of-expert-videos/)
 11 | 
 12 | ### Linear Regression baseline
 13 | 
 14 | This example uses the Auto data set in package ISLR. First we try linear regression as a baseline and then see if knn can beat the linear model. 
 15 | 
 16 | 
 17 | ```{r}
 18 | library(ISLR)
 19 | attach(Auto)
 20 | Auto$origin <- as.factor(origin)
 21 | str(Auto)
 22 | ```
 23 | 
 24 | #### Train test split
 25 | 
 26 | Also remove the name column.
 27 | 
 28 | ```{r}
 29 | set.seed(1234)  
 30 | i <- sample(1:nrow(Auto), round(nrow(Auto)*0.8), replace=FALSE)
 31 | train <- Auto[i, -9]
 32 | test <- Auto[-i, -9]
 33 | ```
 34 | 
 35 | 
 36 | Build a linear regression model with all predictors.
 37 | 
 38 | ```{r}
 39 | lm1 <- lm(mpg ~., data=train)
 40 | summary(lm1)
 41 | ```
 42 | 
 43 | #### Evaluate
 44 | 
 45 | ```{r}
 46 | pred1 <- predict(lm1, newdata=test)
 47 | cor_lm1 <- cor(pred1, test$mpg)
 48 | mse_lm1 <- mean((pred1 - test$mpg)^2)
 49 | print(paste("cor=", cor_lm1))
 50 | print(paste("mse=", mse_lm1))
 51 | ```
 52 | 
 53 | These results aren't bad. Let's see what happens with kNN
 54 | 
 55 | ### kNN for regression
 56 | 
 57 | Notice that origin needs to be an integer. 
 58 | 
 59 | ```{r, warning=FALSE}
 60 | library(caret)
 61 | train$origin <- as.integer(train$origin)
 62 | test$origin <- as.integer(test$origin)
 63 | 
 64 | # fit the model
 65 | fit <- knnreg(train[,2:8],train[,1],k=3)
 66 | 
 67 | # evaluate
 68 | pred2 <- predict(fit, test[,2:8])
 69 | cor_knn1 <- cor(pred2, test$mpg)
 70 | mse_knn1 <- mean((pred2 - test$mpg)^2)
 71 | print(paste("cor=", cor_knn1))
 72 | print(paste("mse=", mse_knn1))
 73 | ```
 74 | 
 75 | The results for kNN weren't quite as good as the linear regression model.
 76 | 
 77 | One reason might be that we didn't scale the data. kNN will work better on scaled data.
 78 | 
 79 | ### Scale the data
 80 | 
 81 | Notice we are scaling both train and test on the means and standard deviations of the training set. This is considered a best practice so that information about the test data doesn't leak into the scaling.
 82 | 
 83 | ```{r}
 84 | train_scaled <- train[, 2:8]  # omit name and don't scale mpg
 85 | means <- sapply(train_scaled, mean)
 86 | stdvs <- sapply(train_scaled, sd)
 87 | train_scaled <- scale(train_scaled, center=means, scale=stdvs)
 88 | test_scaled <- scale(test[, 2:8], center=means, scale=stdvs)
 89 | ```
 90 | 
 91 | 
 92 | ### kNN on scaled data
 93 | 
 94 | ```{r}
 95 | fit <- knnreg(train_scaled, train$mpg, k=3)
 96 | pred3 <- predict(fit, test_scaled)
 97 | cor_knn2 <- cor(pred3, test$mpg)
 98 | mse_knn2 <- mean((pred3 - test$mpg)^2)
 99 | print(paste("cor=", cor_knn2))
100 | print(paste("mse=", mse_knn2))
101 | print(paste("rmse=", sqrt(mse_knn2)))
102 | ```
103 | 
104 | Wow. Now kNN has a higher correlation than linear regression and a lower mse. 
105 | 
106 | ### Find the best k
107 | 
108 | Try various values of k and plot the results.
109 | 
110 | ```{r}
111 | cor_k <- rep(0, 20)
112 | mse_k <- rep(0, 20)
113 | i <- 1
114 | for (k in seq(1, 39, 2)){
115 |   fit_k <- knnreg(train_scaled,train$mpg, k=k)
116 |   pred_k <- predict(fit_k, test_scaled)
117 |   cor_k[i] <- cor(pred_k, test$mpg)
118 |   mse_k[i] <- mean((pred_k - test$mpg)^2)
119 |   print(paste("k=", k, cor_k[i], mse_k[i]))
120 |   i <- i + 1
121 | }
122 | plot(1:20, cor_k, lwd=2, col='red', ylab="", yaxt='n')
123 | par(new=TRUE)
124 | plot(1:20, mse_k, lwd=2, col='blue', labels=FALSE, ylab="", yaxt='n')
125 | ```
126 | 
127 | We can visually see that at k=1 the red correlation is highest and the blue mse is lowest. 
128 | 
129 | 
130 | ```{r}
131 | which.min(mse_k)
132 | which.max(cor_k)
133 | ```
134 | 
135 | ### kNN with k=1
136 | 
137 | The results were better with k=1.
138 | 
139 | ```{r}
140 | fit <- knnreg(train_scaled, train$mpg, k=1)
141 | pred4 <- predict(fit, test_scaled)
142 | cor_knn3 <- cor(pred4, test$mpg)
143 | mse_knn3 <- mean((pred4 - test$mpg)^2)
144 | print(paste("cor=", cor_knn3))
145 | print(paste("mse=", mse_knn3))
146 | ```
147 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_2_kNN_regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/12_2_kNN_regression.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_3_kNN_reg_cv.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | editor_options:
 3 |   chunk_output_type: inline
 4 | output:
 5 |   pdf_document: default
 6 |   html_document:
 7 |     df_print: paged
 8 | ---
 9 | # kNN - Regression
10 | ## Using 10-fold cross validations
11 | ### Karen Mazidi
12 | 
13 | Load the data
14 | 
15 | ```{r}
16 | library(ISLR)
17 | df <- Auto[]
18 | df$origin <- as.integer(df$origin)
19 | # subset to columns mpg, weight, year, origin
20 | df <- data.frame(scale(df[, c(1, 5, 7, 8)]  ))
21 | ```
22 | 
23 | ### Create the 10 folds
24 | 
25 | We could do this manually but there is a function in caret that does this. Since the Auto data is a little less than 400 rows, we expect each of the 10 folds to be of legth 40 or less. We confirm that with sapply.
26 | 
27 | 
28 | ```{r}
29 | library(caret)
30 | set.seed(1234)
31 | folds <- createFolds(df$mpg, k=10)
32 | sapply(folds, length)
33 | ```
34 | 
35 | ### Look at the fold indices
36 | 
37 | To get a better idea of the folds, let's just print the indices for each fold.
38 | 
39 | ```{r}
40 | for (i in 1:10){
41 |   print(folds[[i]])
42 | }
43 | ```
44 | 
45 | ### Perform 10-fold cv
46 | 
47 | For now we will just let k=3 and perform 10-fold cv, then average the correlation and mse values.
48 | 
49 | ```{r}
50 | test_mse <- rep(0, 10)
51 | test_cor <- rep(0, 10)
52 | for (i in 1:10){
53 |   fit <- knnreg(df[-folds[[i]], 2:4], df$mpg[-folds[[i]]], k=3)
54 |   pred <- predict(fit, df[folds[[i]], 2:4])
55 |   test_cor[i] <- cor(pred, df$mpg[folds[[i]]])
56 |   test_mse[i] <- mean((pred - df$mpg[folds[[i]]])^2)
57 | }
58 | print(paste("Average correlation is ", round(mean(test_cor), 2)))
59 | print(paste("range is ", range(test_cor)))
60 | print(paste("Average mse is ", round(mean(test_mse), 2)))
61 | print(paste("range is ", range(test_mse)))
62 | ```
63 | 
64 | ### Try with various k
65 | 
66 | We modify the code above to be an anonymous function called by sapply.  
67 | 
68 | 
69 | ```{r}
70 | # try various values for k
71 | k_values <- seq(1, 39, 2)
72 | results <- sapply(k_values, function(k){
73 |   mse_k <- rep(0, 10)
74 |   cor_k <- rep(0, 10)
75 |   for (i in 1:10){
76 |     fit <- knnreg(df[-folds[[i]], 2:4], df$mpg[-folds[[i]]], k=k)
77 |     pred <- predict(fit, df[folds[[i]], 2:4])
78 |     cor_k[i] <- cor(pred, df$mpg[folds[[i]]])
79 |     mse_k[i] <- mean((pred - df$mpg[folds[[i]]])^2)
80 |   }
81 |   #print(paste(mean(cor_k), mean(mse_k)))
82 |   list(mean(cor_k), mean(mse_k))
83 | })
84 | # reshape results into matrix
85 | m <- matrix(results, nrow=20, ncol=2, byrow=TRUE)
86 | ```
87 | 
88 | 
89 | ### Examine results
90 | 
91 | Plot the correlation and mse for each value of k.
92 | 
93 | ```{r}
94 | par(mfrow=c(2, 1))
95 | plot(1:20, unlist(m[,1]), lwd=2, type="o", col='red', ylab="Correlation")
96 | plot(1:20, unlist(m[,2]), lwd=2, type="o", col='blue', ylab="MSE")
97 | 
98 | ```
99 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/12_3_kNN_reg_cv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/12_3_kNN_reg_cv.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-1-cluster_kmean_iris.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output:
 3 |   html_document: default
 4 |   pdf_document: default
 5 | editor_options: 
 6 |   chunk_output_type: inline
 7 | ---
 8 | # K-means clustering
 9 | ## Karen Mazidi
10 | 
11 | 
12 | 
13 | ### Getting set up
14 | 
15 | We are using the built-in iris data set.
16 | 
17 | ```{r}
18 | library(datasets)
19 | head(iris)
20 | ```
21 | 
22 | Now we try clustering with kmeans().
23 | 
24 | We are using just Petal.Length and Petal.Width for clustering. The number of clusters is set to 3 and the number of starts is 20.
25 | 
26 | 
27 | ```{r}
28 | set.seed(1234)
29 | irisCluster <- kmeans(iris[, 3:4], 3, nstart=20)
30 | irisCluster
31 | ```
32 | 
33 | Compare the clusters with the species. This is not usally something we can do in clustering because we normally don't have labels. We are usually clustering blind, not knowing the true grouping in the data. 
34 | 
35 | ```{r}
36 | table(irisCluster$cluster, iris$Species)
37 | 
38 | ```
39 | 
40 | Plot the clusters.
41 | 
42 | ```{r}
43 | 
44 | plot(iris$Petal.Length, iris$Petal.Width, pch=21, bg=c("red","green3","blue")
45 | [unclass(irisCluster$cluster)], main="Iris Data")
46 | ```
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-1-cluster_kmean_iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/13-1-cluster_kmean_iris.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-2-cluster_kmean_wine.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output:
 3 |   html_document: default
 4 |   pdf_document: default
 5 | ---
 6 | # Clustering
 7 | ### Karen Mazidi
 8 | 
 9 | Modified from Kabacoff, "R in Action", 2nd ed
10 | 
11 | ## K-means
12 | 
13 | Apply k-means to the wine data set, which contains 13 chemical measurements on 178 Italian wines. 
14 | 
15 | The first column, Type, indicates 1 or 3 wine varieties. We will drop this variable for the clustering.
16 | 
17 | ```{r}
18 | data(wine, package="rattle")
19 | names(wine)
20 | head(wine)
21 | df <- scale(wine[-1])
22 | head(df)
23 | ```
24 | 
25 | Write a function to plot the within-groups sums of squares vs. the number of clusters. 
26 | 
27 | 
28 | ```{r}
29 | wsplot <- function(data, nc=15, seed=1234){
30 |   wss <- (nrow(data)-1)*sum(apply(data,2,var))
31 |   for (i in 2:nc){
32 |     set.seed(seed)
33 |     wss[i] <- sum(kmeans(data,centers=i)$withinss)
34 |   }
35 |   plot(1:nc, wss, type="b", xlab="Number of Clusters",
36 |        ylab="Within groups sum of squares")
37 | }
38 | wsplot(df)
39 | ```
40 | 
41 | 
42 | Use the NbClust() function to help determine the best number of clusters.
43 | 
44 | In the within-groups plot, we see an "elbow" around 3, suggesting that 3 clusters is a good choice. 
45 | 
46 | ```{r}
47 | library(NbClust)
48 | set.seed(1234)
49 | nc <- NbClust(df, min.nc=2, max.nc=15, method="kmeans")
50 | table(nc$Best.n[1,])
51 | barplot(table(nc$Best.n[1,]),
52 |         xlab="Number of Clusters", ylab="Number of Criteria",
53 |         main="Number of Clusters Chosen by 26 Criteria")
54 | ```
55 | 
56 | ## KMeans
57 | 
58 | Fit the model using the kmeans() function. We set a seed first so we get reproducible results. 
59 | 
60 | The centroids are found in fit.km$centers and we display those. 
61 | 
62 | ```{r}
63 | set.seed(1234)
64 | fit.km <- kmeans(df, 3, nstart=25)
65 | fit.km$size
66 | fit.km$centers
67 | 
68 | ```
69 | 
70 | The centroids were calculated based on the scaled data. Next we use the aggregate() function along with the cluster membership to get variable means for each cluster in units of the original, unscaled, data. 
71 | 
72 | 
73 | ```{r}
74 | aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean)
75 | 
76 | ```
77 | 
78 | ## Model Analysis
79 | 
80 | If we cross-tabulate the Type in column 1 of the wine data with cluster membership, we see that the clusters are strongly correlated with the wine type. 
81 | 
82 | 
83 | ```{r}
84 | ct.km <- table(wine$Type, fit.km$cluster)
85 | ct.km
86 | ```
87 | 
88 | We can quantify the agreement between the type and the cluster using an adjusted Rand index. The adjusted Rand index provides a measure of the agreement between two partitions, adjusted for chance. The range of the index is from -1 (no agreement) to +1 (perfect agreement). 
89 | 
90 | The results below show very good agreement!
91 | 
92 | ```{r}
93 | library(flexclust)
94 | randIndex(ct.km)
95 | 
96 | ```
97 | 
98 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-2-cluster_kmean_wine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/13-2-cluster_kmean_wine.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-3-kmean-k_synthetic.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Exploring the Number of Clusters"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   html_document:
  6 |     df_print: paged
  7 |   pdf_document: default
  8 | editor_options:
  9 |   chunk_output_type: inline
 10 | ---
 11 | 
 12 | ### Create synthetic data
 13 | 
 14 | First we create some synthetic data using the rnorm() function. We create 3 distributions with centers (10, 3), (27, 2) and (41, 5). These are the "true" clusters but the regions overlap a little. We plot the unclustered data with different shapes for each distribution.
 15 | 
 16 | ```{r}
 17 | set.seed(1234)
 18 | x <- rep(0, 60)
 19 | y <- rep(0, 60)
 20 | x[1:20] <- rnorm(20, mean=10, sd=3)
 21 | y[1:20] <- rnorm(20, mean=3, sd=1)
 22 | x[21:40] <- rnorm(20, mean=27, sd=4)
 23 | y[21:40] <- rnorm(20, mean=2, sd=1)
 24 | x[41:60] <- rnorm(20, mean=41, sd=3)
 25 | y[41:60] <- rnorm(20, mean=5, sd=1)
 26 | # uncomment the next two lines to see what happens
 27 | # with a more uniform distribution
 28 | #x <- rnorm(60, mean=30, sd=10)
 29 | #y <- rnorm(60, mean=3, sd=2)
 30 | true <- c(rep(1,20), rep(2,20), rep(3,20))
 31 | plot(x, y, cex=1.5, pch=c(15, 16, 17)[true])
 32 | ```
 33 | 
 34 | ### k-means: one iteration
 35 | 
 36 | Apply the k-means algoirthm with only one iteration and one start. 
 37 | 
 38 | ```{r}
 39 | set.seed(1234)
 40 | df <- data.frame(cbind(x, y))
 41 | res <- kmeans(df, 3, iter.max=1, nstart=1 )
 42 | plot(x, y, col=c("orange", "green", "purple")[res$cluster], cex=1.5, pch=c(15, 16, 17)[true])
 43 | ```
 44 | ### k-means: unlimited iterations
 45 | 
 46 | Although when we ran one iteration we got a warning message that it did not converge, we se no change when we let it run as many iterations as needed. Typing res2$iter at the console shows that it only ran 2 iterations.
 47 | 
 48 | ```{r}
 49 | set.seed(1234)
 50 | res3 <- kmeans(df, 3,  nstart=1 )
 51 | plot(x, y, col=c("orange", "green", "purple")[res3$cluster], cex=1.5, pch=c(15, 16, 17)[true])
 52 | ```
 53 | ### Try k=2
 54 | 
 55 | ```{r}
 56 | set.seed(1234)
 57 | res2 <- kmeans(df, 2,  nstart= 5)
 58 | plot(x, y, col=c("orange", "green", "purple", "blue")[res2$cluster], cex=1.5, pch=c(15, 16, 17)[true])
 59 | ```
 60 | 
 61 | ### Try k=4
 62 | 
 63 | ```{r}
 64 | set.seed(1234)
 65 | res4 <- kmeans(df, 4,  nstart= 5)
 66 | plot(x, y, col=c("orange", "green", "purple", "blue")[res4$cluster], cex=1.5, pch=c(15, 16, 17)[true])
 67 | ```
 68 | ### Try 5 clusters
 69 | 
 70 | ```{r}
 71 | set.seed(1234)
 72 | res5 <- kmeans(df, 5,  nstart= 5)
 73 | plot(x, y, col=c("orange", "green", "purple", "blue", "black")[res5$cluster], cex=1.5, pch=c(15, 16, 17)[true])
 74 | 
 75 | ```
 76 | ### withinss
 77 | 
 78 | Our goal is to reduce within sum of squares, this means our clusters are more homogenous. Let's compare the withinss for k=2, k=4 and k=5.
 79 | 
 80 | It seems there is a dramatic drop from k=2 to k=3 then it gradually decreases. It makes sense that the larger the number of clusters, the smaller the withinss. After all, if k=n then withnss would be 0.
 81 | 
 82 | ```{r}
 83 | print(paste("k=2: ", sum(res2$withinss)))
 84 | print(paste("k=3: ", sum(res3$withinss)))
 85 | print(paste("k=4: ", sum(res4$withinss)))
 86 | print(paste("k=5: ", sum(res5$withinss)))
 87 | 
 88 | ```
 89 | 
 90 | 
 91 | ### Finding k with a function
 92 | 
 93 | We can write a function to randomly try different k values and plot the within sum of squares.
 94 | 
 95 | ```{r}
 96 | plot_withinss <- function(df, max_clusters){
 97 |   withinss <- rep(0, max_clusters-1)
 98 |   for (i in 2:max_clusters){
 99 |     set.seed(1234)
100 |     withinss[i] <- sum(kmeans(df, i)$withinss)
101 |   }
102 |   plot(2:max_clusters, withinss[2:max_clusters], type="o", xlab="K", ylab="Within Sum Squares")
103 | }
104 | plot_withinss(df, 9)
105 | ```
106 | 
107 | 
108 | ### NbClust()
109 | 
110 | Next we try the NbClust() function to find the best number of clusters. 
111 | 
112 | ```{r}
113 | library(NbClust)
114 | set.seed(1234)
115 | nc <- NbClust(df, min.nc=2, max.nc=9, method="kmeans")
116 | ```
117 | 
118 | ```{r}
119 | t <- table(nc$Best.n[1,])
120 | t
121 | ```
122 | 
123 | ```{r}
124 | barplot(t, xlab="Number of Clusters", ylab = "Criteria")
125 | ```
126 | 
127 | ### Plot for the book
128 | 
129 | ```{r}
130 | par(mfrow=c(2,1))
131 | plot(x, y, col=c("orange", "green", "purple", "blue")[res3$cluster], cex=1.5, pch=c(15, 16, 17)[true])
132 | plot(x, y, col=c("orange", "green", "purple", "blue")[res4$cluster], cex=1.5, pch=c(15, 16, 17)[true])
133 | ```
134 | 
135 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-3-kmean-k_synthetic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/13-3-kmean-k_synthetic.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-4-cluster_hier.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Hierarchical Clustering"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   html_document:
 6 |     df_print: paged
 7 |   pdf_document: default
 8 | editor_options:
 9 |   chunk_output_type: inline
10 | ---
11 | 
12 | ### Load the data
13 | 
14 | This example uses the nutrient data set, which lists values for 5 nutrients (energy, protein, fat, calcium, iron) for 27 different meals. 
15 | 
16 | ```{r}
17 | library(flexclust)
18 | data(nutrient)
19 | str(nutrient)
20 | ```
21 | 
22 | ### Scale the data
23 | 
24 | Taking a look at the data we see that each column is on its own scale. Clustering will perform better if the data is scaled.
25 | 
26 | 
27 | ```{r}
28 | head(nutrient)
29 | nutrient.scaled <- scale(nutrient)
30 | head(nutrient.scaled)
31 | ```
32 | 
33 | 
34 | ### Distance
35 | 
36 | Euclidean distances between each of the 27 food types are calculated, using average-linkage. 
37 | 
38 | The dendogram option hang=-1 causes the labels to be below 0 on the graph. 
39 | 
40 | The height indicates the criterion value at which clusters are joined. 
41 | 
42 | ```{r}
43 | d <- dist(nutrient.scaled)
44 | fit.average <- hclust(d, method="average")
45 | plot(fit.average, hang=-1, cex=.8, 
46 |      main="Hierarchical Clustering")
47 | 
48 | ```
49 | 
50 | ### Cut the dendogram
51 | 
52 | First, we are going to use our domain knowledge to add a column to nutrient indicating what type of food it is. Looking at the dendogram, this will not capture the hierarchy we see in the data but we will use it for illustration purposes. 
53 | 
54 | ```{r}
55 | library(NbClust)
56 | nutrient$Type <- "BEEF"
57 | nutrient$Type[6:7] <- "CHICKEN"
58 | nutrient$Type[9:10] <- "LAMB"
59 | nutrient$Type[16:27] <- "SEAFOOD"
60 | nutrient$Type[11:13] <- "PORK"
61 | nutrient$Type <- factor(nutrient$Type)
62 | ```
63 | 
64 | Try cuts from 3 to 11. 
65 | 
66 | ```{r}
67 | for (c in 3:11){
68 |   cluster_cut <- cutree(fit.average, c)
69 |   table_cut <- table(cluster_cut, nutrient$Type)
70 |   print(table_cut)
71 |   ri <- randIndex(table_cut)
72 |   print(paste("cut=", c, "Rand index = ", ri))
73 | }
74 | ```
75 | 
76 | We don't get great results in terms of Type but cuts at 5, then 8-10 give the best correspondence with Type.
77 | 
78 | Let's try calcium from 3 to 16. We chose 16 because there are 16 unique values of calcium. It seems that the cut at 16 had the highest Rand index. However this is overfitting the data so a more reasonable choice might be 9.
79 | 
80 | ```{r}
81 | for (c in 3:16){
82 |   cluster_cut <- cutree(fit.average, c)
83 |   table_cut <- table(cluster_cut, nutrient$calcium)
84 |   print(table_cut)
85 |   ri <- randIndex(table_cut)
86 |   print(paste("cut=", c, "Rand index = ", ri))
87 | }
88 | ```


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/13-4-cluster_hier.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/13-4-cluster_hier.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/14_DT_boston.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Decision Trees for Regression"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   html_document:
  6 |     df_print: paged
  7 |   pdf_document: default
  8 | ---
  9 | 
 10 | ### Try linear regression on Boston
 11 | 
 12 | We get a correlation of 0.9 and a rmse of 4.35.
 13 | 
 14 | 
 15 | ```{r}
 16 | library(tree)
 17 | library(MASS)
 18 | names(Boston)
 19 | # divide into train and test
 20 | set.seed(1234)
 21 | i <- sample(nrow(Boston), 0.8*nrow(Boston), replace = FALSE)
 22 | train <- Boston[i,]
 23 | test <- Boston[-i,]
 24 | lm1 <- lm(medv~., data=train)
 25 | summary(lm1)
 26 | pred <- predict(lm1, newdata=test)
 27 | cor_lm <- cor(pred, test$medv)
 28 | rmse_lm <- sqrt(mean((pred-test$medv)^2))
 29 | ```
 30 | 
 31 | ### Using tree
 32 | 
 33 | Correlation was 0.8433 and rmse was 5.14.
 34 | 
 35 | ```{r}
 36 | tree1 <- tree(medv~., data=train)
 37 | summary(tree1)
 38 | pred <- predict(tree1, newdata=test)
 39 | print(paste('correlation:', cor(pred, test$medv)))
 40 | rmse_tree <- sqrt(mean((pred-test$medv)^2))
 41 | print(paste('rmse:', rmse_tree))
 42 | plot(tree1)
 43 | text(tree1, cex=0.5, pretty=0)
 44 | ```
 45 | 
 46 | 
 47 | ### cross validation
 48 | 
 49 | ```{r}
 50 | cv_tree <- cv.tree(tree1)
 51 | plot(cv_tree$size, cv_tree$dev, type='b')
 52 | ```
 53 | 
 54 | ### prune the tree
 55 | 
 56 | 
 57 | ```{r}
 58 | tree_pruned <- prune.tree(tree1, best=5)
 59 | plot(tree_pruned)
 60 | text(tree_pruned, pretty=0)
 61 | ```
 62 | 
 63 | 
 64 | ### test on the pruned tree
 65 | 
 66 | The cor is now 0.845, very slightly above the unpruned tree but still lower than linear regression. The rmse is 5.18, very similar to the unpruned tree but higher than linear regression.
 67 | 
 68 | In this case pruning did not improve results on the test data but the tree is simpler and easier to interpret.
 69 | 
 70 | 
 71 | ```{r}
 72 | pred_pruned <- predict(tree_pruned, newdata=test)
 73 | cor_pruned <- cor(pred_pruned, test$medv)
 74 | rmse_pruned <- rmse_pruned <- sqrt(mean((pred_pruned-test$medv)^2))
 75 | ```
 76 | 
 77 | ### Random Forest
 78 | 
 79 | The importance=TRUE argument tells the algorithm to consider the importance of predictors. 
 80 | 
 81 | ```{r}
 82 | library(randomForest)
 83 | set.seed(1234)
 84 | rf <- randomForest(medv~., data=train, importance=TRUE)
 85 | rf
 86 | ```
 87 | 
 88 | 
 89 | ### predict on the random forest
 90 | 
 91 | Now the correlation is much higher than even linear regression and the rmse is almost half.
 92 | 
 93 | ```{r}
 94 | pred_rf <- predict(rf, newdata=test)
 95 | cor_rf <- cor(pred_rf, test$medv)
 96 | print(paste('corr:', cor_rf))
 97 | rmse_rf <- sqrt(mean((pred_rf-test$medv)^2))
 98 | print(paste('rmse:', rmse_rf))
 99 | ```
100 | 
101 | 
102 | ### bagging
103 | 
104 | Setting mtry to the number of predictors, p, will result in bagging
105 | 
106 | ```{r}
107 | bag <- randomForest(medv~., data=train, mtry=13)
108 | bag
109 | ```
110 | 
111 | ### predict
112 | 
113 | Our results for bagging were slightly lower than for the random forest.
114 | 
115 | ```{r}
116 | pred_bag <- predict(bag, newdata=test)
117 | cor_bag <- cor(pred_bag, test$medv)
118 | rmse_bag <- sqrt(mean((pred_bag-test$medv)^2))
119 | ```
120 | 
121 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/14_DT_boston.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/14_DT_boston.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/14_DT_iris.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Decision Tree with the Iris Data"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   html_document:
 6 |     df_print: paged
 7 |   pdf_document: default
 8 |   'pdf_document:': default
 9 | editor_options:
10 |   chunk_output_type: inline
11 | ---
12 | 
13 | ### Using rpart
14 | 
15 | 
16 | ```{r}
17 | library(rpart)
18 | tree_iris <- rpart(Species~., data=iris, method="class")
19 | tree_iris
20 | summary(tree_iris)
21 | plot(tree_iris, uniform=TRUE)
22 | text(tree_iris, use.n=TRUE, all=TRUE, cex=.6)
23 | ```
24 | 
25 | ### Using tree
26 | 
27 | ```{r}
28 | library(tree)
29 | tree_iris2 <- tree(Species~., data=iris)
30 | tree_iris2
31 | summary(tree_iris2)
32 | plot(tree_iris2)
33 | text(tree_iris2, cex=0.5, pretty=0)
34 | ```
35 | 
36 | ### train and test
37 | 
38 | ```{r}
39 | 
40 | set.seed(1958)
41 | i <- sample(150, 100, replace=FALSE)
42 | train <- iris[i,]
43 | test <- iris[-i,]
44 | tree_iris3 <- tree(Species~., data=train)
45 | pred <- predict(tree_iris3, newdata=test, type="class")
46 | table(pred, test$Species)
47 | mean(pred==test$Species)
48 | ```
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/14_DT_iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/14_DT_iris.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/15-PCA-LDA.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "PCA and LDA"
 3 | author: "Karen Mazidi"
 4 | output:
 5 |   pdf_document: default
 6 |   html_notebook: default
 7 |   html_document:
 8 |     df_print: paged
 9 | editor_options:
10 |   chunk_output_type: inline
11 | ---
12 | 
13 | ### Run PCA on the iris data
14 | 
15 | ```{r}
16 | library(caret)
17 | data(iris)
18 | i <- sample(1:150, 100, replace=FALSE)
19 | train <- iris[i,]
20 | test <- iris[-i,]
21 | set.seed(1234)
22 | pca_out <- preProcess(train[,1:4], method=c("center", "scale", "pca"))
23 | pca_out
24 | 
25 | ```
26 | ### PCA plot
27 | 
28 | ```{r}
29 | train_pc <- predict(pca_out, train[, 1:4])
30 | test_pc <- predict(pca_out, test[,])
31 | plot(test_pc$PC1, test_pc$PC2, pch=c(23,21,22)[unclass(test_pc$Species)], bg=c("red","green","blue")[unclass(test$Species)])
32 | ```
33 | ### PCA data in knn
34 | 
35 | Now let's see if our two principal components can predict class.
36 | 
37 | 
38 | ```{r}
39 | train_df <- data.frame(train_pc$PC1, train_pc$PC2, train$Species)
40 | test_df <- data.frame(test_pc$PC1, test_pc$PC2, test$Species)
41 | library(class)
42 | set.seed(1234)
43 | pred <- knn(train=train_df[,1:2], test=test_df[,1:2], cl=train_df[,3], k=3)
44 | mean(pred==test$Species)
45 | ```
46 | 
47 | The accuracy is lower than if we used all 4 predictors. 
48 | 
49 | ```{r}
50 | library(tree)
51 | colnames(train_df) <- c("PC1", "PC2", "Species")
52 | colnames(test_df) <- c("PC1", "PC2", "Species")
53 | set.seed(1234)
54 | tree1 <- tree(Species~., data=train_df)
55 | plot(tree1)
56 | text(tree1, cex=0.5, pretty=0)
57 | 
58 | pred <- predict(tree1, newdata=test_df, type="class")
59 | mean(pred==test$Species)
60 | ```
61 | 
62 | With the decison tree we got a little lower accuracy.
63 | 
64 | ### LDA
65 | 
66 | ```{r}
67 | library(MASS)
68 | lda1 <- lda(Species~., data=train)
69 | lda1$means
70 | ```
71 | 
72 | ### predict on test
73 | 
74 | ```{r}
75 | lda_pred <- predict(lda1, newdata=test, type="class")
76 | lda_pred$class
77 | mean(lda_pred$class==test$Species)
78 | ```
79 | 
80 | ### plot
81 | 
82 | ```{r}
83 | plot(lda_pred$x[,1], lda_pred$x[,2], pch=c(23,21,22)[unclass(lda_pred$class)], bg=c("red","green","blue")[unclass(test_pc$Species)])
84 | ```
85 | 
86 | 


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/15-PCA-LDA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_4_Search_Similarity/15-PCA-LDA.pdf


--------------------------------------------------------------------------------
/Part_4_Search_Similarity/readme.md:
--------------------------------------------------------------------------------
 1 | # Part 4: Searching for Similarity
 2 | 
 3 | These notebooks/pdfs accompany Chapter 12: Instance-Based Learning with kNN, Chapter 13: Clustering, and Chapter 14: Decision Trees
 4 | 
 5 | * 12_1_kNN_class - kNN classification on the iris data set
 6 | * 12_2_kNN_reg - kNN regression on the Auto data set
 7 | * 12_3_kNN_reg_cv - cross validation techniques 
 8 | 
 9 | * 13_1_iris - k-means clustering the iris data set
10 | * 13_2_wine - k-means clustering the wine data set
11 | * 13_3_synthetic - k-means clustering on a synthetic data set
12 | * 13-4_hierarchical - hierarchical clustering example
13 | 
14 | * 14_DT_iris - classification decision tree on the iris data
15 | * 14_DT_boston - regression decision tree on the Boston housing data, also demonstrating cross-validation pruning, random forests, and bagging
16 | 
17 | 


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/16-SVM-1_iris.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_5_Kernel_Ensemble/16-SVM-1_iris.pdf


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/16-SVM-2_housing.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "SVM Regression"
  3 | output:
  4 |   pdf_document: default
  5 |   html_document:
  6 |     df_print: paged
  7 | editor_options:
  8 |   chunk_output_type: console
  9 | ---
 10 | 
 11 | Load packages and data.
 12 | 
 13 | ```{r}
 14 | library(e1071)
 15 | library(MASS)
 16 | df <- Boston[]
 17 | ```
 18 | 
 19 | ### Divide into train, test, validate
 20 | 
 21 | ```{r}
 22 | set.seed(1234)
 23 | spec <- c(train=.6, test=.2, validate=.2)
 24 | i <- sample(cut(1:nrow(df),
 25 |                 nrow(df)*cumsum(c(0,spec)), labels=names(spec)))
 26 | train <- df[i=="train",]
 27 | test <- df[i=="test",]
 28 | vald <- df[i=="validate",]
 29 | ```
 30 | 
 31 | ### Try linear regression
 32 | 
 33 | ```{r}
 34 | lm1 <- lm(medv~., data=train)
 35 | pred <- predict(lm1, newdata=test)
 36 | cor_lm1 <- cor(pred, test$medv)
 37 | mse_lm1 <- mean((pred-test$medv)^2)
 38 | ```
 39 | 
 40 | 
 41 | 
 42 | ### Try a linear kernel
 43 | 
 44 | ```{r}
 45 | svm1 <- svm(medv~., data=train, kernel="linear", cost=10, scale=TRUE)
 46 | summary(svm1)
 47 | pred <- predict(svm1, newdata=test)
 48 | cor_svm1 <- cor(pred, test$medv)
 49 | mse_svm1 <- mean((pred - test$medv)^2)
 50 | ```
 51 | 
 52 | ### Tune
 53 | 
 54 | ```{r}
 55 | tune_svm1 <- tune(svm, medv~., data=vald, kernel="linear",
 56 |                   ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
 57 | summary(tune_svm1)
 58 | ```
 59 | 
 60 | ### Evaluate on best linear svm
 61 | 
 62 | Since our validation set is small, only about 100 observations, we probably did not get hyperparameters that generalize to the full data set.
 63 | 
 64 | ```{r}
 65 | pred <- predict(tune_svm1$best.model, newdata=test)
 66 | cor_svm1_tune <- cor(pred, test$medv)
 67 | mse_svm1_tune <- mean((pred - test$medv)^2)
 68 | ```
 69 | 
 70 | ### Try a polynomial kernel
 71 | 
 72 | ```{r}
 73 | svm2 <- svm(medv~., data=train, kernel="polynomial", cost=10, scale=TRUE)
 74 | summary(svm2)
 75 | pred <- predict(svm2, newdata=test)
 76 | cor_svm2 <- cor(pred, test$medv)
 77 | mse_svm2 <- mean((pred - test$medv)^2)
 78 | ```
 79 | 
 80 | ### Try a radial kernel
 81 | 
 82 | 
 83 | ```{r}
 84 | svm3 <- svm(medv~., data=train, kernel="radial", cost=10, gamma=1, scale=TRUE)
 85 | summary(svm3)
 86 | pred <- predict(svm3, newdata=test)
 87 | cor_svm3 <- cor(pred, test$medv)
 88 | mse_svm3 <- mean((pred - test$medv)^2)
 89 | ```
 90 | 
 91 | ### Tune hyperperameters
 92 | 
 93 | ```{r}
 94 | set.seed(1234)
 95 | tune.out <- tune(svm, medv~., data=vald, kernel="radial",
 96 |                  ranges=list(cost=c(0.1,1,10,100,1000),
 97 |                              gamma=c(0.5,1,2,3,4)))
 98 | summary(tune.out)
 99 | svm4 <- svm(medv~., data=train, kernel="radial", cost=100, gamma=0.5, scale=TRUE)
100 | summary(svm4)
101 | pred <- predict(svm4, newdata=test)
102 | cor_svm4 <- cor(pred, test$medv)
103 | mse_svm4 <- mean((pred - test$medv)^2)
104 | ```
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/16-SVM-2_housing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_5_Kernel_Ensemble/16-SVM-2_housing.pdf


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/17_ensemble_phishing.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Ensemble Methods"
  3 | author: "Karen Mazidi"
  4 | output:
  5 |   pdf_document: default
  6 |   html_document:
  7 |     df_print: paged
  8 | ---
  9 | 
 10 | Using a phishing data set with binary target type: +1 or -1. Convert from Weka format to a data frame. 
 11 | 
 12 | ```{r}
 13 | library(RWeka)
 14 | df <- read.arff("phishing/Training Dataset.arff")
 15 | str(df)
 16 | ```
 17 | 
 18 | 
 19 | ### Train Test Split
 20 | 
 21 | 
 22 | ```{r}
 23 | set.seed(1234)
 24 | i <- sample(nrow(df), .75*nrow(df), replace=FALSE)
 25 | train <- df[i,]
 26 | test <- df[-i,]
 27 | ```
 28 | 
 29 | 
 30 | ### Logistic regression on all predictors
 31 | 
 32 | 
 33 | ```{r}
 34 | library(mltools)
 35 | glm1 <- glm(Result~., data=train, family=binomial)
 36 | probs <- predict(glm1, newdata=test, type="response")
 37 | pred <- ifelse(probs>0.5, 2, 1)
 38 | acc_logreg <- mean(pred==as.integer(test$Result))
 39 | mcc_logreg <- mcc(pred, as.integer(test$Result))
 40 | print(paste("accuracy=", acc_logreg))
 41 | print(paste("mcc=", mcc_logreg))
 42 | ```
 43 | 
 44 | 
 45 | ### Random Forest 
 46 | 
 47 | ```{r}
 48 | library(randomForest)
 49 | set.seed(1234)
 50 | rf <- randomForest(Result~., data=train, importance=TRUE)
 51 | rf
 52 | ```
 53 | 
 54 | 
 55 | ```{r}
 56 | pred <- predict(rf, newdata=test, type="response")
 57 | acc_rf <- mean(pred==test$Result)
 58 | mcc_rf <- mcc(factor(pred), test$Result)
 59 | print(paste("accuracy=", acc_rf))
 60 | print(paste("mcc=", mcc_rf))
 61 | ```
 62 | 
 63 | 
 64 | ### boosting from adabag library
 65 | 
 66 | ```{r}
 67 | library(adabag)
 68 | adab1 <- boosting(Result~., data=train, boos=TRUE, mfinal=20, coeflearn='Breiman')
 69 | summary(adab1)
 70 | ```
 71 | 
 72 | 
 73 | ```{r}
 74 | pred <- predict(adab1, newdata=test, type="response")
 75 | acc_adabag <- mean(pred$class==test$Result)
 76 | mcc_adabag <- mcc(factor(pred$class), test$Result)
 77 | print(paste("accuracy=", acc_adabag))
 78 | print(paste("mcc=", mcc_adabag))
 79 | ```
 80 | 
 81 | 
 82 | ### fastAdaboost
 83 | 
 84 | ```{r}
 85 | library(fastAdaboost)
 86 | set.seed(1234)
 87 | fadab <- adaboost(Result~., train, 10)
 88 | summary(fadab)
 89 | ```
 90 | 
 91 | ```{r}
 92 | pred <- predict(fadab, newdata=test, type="response")
 93 | # pred$class holds the classification
 94 | acc_fadab <- mean(pred$class==test$Result)
 95 | mcc_fadab <- mcc(pred$class, test$Result)
 96 | print(paste("accuracy=", acc_fadab))
 97 | print(paste("mcc=", mcc_fadab))
 98 | ```
 99 | 
100 | ### XGBoost
101 | 
102 | ```{r}
103 | library(xgboost)
104 | train_label <- ifelse(train$Result==1, 1, 0)
105 | train_matrix <- data.matrix(train[, -31])
106 | model <- xgboost(data=train_matrix, label=train_label,
107 |                  nrounds=100, objective='binary:logistic')
108 | ```
109 | 
110 | ```{r}
111 | test_label <- ifelse(test$Result==1, 1, 0)
112 | test_matrix <- data.matrix(test[, -31])
113 | 
114 | probs <- predict(model, test_matrix)
115 | pred <- ifelse(probs>0.5, 1, 0)
116 | 
117 | acc_xg <- mean(pred==test_label)
118 | mcc_xg <- mcc(pred, test_label)
119 | print(paste("accuracy=", acc_xg))
120 | print(paste("mcc=", mcc_xg))
121 | ```
122 | 
123 | ### SuperLearner
124 | 
125 | Had to install packages: ranger kernlab 
126 | 
127 | Super is not super. Can get better results with a lot of parameter tuning, but why? There are better methods.
128 | 
129 | ```{r}
130 | library(SuperLearner)
131 | 
132 | set.seed(1234)
133 | model <- SuperLearner(train_label,
134 |                       train[, -31],
135 |                       family=binomial(),
136 |                       SL.library=list("SL.ranger",
137 |                                       "SL.ksvm",
138 |                                       "SL.ipredbagg"))
139 | 
140 | model
141 | ```
142 | 
143 | 
144 | ```{r}
145 | probs <- predict.SuperLearner(model, newdata=test[,-31])
146 | pred <- ifelse(probs$pred>0, 1, 0)
147 | acc_sl <- mean(pred==test_label)
148 | mcc_sl <- mcc(as.integer(pred), as.integer(test_label))
149 | print(paste("accuracy=", acc_sl))
150 | print(paste("mcc=", mcc_sl))
151 | ```
152 | 
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/17_ensemble_phishing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_5_Kernel_Ensemble/17_ensemble_phishing.pdf


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/readme.md:
--------------------------------------------------------------------------------
 1 | Part 5 of the book explores ensembled methods and kernel methods.
 2 | 
 3 | These notebooks/pdfs accompany Chapter 16: Support Vector Machines
 4 | 
 5 | * 16-1 svm classification on the iris data
 6 | * 16-2 svm regression on the Boston housing data
 7 | 
 8 | These notebooks/pdfs accompany Chapter 18: XGBoost
 9 | 
10 | * 18-1 XGBoost example on the built-in mushroom data set
11 | 


--------------------------------------------------------------------------------
/Part_5_Kernel_Ensemble/xgboost1.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'XGBoost Demo: Mushroom Data Set'
  3 | output:
  4 |   html_document:
  5 |     df_print: paged
  6 |   html_notebook: default
  7 |   pdf_document: default
  8 | editor_options:
  9 |   chunk_output_type: console
 10 | ---
 11 | 
 12 | This notebook is adapted from an [R-bloggers post](https://www.r-bloggers.com/an-introduction-to-xgboost-r-package/).
 13 | 
 14 | The XGBoost library in R optimizes the boosting trees algorithm. XGBoost has reeived a lot of attention due to being  used in many winning solutions for machine learning challenges in Kaggle. The R package won the 2016 John M. Chambers Statitical Software Award. The award is well-deserved since this package runs faster than Python sklearn's version and another R version, gbm. The computational part of the package is written in C++ and can take advantage of multithreading on a single machine.
 15 | 
 16 | This notebook demonstrates the use of the algorithm on the mushroom data set, built into the package.
 17 | 
 18 | ### Load the data
 19 | 
 20 | ```{r}
 21 | require(xgboost)
 22 | 
 23 | data(agaricus.train, package='xgboost')
 24 | data(agaricus.test, package='xgboost')
 25 | train <- agaricus.train
 26 | test <- agaricus.test
 27 | print(train$data[1, 1:5])  # look at the first 5 features
 28 | ```
 29 | 
 30 | 
 31 | The train data wasdata has 6513 rows and 126 columns in a sparse matrix. The test data has 1611 observations. 
 32 | 
 33 | Bor both train and test, 'data' and 'label' are separated. 
 34 | 
 35 | ```{r}
 36 | print(dim(test$data))
 37 | head(train$label)
 38 | ```
 39 | 
 40 | ### Set training parameters
 41 | 
 42 | The nrounds argument specifies the number of decision trees in the final model. The objective argument is the training objective. 
 43 | 
 44 | ```{r}
 45 | model <- xgboost(data=train$data, label=train$label,
 46 |                  nrounds=2, objective='binary:logistic')
 47 | ```
 48 | 
 49 | ### Evaluate the model
 50 | 
 51 | 100% accuracy on this built-in data set.
 52 | 
 53 | ```{r}
 54 | pred <- predict(model, test$data)
 55 | pred <- ifelse(pred>0.5, 1, 0)
 56 | table(pred, test$label)
 57 | ```
 58 | 
 59 | ### Cross validation
 60 | 
 61 | The package also supports cross validation with function xgb.cv(). The same arguments are used as in the xgboost() algorith, with the additional argument for number of folds. 
 62 | 
 63 | ```{r}
 64 | cv.res <- xgb.cv(data=train$data, label=train$label,
 65 |                  nfold=5, nrounds=2, objective='binary:logistic')
 66 | ```
 67 | 
 68 | ### Plot the tree
 69 | 
 70 | The package also includes a function to plot the tree. The code below plots the model built above. A more readable plot could be obtained by training another model with the parameter 'max.depth=2' in the xgboost() function. 
 71 | 
 72 | Note that the DiagrammeR package is required to plot the tree.
 73 | 
 74 | ```{r}
 75 | xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model=model)
 76 | ```
 77 | ### Ensembling trees
 78 | 
 79 | The model above build only 2 trees. If there are many trees, the plot will be even harder to read. The package includes a function to ensemble several trees into one.
 80 | 
 81 | ```{r}
 82 | bst <- xgboost(data = train$data, label = train$label, max.depth = 15,
 83 |                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
 84 |                  min_child_weight = 50)
 85 | xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
 86 | ```
 87 | 
 88 | ### Examining feature importance
 89 | 
 90 | The xgb.importance() function can accummulate the gain on each feature split on all the trees to find the most important features. If the number of features is large, importance can be summed for clusters of features. 
 91 | 
 92 | ```{r}
 93 | bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
 94 |                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 95 | importance_matrix <- xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
 96 | xgb.plot.importance(importance_matrix)
 97 | ```
 98 | 
 99 | 
100 | ### Deepness
101 | 
102 | Limiting the depth of trees can avoid overfitting the model. The package includes a deepness plot to determine a good choice for depth. The plots below indicates few leaves at levels 5 and 6. The bottom plot shows the normalized weighted sum of instances per leaf. 
103 | 
104 | ```{r}
105 | bst <- xgboost(data = train$data, label = train$label, max.depth = 15,
106 |                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
107 |                  min_child_weight = 50)
108 | xgb.plot.deepness(model = bst)
109 | 
110 | ```
111 | 
112 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/0 - What is a jupyter notebook_.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## What is a Jupyter notebook?\n",
  8 |     "\n",
  9 |     "According to [the official site](http://jupyter.org/), Project Jupyter is an open source project that allows you to share code in an interactive platform. JupyterLab is the next evolution from earlier Jupyter notebooks. Jupyter now supports many programming languages besides Python 2 and Python 3. See a list [here](https://github.com/jupyter/jupyter/wiki/Jupyter-kernels).\n",
 10 |     "\n",
 11 |     "### installing\n",
 12 |     "\n",
 13 |     "JupyterLab is not required for the course assignments but you will need it if you want to run the notebooks on your own computer. If you want to install it, read the [documentation here](http://jupyterlab.readthedocs.io). The Jupyter docs recommend installing Anaconda; however, if you already have Python installed, just install jupyter with pip/pip3 commands. \n",
 14 |     "\n",
 15 |     "### running\n",
 16 |     "\n",
 17 |     "At terminal, in the folder of your choice, type **jupyter lab** and the server will start, and then open your default web browser to the local host URL. When you want to exit, save and close any notebooks you have open, close the browser tab, and hit ctrl-c a couple of times in the terminal window to shut down. \n",
 18 |     "\n",
 19 |     "When you first run you will see the folder contents. You can click on a notebook to open it or also do other things like rename or delete it. To create a new notebook, use the *File->New* menu.\n",
 20 |     "\n",
 21 |     "### things to notice\n",
 22 |     "\n",
 23 |     "* the cells can be of different types: **markdown** for text cells and **code** for code\n",
 24 |     "* the toolbar lets you do the usual things like cut, copy, paste, as well as the up/down arrows to move cells, and the play/stop icons to run code or stop it\n",
 25 |     "* there is also a menu at the top with self-explanatory labels\n",
 26 |     "\n",
 27 |     "## markdown\n",
 28 |     "\n",
 29 |     "The next cell shows the markdown that created the formatting for this paragraph. Markdown is a nice system because you can format while typing without taking your hands off the keyboard. A heading-2 was created by two hash tags at the beginning of the line. Bold text was created by surrounding text with two asterisks. Surrounding text by one asterisk creates *italics*. To learn more about Markdown, click **Help** above, then **Markdown**.\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "raw",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## markdown\n",
 40 |     "\n",
 41 |     "The next cell shows the markdown that created the formatting for this paragraph. Markdown is a nice system because you can format while typing without taking your hands off the keyboard. A heading-2 was created by two hash tags at the beginning of the line. Bold text was created by surrounding text with two asterisks. Surrounding text by one asterisk creates *italics*. To learn more about Markdown, click **Help** above, then **Markdown**.\n"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "Let's write some simple Python code in the next cell. \n",
 49 |     "\n",
 50 |     "If you are viewing the notebook instead of the html version, you can hit shift-enter to run the cell."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 1,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdin",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Hi, tell me your name:  Karen\n"
 63 |      ]
 64 |     },
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "Hello  Karen\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "name = input(\"Hi, tell me your name: \")\n",
 75 |     "print(\"Hello \", name)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "You can add more cells using the >| icon above.\n",
 83 |     "\n",
 84 |     "Jupyter allows you to intersperse your code with explanation which is a great way to share it with others.\n",
 85 |     "\n",
 86 |     "Jupyter will remember variables from cell to cell, as shown next.\n"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "Goodbye  Karen\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "print(\"Goodbye \", name)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.7.2"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 4
135 | }
136 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/1 - Getting started with Python 3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting started with Python 3\n",
  8 |     "\n",
  9 |     "This overview of Python 3 assumes no prior knowledge of Python, just prior programming experience in some language. We are just scratching the surface of Python in this series of notebooks, covering just enough Python to enable you to do assignments and projects. There are so many free resources for learning more about Python, including https://docs.python.org/3/tutorial/ but of course the best way to learn is to dive in!"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Things to know about Python\n",
 17 |     "\n",
 18 |     "* an interpreted language\n",
 19 |     "* source code is compiled into bytecode to be executed by the os\n",
 20 |     "* there are no type declarations like **int j;**\n",
 21 |     "* types are checked dynamically at run time not when code is compiled\n",
 22 |     "* uses indents not { } to identify code blocks\n",
 23 |     "* end of line is end of statement, no ; required\n",
 24 |     "* tab/space wars: 4 spaces (but PyCharm IDE fixes tabs for you)\n",
 25 |     "* underscore/camelCase wars: underscore preferred for variable names\n",
 26 |     "* case sensitive\n",
 27 |     "* comments begin with # \n",
 28 |     "* significant changes from Versions 2-> 3 broke backward compatability\n",
 29 |     "\n",
 30 |     "Regarding style, refer to [PEP8](https://www.python.org/dev/peps/pep-0008/) as the ultimate authority.\n",
 31 |     "\n",
 32 |     "One of the best ways to learn about Python is to play with it at the console.\n",
 33 |     "Type this at the console and follow along with this notebook:\n",
 34 |     "\n",
 35 |     "$python\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "### Variables\n",
 39 |     "\n",
 40 |     "Think of Python variables as pointers to memory locations. The type of the variable is determined by its contents. Assign it to different contents, and it's a different type of variable, pointing to a different memory location. \n",
 41 |     "\n",
 42 |     "When you type a variable at the console, Python returns its value. We can also use the print() function to view the contents. \n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 1,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "5\n",
 55 |       "a\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "v = 5      # v is an int\n",
 61 |     "print(v)\n",
 62 |     "v = 'a'    # now v is a string of length 1 (there is no char)\n",
 63 |     "print(v)\n"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "What can you do with ints and strings? We'll talk a lot about strings later."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 2,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "6"
 82 |       ]
 83 |      },
 84 |      "execution_count": 2,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "v = 5\n",
 91 |     "v += 1   # Python doesn't have the ++ or -- operators\n",
 92 |     "v"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 3,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "'ab'"
104 |       ]
105 |      },
106 |      "execution_count": 3,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "v = 'a'\n",
113 |     "v += 'b'   # + used for concatenation\n",
114 |     "v"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "Don't be afraid to break stuff."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 4,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "ename": "TypeError",
131 |      "evalue": "can only concatenate str (not \"int\") to str",
132 |      "output_type": "error",
133 |      "traceback": [
134 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
135 |       "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
136 |       "\u001b[1;32m<ipython-input-4-0aca01ca98d0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mv\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m  \u001b[1;31m# oops\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
137 |       "\u001b[1;31mTypeError\u001b[0m: can only concatenate str (not \"int\") to str"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "v += 1  # oops"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "To reinforce the idea that variables just point to locations, observe this:"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 5,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "0x7388d8f0 0x7388d8f0\n",
162 |       "0x7388d900 0x7388d8f0\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "a = 5\n",
168 |     "b = 5\n",
169 |     "print(hex(id(a)), hex(id(b)))  # a and b point to same location\n",
170 |     "a += 1\n",
171 |     "print(hex(id(a)), hex(id(b)))  # now they don't"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "### Types\n",
179 |     "\n",
180 |     "Common data types include:\n",
181 |     "* int (non-limited length)\n",
182 |     "* float (same as a C double)\n",
183 |     "* complex\n",
184 |     "* boolean (built-in values are True, False)\n",
185 |     "* string ('single' or \"double\" quotes)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "## Console input and output\n",
193 |     "\n",
194 |     "* print() for console output\n",
195 |     "* input(\"prompt\") for console input"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 8,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "What's your name? Karen\n",
208 |       "Hello  Karen !\n",
209 |       "Hello Karen!\n"
210 |      ]
211 |     }
212 |    ],
213 |    "source": [
214 |     "name = input(\"What's your name? \")\n",
215 |     "print('Hello ', name, '!')  # notice that , list adds spaces\n",
216 |     "print('Hello '+ name + '!')"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "#### Practice\n",
224 |     "\n",
225 |     "Write some code to input a circle radius from the user, then calculate and display the area. Hints:\n",
226 |     "* the Python exponent operator is **\n",
227 |     "* convert a string to a number with int() or float()"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 7,
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "name": "stdin",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "Enter radius:  12\n"
240 |      ]
241 |     },
242 |     {
243 |      "name": "stdout",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "area =  452.16\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "# your code here\n",
252 |     "radius = input(\"Enter radius: \")\n",
253 |     "radius = float(radius)\n",
254 |     "#area = radius * radius * 3.14\n",
255 |     "area = radius**2 * 3.14\n",
256 |     "print(\"area = \", area)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "### Python programs\n",
264 |     "\n",
265 |     "ok, we need more than the console, we need to write programs, often called scripts.\n",
266 |     "\n",
267 |     "A python program is a text file that ends in .py\n",
268 |     "\n",
269 |     "You can run it at the console like this:\n",
270 |     "\n",
271 |     "$python myfile.py\n",
272 |     "\n",
273 |     "Of course, the command line is highly system-dependent. \n",
274 |     "\n",
275 |     "So technically, all you need is a simple text editor, but using an IDE like PyCharm will make coding easier.\n",
276 |     "\n",
277 |     "In *nix systems it is customary to start the script with a shebang if you plan to run it at terminal."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "raw",
282 |    "metadata": {},
283 |    "source": [
284 |     "#!/usr/bin/env python\n",
285 |     "\n",
286 |     "import sys\n",
287 |     "\n",
288 |     "def main():\n",
289 |     "    print(\"Hello \" + sys.argv[1])\n",
290 |     "    \n",
291 |     "if __name__ == '__main__':\n",
292 |     "    main()"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "The above shows the skeleton on a Python script in *nix but many features are common across platforms:\n",
300 |     "\n",
301 |     "* a main() function called at the bottom of the script\n",
302 |     "* we will learn more about functions later\n",
303 |     "* command line argument 1, example: $python hello.py anArg"
304 |    ]
305 |   }
306 |  ],
307 |  "metadata": {
308 |   "kernelspec": {
309 |    "display_name": "Python 3",
310 |    "language": "python",
311 |    "name": "python3"
312 |   },
313 |   "language_info": {
314 |    "codemirror_mode": {
315 |     "name": "ipython",
316 |     "version": 3
317 |    },
318 |    "file_extension": ".py",
319 |    "mimetype": "text/x-python",
320 |    "name": "python",
321 |    "nbconvert_exporter": "python",
322 |    "pygments_lexer": "ipython3",
323 |    "version": "3.7.2"
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 4
328 | }
329 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/3 - Files.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Read a file\n",
  8 |     "\n",
  9 |     "* open a file for reading, 'r', that is in the same directory\n",
 10 |     "* read with the read() function\n",
 11 |     "* close the file"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "scrolled": true
 19 |    },
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "You read:\n",
 26 |       " Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.\n",
 27 |       "Source: https://en.wikipedia.org/wiki/Natural_language_processing\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "f = open('sample1.txt','r') \n",
 33 |     "text = f.read()\n",
 34 |     "print('You read:\\n', text)\n",
 35 |     "f.close()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "### Read a line at a time\n",
 43 |     "\n",
 44 |     "The following code shows a *for* loop to process one line at a time."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.\n",
 57 |       "\n",
 58 |       "Source: https://en.wikipedia.org/wiki/Natural_language_processing\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "f = open('sample1.txt', 'r')\n",
 64 |     "for line in f:\n",
 65 |     "    print(line)\n",
 66 |     "f.close()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Using \"with\"\n",
 74 |     "\n",
 75 |     "The *with* statement starts a block of code. When we are through with the block of code, Python will close the file automatically."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "You read:\n",
 88 |       " Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.\n",
 89 |       "Source: https://en.wikipedia.org/wiki/Natural_language_processing\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "with open('sample1.txt', 'r') as f:\n",
 95 |     "    text = f.read()\n",
 96 |     "print(\"You read:\\n\", text)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "### Encoding\n",
104 |     "\n",
105 |     "Encoding used to be a pain in Python 2 but is less of a problem in Python 3, which uses utf-8 by default. However, you can specify the encoding if you need to. The strip() function removes newlines."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 3,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.\n",
118 |       "Source: https://en.wikipedia.org/wiki/Natural_language_processing\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "with open('sample1.txt', 'r', encoding='utf-8') as f:\n",
124 |     "    for line in f:\n",
125 |     "        print(line.strip())"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### Get text from the web\n",
133 |     "\n",
134 |     "The urllib library contains functions to handle urls. Below we read text from a web page."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 4,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "'\\ufeffThe Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\\r\\n\\r\\nThis eBook is for the use of anyone anywhere at no cost and with\\r\\nalmost no restrictions whatsoever.  You may copy it, give it away or\\r\\nre-use it under the terms of the Project Gutenberg License included\\r\\nwith this eBook or online at www.gutenberg.org\\r\\n\\r\\n\\r\\nTitle: Crime and Punishment\\r\\n\\r\\nAuthor: Fyodor Dostoevsky\\r\\n\\r\\nRelease Date: March 28, 2006 [EBook #2554]\\r\\nLast Updated: October 27, 2016\\r\\n\\r\\nLanguage: English\\r\\n\\r\\nCharacter set encoding: UTF-8\\r\\n\\r\\n*** START OF THIS PROJECT GUTENBERG EBOOK CRIME AND PUNISHMENT ***\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nProduced by John Bickers; and Dagny\\r\\n\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nCRIME AND PUNISHMENT\\r\\n\\r\\nBy Fyodor Dostoevsky\\r\\n\\r\\n\\r\\n\\r\\nTranslated By Constance Garnett\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nTRANSLATOR’S PREFACE\\r\\n\\r\\nA few words about Dostoevsky himself may help the English reader to\\r\\nunderstand his work.\\r\\n\\r\\nDostoevsky was the son of a doctor. His parents were very hard-working\\r\\nand deeply religious people, but so poor that they lived with their '"
146 |       ]
147 |      },
148 |      "execution_count": 4,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "from urllib import request\n",
155 |     "url = \"http://www.gutenberg.org/files/2554/2554-0.txt\"\n",
156 |     "crime = request.urlopen(url).read().decode('utf8')\n",
157 |     "crime[:1000]"
158 |    ]
159 |   }
160 |  ],
161 |  "metadata": {
162 |   "kernelspec": {
163 |    "display_name": "Python 3",
164 |    "language": "python",
165 |    "name": "python3"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.7.2"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 4
182 | }
183 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/6 - Tuples and Sets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Tuples\n",
  8 |     "\n",
  9 |     "Python strings, lists and tuples are sequence types. Lists and tuples are both ordered collection of objects. The elements of a list or a tuple can by any type of object, including other lists or tuples.\n",
 10 |     "\n",
 11 |     "Lists use square brackets [] and are mutable.\n",
 12 |     "Tuples use parenthesis () and are immutable. \n",
 13 |     "\n",
 14 |     "If tuples are like lists, then why do we need them?\n",
 15 |     "\n",
 16 |     "There are situations when we need an immutable sequence of objects. For example, a tuple can be a key for a dict, but lists cannot because they are mutable.\n",
 17 |     "\n",
 18 |     "A tuple can simply be created by assigning a list of comma-separated objects on the right-hand-side of the assignment operator.\n"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "('a', 3, 5.6)"
 30 |       ]
 31 |      },
 32 |      "execution_count": 1,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "t = 'a', 3, 5.6   # create a tuple\n",
 39 |     "t\n"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "Tuple elements can be accessed by the same indexing notation that works on strings and lists: []"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "'a'"
 58 |       ]
 59 |      },
 60 |      "execution_count": 2,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "t[0]"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### tuple unpacking\n",
 74 |     "\n",
 75 |     "When a tuple is on the right hand side of an assignment statement, it's individuals elements are placed in order in the variables on the left hand side.\n",
 76 |     "\n",
 77 |     "The second print statement below shows another use of tuple unpacking."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "3 a 5.600000\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "\n",
 95 |     "astring, adigit, afloat = t\n",
 96 |     "print(\"%d %s %f\" % (adigit, astring, afloat))\n"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 4,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "1"
108 |       ]
109 |      },
110 |      "execution_count": 4,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "s = \"123\"\n",
117 |     "s.count('1')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "### tuple methods\n",
125 |     "\n",
126 |     "Notice that len() also works on tuples.\n",
127 |     "\n",
128 |     "Here are some commonly used tuple methods."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 5,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "length =  3\n",
141 |       "count 1 =  0\n",
142 |       "count 3 =  1\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "print('length = ', len(t))\n",
148 |     "print('count 1 = ', t.count('1'))\n",
149 |     "print('count 3 = ', t.count(3))\n",
150 |     "if 'pasta' in t:\n",
151 |     "    print('index = ', t.index('pasta'))"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "### convert to/from list/tuple\n",
159 |     "\n",
160 |     "The list() and tuple() functions can be used for conversion."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 6,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "type of l is <class 'list'>\n",
173 |       "type of t is <class 'tuple'>\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "l = list(t)\n",
179 |     "print('type of l is', type(l))\n",
180 |     "t = tuple(t)\n",
181 |     "print('type of t is', type(t))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Sets\n",
189 |     "\n",
190 |     "Whereas lists, tuples, and strings are sequential data types, sets are unordered collections of objects. Further, no duplicates are allowed in sets. \n",
191 |     "\n",
192 |     "We can create a set with comma-separated objects enlosed in {}, as shown below. We can use the **in** operation to test for membership."
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 2,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/plain": [
203 |        "True"
204 |       ]
205 |      },
206 |      "execution_count": 2,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "fruits = {'apple', 'banana', 'orange'}\n",
213 |     "\n",
214 |     "'apple' in fruits"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "### set()\n",
222 |     "\n",
223 |     "The set function can create a set from other objects such as a string a shown below by including only unique elements. "
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 8,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "{'a', 'e', 'h', 'l', 'n', 'p', 't'}"
235 |       ]
236 |      },
237 |      "execution_count": 8,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "letter_set = set('elephant')\n",
244 |     "letter_set"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "### set comprehensions\n",
252 |     "\n",
253 |     "There is a set comprehension available that is comparable to the list comprehensions."
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 9,
259 |    "metadata": {},
260 |    "outputs": [
261 |     {
262 |      "data": {
263 |       "text/plain": [
264 |        "{'a', 'e', 'h', 'l', 'n', 'p', 't'}"
265 |       ]
266 |      },
267 |      "execution_count": 9,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "letters = {x for x in 'elephant'}\n",
274 |     "letters\n"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "### operations on sets\n",
282 |     "\n",
283 |     "Because sets are immutable we cannot change individual elements as follows:\n",
284 |     "\n",
285 |     "fruits[0] = 'pear'   # not allowed\n",
286 |     "\n",
287 |     "However there are methods to update, add, and remove elements but we will not cover those here. \n",
288 |     "\n",
289 |     "Note that you can iterate through a set just like we did for lists."
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 10,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "name": "stdout",
299 |      "output_type": "stream",
300 |      "text": [
301 |       "orange\n",
302 |       "apple\n",
303 |       "banana\n"
304 |      ]
305 |     }
306 |    ],
307 |    "source": [
308 |     "for fruit in fruits:\n",
309 |     "    print(fruit)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "Python also supports mathematical set operations like union, intersection, difference, and so forth. Again, the point here is to make you aware of the set data structure and methods which you can explore on your own if you find a need. "
317 |    ]
318 |   }
319 |  ],
320 |  "metadata": {
321 |   "kernelspec": {
322 |    "display_name": "Python 3",
323 |    "language": "python",
324 |    "name": "python3"
325 |   },
326 |   "language_info": {
327 |    "codemirror_mode": {
328 |     "name": "ipython",
329 |     "version": 3
330 |    },
331 |    "file_extension": ".py",
332 |    "mimetype": "text/x-python",
333 |    "name": "python",
334 |    "nbconvert_exporter": "python",
335 |    "pygments_lexer": "ipython3",
336 |    "version": "3.7.2"
337 |   }
338 |  },
339 |  "nbformat": 4,
340 |  "nbformat_minor": 4
341 | }
342 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/7 - Dicts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dicts\n",
  8 |     "\n",
  9 |     "Python dictionaries are key/value hash tables. The following example shows how to manually create a dict, then perform simple dict operations."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "pear\n",
 22 |       "peach\n",
 23 |       "There is no fruit c\n",
 24 |       "fruit c =  None\n",
 25 |       "{'a': 'apple', 'b': 'banana'}\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "fruits = {}   # create an empty dict\n",
 31 |     "fruits['a'] = 'apple'\n",
 32 |     "fruits['b'] = 'banana'\n",
 33 |     "fruits['p'] = 'pear'\n",
 34 |     "\n",
 35 |     "print(fruits['p'])\n",
 36 |     "fruits['p'] = 'peach'  # change 'p' value\n",
 37 |     "print(fruits['p'])\n",
 38 |     "\n",
 39 |     "if 'c' in fruits:      # check for 'c' to avoid key error\n",
 40 |     "    print(fruits['c'])\n",
 41 |     "else:\n",
 42 |     "    print(\"There is no fruit c\")\n",
 43 |     "    \n",
 44 |     "# use .get() to avoid key error\n",
 45 |     "print(\"fruit c = \", fruits.get('c'))\n",
 46 |     "\n",
 47 |     "del fruits['p']  # delete an entry\n",
 48 |     "print(fruits)    # print the dict"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### create a vocabulary\n",
 56 |     "\n",
 57 |     "The following example shows how to create a vocabulary dictionary with counts from text."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 2,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "{'quick': 1, 'be': 2, 'nimble': 1, 'jack': 2}\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "text = 'jack be nimble jack be quick'\n",
 75 |     "tokens = text.split()\n",
 76 |     "vocab = {}\n",
 77 |     "for token in tokens:\n",
 78 |     "    if token in vocab:\n",
 79 |     "        vocab[token] += 1\n",
 80 |     "    else:\n",
 81 |     "        vocab[token] = 1\n",
 82 |     "print(vocab)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### dicts and loops\n",
 90 |     "\n",
 91 |     "A for loop iterates over a dictionary's keys by default. Items are in arbitrary order but can be sorted(). Note that the sorted() function has an optional parameter reverse=True if you want the dict sorted in reverse order.\n",
 92 |     "\n",
 93 |     "There are several examples below of accessing keys, values, or both."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 3,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "unsorted dict:\n",
106 |       "quick 1\n",
107 |       "be 2\n",
108 |       "nimble 1\n",
109 |       "jack 2\n",
110 |       "\n",
111 |       "sorted dict:\n",
112 |       "be 2\n",
113 |       "jack 2\n",
114 |       "nimble 1\n",
115 |       "quick 1\n",
116 |       "\n",
117 |       "keys and values\n",
118 |       "dict_keys(['quick', 'be', 'nimble', 'jack'])\n",
119 |       "dict_values([1, 2, 1, 2])\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "# iterate over keys\n",
125 |     "print(\"unsorted dict:\")\n",
126 |     "for key in vocab:\n",
127 |     "    print(key, vocab[key])\n",
128 |     "    \n",
129 |     "# iterate over keys, values\n",
130 |     "print(\"\\nsorted dict:\")\n",
131 |     "for k, v in sorted(vocab.items()):\n",
132 |     "    print(k, v)\n",
133 |     "    \n",
134 |     "# print keys, print values\n",
135 |     "print(\"\\nkeys and values\")\n",
136 |     "print(vocab.keys())\n",
137 |     "print(vocab.values())\n",
138 |     "\n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "### key, value data types\n",
146 |     "\n",
147 |     "The type for keys must be immutable. This means it could be a tuple but not a list, for example.\n",
148 |     "\n",
149 |     "The value can be any kind of type. Next we show lists as values."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 4,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "{'a': ['abby'], 'b': ['ben', 'bessy'], 'c': ['cathy', 'clay', 'chance']}\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "friend_list = ['abby', 'ben', 'bessy', 'cathy', 'clay', 'chance']\n",
167 |     "friends = {}\n",
168 |     "for friend in friend_list:\n",
169 |     "    if friend[0] in friends:\n",
170 |     "        friends[friend[0]].append(friend)\n",
171 |     "    else:\n",
172 |     "        friends[friend[0]] = [friend]\n",
173 |     "        \n",
174 |     "print(friends)"
175 |    ]
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "kernelspec": {
180 |    "display_name": "Python 3",
181 |    "language": "python",
182 |    "name": "python3"
183 |   },
184 |   "language_info": {
185 |    "codemirror_mode": {
186 |     "name": "ipython",
187 |     "version": 3
188 |    },
189 |    "file_extension": ".py",
190 |    "mimetype": "text/x-python",
191 |    "name": "python",
192 |    "nbconvert_exporter": "python",
193 |    "pygments_lexer": "ipython3",
194 |    "version": "3.7.2"
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 4
199 | }
200 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/8 - Classes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Choose a paradigm\n",
  8 |     "\n",
  9 |     "Python works well with procedural, object oriented, or functional programming paradigms. In this notebook we take a look at Python classes. This is a very brief overview. A good starting point to learn more is Jeff Knupp's blog post:\n",
 10 |     "\n",
 11 |     "[Python classes](https://jeffknupp.com/blog/2014/06/18/improve-your-python-python-classes-and-object-oriented-programming/)\n",
 12 |     "\n",
 13 |     "## Python classes\n",
 14 |     "\n",
 15 |     "A Python class is simply a logical grouping of data and functions (aka methods).  As in other languages, a class is a blueprint for creating objects. \n",
 16 |     "\n",
 17 |     "The following code defines a class called Sentence which holds the sentence number, text, tokens, and parts of speech. Each class definition should include an __init__ method whose first argument will always be 'self' and the remaining arguments (if any) are items the class needs to define itself. The 'self' you see in the code below is just a reference to the particular instance of the class currently being processed.\n",
 18 |     "\n",
 19 |     "The following code also shows a simple display method to demonstrate further use of a class method.\n",
 20 |     "\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import nltk\n",
 30 |     "from nltk import word_tokenize\n",
 31 |     "class Sentence:\n",
 32 |     "    def __init__(self, sentence_number, text):\n",
 33 |     "        self.sentence_number = sentence_number\n",
 34 |     "        self.text = text\n",
 35 |     "        self.tokens = word_tokenize(self.text)\n",
 36 |     "        self.pos = nltk.pos_tag(self.tokens)\n",
 37 |     "        \n",
 38 |     "    def display(self):\n",
 39 |     "        print('\\nSentence number: ', self.sentence_number)\n",
 40 |     "        print('Text: ', self.text)\n",
 41 |     "        print('POS: ', self.pos)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "Now that we have defined a class, we can use it in our code. the following code iterates over the raw text in 'input_text'. For each text it creates a sentence object and stores the reference to that object in a list.\n",
 49 |     "\n",
 50 |     "Later we iterate over the sentence objects in that list just to show how that's done."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "\n",
 63 |       "Sentence number:  1\n",
 64 |       "Text:  Yesterday I went to the movies.\n",
 65 |       "POS:  [('Yesterday', 'NN'), ('I', 'PRP'), ('went', 'VBD'), ('to', 'TO'), ('the', 'DT'), ('movies', 'NNS'), ('.', '.')]\n",
 66 |       "\n",
 67 |       "Sentence number:  2\n",
 68 |       "Text:  Today I am going for a hike.\n",
 69 |       "POS:  [('Today', 'NN'), ('I', 'PRP'), ('am', 'VBP'), ('going', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('hike', 'NN'), ('.', '.')]\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "input_text = ['Yesterday I went to the movies.', 'Today I am going for a hike.']\n",
 75 |     "sentences = []   # a list of sentences\n",
 76 |     "sentence_number = 1\n",
 77 |     "for text in input_text:\n",
 78 |     "    s = Sentence(sentence_number, text)\n",
 79 |     "    sentences.append(s)\n",
 80 |     "    sentence_number += 1\n",
 81 |     "    \n",
 82 |     "for s in sentences:\n",
 83 |     "    s.display()\n",
 84 |     "\n"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Full object-oriented programming with Python is of course a vast subject that we will not cover in class since it's only one of many ways to write Python code for NLP. What is useful however in any programming paradigm are classes and objects because these help organize our code logically, and we can easily pass objects to functions. For example, if we had a function called process_sentence, we could pass a Sentence s to is as follows:\n",
 92 |     "\n",
 93 |     "process_sentence(s)\n",
 94 |     "\n",
 95 |     "So the function receives a pointer to the Sentence object 's' and will know that it is a Sentence object. "
 96 |    ]
 97 |   }
 98 |  ],
 99 |  "metadata": {
100 |   "kernelspec": {
101 |    "display_name": "Python 3",
102 |    "language": "python",
103 |    "name": "python3"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.7.2"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 4
120 | }
121 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/hello.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # run like this: $python hello.py karen
 3 | 
 4 | import sys
 5 | 
 6 | def main():
 7 |     print("Hello " + sys.argv[1])
 8 |     
 9 | if __name__ == '__main__':
10 |     main()
11 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/sample1.txt:
--------------------------------------------------------------------------------
1 | Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.
2 | Source: https://en.wikipedia.org/wiki/Natural_language_processing
3 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_19/temp.txt:
--------------------------------------------------------------------------------
1 | Name: Ralph    Favorite number is 3      GPA is 3.70
2 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_20/readme.md:
--------------------------------------------------------------------------------
1 | Notebooks demonstrating basic features of:
2 | 
3 | * NumPy
4 | * pandas
5 | * seaborn
6 | * sklearn
7 | * along with some data cleaning examples
8 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_21/Running Multiple Models.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Running Multiple Models\n",
  8 |     "\n",
  9 |     "The other notebooks focus on one algorithm at a time for reference purposes. Most of the code is redundant from notebook to notebook. This notebook shows code to run multiple models, and will be demonstrated on the Titanic data.\n",
 10 |     "\n",
 11 |     "Another difference in this notebook is that all the imports occur first. The imports in the other notebooks were placed near their first use so that it would be clear exactly what to import."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# notebook imports\n",
 21 |     "\n",
 22 |     "import pandas as pd\n",
 23 |     "import numpy as np\n",
 24 |     "from sklearn.model_selection import train_test_split\n",
 25 |     "from sklearn.linear_model import LogisticRegression\n",
 26 |     "from sklearn.naive_bayes import BernoulliNB\n",
 27 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 28 |     "from sklearn.tree import DecisionTreeClassifier\n",
 29 |     "from sklearn.metrics import accuracy_score\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/html": [
 40 |        "<div>\n",
 41 |        "<style scoped>\n",
 42 |        "    .dataframe tbody tr th:only-of-type {\n",
 43 |        "        vertical-align: middle;\n",
 44 |        "    }\n",
 45 |        "\n",
 46 |        "    .dataframe tbody tr th {\n",
 47 |        "        vertical-align: top;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe thead th {\n",
 51 |        "        text-align: right;\n",
 52 |        "    }\n",
 53 |        "</style>\n",
 54 |        "<table border=\"1\" class=\"dataframe\">\n",
 55 |        "  <thead>\n",
 56 |        "    <tr style=\"text-align: right;\">\n",
 57 |        "      <th></th>\n",
 58 |        "      <th>pclass</th>\n",
 59 |        "      <th>survived</th>\n",
 60 |        "      <th>sex</th>\n",
 61 |        "      <th>age</th>\n",
 62 |        "    </tr>\n",
 63 |        "  </thead>\n",
 64 |        "  <tbody>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>0</th>\n",
 67 |        "      <td>0</td>\n",
 68 |        "      <td>1</td>\n",
 69 |        "      <td>0</td>\n",
 70 |        "      <td>29.0000</td>\n",
 71 |        "    </tr>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>1</th>\n",
 74 |        "      <td>0</td>\n",
 75 |        "      <td>1</td>\n",
 76 |        "      <td>1</td>\n",
 77 |        "      <td>0.9167</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>2</th>\n",
 81 |        "      <td>0</td>\n",
 82 |        "      <td>0</td>\n",
 83 |        "      <td>0</td>\n",
 84 |        "      <td>2.0000</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>3</th>\n",
 88 |        "      <td>0</td>\n",
 89 |        "      <td>0</td>\n",
 90 |        "      <td>1</td>\n",
 91 |        "      <td>30.0000</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>4</th>\n",
 95 |        "      <td>0</td>\n",
 96 |        "      <td>0</td>\n",
 97 |        "      <td>0</td>\n",
 98 |        "      <td>25.0000</td>\n",
 99 |        "    </tr>\n",
100 |        "  </tbody>\n",
101 |        "</table>\n",
102 |        "</div>"
103 |       ],
104 |       "text/plain": [
105 |        "   pclass  survived  sex      age\n",
106 |        "0       0         1    0  29.0000\n",
107 |        "1       0         1    1   0.9167\n",
108 |        "2       0         0    0   2.0000\n",
109 |        "3       0         0    1  30.0000\n",
110 |        "4       0         0    0  25.0000"
111 |       ]
112 |      },
113 |      "execution_count": 2,
114 |      "metadata": {},
115 |      "output_type": "execute_result"
116 |     }
117 |    ],
118 |    "source": [
119 |     "### load the data\n",
120 |     "df = pd.read_csv('data/titanic3.csv', usecols=['pclass', 'survived', 'sex', 'age'])\n",
121 |     "\n",
122 |     "# convert columns to factors\n",
123 |     "df.survived = df.survived.astype('category').cat.codes\n",
124 |     "df.pclass = df.pclass.astype('category').cat.codes\n",
125 |     "df.sex = df.sex.astype('category').cat.codes\n",
126 |     "df.head()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 3,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "# fill missing values\n",
136 |     "\n",
137 |     "age_mean = np.mean(df.age)\n",
138 |     "df.age.fillna(age_mean, inplace=True)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 4,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# train test split\n",
148 |     "\n",
149 |     "X = df.loc[:, ['pclass', 'age', 'sex']]\n",
150 |     "y = df.survived\n",
151 |     "\n",
152 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Setting up the modesl\n",
160 |     "\n",
161 |     "Now that the data is ready, a list of models is created. Each model will have an associated acronym for later display purposes."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 5,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "models = [\n",
171 |     "    ['LR', LogisticRegression()],\n",
172 |     "    ['NB', BernoulliNB()],\n",
173 |     "    ['kNN', KNeighborsClassifier(n_neighbors=5)],\n",
174 |     "    ['DT', DecisionTreeClassifier()]\n",
175 |     "]"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 6,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "LR \t 0.7977099236641222\n",
188 |       "NB \t 0.7786259541984732\n",
189 |       "kNN \t 0.7786259541984732\n",
190 |       "DT \t 0.7786259541984732\n"
191 |      ]
192 |     }
193 |    ],
194 |    "source": [
195 |     "clf_acc = {}\n",
196 |     "\n",
197 |     "for clf, model in models:\n",
198 |     "    model.fit(X_train, y_train)\n",
199 |     "    y_pred = model.predict(X_test)\n",
200 |     "    clf_acc[clf] = accuracy_score(y_test, y_pred)\n",
201 |     "    \n",
202 |     "for clf in clf_acc:\n",
203 |     "    print(clf, '\\t', clf_acc[clf])\n",
204 |     "    "
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": []
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.8.2"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 4
236 | }
237 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_21/kNN_classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# kNN\n",
  8 |     "## Titanic data\n",
  9 |     "\n",
 10 |     "This notebook runs the k Nearest Neighbors algorithm on the Titanic data. \n",
 11 |     "\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "   pclass  survived     sex      age\n",
 24 |       "0       1         1  female  29.0000\n",
 25 |       "1       1         1    male   0.9167\n",
 26 |       "2       1         0  female   2.0000\n",
 27 |       "3       1         0    male  30.0000\n",
 28 |       "4       1         0  female  25.0000\n",
 29 |       "\n",
 30 |       "Dimensions of data frame: (1309, 4)\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "### load the data\n",
 36 |     "import pandas as pd\n",
 37 |     "df = pd.read_csv('data/titanic3.csv', usecols=['pclass', 'survived', 'sex', 'age'])\n",
 38 |     "print(df.head())\n",
 39 |     "print('\\nDimensions of data frame:', df.shape)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/html": [
 50 |        "<div>\n",
 51 |        "<style scoped>\n",
 52 |        "    .dataframe tbody tr th:only-of-type {\n",
 53 |        "        vertical-align: middle;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe tbody tr th {\n",
 57 |        "        vertical-align: top;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe thead th {\n",
 61 |        "        text-align: right;\n",
 62 |        "    }\n",
 63 |        "</style>\n",
 64 |        "<table border=\"1\" class=\"dataframe\">\n",
 65 |        "  <thead>\n",
 66 |        "    <tr style=\"text-align: right;\">\n",
 67 |        "      <th></th>\n",
 68 |        "      <th>pclass</th>\n",
 69 |        "      <th>survived</th>\n",
 70 |        "      <th>sex</th>\n",
 71 |        "      <th>age</th>\n",
 72 |        "    </tr>\n",
 73 |        "  </thead>\n",
 74 |        "  <tbody>\n",
 75 |        "    <tr>\n",
 76 |        "      <th>0</th>\n",
 77 |        "      <td>0</td>\n",
 78 |        "      <td>1</td>\n",
 79 |        "      <td>0</td>\n",
 80 |        "      <td>29.0000</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>1</th>\n",
 84 |        "      <td>0</td>\n",
 85 |        "      <td>1</td>\n",
 86 |        "      <td>1</td>\n",
 87 |        "      <td>0.9167</td>\n",
 88 |        "    </tr>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>2</th>\n",
 91 |        "      <td>0</td>\n",
 92 |        "      <td>0</td>\n",
 93 |        "      <td>0</td>\n",
 94 |        "      <td>2.0000</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>3</th>\n",
 98 |        "      <td>0</td>\n",
 99 |        "      <td>0</td>\n",
100 |        "      <td>1</td>\n",
101 |        "      <td>30.0000</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>4</th>\n",
105 |        "      <td>0</td>\n",
106 |        "      <td>0</td>\n",
107 |        "      <td>0</td>\n",
108 |        "      <td>25.0000</td>\n",
109 |        "    </tr>\n",
110 |        "  </tbody>\n",
111 |        "</table>\n",
112 |        "</div>"
113 |       ],
114 |       "text/plain": [
115 |        "   pclass  survived  sex      age\n",
116 |        "0       0         1    0  29.0000\n",
117 |        "1       0         1    1   0.9167\n",
118 |        "2       0         0    0   2.0000\n",
119 |        "3       0         0    1  30.0000\n",
120 |        "4       0         0    0  25.0000"
121 |       ]
122 |      },
123 |      "execution_count": 2,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "# convert columns to factors\n",
130 |     "df.survived = df.survived.astype('category').cat.codes\n",
131 |     "df.pclass = df.pclass.astype('category').cat.codes\n",
132 |     "df.sex = df.sex.astype('category').cat.codes\n",
133 |     "df.head()"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 3,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "pclass        0\n",
145 |        "survived      0\n",
146 |        "sex           0\n",
147 |        "age         263\n",
148 |        "dtype: int64"
149 |       ]
150 |      },
151 |      "execution_count": 3,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "# count missing values\n",
158 |     "\n",
159 |     "df.isnull().sum()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 4,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "# fill missing values\n",
169 |     "import numpy as np\n",
170 |     "\n",
171 |     "age_mean = np.mean(df.age)\n",
172 |     "df.age.fillna(age_mean, inplace=True)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 5,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "train size: (1047, 3)\n",
185 |       "test size: (262, 3)\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "# train test split\n",
191 |     "from sklearn.model_selection import train_test_split\n",
192 |     "\n",
193 |     "X = df.loc[:, ['pclass', 'age', 'sex']]\n",
194 |     "y = df.survived\n",
195 |     "\n",
196 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
197 |     "\n",
198 |     "print('train size:', X_train.shape)\n",
199 |     "print('test size:', X_test.shape)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 6,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "data": {
209 |       "text/plain": [
210 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
211 |        "                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
212 |        "                     weights='uniform')"
213 |       ]
214 |      },
215 |      "execution_count": 6,
216 |      "metadata": {},
217 |      "output_type": "execute_result"
218 |     }
219 |    ],
220 |    "source": [
221 |     "from sklearn.neighbors import KNeighborsClassifier\n",
222 |     "\n",
223 |     "clf = KNeighborsClassifier(n_neighbors = 5)\n",
224 |     "clf.fit(X_train, y_train)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 7,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# make predictions\n",
234 |     "\n",
235 |     "pred = clf.predict(X_test)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 8,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "accuracy score:  0.7786259541984732\n",
248 |       "precision score:  0.7625\n",
249 |       "recall score:  0.61\n",
250 |       "f1 score:  0.6777777777777777\n"
251 |      ]
252 |     }
253 |    ],
254 |    "source": [
255 |     "# evaluate\n",
256 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
257 |     "\n",
258 |     "print('accuracy score: ', accuracy_score(y_test, pred))\n",
259 |     "print('precision score: ', precision_score(y_test, pred))\n",
260 |     "print('recall score: ', recall_score(y_test, pred))\n",
261 |     "print('f1 score: ', f1_score(y_test, pred))"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 9,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "array([[143,  19],\n",
273 |        "       [ 39,  61]])"
274 |       ]
275 |      },
276 |      "execution_count": 9,
277 |      "metadata": {},
278 |      "output_type": "execute_result"
279 |     }
280 |    ],
281 |    "source": [
282 |     "# confusion matrix\n",
283 |     "from sklearn.metrics import confusion_matrix\n",
284 |     "\n",
285 |     "confusion_matrix(y_test, pred)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 10,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "name": "stdout",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "              precision    recall  f1-score   support\n",
298 |       "\n",
299 |       "           0       0.79      0.88      0.83       162\n",
300 |       "           1       0.76      0.61      0.68       100\n",
301 |       "\n",
302 |       "    accuracy                           0.78       262\n",
303 |       "   macro avg       0.77      0.75      0.75       262\n",
304 |       "weighted avg       0.78      0.78      0.77       262\n",
305 |       "\n"
306 |      ]
307 |     }
308 |    ],
309 |    "source": [
310 |     "from sklearn.metrics import classification_report\n",
311 |     "print(classification_report(y_test, pred))"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": []
320 |   }
321 |  ],
322 |  "metadata": {
323 |   "kernelspec": {
324 |    "display_name": "Python 3",
325 |    "language": "python",
326 |    "name": "python3"
327 |   },
328 |   "language_info": {
329 |    "codemirror_mode": {
330 |     "name": "ipython",
331 |     "version": 3
332 |    },
333 |    "file_extension": ".py",
334 |    "mimetype": "text/x-python",
335 |    "name": "python",
336 |    "nbconvert_exporter": "python",
337 |    "pygments_lexer": "ipython3",
338 |    "version": "3.8.2"
339 |   }
340 |  },
341 |  "nbformat": 4,
342 |  "nbformat_minor": 4
343 | }
344 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_21/kNN_regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# kNN Regression with Python\n",
  8 |     "## Boston Housing data\n",
  9 |     "\n",
 10 |     "R data sets can be found at [this link](http://vincentarelbundock.github.io/Rdatasets/datasets.html).\n",
 11 |     "\n",
 12 |     "This notebook will use the Boston Housing data."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "### load the data\n",
 22 |     "import pandas as pd\n",
 23 |     "df = pd.read_csv('data/Boston.csv')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "train size: (404, 12)\n",
 36 |       "test size: (102, 12)\n"
 37 |      ]
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# train test split\n",
 42 |     "from sklearn.model_selection import train_test_split\n",
 43 |     "\n",
 44 |     "X = df.iloc[:, 0:12]\n",
 45 |     "y = df.iloc[:, 13]\n",
 46 |     "\n",
 47 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)\n",
 48 |     "\n",
 49 |     "print('train size:', X_train.shape)\n",
 50 |     "print('test size:', X_test.shape)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
 62 |        "                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,\n",
 63 |        "                    weights='uniform')"
 64 |       ]
 65 |      },
 66 |      "execution_count": 3,
 67 |      "metadata": {},
 68 |      "output_type": "execute_result"
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "# train the algorithm\n",
 73 |     "from sklearn.neighbors import KNeighborsRegressor\n",
 74 |     "regressor = KNeighborsRegressor(n_neighbors=3)\n",
 75 |     "regressor.fit(X_train, y_train)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 4,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# make predictions\n",
 85 |     "\n",
 86 |     "y_pred = regressor.predict(X_test)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "mse= 66.1533442265795\n",
 99 |       "correlation= 0.35553728726247713\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "# evaluation\n",
105 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
106 |     "print('mse=', mean_squared_error(y_test, y_pred))\n",
107 |     "print('correlation=', r2_score(y_test, y_pred))"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "The mse was considerably higher at 35.7 than the mse for linear regression of 27.85. The correlation was much lower. Different values  of k=5 and k=7 resulted in worse performance."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "### Scaling\n",
122 |     "\n",
123 |     "In R, much improved results were achieved by scaling data for kNN. The following code tries kNN again after scaling the data."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 6,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "from sklearn import preprocessing\n",
133 |     "\n",
134 |     "scaler = preprocessing.StandardScaler().fit(X_train)\n",
135 |     "\n",
136 |     "X_train_scaled = scaler.transform(X_train)\n",
137 |     "X_test_scaled = scaler.transform(X_test)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "mse= 19.192755991285402\n",
150 |       "correlation= 0.8130250898777173\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "regressor2 = KNeighborsRegressor(n_neighbors=3)\n",
156 |     "regressor2.fit(X_train_scaled, y_train)\n",
157 |     "\n",
158 |     "# make predictions\n",
159 |     "y_pred2 = regressor2.predict(X_test_scaled)\n",
160 |     "\n",
161 |     "# evaluation\n",
162 |     "print('mse=', mean_squared_error(y_test, y_pred2))\n",
163 |     "print('correlation=', r2_score(y_test, y_pred2))"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Wow. Much better results than the non-scaled version, and a significant improvement over linear regression as well."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": []
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 3",
184 |    "language": "python",
185 |    "name": "python3"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 3
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython3",
197 |    "version": "3.8.2"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 4
202 | }
203 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/Chapter_21/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Part_6_Python_ML/readme.md:
--------------------------------------------------------------------------------
 1 | Part 6 of the book explores machine learning with Python. 
 2 | 
 3 | Chapter 19: Python Basics
 4 | * Notebooks 0 - 9 provide an introduction to Python 3 with examples
 5 | 
 6 | Chapter 20: Python ML Libraries
 7 | * NumPy
 8 | * pandas
 9 | * SciKit-Learn
10 | * seaborn
11 | * data cleaning
12 | 
13 | Chapter 21: Python ML Examples
14 | * Linear regression
15 | * kNN regression
16 | * Logistic regression
17 | * kNN classification
18 | * Naive Bayes
19 | * Decision Trees
20 | 


--------------------------------------------------------------------------------
/Part_7_Neural_Networks/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | Chapter 23 explores Neural Networks in sklearn
3 | 
4 | Chapter 24 explores the Keras package
5 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/MM1.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Modeling with a Markov Model"
 3 | output:
 4 |   pdf_document: default
 5 |   html_document:
 6 |     df_print: paged
 7 | ---
 8 | 
 9 | A Markov model of a person's tendency to exercise or not. A Markov process has:
10 | 
11 | * a finite set of states
12 | * fixed transition probabilities between states
13 | 
14 | Initial state: 30 days: 25 exercised (E), 5 did not (N)
15 | 
16 | Transition probabilities: 
17 | 
18 | * E -> E .8
19 | * E -> N .2
20 | * N -> N .75
21 | * N -> E .25
22 | 
23 | ```{r}
24 | # build the transition matrix for the model
25 | transMatrix <- matrix(c(.8, .2, .25, .75), nrow=2)
26 | transMatrix
27 | # represent the initial state in the exercise matrix
28 | exercise <- matrix(c(5/30, 25/30), nrow=2)
29 | exercise
30 | ```
31 | 
32 | Alter the number of iterations.
33 | 
34 | After 6 iterations, the model stabilizes at around 54% to 46%. After 5000 iterations, about the same. 
35 | 
36 | ```{r}
37 | for (i in 1:10){
38 |   exercise <- transMatrix %*% exercise
39 |   print(paste("exercise at i=", i, ":", format(round(exercise[1,], 2))))
40 | }
41 | exercise
42 | ```
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/MM1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/MM1.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/RL.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output:
  3 |   pdf_document: default
  4 |   html_document: default
  5 | ---
  6 | # Reinforcement Learning
  7 | ## MDP (markov decision process)
  8 | 
  9 | Code available as a github gist: https://gist.github.com/smc77/8277155#file-gistfile1-txt from Shane Conway.
 10 | 
 11 | Reinforcement learning is a topic usually covered in AI classes because it is helpful for typical AI problems such as learning to play checkers, or teaching a robot to move through a room.
 12 | 
 13 | Reinforcement models typically have:
 14 | 
 15 | * E environment - the checker board, the room, etc.
 16 | * S state - the current state (such as location) of the agent
 17 | * A action - action available to the agent such as move north, stay put, etc.
 18 | * R reward - the reward given to the agent for the last action A
 19 | 
 20 | ![grid](grid.png)
 21 | 
 22 | ```{r}
 23 | actions <- c("N", "S", "E", "W")
 24 | 
 25 | x <- 1:4
 26 | y <- 1:3
 27 | 
 28 | rewards <- matrix(rep(0, 12), nrow=3)
 29 | rewards[2, 2] <- NA
 30 | rewards[1, 4] <- 1
 31 | rewards[2, 4] <- -1
 32 | 
 33 | values <- rewards # initial values
 34 | 
 35 | states <- expand.grid(x=x, y=y)
 36 | 
 37 | # Transition probability
 38 | transition <- list("N" = c("N" = 0.8, "S" = 0, "E" = 0.1, "W" = 0.1), 
 39 |         "S"= c("S" = 0.8, "N" = 0, "E" = 0.1, "W" = 0.1),
 40 |         "E"= c("E" = 0.8, "W" = 0, "S" = 0.1, "N" = 0.1),
 41 |         "W"= c("W" = 0.8, "E" = 0, "S" = 0.1, "N" = 0.1))
 42 | 
 43 | # The value of an action (e.g. move north means y + 1)
 44 | action.values <- list("N" = c("x" = 0, "y" = 1), 
 45 |         "S" = c("x" = 0, "y" = -1),
 46 |         "E" = c("x" = -1, "y" = 0),
 47 |         "W" = c("x" = 1, "y" = 0))
 48 | 
 49 | # act() function serves to move the robot through states based on an action
 50 | act <- function(action, state) {
 51 |     action.value <- action.values[[action]]
 52 |     new.state <- state
 53 |     #
 54 |     if(state["x"] == 4 && state["y"] == 1 || (state["x"] == 4 && state["y"] == 2))
 55 |         return(state)
 56 |     #
 57 |     new.x = state["x"] + action.value["x"]
 58 |     new.y = state["y"] + action.value["y"]
 59 |     # Constrained by edge of grid
 60 |     new.state["x"] <- min(x[length(x)], max(x[1], new.x))
 61 |     new.state["y"] <- min(y[length(y)], max(y[1], new.y))
 62 |     #
 63 |     if(is.na(rewards[new.state["y"], new.state["x"]]))
 64 |         new.state <- state
 65 |     #
 66 |     return(new.state)
 67 | }
 68 | 
 69 | 
 70 | rewards
 71 | 
 72 |  
 73 | 
 74 | 
 75 | bellman.update <- function(action, state, values, gamma=1) {
 76 |     state.transition.prob <- transition[[action]]
 77 |     q <- rep(0, length(state.transition.prob))
 78 |     for(i in 1:length(state.transition.prob)) {        
 79 |         new.state <- act(names(state.transition.prob)[i], state) 
 80 |         q[i] <- (state.transition.prob[i] * (rewards[state["y"], state["x"]] + (gamma * values[new.state["y"], new.state["x"]])))
 81 |     }
 82 |     sum(q)
 83 | }
 84 | 
 85 | value.iteration <- function(states, actions, rewards, values, gamma, niter) {
 86 |     for (j in 1:niter) {
 87 |         for (i in 1:nrow(states)) {
 88 |             state <- unlist(states[i,])
 89 |             if(i %in% c(4, 8)) next # terminal states
 90 |             q.values <- as.numeric(lapply(actions, bellman.update, state=state, values=values, gamma=gamma))
 91 |             values[state["y"], state["x"]] <- max(q.values)
 92 |         }
 93 |     }
 94 |     return(values)
 95 | }
 96 | 
 97 | final.values <- value.iteration(states=states, actions=actions, rewards=rewards, values=values, gamma=0.99, niter=100)
 98 | 
 99 | final.values
100 | 
101 | ```
102 | 
103 | 
104 | 
105 | 
106 | ```{r}
107 | 
108 | 
109 | ```
110 | 
111 | 
112 | 
113 | 
114 | ```{r}
115 | 
116 | 
117 | ```
118 | 
119 | 
120 | 
121 | 
122 | ```{r}
123 | 
124 | 
125 | ```
126 | 
127 | 
128 | 
129 | 
130 | ```{r}
131 | 
132 | 
133 | ```
134 | 
135 | 
136 | 
137 | 
138 | ```{r}
139 | 
140 | 
141 | ```
142 | 
143 | 
144 | 
145 | 
146 | ```{r}
147 | 
148 | 
149 | ```
150 | 
151 | 
152 | 
153 | 
154 | ```{r}
155 | 
156 | 
157 | ```
158 | 
159 | 
160 | 
161 | 
162 | ```{r}
163 | 
164 | 
165 | ```
166 | 
167 | 
168 | 
169 | 
170 | ```{r}
171 | 
172 | 
173 | ```
174 | 
175 | 
176 | 
177 | 
178 | ```{r}
179 | 
180 | 
181 | ```
182 | 
183 | 
184 | 
185 | 
186 | ```{r}
187 | 
188 | 
189 | ```
190 | 
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/RL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/RL.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/RL2.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output:
 3 |   pdf_document: default
 4 |   html_document: default
 5 | ---
 6 | # Reinforcement Learning
 7 | ## MDP (markov decision process)
 8 | 
 9 | 
10 | Adapted from: http://www7.inra.fr/mia/T/MDPtoolbox/QuickStart.pdf
11 | 
12 | 
13 | ```{r}
14 | library(MDPtoolbox)
15 | 
16 | ```
17 | 
18 | 
19 | ```{r}
20 | x <- mdp_example_forest()
21 | mdp_check(x$P, x$R)
22 | discount <- 0.95
23 | ```
24 | 
25 | 
26 | ```{r}
27 | y <- mdp_policy_iteration(x$P, x$R, discount)
28 | V <- y$V
29 | policy <- y$policy
30 | ```
31 | 
32 | 
33 | ```{r}
34 | z <- mdp_LP(x$P, x$R, discount)
35 | V <- z$V
36 | policy <- z$policy
37 | ```
38 | 
39 | 
40 | ```{r}
41 | Vpolicy <- mdp_eval_policy_matrix(x$P, x$R, discount, policy)
42 | Vpolicy
43 | Vpolicy <- mdp_eval_policy_iterative(x$P, x$R, discount, policy)
44 | Vpolicy
45 | Vpolicy <- mdp_eval_policy_TD_0(x$P, x$R, discount, policy)
46 | Vpolicy
47 | ```
48 | 
49 | Model parameter change
50 | 
51 | change...
52 | 
53 | didn't finish:  http://www7.inra.fr/mia/T/MDPtoolbox/QuickStart.pdf
54 | 
55 | 
56 | 
57 | ```{r}
58 | q <- mdp_example_forest(3, 0.4, 2, .2)
59 | # didn't finish w <- mdp_
60 | ```
61 | 
62 | wildfire high
63 | 
64 | ```{r}
65 | 
66 | ```
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/RL2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/RL2.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/bayesnet.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output:
  3 |   pdf_document: default
  4 |   html_document: default
  5 | ---
  6 | # Bayes Network
  7 | ## Demo 1
  8 | 
  9 | Adapted from: https://www.r-bloggers.com/bayesian-network-in-r-introduction/
 10 | 
 11 | Bayesian networks (BN) are graphical models that encode the conditional probability between predictors in a directed acyclic graph. 
 12 | 
 13 | BN advantages over other unsupervised algorithms:
 14 | 
 15 | - easy to exploit expert knowledge in BN models
 16 | 
 17 | - BN models are robust to noisy, missing, sparse data
 18 | 
 19 | - parameters are understandable, interpretable
 20 | 
 21 | ```{r}
 22 | library(bnlearn)
 23 | data(coronary)
 24 | names(coronary)
 25 | ```
 26 | 
 27 | The coronary data contains the following information:
 28 | 
 29 | - smoking - yes/no
 30 | 
 31 | - M. Work - strenuous mental work yes/no
 32 | 
 33 | - P.Work - strenuous physical work yes/no
 34 | 
 35 | - Pressure - yes/no <140
 36 | 
 37 | - Proteins - 2-level factor with levels (ratio of beta and alpha lipoproteins)
 38 | 
 39 | - Family - two-level factor neg/pos for family anamnesis of coronary heart disease
 40 | 
 41 | ### Create the network
 42 | 
 43 | In this example we use the "max-min" hill climbing algorithm. 
 44 | 
 45 | ```{r}
 46 | bn_df <- data.frame(coronary)
 47 | res <- hc(bn_df)
 48 | plot(res)
 49 | 
 50 | ```
 51 | 
 52 | The causality between some nodes is intuitive and some doesn't seem to make sense. Ex: M.Work and Family. Let's remove that link. 
 53 | 
 54 | 
 55 | ```{r}
 56 | res$arcs <- res$arcs[-which((res$arcs[,'from'] == "M..Work" & res$arcs[,'to'] == "Family")),]
 57 | ```
 58 | 
 59 | ### CPT
 60 | 
 61 | We need to learn the conditional probability tables (CPT) at each node. The bn.fit function runs the EM algorithm to learn CPT for different nodes in the above graph. We can print them for different nodes.
 62 | 
 63 | 
 64 | ```{r}
 65 | fittedbn <- bn.fit(res, data = bn_df)
 66 | print(fittedbn$Proteins)
 67 | 
 68 | ```
 69 | 
 70 | ### Inference
 71 | 
 72 | The BN is ready and we can start inferring from the network. 
 73 | 
 74 | Although Proteins variable is conditioned on 2 variables, we did the query based on the available evidence on only one variables. 
 75 | 
 76 | Next we query the network. What is the chance that a non-smoker has a Proteins level less than 3?
 77 | 
 78 | The result is 0.61.
 79 | 
 80 | 
 81 | ```{r}
 82 | cpquery(fittedbn, event = (Proteins=="<3"), evidence = ( Smoking=="no") )
 83 | 
 84 | ```
 85 | 
 86 | What is the chance that a non-smoker with blood pressure greated than 140 has a Proteins level less than 3?
 87 | 
 88 | The result is 0.62.
 89 | 
 90 | 
 91 | ```{r}
 92 | cpquery(fittedbn, event = (Proteins=="<3"), evidence = ( Smoking=="no" & Pressure==">140" ) )
 93 | 
 94 | ```
 95 | 
 96 | We can also move in the opposite direction of an arc between two nodes. Let's see if a person's Proteins level is greater than 3, then what is the chance that his or her Pressure level is greater than 140?
 97 | 
 98 | The resutl is the Pressure is greater than 140 with probability 0.41.
 99 | 
100 | 
101 | ```{r}
102 | 
103 | cpquery(fittedbn, event = (Pressure==">140"), evidence = ( Proteins=="<3" ) )
104 | ```
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/bayesnet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/bayesnet.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/bayesnet1.Rmd:
--------------------------------------------------------------------------------
 1 | # Bayesian Network Learning
 2 | ## On the Default and Titanic data sets
 3 | ### Karen Mazidi
 4 | 
 5 | ### Default data set
 6 | 
 7 | Let's try bnlearn on some familiar data sets. First, the Default data set from ISLR. 
 8 | 
 9 | ```{r}
10 | library(bnlearn)
11 | library(ISLR)
12 | data("Default")
13 | ```
14 | 
15 | ### Build the model
16 | 
17 | Use the hill-climbing algorithm to create the model, then display it. 
18 | 
19 | The model seems to fit what we learned by doing ML on this data set before. Income is not a good predictor for default, notice there is no link between them.
20 | 
21 | 
22 | ```{r}
23 | bn_df <- data.frame(Default)
24 | str(bn_df)
25 | res <- hc(bn_df)
26 | plot(res)
27 | ```
28 | 
29 | 
30 | ### Probabilities
31 | 
32 | This is a really unbalanced data set!
33 | 
34 | ```{r}
35 | fittedbn <- bn.fit(res, data=bn_df)
36 | print(fittedbn$default)
37 | ```
38 | 
39 | 
40 | ### Try some predictions
41 | 
42 | ```{r}
43 | cpquery(fittedbn, event=(default=="Yes"), evidence = (student=="Yes"))
44 | cpquery(fittedbn, event=(default=="Yes"), evidence = (balance>1200))
45 | cpquery(fittedbn, event=(default=="Yes"), evidence = (student=="Yes" & balance>1200))
46 | ```
47 | 
48 | ### Titanic
49 | 
50 | Now let's try a subset of the Titanic data set. We had to a little data fiddling because the bnlearn didn't like NAs and ints. 
51 | 
52 | 
53 | ```{r}
54 | df <- read.csv('train.csv', header=T, na.strings=c(""))
55 | df <- df[1:890, c(2,3,5,6)]  # Survived, Pclass, Sex, Age
56 | bn_titan <- df[complete.cases(df), ]
57 | bn_titan$Survived <- as.factor(bn_titan$Survived)
58 | bn_titan$Pclass <- as.factor(bn_titan$Pclass)
59 | bn_titan$Sex <- as.factor(bn_titan$Sex)
60 | str(bn_titan)
61 | ```
62 | 
63 | 
64 | ### Build the net
65 | 
66 | This model is not surprising. Age and Sex and Pclass were found to be good predictors in previous experiments.
67 | 
68 | ```{r}
69 | res_titan <- hc(bn_titan)
70 | plot(res_titan)
71 | ```
72 | 
73 | ### Print the probabilities
74 | 
75 | This breaks down the probabilities nicely.
76 | 
77 | 
78 | ```{r}
79 | fitted_bn_titan <- bn.fit(res_titan, data=bn_titan)
80 | print(fitted_bn_titan)
81 | ```
82 | 
83 | 
84 | ### Predict
85 | 
86 | Try some predictions based on the net
87 | 
88 | ```{r}
89 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Pclass==1))
90 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Age<9) )
91 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Pclass==1 & Age<=9) )
92 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Pclass==1 & Age>9) )
93 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Sex=="female") )
94 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Sex=="male" & Age>21) )
95 | ```
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/bayesnet2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output:
  3 |   pdf_document: default
  4 |   html_document: default
  5 | ---
  6 | # Bayesian Network Learning
  7 | ## On the Default and Titanic data sets
  8 | ### Karen Mazidi
  9 | 
 10 | ### Default data set
 11 | 
 12 | Let's try bnlearn on some familiar data sets. First, the Default data set from ISLR. 
 13 | 
 14 | ```{r}
 15 | library(bnlearn)
 16 | library(ISLR)
 17 | data("Default")
 18 | ```
 19 | 
 20 | ### Build the model
 21 | 
 22 | Use the hill-climbing algorithm to create the model, then display it. 
 23 | 
 24 | The model seems to fit what we learned by doing ML on this data set before. Income is not a good predictor for default, notice there is no link between them.
 25 | 
 26 | 
 27 | ```{r}
 28 | bn_df <- data.frame(Default)
 29 | str(bn_df)
 30 | res <- hc(bn_df)
 31 | plot(res)
 32 | ```
 33 | 
 34 | 
 35 | ### Probabilities
 36 | 
 37 | This is a really unbalanced data set!
 38 | 
 39 | ```{r}
 40 | fittedbn <- bn.fit(res, data=bn_df)
 41 | print(fittedbn$default)
 42 | ```
 43 | 
 44 | 
 45 | ### Try some predictions
 46 | 
 47 | ```{r}
 48 | cpquery(fittedbn, event=(default=="Yes"), evidence = (student=="Yes"))
 49 | cpquery(fittedbn, event=(default=="Yes"), evidence = (balance>1200))
 50 | cpquery(fittedbn, event=(default=="Yes"), evidence = (student=="Yes" & balance>1200))
 51 | ```
 52 | 
 53 | ### Titanic
 54 | 
 55 | Now let's try a subset of the Titanic data set. We had to a little data fiddling because the bnlearn didn't like NAs and ints. 
 56 | 
 57 | 
 58 | ```{r}
 59 | df <- read.csv('train.csv', header=T, na.strings=c(""))
 60 | df <- df[1:750, c(2,3,5,6)]  # Survived, Pclass, Sex, Age
 61 | bn_titan <- df[complete.cases(df), ]
 62 | bn_titan$Survived <- as.factor(bn_titan$Survived)
 63 | bn_titan$Pclass <- as.factor(bn_titan$Pclass)
 64 | bn_titan$Sex <- as.factor(bn_titan$Sex)
 65 | str(bn_titan)
 66 | ```
 67 | 
 68 | 
 69 | ### Build the net
 70 | 
 71 | This model is not surprising. Age and Sex and Pclass were found to be good predictors in previous experiments.
 72 | 
 73 | ```{r}
 74 | res_titan <- hc(bn_titan)
 75 | plot(res_titan)
 76 | ```
 77 | 
 78 | ### Print the probabilities
 79 | 
 80 | This breaks down the probabilities nicely.
 81 | 
 82 | 
 83 | ```{r}
 84 | fitted_bn_titan <- bn.fit(res_titan, data=bn_titan)
 85 | print(fitted_bn_titan)
 86 | ```
 87 | 
 88 | 
 89 | ### Predict
 90 | 
 91 | Try some predictions based on the net
 92 | 
 93 | ```{r}
 94 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Pclass==1))
 95 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Age<9) )
 96 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Pclass==1 & Age<=9) )
 97 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Pclass==1 & Age>9) )
 98 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Sex=="female") )
 99 | cpquery(fitted_bn_titan, event = (Survived==1), evidence = (Sex=="male" & Age>21) )
100 | ```
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/bayesnet2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/bayesnet2.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/grid.png


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/hmm1.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Hidden Markov Model"
 3 | output:
 4 |   pdf_document: default
 5 |   html_document:
 6 |     df_print: paged
 7 | editor_options:
 8 |   chunk_output_type: console
 9 | ---
10 | 
11 | Modified from: https://www.r-bloggers.com/hmm-example-with-depmixs4/
12 | 
13 | 
14 | ```{r}
15 | library(depmixS4)
16 | 
17 | # generate our data
18 | n <- 140 # number of transitions (7 days, 10 weeks)
19 | obs <- rep(c(c(2, 2), c(2, 1), c(1, 1), c(1, 1), c(1, 1), c(1, 1), c(1, 2)), 10)
20 | ```
21 | 
22 | 
23 | 
24 | 
25 | 
26 | ```{r}
27 | set.seed(1234)
28 | # 1. create the model
29 | mod <- depmix(response = obs ~ 1, data=data.frame(obs), nstates=2)
30 | 
31 | # 2. fit the model 
32 | f <- fit(mod)
33 | summary(f)
34 | 
35 | # get the estimated state for each timestep 
36 | estimates <- posterior(f)
37 | 
38 | par(mfrow=c(2,1))
39 | plot(1:n, obs, type='l', main='Observations, X')
40 | plot(1:n, estimates[,2], type='l', main='Estimates')
41 | 
42 | ```
43 | 
44 | What did it learn?
45 | 
46 | ```{r}
47 | head(estimates)
48 | ```
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/hmm1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/hmm1.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/hmm2.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "HMM S&P 500"
 3 | output:
 4 |   pdf_document: default
 5 |   html_document:
 6 |     df_print: paged
 7 | editor_options:
 8 |   chunk_output_type: inline
 9 | ---
10 | 
11 | HMM example using S&P 500 data which is included in the depmix54 package. The sp500 data consists of monthly values of the S&P 500 index. The logret column, column 6, is the log of the ratio of the closing indices.
12 | 
13 | 
14 | ```{r}
15 | library(depmixS4)
16 | data(sp500)
17 | head(sp500)
18 | range(sp500[,6])
19 | 
20 | ```
21 | 
22 | Column 6 has a range of [-0.245, 0.15]
23 | 
24 | 
25 | ```{r}
26 | # create the model, then fit
27 | mod <- depmix(logret~1, nstates=2, data=sp500)
28 | set.seed(1)
29 | fmod <- fit(mod)	
30 | 
31 | # plot 
32 | par(mfrow=c(3,1))
33 | plot(posterior(fmod)[,1], type='l')
34 | plot(posterior(fmod)[,2], type='l')
35 | plot(sp500[,6], type='l')
36 | 
37 | 
38 | ```
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/hmm2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjmazidi/Machine_Learning_2nd_edition/31d34a8d5154855eae0b840335ce6552711375df/Part_8_Modeling_the_World/hmm2.pdf


--------------------------------------------------------------------------------
/Part_8_Modeling_the_World/readme.md:
--------------------------------------------------------------------------------
1 | This section looks at AI algorithms included in the course
2 | 
3 | * Bayesian Networks
4 | 
5 | * Markov Models
6 | 
7 | * Reinforcement Learning
8 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
 1 | # Machine Learning Handbook Using R and Python
 2 | ## author: Dr. Karen Mazidi
 3 | 
 4 | 
 5 | Videos available on [my YouTube channel](https://www.youtube.com/playlist?list=PLfe6IcA_dEWkcHFfBA6XSXW31H8t4XSbB)
 6 | 
 7 | Table of Contents
 8 | 
 9 | Part One: Introduction to Machine Learning
10 | 
11 | 1. The Craft of Machine Learning
12 | 2. Learning R
13 | 3. Data Visualization in R
14 | 4. The Craft 1: Planning to Learn 
15 | 
16 | Part Two: Linear Models
17 | 
18 | 5. Linear Regression
19 | 6. Logistic Regression
20 | 7. Naive Bayes
21 | 8. The Craft 2: Predictive Analytics
22 | 
23 | Part Three: Modern R
24 | 
25 | 9. The Tidyverse
26 | 10. ggplot2
27 | 11. The Craft 3: Data Wrangling
28 | 
29 | Part Four: Searching for Similarity
30 | 
31 | 12. Instance-based learning with kNN
32 | 13. Clustering
33 | 14. Decision Trees and Random Forests
34 | 15. The Craft 4: Feature Engineering
35 | 
36 | Part Five: Kernel Methods and Ensemble Methods
37 | 
38 | 16. Support Vector Machines
39 | 17. Ensemble Methods
40 | 18. The Craft 5: Choosing Algorithms
41 | 
42 | Part Six: Python for Machine Learning
43 | 
44 | 19. Python Basics
45 | 20. Python Libraries for Machine Learning
46 | 21. Python Machine Learning Examples
47 | 
48 | Part Seven: Neural Networks
49 | 
50 | 22. Neural Networks
51 | 23. Deep Learning with Keras
52 | 22. The Craft 6: Machine Learning Trends
53 | 
54 | Part Eight: Modeling the World
55 | 
56 | 25. Bayes Nets
57 | 26. Markov Models
58 | 27. Reinforcement Learning
59 | 28. The Craft 8: Learning Theory
60 | 
61 | 


--------------------------------------------------------------------------------