├── README.md
├── dhfr
    ├── dhfr-classification-deploy.R
    ├── dhfr-classification.R
    ├── dhfr-data-understanding.R
    ├── dhfr-handling-missing-data.R
    └── dhfr-parallel-speed-up.R
├── iris
    ├── iris-classification.R
    └── iris-data-understanding.R
├── linear-regression
    └── boston-housing-linear-regression.R
├── plot
    └── scatter-plot
    │   ├── aromatase.csv
    │   └── code-scatter-plot.R
├── python-in-r
    └── using-reticulate.R
├── python
    ├── CDD_ML_Part_1_Acetylcholinesterase_Bioactivity_Data_Concised.ipynb
    ├── CDD_ML_Part_1_Bioactivity_Data_Concised.ipynb
    ├── CDD_ML_Part_1_bioactivity_data.ipynb
    ├── CDD_ML_Part_2_Acetylcholinesterase_Exploratory_Data_Analysis.ipynb
    ├── CDD_ML_Part_2_Exploratory_Data_Analysis.ipynb
    ├── CDD_ML_Part_3_Acetylcholinesterase_Descriptor_Dataset_Preparation.ipynb
    ├── CDD_ML_Part_4_Acetylcholinesterase_Regression_Random_Forest.ipynb
    ├── CDD_ML_Part_5_Acetylcholinesterase_Compare_Regressors.ipynb
    ├── Colab_File_handling_on_Google_Colab.ipynb
    ├── How_to_build_a_simple_linear_regression_model_in_python.ipynb
    ├── Hummingbird_ML.ipynb
    ├── PCA_analysis.ipynb
    ├── ROC_curve.ipynb
    ├── ROC_curve_kNN.ipynb
    ├── Sweetviz.ipynb
    ├── cheminformatics_predicting_solubility.ipynb
    ├── cheminformatics_predicting_solubility_2_1_PyCaret.ipynb
    ├── cheminformatics_predicting_solubility_2_2_PyCaret.ipynb
    ├── comparing-classifiers.ipynb
    ├── google_colab_install_conda.ipynb
    ├── google_colab_r_magic_command.ipynb
    ├── google_colab_r_notebook.ipynb
    ├── hyperparameter_tuning.ipynb
    ├── iris
    │   └── iris-classification-random-forest.ipynb
    ├── klib.ipynb
    ├── linear_regression.ipynb
    ├── model_is_training_progress_bar.ipynb
    ├── pandas-create-and-combine-dataframes.ipynb
    ├── pandas_exploratory_data_analysis.ipynb
    ├── pandas_profiling_example.ipynb
    ├── pandas_read_html_for_webscraping.ipynb
    ├── pandas_select_columns.ipynb
    ├── pandas_styling_dataframe.ipynb
    └── r_magic_command.ipynb
├── shiny
    ├── 001-first-app
    │   └── app.R
    ├── 002-histogram
    │   └── app.R
    ├── 003-play-golf
    │   └── app.R
    ├── 004-iris-predictor
    │   ├── app-numeric.R
    │   ├── app-slider.R
    │   ├── model.R
    │   ├── model.rds
    │   ├── testing.csv
    │   └── training.csv
    └── 005-bmi
    │   ├── about.md
    │   └── app.R
└── streamlit
    ├── part1
        ├── myapp.py
        └── myapp2.py
    ├── part10
        └── sp500-app.py
    ├── part12
        ├── crypto-price-app.py
        └── logo.jpg
    ├── part2
        └── iris-ml-app.py
    ├── part3
        ├── penguins-app.py
        ├── penguins-model-building.py
        ├── penguins_cleaned.csv
        ├── penguins_clf.pkl
        └── penguins_example.csv
    ├── part5
        └── basketball_app.py
    ├── part6
        └── boston-house-ml-app.py
    ├── part7
        ├── solubility-app.py
        ├── solubility-logo.jpg
        ├── solubility-web-app.ipynb
        └── solubility_model.pkl
    ├── part8
        ├── dna-app.py
        └── dna-logo.jpg
    └── part9
        └── football_app.py


/README.md:
--------------------------------------------------------------------------------
 1 | # code
 2 | This is a compilation of R programming codes used on the **Data Professor** YouTube channel tutorial videos.
 3 | 
 4 | Folder | Description
 5 | ---|---
 6 | [iris](https://github.com/dataprofessor/code/tree/master/iris) | Codes for performing *exploratory data analysis* (so as to gain *data understanding*) and for building *classification models* of the Iris dataset.
 7 | [dhfr](https://github.com/dataprofessor/code/tree/master/dhfr) | Codes for performing *exploratory data analysis* (so as to gain *data understanding*) and for building *classification models* of the Iris dataset.
 8 | [python](https://github.com/dataprofessor/code/tree/master/python) | Codes for various Python data science project tutorials.
 9 | [shiny](https://github.com/dataprofessor/code/tree/master/shiny) | Codes for building *web applications* in R with *shiny* package.
10 | 
11 | > Note: More to come. Please stay tuned!
12 | 


--------------------------------------------------------------------------------
/dhfr/dhfr-classification-deploy.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | # Importing libraries
 8 | library(datasets) # Contains several data sets
 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 | 
11 | # Importing the dhfr data set
12 | data(dhfr)
13 | 
14 | # Check to see if there are missing data?
15 | sum(is.na(dhfr))
16 | 
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 | 
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set
23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set
24 | 
25 | 
26 | 
27 | ###############################
28 | # SVM model (polynomial kernel)
29 | 
30 | # Build Training model
31 | Model <- train(Y ~ ., data = TrainingSet,
32 |                method = "svmPoly",
33 |                na.action = na.omit,
34 |                preProcess=c("scale","center"),
35 |                trControl= trainControl(method="none"),
36 |                tuneGrid = data.frame(degree=1,scale=1,C=1)
37 | )
38 | 
39 | 
40 | # Save model to RDS file
41 | 
42 | saveRDS(Model, "Model.rds")
43 | 
44 | # Read the model from RDS file
45 | 
46 | read.Model <- readRDS("Model.rds")
47 | 
48 | 
49 | # Apply model for prediction
50 | Model.training <-predict(read.Model, TrainingSet) # Apply model to make prediction on Training set
51 | Model.testing <-predict(read.Model, TestingSet) # Apply model to make prediction on Testing set
52 | 
53 | # Model performance (Displays confusion matrix and statistics)
54 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
55 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Y)
56 | 
57 | print(Model.training.confusion)
58 | print(Model.testing.confusion)
59 | 
60 | # Feature importance
61 | Importance <- varImp(Model)
62 | plot(Importance, top = 25)
63 | plot(Importance, col = "red")
64 | 


--------------------------------------------------------------------------------
/dhfr/dhfr-classification.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | # Importing libraries
 8 | library(datasets) # Contains several data sets
 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 | 
11 | # Importing the dhfr data set
12 | data(dhfr)
13 | 
14 | # Check to see if there are missing data?
15 | sum(is.na(dhfr))
16 | 
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 | 
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set
23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set
24 | 
25 | 
26 | 
27 | ###############################
28 | # SVM model (polynomial kernel)
29 | 
30 | # Build Training model
31 | Model <- train(Y ~ ., data = TrainingSet,
32 |                method = "svmPoly",
33 |                na.action = na.omit,
34 |                preProcess=c("scale","center"),
35 |                trControl= trainControl(method="none"),
36 |                tuneGrid = data.frame(degree=1,scale=1,C=1)
37 | )
38 | 
39 | # Build CV model
40 | Model.cv <- train(Y ~ ., data = TrainingSet,
41 |                   method = "svmPoly",
42 |                   na.action = na.omit,
43 |                   preProcess=c("scale","center"),
44 |                   trControl= trainControl(method="cv", number=10),
45 |                   tuneGrid = data.frame(degree=1,scale=1,C=1)
46 | )
47 | 
48 | 
49 | # Apply model for prediction
50 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
51 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set
52 | Model.cv <-predict(Model.cv, TrainingSet) # Perform cross-validation
53 | 
54 | # Model performance (Displays confusion matrix and statistics)
55 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
56 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Y)
57 | Model.cv.confusion <-confusionMatrix(Model.cv, TrainingSet$Y)
58 | 
59 | print(Model.training.confusion)
60 | print(Model.testing.confusion)
61 | print(Model.cv.confusion)
62 | 
63 | # Feature importance
64 | Importance <- varImp(Model)
65 | plot(Importance, top = 25)
66 | plot(Importance, col = "red")
67 | 


--------------------------------------------------------------------------------
/dhfr/dhfr-data-understanding.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | #########################
 8 | # Loading DHFR data set
 9 | #########################
10 | 
11 | # Method 1
12 | 
13 | library(datasets)
14 | data(dhfr)
15 | 
16 | # Method 2
17 | #dhfr2 <- datasets::dhfr
18 | 
19 | # Method 3
20 | # install.packages("RCurl")
21 | 
22 | #library(RCurl)
23 | dhfr <- read.csv(text = getURL("https://github.com/dataprofessor/data/raw/master/dhfr.csv") )
24 | 
25 | # View the data
26 | View(dhfr)
27 | 
28 | #############################
29 | # Display summary statistics
30 | #############################
31 | 
32 | # head() / tail()
33 | head(dhfr, 5)
34 | tail(dhfr, 5)
35 | 
36 | 
37 | # summary()
38 | summary(dhfr)
39 | summary(dhfr$Y)
40 | 
41 | 
42 | # Check to see if there are missing data?
43 | sum(is.na(dhfr))
44 | 
45 | 
46 | # skimr() - expands on summary() by providing larger set of statistics
47 | #  install.packages("skimr")
48 | # https://github.com/ropensci/skimr
49 | 
50 | library(skimr)
51 | 
52 | skim(dhfr) # Perform skim to display summary statistics
53 | 
54 | # Group data by Y (biological activity) then perform skim
55 | dhfr %>%
56 |   dplyr::group_by(Y) %>%
57 |   skim()
58 | 
59 | #############################
60 | # Quick data visualization
61 | #
62 | # R base plot()
63 | #############################
64 | 
65 | 
66 | # Panel plots
67 | #plot(dhfr)
68 | #plot(iris, col = "red")
69 | 
70 | # Scatter plot
71 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol)
72 | 
73 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = "red")     # Makes red circles
74 | 
75 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = dhfr$Y)    # Color by Y
76 | 
77 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = "red",     # Makes red circles + Adds x and y axis labels
78 |      xlab = "moe2D_zagreb", ylab = "moe2D_weinerPol")
79 | 
80 | # Histogram
81 | hist(dhfr$moe2D_zagreb)
82 | hist(dhfr$moe2D_zagreb, col = "red")   # Makes red bars
83 | 
84 | # Feature plots
85 | # https://www.machinelearningplus.com/machine-learning/caret-package/
86 | featurePlot(x = dhfr[,2:21],
87 |             y = dhfr$Y,
88 |             plot = "box",
89 |             strip=strip.custom(par.strip.text=list(cex=.7)),
90 |             scales = list(x = list(relation="free"),
91 |                           y = list(relation="free")))
92 | 


--------------------------------------------------------------------------------
/dhfr/dhfr-handling-missing-data.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | 
 8 | # 1. Loading the DHFR data
 9 | library(RCurl)
10 | dhfr <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/dhfr.csv") )
11 | 
12 | View(dhfr)
13 | 
14 | 
15 | # 2. Check for missing data
16 | 
17 | sum(is.na(dhfr))
18 | 
19 | 
20 | # 3. If data is clean, randomly introduce NA to the dataset
21 | 
22 | na.gen <- function(data,n) {
23 |   i <- 1
24 |   while (i < n+1) {
25 |     idx1 <- sample(1:nrow(data), 1)
26 |     idx2 <- sample(1:ncol(data), 1)
27 |     data[idx1,idx2] <- NA
28 |     i = i+1
29 |   }
30 |   return(data)
31 | }
32 | 
33 | 
34 | # Before introducing NA to the dataset, leave the Y class label (output variable) out
35 | 
36 | dhfr <- dhfr[,-1]
37 | 
38 | 
39 | # Choose 1 of the following to run (they'll produce the same result)
40 | 
41 | dhfr <- na.gen(dhfr,100)
42 | 
43 | dhfr <- na.gen(n=100,data=dhfr)
44 | 
45 | dhfr <- na.gen(100,dhfr) # This produces an error, why?
46 | 
47 | 
48 | # 4. Check again for missing data
49 | 
50 | sum(is.na(dhfr))
51 | 
52 | colSums(is.na(dhfr))
53 | 
54 | str(dhfr)
55 | 
56 | 
57 | # Lists rows with missing data
58 | 
59 | missingdata <- dhfr[!complete.cases(dhfr), ]
60 | 
61 | sum(is.na(missingdata))
62 | 
63 | 
64 | # If above sum is 0, this means that there is no missing data and proceed to modeling.
65 | # If above sum is greater than 0, then proceed to # 5
66 | 
67 | 
68 | # 5. Handling the missing data. There are 2 options, decide and choose only 1
69 | 
70 | # 5.1. Simply delete all entries with missing data
71 | 
72 | clean.data <- na.omit(dhfr)
73 | 
74 | sum(is.na(clean.data))
75 | 
76 | 
77 | # 5.2. Imputation: Replace missing values with the column's 
78 | 
79 | # MEAN
80 | dhfr.impute <- dhfr
81 | 
82 | for (i in which(sapply(dhfr.impute, is.numeric))) { 
83 |   dhfr.impute[is.na(dhfr.impute[, i]), i] <- mean(dhfr.impute[, i],  na.rm = TRUE) 
84 | }
85 | 
86 | sum(is.na(dhfr.impute))
87 | 
88 | 
89 | # MEDIAN
90 | dhfr.impute <- dhfr
91 | 
92 | for (i in which(sapply(dhfr.impute, is.numeric))) { 
93 |   dhfr.impute[is.na(dhfr.impute[, i]), i] <- median(dhfr.impute[, i],  na.rm = TRUE) 
94 | }
95 | 
96 | sum(is.na(dhfr.impute))
97 | 


--------------------------------------------------------------------------------
/dhfr/dhfr-parallel-speed-up.R:
--------------------------------------------------------------------------------
  1 | ####################################
  2 | # Data Professor                   #
  3 | # http://youtube.com/dataprofessor #
  4 | # http://github.com/dataprofessor  #
  5 | ####################################
  6 | 
  7 | # Importing libraries
  8 | library(datasets) # Contains several data sets
  9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
 10 | 
 11 | # Importing the dhfr data set
 12 | data(dhfr)
 13 | 
 14 | # Check to see if there are missing data?
 15 | sum(is.na(dhfr))
 16 | 
 17 | # To achieve reproducible model; set the random seed number
 18 | set.seed(100)
 19 | 
 20 | # Performs stratified random split of the data set
 21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
 22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set
 23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set
 24 | 
 25 | 
 26 | 
 27 | ###############################
 28 | # Random forest
 29 | 
 30 | 
 31 | # Run normally without parallel processing
 32 | start.time <- proc.time()
 33 | Model <- train(Y ~ ., 
 34 |                data = TrainingSet, # Build model using training set
 35 |                method = "rf" # Learning algorithm
 36 |          )
 37 | stop.time <- proc.time()
 38 | run.time <- stop.time - start.time
 39 | print(run.time)
 40 | 
 41 | 
 42 | 
 43 | # Use doParallel
 44 | # https://topepo.github.io/caret/parallel-processing.html
 45 | 
 46 | library(doParallel)
 47 | 
 48 | cl <- makePSOCKcluster(5)
 49 | registerDoParallel(cl)
 50 | 
 51 | start.time <- proc.time()
 52 | Model <- train(Y ~ ., 
 53 |                data = TrainingSet, # Build model using training set
 54 |                method = "rf" # Learning algorithm
 55 |          )
 56 | stop.time <- proc.time()
 57 | run.time <- stop.time - start.time
 58 | print(run.time)
 59 | 
 60 | stopCluster(cl)
 61 | 
 62 | 
 63 | 
 64 | 
 65 | ##########################
 66 | 
 67 | # Run without parallel processing
 68 | 
 69 | start.time <- proc.time()
 70 | Model <- train(Y ~ ., 
 71 |                data = TrainingSet, # Build model using training set
 72 |                method = "rf", # Learning algorithm
 73 |                tuneGrid = data.frame(mtry = seq(5,15, by=5))
 74 |          )
 75 | stop.time <- proc.time()
 76 | run.time <- stop.time - start.time
 77 | print(run.time)
 78 | 
 79 | # Using doParallel
 80 | 
 81 | library(doParallel)
 82 | 
 83 | cl <- makePSOCKcluster(5)
 84 | registerDoParallel(cl)
 85 | 
 86 | start.time <- proc.time()
 87 | Model <- train(Y ~ ., 
 88 |                data = TrainingSet, # Build model using training set
 89 |                method = "rf", # Learning algorithm
 90 |                tuneGrid = data.frame(mtry = seq(5,15, by=5))
 91 |          )
 92 | stop.time <- proc.time()
 93 | run.time <- stop.time - start.time
 94 | print(run.time)
 95 | 
 96 | stopCluster(cl)
 97 | 
 98 | 
 99 | ##########################
100 | # Apply model for prediction
101 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
102 | 
103 | # Model performance (Displays confusion matrix and statistics)
104 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
105 | 
106 | print(Model.training.confusion)
107 | 
108 | # Feature importance
109 | Importance <- varImp(Model)
110 | plot(Importance, top = 25)
111 | plot(Importance, col = "red")
112 | 


--------------------------------------------------------------------------------
/iris/iris-classification.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | # Importing libraries
 8 | library(datasets) # Contains the Iris data set
 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 | 
11 | # Importing the Iris data set
12 | data(iris)
13 | 
14 | # Check to see if there are missing data?
15 | sum(is.na(iris))
16 | 
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 | 
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(iris$Species, p=0.8, list = FALSE)
22 | TrainingSet <- iris[TrainingIndex,] # Training Set
23 | TestingSet <- iris[-TrainingIndex,] # Test Set
24 | 
25 | # Compare scatter plot of the 80 and 20 data subsets
26 | 
27 | 
28 | 
29 | 
30 | ###############################
31 | # SVM model (polynomial kernel)
32 | 
33 | # Build Training model
34 | Model <- train(Species ~ ., data = TrainingSet,
35 |                method = "svmPoly",
36 |                na.action = na.omit,
37 |                preProcess=c("scale","center"),
38 |                trControl= trainControl(method="none"),
39 |                tuneGrid = data.frame(degree=1,scale=1,C=1)
40 | )
41 | 
42 | # Build CV model
43 | Model.cv <- train(Species ~ ., data = TrainingSet,
44 |                   method = "svmPoly",
45 |                   na.action = na.omit,
46 |                   preProcess=c("scale","center"),
47 |                   trControl= trainControl(method="cv", number=10),
48 |                   tuneGrid = data.frame(degree=1,scale=1,C=1)
49 | )
50 | 
51 | 
52 | # Apply model for prediction
53 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
54 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set
55 | Model.cv <-predict(Model.cv, TrainingSet) # Perform cross-validation
56 | 
57 | # Model performance (Displays confusion matrix and statistics)
58 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Species)
59 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Species)
60 | Model.cv.confusion <-confusionMatrix(Model.cv, TrainingSet$Species)
61 | 
62 | print(Model.training.confusion)
63 | print(Model.testing.confusion)
64 | print(Model.cv.confusion)
65 | 
66 | # Feature importance
67 | Importance <- varImp(Model)
68 | plot(Importance)
69 | plot(Importance, col = "red")
70 | 


--------------------------------------------------------------------------------
/iris/iris-data-understanding.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | #########################
 8 | # Loading Iris data set
 9 | #########################
10 | 
11 | # Method 1
12 | 
13 | library(datasets)
14 | data(iris)
15 | 
16 | iris2 <- datasets::iris
17 | 
18 | # Method 2
19 | # install.packages("RCurl")
20 | 
21 | library(RCurl)
22 | iris3 <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/iris.csv") )
23 | 
24 | # View the data
25 | View(iris)
26 | 
27 | #############################
28 | # Display summary statistics
29 | #############################
30 | 
31 | # head() / tail()
32 | head(iris, 5)
33 | tail(iris, 5)
34 | 
35 | 
36 | # summary()
37 | summary(iris)
38 | summary(iris$Sepal.Length)
39 | 
40 | 
41 | # Check to see if there are missing data?
42 | sum(is.na(iris))
43 | 
44 | 
45 | # skimr() - expands on summary() by providing larger set of statistics
46 | #  install.packages("skimr")
47 | # https://github.com/ropensci/skimr
48 | 
49 | library(skimr)
50 | 
51 | skim(iris) # Perform skim to display summary statistics
52 | 
53 | # Group data by Species then perform skim
54 | iris %>% 
55 |   dplyr::group_by(Species) %>% 
56 |   skim() 
57 | 
58 | #############################
59 | # Quick data visualization
60 | #
61 | # R base plot()
62 | #############################
63 | 
64 | 
65 | # Panel plots
66 | plot(iris)
67 | plot(iris, col = "red")
68 | 
69 | # Scatter plot
70 | plot(iris$Sepal.Width, iris$Sepal.Length)
71 | 
72 | plot(iris$Sepal.Width, iris$Sepal.Length, col = "red")     # Makes red circles
73 | 
74 | plot(iris$Sepal.Width, iris$Sepal.Length, col = "red",     # Makes red circles + Adds x and y axis labels
75 |      xlab = "Sepal width", ylab = "Sepal length")
76 | 
77 | # Histogram
78 | hist(iris$Sepal.Width)
79 | hist(iris$Sepal.Width, col = "red")   # Makes red bars
80 | 
81 | # Feature plots
82 | # https://www.machinelearningplus.com/machine-learning/caret-package/
83 | featurePlot(x = iris[,1:4], 
84 |             y = iris$Species, 
85 |             plot = "box",
86 |             strip=strip.custom(par.strip.text=list(cex=.7)),
87 |             scales = list(x = list(relation="free"), 
88 |                           y = list(relation="free")))
89 | 
90 | 


--------------------------------------------------------------------------------
/linear-regression/boston-housing-linear-regression.R:
--------------------------------------------------------------------------------
 1 | ############################################
 2 | # Data Professor                           #
 3 | # http://youtube.com/dataprofessor         #
 4 | # http://github.com/dataprofessor          #
 5 | # http://facebook.com/dataprofessor        #
 6 | # https://www.instagram.com/data.professor #
 7 | ############################################
 8 | 
 9 | # Importing libraries
10 | library(mlbench) # Contains several benchmark data sets (especially the Boston Housing dataset)
11 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
12 | 
13 | # Importing the Boston Housing data set
14 | data(BostonHousing)
15 | 
16 | head(BostonHousing)
17 | 
18 | # Check to see if there are missing data?
19 | sum(is.na(BostonHousing))
20 | 
21 | # To achieve reproducible model; set the random seed number
22 | set.seed(100)
23 | 
24 | # Performs stratified random split of the data set
25 | TrainingIndex <- createDataPartition(BostonHousing$medv, p=0.8, list = FALSE)
26 | TrainingSet <- BostonHousing[TrainingIndex,] # Training Set
27 | TestingSet <- BostonHousing[-TrainingIndex,] # Test Set
28 | 
29 | 
30 | ###############################
31 | 
32 | # Build Training model
33 | Model <- train(medv ~ ., data = TrainingSet,
34 |                method = "lm",
35 |                na.action = na.omit,
36 |                preProcess=c("scale","center"),
37 |                trControl= trainControl(method="none")
38 | )
39 | 
40 | # Apply model for prediction
41 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
42 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set
43 | 
44 | # Model performance (Displays scatter plot and performance metrics)
45 |   # Scatter plot of Training set
46 |     plot(TrainingSet$medv,Model.training, col = "blue" )
47 |     plot(TestingSet$medv,Model.testing, col = "blue" )
48 | 


--------------------------------------------------------------------------------
/plot/scatter-plot/code-scatter-plot.R:
--------------------------------------------------------------------------------
  1 | ############################################
  2 | # Data Professor                           #
  3 | # http://youtube.com/dataprofessor         #
  4 | # http://github.com/dataprofessor          #
  5 | # http://facebook.com/dataprofessor        #
  6 | # https://www.instagram.com/data.professor #
  7 | ############################################
  8 | 
  9 | ######## READ DATA
 10 | # https://link.springer.com/article/10.1007%2Fs11030-013-9462-x
 11 | # 11030_2013_9462_MOESM2_ESM.xls (423 kb)
 12 | # Supplementary material 2 (xls 423 KB)
 13 | aromatase <- read.csv("aromatase.csv")
 14 | 
 15 | ######## MISSING DATA
 16 | sum(is.na(aromatase))
 17 | missingdata <- aromatase[!complete.cases(aromatase), ] # Identify which row contains missing data
 18 | 
 19 | aromatase <- na.omit(aromatase) # Remove any missing data >> Complete case
 20 | sum(is.na(aromatase)) # Check again for missing data
 21 | 
 22 | class <- aromatase[ ,2] # Class label
 23 | aromatase2 <- aromatase[,6:18]   # Descriptors
 24 | aromatase3 <- cbind(class, aromatase2) # Combine Class label + Descriptors into same dataframe
 25 | 
 26 | df <- aromatase3 # Once we are satisfied with the dataset, let's call it "df" for conciseness
 27 | 
 28 | 
 29 | ######## plot()
 30 | 
 31 | # See at a glance all possible scatter plots
 32 | plot(df)
 33 | plot(df , col = "blue")
 34 | 
 35 | # Select a pair of interest to visualize scatter plot
 36 | 
 37 |      # Figure 1, https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0066566
 38 | 
 39 | 
 40 | plot(df$MW, df$ALogP)
 41 | 
 42 | 
 43 | # We're going to make Steroids blue and Non-Steroids red
 44 | library(colorspace) 
 45 | df$color <- factor(df$class,
 46 |                      levels=c("Steroid", "Non-Steroid"),
 47 |                      labels=c("blue", "red"))
 48 | plot(df$MW, df$ALogP, pch = 16, col=as.character(df$color) )
 49 | 
 50 | 
 51 | 
 52 | 
 53 | # col argument for defining the color
 54 | # R has 657 colors, colors() function lists these colors
 55 | plot(df$MW, df$ALogP, col = "red")
 56 | plot(df$MW, df$ALogP, col = "blue")
 57 | plot(df$MW, df$ALogP, col = "green")
 58 | plot(df$MW, df$ALogP, col = "purple")
 59 | 
 60 | plot(df$MW, df$ALogP, col = "orangered3")
 61 | 
 62 | plot(df$MW, df$ALogP, col = "#FF0000") # Hex color code for red
 63 | 
 64 | # Color in RGB color code
 65 | rgb(1,0,0) # red color
 66 | rgb(255,0,0, max=255) # red color
 67 | 
 68 | plot(df$MW, df$ALogP, col = rgb(0,0,0, max=255) )
 69 | 
 70 | 
 71 | 
 72 | # symbols
 73 | 
 74 | plot(df$MW, df$ALogP, pch = 1) # pch = 1, open circles (the default value)
 75 |                                             # There are a total of 25 symbols to choose from
 76 | plot(df$MW, df$ALogP, pch = 2) # pch = 2, open triangle symbols
 77 | plot(df$MW, df$ALogP, pch = 3) # pch = 3, plus symbols
 78 | plot(df$MW, df$ALogP, pch = 4) # pch = 4, x symbols
 79 | plot(df$MW, df$ALogP, pch = 5) # pch = 5, diamond diamongs
 80 | plot(df$MW, df$ALogP, pch = 16) # pch = 16, filled circle symbols
 81 | 
 82 | 
 83 | plot(df$MW, df$ALogP, pch = 16, col = "orangered3")                                                
 84 | col2rgb("orangered3") # This gives us rgb(205,55,0, max=255)
 85 | plot(df$MW, df$ALogP, pch = 16, col = rgb(205,55,0, max=255))
 86 | 
 87 | 
 88 | # Add transparency to color 
 89 | 
 90 | library(scales)
 91 | 
 92 | plot(df$MW, df$ALogP, pch = 16, 
 93 |      col = alpha("orangered3", 0.3))
 94 | 
 95 | plot(df$MW, df$ALogP, pch = 16, 
 96 |      col = rgb(205,55,0, 75, max=255))
 97 | 
 98 | plot(df$MW, df$ALogP, pch = 16, col=alpha(as.character(df$color),0.3 ) )
 99 | 
100 | 
101 | ##################################
102 | # Multi-plot
103 | 
104 | # Scatter plot of first pair
105 | plot(df$MW, df$ALogP, pch = 16, 
106 |      col = alpha("red", 0.3),
107 |      xlab = "Molecular Weight (MW)",    # X-axis label
108 |      ylab = "Solubility (ALogP)", # Y-axis label
109 |      font.lab = 2    # X and Y labels are now bold
110 |      )
111 | abline(lm(df$ALogP ~ df$MW)) # Trend line
112 | 
113 | 
114 | # Scatter plot of second pair
115 | plot(df$MW, df$Qm, pch = 16, 
116 |      col = alpha("blue", 0.3),
117 |      xlab = "MW",    # X-axis label
118 |      ylab = "Qm", # Y-axis label
119 |      font.lab = 2    # X and Y labels are now bold
120 | )
121 | abline(lm(df$Qm ~ df$MW)) # Trend line
122 | 
123 | 
124 | # Scatter plot of third pair
125 | plot(df$HOMO, df$LUMO, pch = 16, 
126 |      col = alpha("green", 0.3),
127 |      xlab = "HOMO",    # X-axis label
128 |      ylab = "LUMO", # Y-axis label
129 |      font.lab = 2    # X and Y labels are now bold
130 | )
131 | abline(lm(df$LUMO ~ df$HOMO)) # Trend line
132 | 
133 | 
134 | # Scatter plot of fourth pair
135 | plot(df$MW, df$HOMO, pch = 16, 
136 |      col = alpha("purple", 0.3),
137 |      xlab = "MW",    # X-axis label
138 |      ylab = "HOMO", # Y-axis label
139 |      font.lab = 2    # X and Y labels are now bold
140 | )
141 | abline(lm(df$HOMO ~ df$MW)) # Trend line
142 | 
143 | 
144 | ######## Creating multi-plot figures
145 | 
146 |   # 2 rows by 2 columns
147 | 
148 |       par(mfrow=c(2,2))
149 |       # Plot 1
150 |       # Plot 2
151 |       # Plot 3
152 |       # Plot 4
153 |       
154 |       par(mfrow=c(2,2),  mai = c(0.7, 0.7, 0.3, 0.3))
155 |         plot(df$MW, df$ALogP) # Plot 1
156 |         plot(df$MW, df$Qm) # Plot 2
157 |         plot(df$HOMO, df$LUMO) # Plot 3
158 |         plot(df$MW, df$HOMO) # Plot 4
159 |       
160 |         
161 |   # 3 rows by 1 column
162 |         
163 |       par(mfrow=c(3,1))
164 |       # Plot 1
165 |       # Plot 2
166 |       # Plot 4
167 |       
168 |       par(mfrow=c(3,1),  mai = c(0.3, 0.7, 0.1, 0.3))
169 |         plot(df$MW, df$ALogP) # Plot 1
170 |         plot(df$MW, df$Qm) # Plot 2
171 |         plot(df$MW, df$HOMO) # Plot 4
172 |       
173 |         
174 |   # 1 row by 3 column
175 |         
176 |       par(mfrow=c(1,3))
177 |       # Plot 1
178 |       # Plot 2
179 |       # Plot 3
180 |       # Plot 4
181 | 
182 |       par(mfrow=c(1,3),  mai = c(0.3, 0.3, 0.3, 0.3))
183 |         plot(df$MW, df$ALogP) # Plot 1
184 |         plot(df$MW, df$Qm) # Plot 2
185 |         plot(df$MW, df$HOMO) # Plot 4
186 |       
187 |       par(mfrow=c(1,3),  mai = c(0.3, 0.3, 0.3, 0))
188 |         plot(df$ALogP, df$MW) # Plot 1
189 |         plot(df$Qm, df$MW) # Plot 2
190 |         plot(df$HOMO, df$MW) # Plot 4
191 |         
192 | 
193 | ######## Saving plot to file
194 | 
195 | # Single plot   
196 |       
197 |   pdf("plot.pdf")
198 |   #...Insert plot function here...
199 |   dev.off()
200 |   
201 |   pdf("plot.pdf")
202 |     plot(df$ALogP, df$MW)
203 |   dev.off()
204 |   
205 | # Multi-plot
206 | 
207 |   pdf("plot2.pdf")
208 |     par(mfrow=c(2,2))
209 |     # Plot 1
210 |     # Plot 2
211 |     # Plot 3
212 |     # Plot 4
213 |   dev.off()
214 | 
215 |   pdf("plot_multiplot.pdf")
216 |     par(mfrow=c(1,3),  mai = c(0.3, 0.3, 0.3, 0))
217 |       plot(df$ALogP, df$MW) # Plot 1
218 |       plot(df$Qm, df$MW) # Plot 2
219 |       plot(df$HOMO, df$MW) # Plot 4
220 |   dev.off()
221 |   
222 |   pdf("plot2.pdf")
223 |   par(mfrow=c(2,2),  mai = c(0.7, 0.7, 0.3, 0.3))
224 |     # Plot 1
225 |     # Plot 2
226 |     # Plot 3
227 |     # Plot 4
228 |   dev.off()
229 | 


--------------------------------------------------------------------------------
/python-in-r/using-reticulate.R:
--------------------------------------------------------------------------------
 1 | ############################################
 2 | # Data Professor                           #
 3 | # http://youtube.com/dataprofessor         #
 4 | # http://github.com/dataprofessor          #
 5 | # http://facebook.com/dataprofessor        #
 6 | # https://www.instagram.com/data.professor #
 7 | ############################################
 8 | 
 9 | # https://rstudio.github.io/reticulate/
10 | # install.packages("reticulate")
11 | library(reticulate)
12 | 
13 | # Loads Python Shell
14 | repl_python()
15 | 
16 | # Check the current Python version
17 | 
18 | reticulate::py_config()
19 | 
20 | # Load a particular Python version on our system
21 | use_python("C:/Program Files/Python38", required = TRUE)
22 | 
23 | 
24 | 
25 | ############################
26 | #
27 | # matplotlib Example - Scatter plot
28 | # https://matplotlib.org/3.1.1/gallery/shapes_and_collections/scatter.html#sphx-glr-gallery-shapes-and-collections-scatter-py
29 | #
30 | ############################
31 | 
32 | ############################
33 | # Import libraries
34 | ############################
35 | 
36 | # import matplotlib.pyplot as plt
37 | plt <- import('matplotlib.pyplot') 
38 | 
39 | # import numpy as np
40 | np <- import('numpy')
41 | 
42 | ############################
43 | # Load the Iris dataset
44 | ############################
45 | data(iris)
46 | 
47 | 
48 | ############################
49 | # Fixing random state for reproducibility
50 | ############################
51 | 
52 | # np.random.seed(19680801)  # https://github.com/rstudio/reticulate/issues/226
53 | np$random$seed(19680801L)
54 | 
55 | # N = 50
56 | N <- 50L
57 | # x = np.random.rand(N)
58 | x <- np$random$rand(N)
59 | 
60 | # y = np.random.rand(N)
61 | y <- np$random$rand(N)
62 | 
63 | # colors = np.random.rand(N)
64 | colors <- np$random$rand(N)
65 | 
66 | # area = (30 * np.random.rand(N))**2  # 0 to 15 point radii
67 | area <- (30 * np$random$rand(N))**2
68 | 
69 | # plt.scatter(x, y, s=area, c=colors, alpha=0.5)
70 | plt$scatter(x, y, s=area, c=colors, alpha=0.5)
71 | 
72 | # plt.show()
73 | plt$show()
74 | 


--------------------------------------------------------------------------------
/python/Hummingbird_ML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Hummingbird-ML.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "accelerator": "GPU"
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "metadata": {
 20 |         "id": "IpOdlr3WAPHJ",
 21 |         "colab_type": "text"
 22 |       },
 23 |       "source": [
 24 |         "# **Hummingbird-ML**\n",
 25 |         "\n",
 26 |         "[How to Harness GPU to Speed Up Machine Learning with Hummingbird-ML](https://www.youtube.com/watch?v=qN8jcUmo8TI)\n",
 27 |         "\n",
 28 |         "Adapted from: https://github.com/microsoft/hummingbird"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "metadata": {
 34 |         "id": "ir3DZd5-_jiu",
 35 |         "colab_type": "text"
 36 |       },
 37 |       "source": [
 38 |         "# Install Hummingbird-ML"
 39 |       ]
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "metadata": {
 44 |         "id": "ra3JEgWN_bfp",
 45 |         "colab_type": "code",
 46 |         "colab": {
 47 |           "base_uri": "https://localhost:8080/",
 48 |           "height": 408
 49 |         },
 50 |         "outputId": "4fae39de-26f0-4939-846d-039fb876725a"
 51 |       },
 52 |       "source": [
 53 |         "! pip install hummingbird-ml[extra]"
 54 |       ],
 55 |       "execution_count": 1,
 56 |       "outputs": [
 57 |         {
 58 |           "output_type": "stream",
 59 |           "text": [
 60 |             "Collecting hummingbird-ml[extra]\n",
 61 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ed/3b/cf1b8c1e7531377adead8de29e29b00b5aed380544ad0def4c0188b50d80/hummingbird_ml-0.0.5-py2.py3-none-any.whl (60kB)\n",
 62 |             "\r\u001b[K     |█████▌                          | 10kB 16.6MB/s eta 0:00:01\r\u001b[K     |███████████                     | 20kB 1.8MB/s eta 0:00:01\r\u001b[K     |████████████████▍               | 30kB 2.2MB/s eta 0:00:01\r\u001b[K     |█████████████████████▉          | 40kB 2.5MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▎    | 51kB 2.0MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 61kB 1.8MB/s \n",
 63 |             "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.18.5)\n",
 64 |             "Requirement already satisfied: torch>=1.4.* in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.6.0+cu101)\n",
 65 |             "Collecting onnxconverter-common>=1.6.0\n",
 66 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fe/7a/7e30c643cd7d2ad87689188ef34ce93e657bd14da3605f87bcdbc19cd5b1/onnxconverter_common-1.7.0-py2.py3-none-any.whl (64kB)\n",
 67 |             "\u001b[K     |████████████████████████████████| 71kB 3.7MB/s \n",
 68 |             "\u001b[?25hRequirement already satisfied: scikit-learn>=0.22.1 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.22.2.post1)\n",
 69 |             "Requirement already satisfied: xgboost==0.90; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.90)\n",
 70 |             "Requirement already satisfied: lightgbm>=2.2; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (2.2.3)\n",
 71 |             "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.4.*->hummingbird-ml[extra]) (0.16.0)\n",
 72 |             "Collecting onnx\n",
 73 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/36/ee/bc7bc88fc8449266add978627e90c363069211584b937fd867b0ccc59f09/onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4MB)\n",
 74 |             "\u001b[K     |████████████████████████████████| 7.4MB 16.0MB/s \n",
 75 |             "\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.6/dist-packages (from onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.12.4)\n",
 76 |             "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (0.16.0)\n",
 77 |             "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (1.4.1)\n",
 78 |             "Requirement already satisfied: typing-extensions>=3.6.2.1 in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.7.4.3)\n",
 79 |             "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (1.15.0)\n",
 80 |             "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (49.6.0)\n",
 81 |             "Installing collected packages: onnx, onnxconverter-common, hummingbird-ml\n",
 82 |             "Successfully installed hummingbird-ml-0.0.5 onnx-1.7.0 onnxconverter-common-1.7.0\n"
 83 |           ],
 84 |           "name": "stdout"
 85 |         }
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "markdown",
 90 |       "metadata": {
 91 |         "id": "YnA-PmeA_q70",
 92 |         "colab_type": "text"
 93 |       },
 94 |       "source": [
 95 |         "# Import libraries"
 96 |       ]
 97 |     },
 98 |     {
 99 |       "cell_type": "code",
100 |       "metadata": {
101 |         "id": "lkIThThi_puf",
102 |         "colab_type": "code",
103 |         "colab": {}
104 |       },
105 |       "source": [
106 |         "import numpy as np\n",
107 |         "from sklearn.ensemble import RandomForestClassifier\n",
108 |         "from hummingbird.ml import convert"
109 |       ],
110 |       "execution_count": 2,
111 |       "outputs": []
112 |     },
113 |     {
114 |       "cell_type": "markdown",
115 |       "metadata": {
116 |         "id": "rFw_4cGa_-tF",
117 |         "colab_type": "text"
118 |       },
119 |       "source": [
120 |         "# Create some random data for binary classification"
121 |       ]
122 |     },
123 |     {
124 |       "cell_type": "code",
125 |       "metadata": {
126 |         "id": "hGGngPPp__mx",
127 |         "colab_type": "code",
128 |         "colab": {}
129 |       },
130 |       "source": [
131 |         "num_classes = 2\n",
132 |         "X = np.random.rand(100000, 28)\n",
133 |         "y = np.random.randint(num_classes, size=100000)"
134 |       ],
135 |       "execution_count": 3,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "cell_type": "markdown",
140 |       "metadata": {
141 |         "id": "WusxNKH4AHII",
142 |         "colab_type": "text"
143 |       },
144 |       "source": [
145 |         "# Create and train a model (scikit-learn RandomForestClassifier)"
146 |       ]
147 |     },
148 |     {
149 |       "cell_type": "code",
150 |       "metadata": {
151 |         "id": "GMRJRuBwAGeV",
152 |         "colab_type": "code",
153 |         "colab": {}
154 |       },
155 |       "source": [
156 |         "skl_model = RandomForestClassifier(n_estimators=10, max_depth=10)"
157 |       ],
158 |       "execution_count": 4,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "metadata": {
164 |         "id": "M_kGo80yAYTn",
165 |         "colab_type": "code",
166 |         "colab": {
167 |           "base_uri": "https://localhost:8080/",
168 |           "height": 34
169 |         },
170 |         "outputId": "aa863652-02f8-4578-8fb7-e3b028685cd7"
171 |       },
172 |       "source": [
173 |         "%%timeit\n",
174 |         "skl_model.fit(X, y)"
175 |       ],
176 |       "execution_count": 5,
177 |       "outputs": [
178 |         {
179 |           "output_type": "stream",
180 |           "text": [
181 |             "1 loop, best of 3: 4.78 s per loop\n"
182 |           ],
183 |           "name": "stdout"
184 |         }
185 |       ]
186 |     },
187 |     {
188 |       "cell_type": "code",
189 |       "metadata": {
190 |         "id": "Hp4a8I0tAbBl",
191 |         "colab_type": "code",
192 |         "colab": {
193 |           "base_uri": "https://localhost:8080/",
194 |           "height": 34
195 |         },
196 |         "outputId": "4e083fd5-981f-4238-9158-3f4500585560"
197 |       },
198 |       "source": [
199 |         "%%timeit\n",
200 |         "skl_model.predict(X)"
201 |       ],
202 |       "execution_count": 6,
203 |       "outputs": [
204 |         {
205 |           "output_type": "stream",
206 |           "text": [
207 |             "10 loops, best of 3: 85.6 ms per loop\n"
208 |           ],
209 |           "name": "stdout"
210 |         }
211 |       ]
212 |     },
213 |     {
214 |       "cell_type": "markdown",
215 |       "metadata": {
216 |         "id": "mNiBvy9BA7wR",
217 |         "colab_type": "text"
218 |       },
219 |       "source": [
220 |         "# Use Hummingbird to convert the model to PyTorch"
221 |       ]
222 |     },
223 |     {
224 |       "cell_type": "code",
225 |       "metadata": {
226 |         "id": "vcAOpuxxAzPc",
227 |         "colab_type": "code",
228 |         "colab": {}
229 |       },
230 |       "source": [
231 |         "model = convert(skl_model, 'pytorch')"
232 |       ],
233 |       "execution_count": 7,
234 |       "outputs": []
235 |     },
236 |     {
237 |       "cell_type": "markdown",
238 |       "metadata": {
239 |         "id": "dpt6_4l8BF7e",
240 |         "colab_type": "text"
241 |       },
242 |       "source": [
243 |         "# Run predictions on CPU"
244 |       ]
245 |     },
246 |     {
247 |       "cell_type": "code",
248 |       "metadata": {
249 |         "id": "_BiU63hNBDu-",
250 |         "colab_type": "code",
251 |         "colab": {
252 |           "base_uri": "https://localhost:8080/",
253 |           "height": 34
254 |         },
255 |         "outputId": "1bd8b158-a62b-4fe0-be09-ca382c817247"
256 |       },
257 |       "source": [
258 |         "%%timeit\n",
259 |         "model.predict(X)"
260 |       ],
261 |       "execution_count": 8,
262 |       "outputs": [
263 |         {
264 |           "output_type": "stream",
265 |           "text": [
266 |             "1 loop, best of 3: 174 ms per loop\n"
267 |           ],
268 |           "name": "stdout"
269 |         }
270 |       ]
271 |     },
272 |     {
273 |       "cell_type": "markdown",
274 |       "metadata": {
275 |         "id": "F10tJEMKBPZG",
276 |         "colab_type": "text"
277 |       },
278 |       "source": [
279 |         "# Run predictions on GPU"
280 |       ]
281 |     },
282 |     {
283 |       "cell_type": "code",
284 |       "metadata": {
285 |         "id": "l2PUbqoHBJBX",
286 |         "colab_type": "code",
287 |         "colab": {}
288 |       },
289 |       "source": [
290 |         "model.to('cuda')"
291 |       ],
292 |       "execution_count": 9,
293 |       "outputs": []
294 |     },
295 |     {
296 |       "cell_type": "code",
297 |       "metadata": {
298 |         "id": "-AB23_VTBRMP",
299 |         "colab_type": "code",
300 |         "colab": {
301 |           "base_uri": "https://localhost:8080/",
302 |           "height": 51
303 |         },
304 |         "outputId": "b9efea7d-913c-4326-c14a-6b6ca0e9c063"
305 |       },
306 |       "source": [
307 |         "%%timeit\n",
308 |         "model.predict(X)"
309 |       ],
310 |       "execution_count": 10,
311 |       "outputs": [
312 |         {
313 |           "output_type": "stream",
314 |           "text": [
315 |             "The slowest run took 5.22 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
316 |             "100 loops, best of 3: 14.8 ms per loop\n"
317 |           ],
318 |           "name": "stdout"
319 |         }
320 |       ]
321 |     },
322 |     {
323 |       "cell_type": "markdown",
324 |       "metadata": {
325 |         "id": "dbkQU69JDt7T",
326 |         "colab_type": "text"
327 |       },
328 |       "source": [
329 |         "# Calculation Time"
330 |       ]
331 |     },
332 |     {
333 |       "cell_type": "markdown",
334 |       "metadata": {
335 |         "id": "Hr1R_9nwDwpc",
336 |         "colab_type": "text"
337 |       },
338 |       "source": [
339 |         "Methods | Timing | Performance\n",
340 |         "--|--|--\n",
341 |         "scikit-learn | 85.6 ms | -\n",
342 |         "PyTorch (CPU) | 174 ms | 2 X slower than scikit-learn\n",
343 |         "PyTorch (GPU) | 14.8 ms | Almost 6 X faster than scikit-learn; Almost 12 X faster than PyTorch (CPU)"
344 |       ]
345 |     },
346 |     {
347 |       "cell_type": "code",
348 |       "metadata": {
349 |         "id": "9lmR3LHoEzhl",
350 |         "colab_type": "code",
351 |         "colab": {}
352 |       },
353 |       "source": [
354 |         ""
355 |       ],
356 |       "execution_count": null,
357 |       "outputs": []
358 |     }
359 |   ]
360 | }


--------------------------------------------------------------------------------
/python/google_colab_install_conda.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "conda-on-google-colab.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     }
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "code",
 18 |       "metadata": {
 19 |         "id": "uyfFc8VufUyl",
 20 |         "colab_type": "code",
 21 |         "outputId": "36b12856-fc68-40b8-c203-0fcf8ab7244e",
 22 |         "colab": {
 23 |           "base_uri": "https://localhost:8080/",
 24 |           "height": 1000
 25 |         }
 26 |       },
 27 |       "source": [
 28 |         "################################################################################\n",
 29 |         "# INSTALL CONDA ON GOOGLE COLAB\n",
 30 |         "################################################################################\n",
 31 |         "! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
 32 |         "! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
 33 |         "! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n",
 34 |         "import sys\n",
 35 |         "sys.path.append('/usr/local/lib/python3.7/site-packages/')"
 36 |       ],
 37 |       "execution_count": 0,
 38 |       "outputs": [
 39 |         {
 40 |           "output_type": "stream",
 41 |           "text": [
 42 |             "--2020-04-06 03:23:37--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
 43 |             "Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8303, ...\n",
 44 |             "Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.\n",
 45 |             "HTTP request sent, awaiting response... 200 OK\n",
 46 |             "Length: 85055499 (81M) [application/x-sh]\n",
 47 |             "Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.2’\n",
 48 |             "\n",
 49 |             "\r          Miniconda   0%[                    ]       0  --.-KB/s               \r         Miniconda3  48%[========>           ]  39.24M   196MB/s               \r        Miniconda3-  93%[=================>  ]  76.01M   189MB/s               \rMiniconda3-py37_4.8 100%[===================>]  81.12M   187MB/s    in 0.4s    \n",
 50 |             "\n",
 51 |             "2020-04-06 03:23:37 (187 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.2’ saved [85055499/85055499]\n",
 52 |             "\n",
 53 |             "PREFIX=/usr/local\n",
 54 |             "Unpacking payload ...\n",
 55 |             "Collecting package metadata (current_repodata.json): - \b\b\\ \b\bdone\n",
 56 |             "Solving environment: / \b\b- \b\b\\ \n",
 57 |             "The environment is inconsistent, please check the package plan carefully\n",
 58 |             "The following packages are causing the inconsistency:\n",
 59 |             "\n",
 60 |             "  - defaults/linux-64::urllib3==1.25.8=py37_0\n",
 61 |             "  - defaults/linux-64::ruamel_yaml==0.15.87=py37h7b6447c_0\n",
 62 |             "  - defaults/linux-64::pyopenssl==19.1.0=py37_0\n",
 63 |             "  - defaults/linux-64::pysocks==1.7.1=py37_0\n",
 64 |             "  - defaults/linux-64::six==1.14.0=py37_0\n",
 65 |             "  - defaults/linux-64::setuptools==45.2.0=py37_0\n",
 66 |             "  - defaults/linux-64::idna==2.8=py37_0\n",
 67 |             "  - defaults/noarch::tqdm==4.42.1=py_0\n",
 68 |             "  - defaults/linux-64::asn1crypto==1.3.0=py37_0\n",
 69 |             "  - defaults/linux-64::cffi==1.14.0=py37h2e261b9_0\n",
 70 |             "  - defaults/linux-64::wheel==0.34.2=py37_0\n",
 71 |             "  - defaults/linux-64::conda-package-handling==1.6.0=py37h7b6447c_0\n",
 72 |             "  - defaults/linux-64::pip==20.0.2=py37_1\n",
 73 |             "  - defaults/linux-64::cryptography==2.8=py37h1ba5d50_0\n",
 74 |             "  - defaults/linux-64::python==3.7.6=h0371630_2\n",
 75 |             "  - defaults/linux-64::pycparser==2.19=py37_0\n",
 76 |             "  - defaults/linux-64::pycosat==0.6.3=py37h7b6447c_0\n",
 77 |             "  - defaults/linux-64::requests==2.22.0=py37_1\n",
 78 |             "  - defaults/linux-64::chardet==3.0.4=py37_1003\n",
 79 |             "\b\b| \b\b/ \b\bdone\n",
 80 |             "\n",
 81 |             "## Package Plan ##\n",
 82 |             "\n",
 83 |             "  environment location: /usr/local\n",
 84 |             "\n",
 85 |             "  added / updated specs:\n",
 86 |             "    - _libgcc_mutex==0.1=main\n",
 87 |             "    - asn1crypto==1.3.0=py37_0\n",
 88 |             "    - ca-certificates==2020.1.1=0\n",
 89 |             "    - certifi==2019.11.28=py37_0\n",
 90 |             "    - cffi==1.14.0=py37h2e261b9_0\n",
 91 |             "    - chardet==3.0.4=py37_1003\n",
 92 |             "    - conda-package-handling==1.6.0=py37h7b6447c_0\n",
 93 |             "    - conda==4.8.2=py37_0\n",
 94 |             "    - cryptography==2.8=py37h1ba5d50_0\n",
 95 |             "    - idna==2.8=py37_0\n",
 96 |             "    - ld_impl_linux-64==2.33.1=h53a641e_7\n",
 97 |             "    - libedit==3.1.20181209=hc058e9b_0\n",
 98 |             "    - libffi==3.2.1=hd88cf55_4\n",
 99 |             "    - libgcc-ng==9.1.0=hdf63c60_0\n",
100 |             "    - libstdcxx-ng==9.1.0=hdf63c60_0\n",
101 |             "    - ncurses==6.2=he6710b0_0\n",
102 |             "    - openssl==1.1.1d=h7b6447c_4\n",
103 |             "    - pip==20.0.2=py37_1\n",
104 |             "    - pycosat==0.6.3=py37h7b6447c_0\n",
105 |             "    - pycparser==2.19=py37_0\n",
106 |             "    - pyopenssl==19.1.0=py37_0\n",
107 |             "    - pysocks==1.7.1=py37_0\n",
108 |             "    - python==3.7.6=h0371630_2\n",
109 |             "    - readline==7.0=h7b6447c_5\n",
110 |             "    - requests==2.22.0=py37_1\n",
111 |             "    - ruamel_yaml==0.15.87=py37h7b6447c_0\n",
112 |             "    - setuptools==45.2.0=py37_0\n",
113 |             "    - six==1.14.0=py37_0\n",
114 |             "    - sqlite==3.31.1=h7b6447c_0\n",
115 |             "    - tk==8.6.8=hbc83047_0\n",
116 |             "    - tqdm==4.42.1=py_0\n",
117 |             "    - urllib3==1.25.8=py37_0\n",
118 |             "    - wheel==0.34.2=py37_0\n",
119 |             "    - xz==5.2.4=h14c3975_4\n",
120 |             "    - yaml==0.1.7=had09818_2\n",
121 |             "    - zlib==1.2.11=h7b6447c_3\n",
122 |             "\n",
123 |             "\n",
124 |             "The following NEW packages will be INSTALLED:\n",
125 |             "\n",
126 |             "  certifi            pkgs/main/linux-64::certifi-2019.11.28-py37_0\n",
127 |             "  conda              pkgs/main/linux-64::conda-4.8.2-py37_0\n",
128 |             "  openssl            pkgs/main/linux-64::openssl-1.1.1d-h7b6447c_4\n",
129 |             "\n",
130 |             "\n",
131 |             "Preparing transaction: \\ \b\bdone\n",
132 |             "Executing transaction: / \b\b- \b\bdone\n",
133 |             "installation finished.\n",
134 |             "WARNING:\n",
135 |             "    You currently have a PYTHONPATH environment variable set. This may cause\n",
136 |             "    unexpected behavior when running the Python interpreter in Miniconda3.\n",
137 |             "    For best results, please verify that your PYTHONPATH only points to\n",
138 |             "    directories of packages that are compatible with the Python interpreter\n",
139 |             "    in Miniconda3: /usr/local\n"
140 |           ],
141 |           "name": "stdout"
142 |         }
143 |       ]
144 |     },
145 |     {
146 |       "cell_type": "code",
147 |       "metadata": {
148 |         "id": "QD319lDvf6Xp",
149 |         "colab_type": "code",
150 |         "outputId": "71cc1953-4fd9-4f85-fc0a-5a3f7e7688bd",
151 |         "colab": {
152 |           "base_uri": "https://localhost:8080/",
153 |           "height": 996
154 |         }
155 |       },
156 |       "source": [
157 |         "! conda install -c rdkit rdkit -y"
158 |       ],
159 |       "execution_count": 0,
160 |       "outputs": [
161 |         {
162 |           "output_type": "stream",
163 |           "text": [
164 |             "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n",
165 |             "Solving environment: - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bfailed with initial frozen solve. Retrying with flexible solve.\n",
166 |             "Solving environment: / \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bfailed with repodata from current_repodata.json, will retry with next repodata source.\n",
167 |             "Collecting package metadata (repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
168 |             "Solving environment: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
169 |             "\n",
170 |             "## Package Plan ##\n",
171 |             "\n",
172 |             "  environment location: /usr/local\n",
173 |             "\n",
174 |             "  added / updated specs:\n",
175 |             "    - rdkit\n",
176 |             "\n",
177 |             "\n",
178 |             "The following NEW packages will be INSTALLED:\n",
179 |             "\n",
180 |             "  blas               pkgs/main/linux-64::blas-1.0-mkl\n",
181 |             "  bzip2              pkgs/main/linux-64::bzip2-1.0.8-h7b6447c_0\n",
182 |             "  cairo              pkgs/main/linux-64::cairo-1.14.12-h8948797_3\n",
183 |             "  fontconfig         pkgs/main/linux-64::fontconfig-2.13.0-h9420a91_0\n",
184 |             "  freetype           pkgs/main/linux-64::freetype-2.9.1-h8a8886c_1\n",
185 |             "  glib               pkgs/main/linux-64::glib-2.63.1-h5a9c865_0\n",
186 |             "  icu                pkgs/main/linux-64::icu-58.2-h9c2bf20_1\n",
187 |             "  intel-openmp       pkgs/main/linux-64::intel-openmp-2020.0-166\n",
188 |             "  jpeg               pkgs/main/linux-64::jpeg-9b-h024ee3a_2\n",
189 |             "  libboost           pkgs/main/linux-64::libboost-1.67.0-h46d08c1_4\n",
190 |             "  libgfortran-ng     pkgs/main/linux-64::libgfortran-ng-7.3.0-hdf63c60_0\n",
191 |             "  libpng             pkgs/main/linux-64::libpng-1.6.37-hbc83047_0\n",
192 |             "  libtiff            pkgs/main/linux-64::libtiff-4.1.0-h2733197_0\n",
193 |             "  libuuid            pkgs/main/linux-64::libuuid-1.0.3-h1bed415_2\n",
194 |             "  libxcb             pkgs/main/linux-64::libxcb-1.13-h1bed415_1\n",
195 |             "  libxml2            pkgs/main/linux-64::libxml2-2.9.9-hea5a465_1\n",
196 |             "  mkl                pkgs/main/linux-64::mkl-2020.0-166\n",
197 |             "  mkl-service        pkgs/main/linux-64::mkl-service-2.3.0-py37he904b0f_0\n",
198 |             "  mkl_fft            pkgs/main/linux-64::mkl_fft-1.0.15-py37ha843d7b_0\n",
199 |             "  mkl_random         pkgs/main/linux-64::mkl_random-1.1.0-py37hd6b4f25_0\n",
200 |             "  numpy              pkgs/main/linux-64::numpy-1.18.1-py37h4f9e942_0\n",
201 |             "  numpy-base         pkgs/main/linux-64::numpy-base-1.18.1-py37hde5b4d6_1\n",
202 |             "  olefile            pkgs/main/linux-64::olefile-0.46-py37_0\n",
203 |             "  pandas             pkgs/main/linux-64::pandas-1.0.3-py37h0573a6f_0\n",
204 |             "  pcre               pkgs/main/linux-64::pcre-8.43-he6710b0_0\n",
205 |             "  pillow             pkgs/main/linux-64::pillow-7.0.0-py37hb39fc2d_0\n",
206 |             "  pixman             pkgs/main/linux-64::pixman-0.38.0-h7b6447c_0\n",
207 |             "  py-boost           pkgs/main/linux-64::py-boost-1.67.0-py37h04863e7_4\n",
208 |             "  python-dateutil    pkgs/main/noarch::python-dateutil-2.8.1-py_0\n",
209 |             "  pytz               pkgs/main/noarch::pytz-2019.3-py_0\n",
210 |             "  rdkit              rdkit/linux-64::rdkit-2020.03.1.0-py37hc20afe1_1\n",
211 |             "  zstd               pkgs/main/linux-64::zstd-1.3.7-h0b5b093_0\n",
212 |             "\n",
213 |             "The following packages will be UPDATED:\n",
214 |             "\n",
215 |             "  certifi                                 2019.11.28-py37_0 --> 2019.11.28-py37_1\n",
216 |             "  conda                                        4.8.2-py37_0 --> 4.8.3-py37_0\n",
217 |             "  openssl                                 1.1.1d-h7b6447c_4 --> 1.1.1f-h7b6447c_0\n",
218 |             "\n",
219 |             "\n",
220 |             "Proceed ([y]/n)? "
221 |           ],
222 |           "name": "stdout"
223 |         }
224 |       ]
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "metadata": {
229 |         "id": "M8MeOz0miqJ1",
230 |         "colab_type": "code",
231 |         "colab": {}
232 |       },
233 |       "source": [
234 |         ""
235 |       ],
236 |       "execution_count": 0,
237 |       "outputs": []
238 |     }
239 |   ]
240 | }


--------------------------------------------------------------------------------
/python/iris/iris-classification-random-forest.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Building a Classification Model for the Iris data set\n",
  8 |     "\n",
  9 |     "Chanin Nantasenamat\n",
 10 |     "\n",
 11 |     "<i>Data Professor YouTube channel, http://youtube.com/dataprofessor </i>\n",
 12 |     "\n",
 13 |     "In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## 1. Import libraries"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from sklearn import datasets\n",
 30 |     "from sklearn.model_selection import train_test_split\n",
 31 |     "from sklearn.ensemble import RandomForestClassifier\n",
 32 |     "from sklearn.datasets import make_classification"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## 2. Load the *iris* data set"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "iris = datasets.load_iris()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 3. Input features\n",
 56 |     "The ***iris*** data set contains 4 input features and 1 output variable (the class label)."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### 3.1. Input features"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {
 70 |     "scrolled": true
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "print(iris.feature_names)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### 3.2. Output features"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "['setosa' 'versicolor' 'virginica']\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "print(iris.target_names)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## 4. Glimpse of the data"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "### 4.1. Input features"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "array([[5.1, 3.5, 1.4, 0.2],\n",
132 |        "       [4.9, 3. , 1.4, 0.2],\n",
133 |        "       [4.7, 3.2, 1.3, 0.2],\n",
134 |        "       [4.6, 3.1, 1.5, 0.2],\n",
135 |        "       [5. , 3.6, 1.4, 0.2],\n",
136 |        "       [5.4, 3.9, 1.7, 0.4],\n",
137 |        "       [4.6, 3.4, 1.4, 0.3],\n",
138 |        "       [5. , 3.4, 1.5, 0.2],\n",
139 |        "       [4.4, 2.9, 1.4, 0.2],\n",
140 |        "       [4.9, 3.1, 1.5, 0.1],\n",
141 |        "       [5.4, 3.7, 1.5, 0.2],\n",
142 |        "       [4.8, 3.4, 1.6, 0.2],\n",
143 |        "       [4.8, 3. , 1.4, 0.1],\n",
144 |        "       [4.3, 3. , 1.1, 0.1],\n",
145 |        "       [5.8, 4. , 1.2, 0.2],\n",
146 |        "       [5.7, 4.4, 1.5, 0.4],\n",
147 |        "       [5.4, 3.9, 1.3, 0.4],\n",
148 |        "       [5.1, 3.5, 1.4, 0.3],\n",
149 |        "       [5.7, 3.8, 1.7, 0.3],\n",
150 |        "       [5.1, 3.8, 1.5, 0.3],\n",
151 |        "       [5.4, 3.4, 1.7, 0.2],\n",
152 |        "       [5.1, 3.7, 1.5, 0.4],\n",
153 |        "       [4.6, 3.6, 1. , 0.2],\n",
154 |        "       [5.1, 3.3, 1.7, 0.5],\n",
155 |        "       [4.8, 3.4, 1.9, 0.2],\n",
156 |        "       [5. , 3. , 1.6, 0.2],\n",
157 |        "       [5. , 3.4, 1.6, 0.4],\n",
158 |        "       [5.2, 3.5, 1.5, 0.2],\n",
159 |        "       [5.2, 3.4, 1.4, 0.2],\n",
160 |        "       [4.7, 3.2, 1.6, 0.2],\n",
161 |        "       [4.8, 3.1, 1.6, 0.2],\n",
162 |        "       [5.4, 3.4, 1.5, 0.4],\n",
163 |        "       [5.2, 4.1, 1.5, 0.1],\n",
164 |        "       [5.5, 4.2, 1.4, 0.2],\n",
165 |        "       [4.9, 3.1, 1.5, 0.2],\n",
166 |        "       [5. , 3.2, 1.2, 0.2],\n",
167 |        "       [5.5, 3.5, 1.3, 0.2],\n",
168 |        "       [4.9, 3.6, 1.4, 0.1],\n",
169 |        "       [4.4, 3. , 1.3, 0.2],\n",
170 |        "       [5.1, 3.4, 1.5, 0.2],\n",
171 |        "       [5. , 3.5, 1.3, 0.3],\n",
172 |        "       [4.5, 2.3, 1.3, 0.3],\n",
173 |        "       [4.4, 3.2, 1.3, 0.2],\n",
174 |        "       [5. , 3.5, 1.6, 0.6],\n",
175 |        "       [5.1, 3.8, 1.9, 0.4],\n",
176 |        "       [4.8, 3. , 1.4, 0.3],\n",
177 |        "       [5.1, 3.8, 1.6, 0.2],\n",
178 |        "       [4.6, 3.2, 1.4, 0.2],\n",
179 |        "       [5.3, 3.7, 1.5, 0.2],\n",
180 |        "       [5. , 3.3, 1.4, 0.2],\n",
181 |        "       [7. , 3.2, 4.7, 1.4],\n",
182 |        "       [6.4, 3.2, 4.5, 1.5],\n",
183 |        "       [6.9, 3.1, 4.9, 1.5],\n",
184 |        "       [5.5, 2.3, 4. , 1.3],\n",
185 |        "       [6.5, 2.8, 4.6, 1.5],\n",
186 |        "       [5.7, 2.8, 4.5, 1.3],\n",
187 |        "       [6.3, 3.3, 4.7, 1.6],\n",
188 |        "       [4.9, 2.4, 3.3, 1. ],\n",
189 |        "       [6.6, 2.9, 4.6, 1.3],\n",
190 |        "       [5.2, 2.7, 3.9, 1.4],\n",
191 |        "       [5. , 2. , 3.5, 1. ],\n",
192 |        "       [5.9, 3. , 4.2, 1.5],\n",
193 |        "       [6. , 2.2, 4. , 1. ],\n",
194 |        "       [6.1, 2.9, 4.7, 1.4],\n",
195 |        "       [5.6, 2.9, 3.6, 1.3],\n",
196 |        "       [6.7, 3.1, 4.4, 1.4],\n",
197 |        "       [5.6, 3. , 4.5, 1.5],\n",
198 |        "       [5.8, 2.7, 4.1, 1. ],\n",
199 |        "       [6.2, 2.2, 4.5, 1.5],\n",
200 |        "       [5.6, 2.5, 3.9, 1.1],\n",
201 |        "       [5.9, 3.2, 4.8, 1.8],\n",
202 |        "       [6.1, 2.8, 4. , 1.3],\n",
203 |        "       [6.3, 2.5, 4.9, 1.5],\n",
204 |        "       [6.1, 2.8, 4.7, 1.2],\n",
205 |        "       [6.4, 2.9, 4.3, 1.3],\n",
206 |        "       [6.6, 3. , 4.4, 1.4],\n",
207 |        "       [6.8, 2.8, 4.8, 1.4],\n",
208 |        "       [6.7, 3. , 5. , 1.7],\n",
209 |        "       [6. , 2.9, 4.5, 1.5],\n",
210 |        "       [5.7, 2.6, 3.5, 1. ],\n",
211 |        "       [5.5, 2.4, 3.8, 1.1],\n",
212 |        "       [5.5, 2.4, 3.7, 1. ],\n",
213 |        "       [5.8, 2.7, 3.9, 1.2],\n",
214 |        "       [6. , 2.7, 5.1, 1.6],\n",
215 |        "       [5.4, 3. , 4.5, 1.5],\n",
216 |        "       [6. , 3.4, 4.5, 1.6],\n",
217 |        "       [6.7, 3.1, 4.7, 1.5],\n",
218 |        "       [6.3, 2.3, 4.4, 1.3],\n",
219 |        "       [5.6, 3. , 4.1, 1.3],\n",
220 |        "       [5.5, 2.5, 4. , 1.3],\n",
221 |        "       [5.5, 2.6, 4.4, 1.2],\n",
222 |        "       [6.1, 3. , 4.6, 1.4],\n",
223 |        "       [5.8, 2.6, 4. , 1.2],\n",
224 |        "       [5. , 2.3, 3.3, 1. ],\n",
225 |        "       [5.6, 2.7, 4.2, 1.3],\n",
226 |        "       [5.7, 3. , 4.2, 1.2],\n",
227 |        "       [5.7, 2.9, 4.2, 1.3],\n",
228 |        "       [6.2, 2.9, 4.3, 1.3],\n",
229 |        "       [5.1, 2.5, 3. , 1.1],\n",
230 |        "       [5.7, 2.8, 4.1, 1.3],\n",
231 |        "       [6.3, 3.3, 6. , 2.5],\n",
232 |        "       [5.8, 2.7, 5.1, 1.9],\n",
233 |        "       [7.1, 3. , 5.9, 2.1],\n",
234 |        "       [6.3, 2.9, 5.6, 1.8],\n",
235 |        "       [6.5, 3. , 5.8, 2.2],\n",
236 |        "       [7.6, 3. , 6.6, 2.1],\n",
237 |        "       [4.9, 2.5, 4.5, 1.7],\n",
238 |        "       [7.3, 2.9, 6.3, 1.8],\n",
239 |        "       [6.7, 2.5, 5.8, 1.8],\n",
240 |        "       [7.2, 3.6, 6.1, 2.5],\n",
241 |        "       [6.5, 3.2, 5.1, 2. ],\n",
242 |        "       [6.4, 2.7, 5.3, 1.9],\n",
243 |        "       [6.8, 3. , 5.5, 2.1],\n",
244 |        "       [5.7, 2.5, 5. , 2. ],\n",
245 |        "       [5.8, 2.8, 5.1, 2.4],\n",
246 |        "       [6.4, 3.2, 5.3, 2.3],\n",
247 |        "       [6.5, 3. , 5.5, 1.8],\n",
248 |        "       [7.7, 3.8, 6.7, 2.2],\n",
249 |        "       [7.7, 2.6, 6.9, 2.3],\n",
250 |        "       [6. , 2.2, 5. , 1.5],\n",
251 |        "       [6.9, 3.2, 5.7, 2.3],\n",
252 |        "       [5.6, 2.8, 4.9, 2. ],\n",
253 |        "       [7.7, 2.8, 6.7, 2. ],\n",
254 |        "       [6.3, 2.7, 4.9, 1.8],\n",
255 |        "       [6.7, 3.3, 5.7, 2.1],\n",
256 |        "       [7.2, 3.2, 6. , 1.8],\n",
257 |        "       [6.2, 2.8, 4.8, 1.8],\n",
258 |        "       [6.1, 3. , 4.9, 1.8],\n",
259 |        "       [6.4, 2.8, 5.6, 2.1],\n",
260 |        "       [7.2, 3. , 5.8, 1.6],\n",
261 |        "       [7.4, 2.8, 6.1, 1.9],\n",
262 |        "       [7.9, 3.8, 6.4, 2. ],\n",
263 |        "       [6.4, 2.8, 5.6, 2.2],\n",
264 |        "       [6.3, 2.8, 5.1, 1.5],\n",
265 |        "       [6.1, 2.6, 5.6, 1.4],\n",
266 |        "       [7.7, 3. , 6.1, 2.3],\n",
267 |        "       [6.3, 3.4, 5.6, 2.4],\n",
268 |        "       [6.4, 3.1, 5.5, 1.8],\n",
269 |        "       [6. , 3. , 4.8, 1.8],\n",
270 |        "       [6.9, 3.1, 5.4, 2.1],\n",
271 |        "       [6.7, 3.1, 5.6, 2.4],\n",
272 |        "       [6.9, 3.1, 5.1, 2.3],\n",
273 |        "       [5.8, 2.7, 5.1, 1.9],\n",
274 |        "       [6.8, 3.2, 5.9, 2.3],\n",
275 |        "       [6.7, 3.3, 5.7, 2.5],\n",
276 |        "       [6.7, 3. , 5.2, 2.3],\n",
277 |        "       [6.3, 2.5, 5. , 1.9],\n",
278 |        "       [6.5, 3. , 5.2, 2. ],\n",
279 |        "       [6.2, 3.4, 5.4, 2.3],\n",
280 |        "       [5.9, 3. , 5.1, 1.8]])"
281 |       ]
282 |      },
283 |      "execution_count": 5,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "iris.data"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "### 4.2. Output variable (the Class label)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 30,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
308 |        "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
309 |        "       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
310 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
311 |        "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
312 |        "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
313 |        "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
314 |       ]
315 |      },
316 |      "execution_count": 30,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "iris.target"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "### 4.3. Assigning *input* and *output* variables\n",
330 |     "Let's assign the 4 input variables to X and the output variable (class label) to Y"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 9,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "X = iris.data\n",
340 |     "Y = iris.target"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "### 4.3. Let's examine the data dimension"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 10,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "data": {
357 |       "text/plain": [
358 |        "(150, 4)"
359 |       ]
360 |      },
361 |      "execution_count": 10,
362 |      "metadata": {},
363 |      "output_type": "execute_result"
364 |     }
365 |    ],
366 |    "source": [
367 |     "X.shape"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 11,
373 |    "metadata": {},
374 |    "outputs": [
375 |     {
376 |      "data": {
377 |       "text/plain": [
378 |        "(150,)"
379 |       ]
380 |      },
381 |      "execution_count": 11,
382 |      "metadata": {},
383 |      "output_type": "execute_result"
384 |     }
385 |    ],
386 |    "source": [
387 |     "Y.shape"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "## 5. Build Classification Model using Random Forest"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 9,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "clf = RandomForestClassifier()"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 10,
409 |    "metadata": {},
410 |    "outputs": [
411 |     {
412 |      "data": {
413 |       "text/plain": [
414 |        "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
415 |        "                       criterion='gini', max_depth=None, max_features='auto',\n",
416 |        "                       max_leaf_nodes=None, max_samples=None,\n",
417 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
418 |        "                       min_samples_leaf=1, min_samples_split=2,\n",
419 |        "                       min_weight_fraction_leaf=0.0, n_estimators=100,\n",
420 |        "                       n_jobs=None, oob_score=False, random_state=None,\n",
421 |        "                       verbose=0, warm_start=False)"
422 |       ]
423 |      },
424 |      "execution_count": 10,
425 |      "metadata": {},
426 |      "output_type": "execute_result"
427 |     }
428 |    ],
429 |    "source": [
430 |     "clf.fit(X, Y)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "## 6. Feature Importance"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 11,
443 |    "metadata": {},
444 |    "outputs": [
445 |     {
446 |      "name": "stdout",
447 |      "output_type": "stream",
448 |      "text": [
449 |       "[0.07344346 0.01623453 0.42869861 0.4816234 ]\n"
450 |      ]
451 |     }
452 |    ],
453 |    "source": [
454 |     "print(clf.feature_importances_)"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "markdown",
459 |    "metadata": {},
460 |    "source": [
461 |     "## 7. Make Prediction"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": 12,
467 |    "metadata": {},
468 |    "outputs": [
469 |     {
470 |      "data": {
471 |       "text/plain": [
472 |        "array([5.1, 3.5, 1.4, 0.2])"
473 |       ]
474 |      },
475 |      "execution_count": 12,
476 |      "metadata": {},
477 |      "output_type": "execute_result"
478 |     }
479 |    ],
480 |    "source": [
481 |     "X[0]"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": 13,
487 |    "metadata": {},
488 |    "outputs": [
489 |     {
490 |      "name": "stdout",
491 |      "output_type": "stream",
492 |      "text": [
493 |       "[0]\n"
494 |      ]
495 |     }
496 |    ],
497 |    "source": [
498 |     "print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": 14,
504 |    "metadata": {},
505 |    "outputs": [
506 |     {
507 |      "name": "stdout",
508 |      "output_type": "stream",
509 |      "text": [
510 |       "[0]\n"
511 |      ]
512 |     }
513 |    ],
514 |    "source": [
515 |     "print(clf.predict(X[[0]]))"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": 15,
521 |    "metadata": {},
522 |    "outputs": [
523 |     {
524 |      "name": "stdout",
525 |      "output_type": "stream",
526 |      "text": [
527 |       "[[1. 0. 0.]]\n"
528 |      ]
529 |     }
530 |    ],
531 |    "source": [
532 |     "print(clf.predict_proba(X[[0]]))"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 16,
538 |    "metadata": {},
539 |    "outputs": [
540 |     {
541 |      "data": {
542 |       "text/plain": [
543 |        "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
544 |        "                       criterion='gini', max_depth=None, max_features='auto',\n",
545 |        "                       max_leaf_nodes=None, max_samples=None,\n",
546 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
547 |        "                       min_samples_leaf=1, min_samples_split=2,\n",
548 |        "                       min_weight_fraction_leaf=0.0, n_estimators=100,\n",
549 |        "                       n_jobs=None, oob_score=False, random_state=None,\n",
550 |        "                       verbose=0, warm_start=False)"
551 |       ]
552 |      },
553 |      "execution_count": 16,
554 |      "metadata": {},
555 |      "output_type": "execute_result"
556 |     }
557 |    ],
558 |    "source": [
559 |     "clf.fit(iris.data, iris.target_names[iris.target])"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "markdown",
564 |    "metadata": {},
565 |    "source": [
566 |     "## 8. Data split (80/20 ratio)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 17,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": 18,
581 |    "metadata": {},
582 |    "outputs": [
583 |     {
584 |      "data": {
585 |       "text/plain": [
586 |        "((120, 4), (120,))"
587 |       ]
588 |      },
589 |      "execution_count": 18,
590 |      "metadata": {},
591 |      "output_type": "execute_result"
592 |     }
593 |    ],
594 |    "source": [
595 |     "X_train.shape, Y_train.shape"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": 19,
601 |    "metadata": {},
602 |    "outputs": [
603 |     {
604 |      "data": {
605 |       "text/plain": [
606 |        "((30, 4), (30,))"
607 |       ]
608 |      },
609 |      "execution_count": 19,
610 |      "metadata": {},
611 |      "output_type": "execute_result"
612 |     }
613 |    ],
614 |    "source": [
615 |     "X_test.shape, Y_test.shape"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "markdown",
620 |    "metadata": {},
621 |    "source": [
622 |     "## 9. Rebuild the Random Forest Model"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": 20,
628 |    "metadata": {},
629 |    "outputs": [
630 |     {
631 |      "data": {
632 |       "text/plain": [
633 |        "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
634 |        "                       criterion='gini', max_depth=None, max_features='auto',\n",
635 |        "                       max_leaf_nodes=None, max_samples=None,\n",
636 |        "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
637 |        "                       min_samples_leaf=1, min_samples_split=2,\n",
638 |        "                       min_weight_fraction_leaf=0.0, n_estimators=100,\n",
639 |        "                       n_jobs=None, oob_score=False, random_state=None,\n",
640 |        "                       verbose=0, warm_start=False)"
641 |       ]
642 |      },
643 |      "execution_count": 20,
644 |      "metadata": {},
645 |      "output_type": "execute_result"
646 |     }
647 |    ],
648 |    "source": [
649 |     "clf.fit(X_train, Y_train)"
650 |    ]
651 |   },
652 |   {
653 |    "cell_type": "markdown",
654 |    "metadata": {},
655 |    "source": [
656 |     "### 9.1. Performs prediction on single sample from the data set"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": 21,
662 |    "metadata": {},
663 |    "outputs": [
664 |     {
665 |      "name": "stdout",
666 |      "output_type": "stream",
667 |      "text": [
668 |       "[0]\n"
669 |      ]
670 |     }
671 |    ],
672 |    "source": [
673 |     "print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": 22,
679 |    "metadata": {},
680 |    "outputs": [
681 |     {
682 |      "name": "stdout",
683 |      "output_type": "stream",
684 |      "text": [
685 |       "[[1. 0. 0.]]\n"
686 |      ]
687 |     }
688 |    ],
689 |    "source": [
690 |     "print(clf.predict_proba([[5.1, 3.5, 1.4, 0.2]]))"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "markdown",
695 |    "metadata": {},
696 |    "source": [
697 |     "### 9.2. Performs prediction on the test set"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "markdown",
702 |    "metadata": {},
703 |    "source": [
704 |     "#### *Predicted class labels*"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": 23,
710 |    "metadata": {},
711 |    "outputs": [
712 |     {
713 |      "name": "stdout",
714 |      "output_type": "stream",
715 |      "text": [
716 |       "[2 1 0 1 1 2 1 0 1 0 2 1 1 1 1 1 1 2 2 0 0 2 0 0 0 1 1 1 1 0]\n"
717 |      ]
718 |     }
719 |    ],
720 |    "source": [
721 |     "print(clf.predict(X_test))"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "markdown",
726 |    "metadata": {},
727 |    "source": [
728 |     "#### *Actual class labels*"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 24,
734 |    "metadata": {},
735 |    "outputs": [
736 |     {
737 |      "name": "stdout",
738 |      "output_type": "stream",
739 |      "text": [
740 |       "[2 1 0 1 1 2 1 0 1 0 2 1 2 1 1 2 2 2 2 0 0 2 0 0 0 1 1 1 1 0]\n"
741 |      ]
742 |     }
743 |    ],
744 |    "source": [
745 |     "print(Y_test)"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "markdown",
750 |    "metadata": {},
751 |    "source": [
752 |     "## 10. Model Performance"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": 25,
758 |    "metadata": {},
759 |    "outputs": [
760 |     {
761 |      "name": "stdout",
762 |      "output_type": "stream",
763 |      "text": [
764 |       "0.9\n"
765 |      ]
766 |     }
767 |    ],
768 |    "source": [
769 |     "print(clf.score(X_test, Y_test))"
770 |    ]
771 |   }
772 |  ],
773 |  "metadata": {
774 |   "kernelspec": {
775 |    "display_name": "Python 3",
776 |    "language": "python",
777 |    "name": "python3"
778 |   },
779 |   "language_info": {
780 |    "codemirror_mode": {
781 |     "name": "ipython",
782 |     "version": 3
783 |    },
784 |    "file_extension": ".py",
785 |    "mimetype": "text/x-python",
786 |    "name": "python",
787 |    "nbconvert_exporter": "python",
788 |    "pygments_lexer": "ipython3",
789 |    "version": "3.7.6"
790 |   }
791 |  },
792 |  "nbformat": 4,
793 |  "nbformat_minor": 4
794 | }
795 | 


--------------------------------------------------------------------------------
/python/model_is_training_progress_bar.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "zzD4-HxqXBmt"
  7 |    },
  8 |    "source": [
  9 |     "# **Progress Bar in Jupyter Notebook**\n",
 10 |     "\n",
 11 |     "Chanin Nantasenamat\n",
 12 |     "\n",
 13 |     "**Data Professor YouTube channel**, http://youtube.com/dataprofessor"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "id": "An7XU557Y5ci"
 20 |    },
 21 |    "source": [
 22 |     "# **Progress Bar with the tqdm library**"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {
 29 |     "id": "3yc04janmetd"
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# ! pip install tqdm"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 1,
 39 |    "metadata": {
 40 |     "id": "gxa8jup1DNjt"
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from tqdm.notebook import tqdm\n",
 45 |     "from time import sleep"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {
 52 |     "id": "009bdoXCE74q"
 53 |    },
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "application/vnd.jupyter.widget-view+json": {
 58 |        "model_id": "93cc2d7933af4faf96fda14e55f24e23",
 59 |        "version_major": 2,
 60 |        "version_minor": 0
 61 |       },
 62 |       "text/plain": [
 63 |        "  0%|          | 0/100 [00:00<?, ?it/s]"
 64 |       ]
 65 |      },
 66 |      "metadata": {},
 67 |      "output_type": "display_data"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "number_list = list(range(100))\n",
 72 |     "for x in tqdm(number_list):\n",
 73 |     "    sleep(0.05)\n",
 74 |     "#print('Completed!')"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {
 80 |     "id": "4tFGw2QFMz6N"
 81 |    },
 82 |    "source": [
 83 |     "# **Model Building**"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {
 89 |     "id": "zKKr9EoSVbOV"
 90 |    },
 91 |    "source": [
 92 |     "### Reading in the Delaney Solubility Dataset"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 3,
 98 |    "metadata": {
 99 |     "id": "FHR0FBHEMyyL"
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "import pandas as pd\n",
104 |     "\n",
105 |     "dataset = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')\n",
106 |     "\n",
107 |     "X = dataset.drop(['logS'], axis=1)\n",
108 |     "Y = dataset.iloc[:,-1]\n"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {
114 |     "id": "BqqRRTtUVi7v"
115 |    },
116 |    "source": [
117 |     "### Model Building with Progress Bar"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {
124 |     "id": "cpa2tS3kInAx",
125 |     "scrolled": true
126 |    },
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "application/vnd.jupyter.widget-view+json": {
131 |        "model_id": "a1b762495ff545468e8b801795c6b708",
132 |        "version_major": 2,
133 |        "version_minor": 0
134 |       },
135 |       "text/plain": [
136 |        "  0%|          | 0/10 [00:00<?, ?it/s]"
137 |       ]
138 |      },
139 |      "metadata": {},
140 |      "output_type": "display_data"
141 |     },
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Tree: 100, R2: 0.9796508266364179, MSE: 0.08936295274735467\n",
147 |       "Tree: 200, R2: 0.9805478792326812, MSE: 0.08542356575902461\n",
148 |       "Tree: 300, R2: 0.9801470956638436, MSE: 0.08718359809468906\n",
149 |       "Tree: 400, R2: 0.9803760482277171, MSE: 0.08617815788435489\n",
150 |       "Tree: 500, R2: 0.9804686074892891, MSE: 0.08577168589797951\n",
151 |       "Tree: 600, R2: 0.9804079256830844, MSE: 0.08603816873163578\n",
152 |       "Tree: 700, R2: 0.9802975717717071, MSE: 0.0865227855360484\n",
153 |       "Tree: 800, R2: 0.9803651322114956, MSE: 0.08622609533244484\n",
154 |       "Tree: 900, R2: 0.98037907466393, MSE: 0.08616486735547396\n",
155 |       "Tree: 1000, R2: 0.9804349669126423, MSE: 0.08591941775949379\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "from sklearn.ensemble import RandomForestRegressor\n",
161 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
162 |     "\n",
163 |     "parameter_n_estimators = [100,200,300,400,500,600,700,800,900,1000]\n",
164 |     "\n",
165 |     "for i in tqdm(parameter_n_estimators):\n",
166 |     "  model = RandomForestRegressor(n_estimators=i)\n",
167 |     "  model.fit(X,Y)\n",
168 |     "  Y_pred = model.predict(X)\n",
169 |     "  r2 = r2_score(Y, Y_pred)\n",
170 |     "  mse = mean_squared_error(Y, Y_pred)\n",
171 |     "  print('Tree: %s, R2: %s, MSE: %s' % (i, r2, mse))"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": []
180 |   }
181 |  ],
182 |  "metadata": {
183 |   "colab": {
184 |    "collapsed_sections": [],
185 |    "name": "Model-building-with-progress-bar.ipynb",
186 |    "provenance": []
187 |   },
188 |   "kernelspec": {
189 |    "display_name": "Python 3",
190 |    "language": "python",
191 |    "name": "python3"
192 |   },
193 |   "language_info": {
194 |    "codemirror_mode": {
195 |     "name": "ipython",
196 |     "version": 3
197 |    },
198 |    "file_extension": ".py",
199 |    "mimetype": "text/x-python",
200 |    "name": "python",
201 |    "nbconvert_exporter": "python",
202 |    "pygments_lexer": "ipython3",
203 |    "version": "3.7.9"
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 1
208 | }
209 | 


--------------------------------------------------------------------------------
/python/pandas_select_columns.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "pandas-select-columns.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     }
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "markdown",
 18 |       "metadata": {
 19 |         "id": "sBn4_vrWHjCi",
 20 |         "colab_type": "text"
 21 |       },
 22 |       "source": [
 23 |         "# **Introduction to Pandas dataframe: Select specific column(s) in a DataFrame**\n",
 24 |         "\n",
 25 |         "Chanin Nantasenamat\n",
 26 |         "\n",
 27 |         "[*'Data Professor' YouTube channel*](http://youtube.com/dataprofessor)\n",
 28 |         "\n",
 29 |         "In this Jupyter notebook, I will be showing you how to select specifc column(s) in a DataFrame.\n",
 30 |         "\n",
 31 |         "---"
 32 |       ]
 33 |     },
 34 |     {
 35 |       "cell_type": "markdown",
 36 |       "metadata": {
 37 |         "id": "PBWDV57tHmLf",
 38 |         "colab_type": "text"
 39 |       },
 40 |       "source": [
 41 |         "## **Download CSV data**"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "cell_type": "code",
 46 |       "metadata": {
 47 |         "id": "pq6EVr5sHXAq",
 48 |         "colab_type": "code",
 49 |         "colab": {
 50 |           "base_uri": "https://localhost:8080/",
 51 |           "height": 208
 52 |         },
 53 |         "outputId": "1eea0330-050d-4101-c21b-dce27e09cf01"
 54 |       },
 55 |       "source": [
 56 |         "! wget https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv"
 57 |       ],
 58 |       "execution_count": 10,
 59 |       "outputs": [
 60 |         {
 61 |           "output_type": "stream",
 62 |           "text": [
 63 |             "--2020-05-03 17:41:00--  https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv\n",
 64 |             "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
 65 |             "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
 66 |             "HTTP request sent, awaiting response... 200 OK\n",
 67 |             "Length: 57370 (56K) [text/plain]\n",
 68 |             "Saving to: ‘delaney_solubility_with_descriptors.csv’\n",
 69 |             "\n",
 70 |             "\r          delaney_s   0%[                    ]       0  --.-KB/s               \rdelaney_solubility_ 100%[===================>]  56.03K  --.-KB/s    in 0.01s   \n",
 71 |             "\n",
 72 |             "2020-05-03 17:41:00 (3.82 MB/s) - ‘delaney_solubility_with_descriptors.csv’ saved [57370/57370]\n",
 73 |             "\n"
 74 |           ],
 75 |           "name": "stdout"
 76 |         }
 77 |       ]
 78 |     },
 79 |     {
 80 |       "cell_type": "code",
 81 |       "metadata": {
 82 |         "id": "qJ9j-V_zHwm8",
 83 |         "colab_type": "code",
 84 |         "colab": {}
 85 |       },
 86 |       "source": [
 87 |         "import pandas as pd"
 88 |       ],
 89 |       "execution_count": 0,
 90 |       "outputs": []
 91 |     },
 92 |     {
 93 |       "cell_type": "code",
 94 |       "metadata": {
 95 |         "id": "D6twMV5THz2r",
 96 |         "colab_type": "code",
 97 |         "colab": {
 98 |           "base_uri": "https://localhost:8080/",
 99 |           "height": 415
100 |         },
101 |         "outputId": "3230bda2-d61e-48f6-f232-0dabecc08908"
102 |       },
103 |       "source": [
104 |         "df = pd.read_csv('delaney_solubility_with_descriptors.csv')\n",
105 |         "df"
106 |       ],
107 |       "execution_count": 12,
108 |       "outputs": [
109 |         {
110 |           "output_type": "execute_result",
111 |           "data": {
112 |             "text/html": [
113 |               "<div>\n",
114 |               "<style scoped>\n",
115 |               "    .dataframe tbody tr th:only-of-type {\n",
116 |               "        vertical-align: middle;\n",
117 |               "    }\n",
118 |               "\n",
119 |               "    .dataframe tbody tr th {\n",
120 |               "        vertical-align: top;\n",
121 |               "    }\n",
122 |               "\n",
123 |               "    .dataframe thead th {\n",
124 |               "        text-align: right;\n",
125 |               "    }\n",
126 |               "</style>\n",
127 |               "<table border=\"1\" class=\"dataframe\">\n",
128 |               "  <thead>\n",
129 |               "    <tr style=\"text-align: right;\">\n",
130 |               "      <th></th>\n",
131 |               "      <th>MolLogP</th>\n",
132 |               "      <th>MolWt</th>\n",
133 |               "      <th>NumRotatableBonds</th>\n",
134 |               "      <th>AromaticProportion</th>\n",
135 |               "      <th>logS</th>\n",
136 |               "    </tr>\n",
137 |               "  </thead>\n",
138 |               "  <tbody>\n",
139 |               "    <tr>\n",
140 |               "      <th>0</th>\n",
141 |               "      <td>2.59540</td>\n",
142 |               "      <td>167.850</td>\n",
143 |               "      <td>0.0</td>\n",
144 |               "      <td>0.000000</td>\n",
145 |               "      <td>-2.180</td>\n",
146 |               "    </tr>\n",
147 |               "    <tr>\n",
148 |               "      <th>1</th>\n",
149 |               "      <td>2.37650</td>\n",
150 |               "      <td>133.405</td>\n",
151 |               "      <td>0.0</td>\n",
152 |               "      <td>0.000000</td>\n",
153 |               "      <td>-2.000</td>\n",
154 |               "    </tr>\n",
155 |               "    <tr>\n",
156 |               "      <th>2</th>\n",
157 |               "      <td>2.59380</td>\n",
158 |               "      <td>167.850</td>\n",
159 |               "      <td>1.0</td>\n",
160 |               "      <td>0.000000</td>\n",
161 |               "      <td>-1.740</td>\n",
162 |               "    </tr>\n",
163 |               "    <tr>\n",
164 |               "      <th>3</th>\n",
165 |               "      <td>2.02890</td>\n",
166 |               "      <td>133.405</td>\n",
167 |               "      <td>1.0</td>\n",
168 |               "      <td>0.000000</td>\n",
169 |               "      <td>-1.480</td>\n",
170 |               "    </tr>\n",
171 |               "    <tr>\n",
172 |               "      <th>4</th>\n",
173 |               "      <td>2.91890</td>\n",
174 |               "      <td>187.375</td>\n",
175 |               "      <td>1.0</td>\n",
176 |               "      <td>0.000000</td>\n",
177 |               "      <td>-3.040</td>\n",
178 |               "    </tr>\n",
179 |               "    <tr>\n",
180 |               "      <th>...</th>\n",
181 |               "      <td>...</td>\n",
182 |               "      <td>...</td>\n",
183 |               "      <td>...</td>\n",
184 |               "      <td>...</td>\n",
185 |               "      <td>...</td>\n",
186 |               "    </tr>\n",
187 |               "    <tr>\n",
188 |               "      <th>1139</th>\n",
189 |               "      <td>1.98820</td>\n",
190 |               "      <td>287.343</td>\n",
191 |               "      <td>8.0</td>\n",
192 |               "      <td>0.000000</td>\n",
193 |               "      <td>1.144</td>\n",
194 |               "    </tr>\n",
195 |               "    <tr>\n",
196 |               "      <th>1140</th>\n",
197 |               "      <td>3.42130</td>\n",
198 |               "      <td>286.114</td>\n",
199 |               "      <td>2.0</td>\n",
200 |               "      <td>0.333333</td>\n",
201 |               "      <td>-4.925</td>\n",
202 |               "    </tr>\n",
203 |               "    <tr>\n",
204 |               "      <th>1141</th>\n",
205 |               "      <td>3.60960</td>\n",
206 |               "      <td>308.333</td>\n",
207 |               "      <td>4.0</td>\n",
208 |               "      <td>0.695652</td>\n",
209 |               "      <td>-3.893</td>\n",
210 |               "    </tr>\n",
211 |               "    <tr>\n",
212 |               "      <th>1142</th>\n",
213 |               "      <td>2.56214</td>\n",
214 |               "      <td>354.815</td>\n",
215 |               "      <td>3.0</td>\n",
216 |               "      <td>0.521739</td>\n",
217 |               "      <td>-3.790</td>\n",
218 |               "    </tr>\n",
219 |               "    <tr>\n",
220 |               "      <th>1143</th>\n",
221 |               "      <td>2.02164</td>\n",
222 |               "      <td>179.219</td>\n",
223 |               "      <td>1.0</td>\n",
224 |               "      <td>0.461538</td>\n",
225 |               "      <td>-2.581</td>\n",
226 |               "    </tr>\n",
227 |               "  </tbody>\n",
228 |               "</table>\n",
229 |               "<p>1144 rows × 5 columns</p>\n",
230 |               "</div>"
231 |             ],
232 |             "text/plain": [
233 |               "      MolLogP    MolWt  NumRotatableBonds  AromaticProportion   logS\n",
234 |               "0     2.59540  167.850                0.0            0.000000 -2.180\n",
235 |               "1     2.37650  133.405                0.0            0.000000 -2.000\n",
236 |               "2     2.59380  167.850                1.0            0.000000 -1.740\n",
237 |               "3     2.02890  133.405                1.0            0.000000 -1.480\n",
238 |               "4     2.91890  187.375                1.0            0.000000 -3.040\n",
239 |               "...       ...      ...                ...                 ...    ...\n",
240 |               "1139  1.98820  287.343                8.0            0.000000  1.144\n",
241 |               "1140  3.42130  286.114                2.0            0.333333 -4.925\n",
242 |               "1141  3.60960  308.333                4.0            0.695652 -3.893\n",
243 |               "1142  2.56214  354.815                3.0            0.521739 -3.790\n",
244 |               "1143  2.02164  179.219                1.0            0.461538 -2.581\n",
245 |               "\n",
246 |               "[1144 rows x 5 columns]"
247 |             ]
248 |           },
249 |           "metadata": {
250 |             "tags": []
251 |           },
252 |           "execution_count": 12
253 |         }
254 |       ]
255 |     },
256 |     {
257 |       "cell_type": "markdown",
258 |       "metadata": {
259 |         "id": "uc36WETtHp77",
260 |         "colab_type": "text"
261 |       },
262 |       "source": [
263 |         "## **Selecting specific column(s)**"
264 |       ]
265 |     },
266 |     {
267 |       "cell_type": "markdown",
268 |       "metadata": {
269 |         "id": "Ha6n6gH3IsPs",
270 |         "colab_type": "text"
271 |       },
272 |       "source": [
273 |         "### Selecting a single column"
274 |       ]
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "metadata": {
279 |         "id": "94xKLaCoHpeY",
280 |         "colab_type": "code",
281 |         "colab": {
282 |           "base_uri": "https://localhost:8080/",
283 |           "height": 225
284 |         },
285 |         "outputId": "7bc417a4-ff9e-43c4-f05a-7dc4eb5aa387"
286 |       },
287 |       "source": [
288 |         "df.MolLogP"
289 |       ],
290 |       "execution_count": 13,
291 |       "outputs": [
292 |         {
293 |           "output_type": "execute_result",
294 |           "data": {
295 |             "text/plain": [
296 |               "0       2.59540\n",
297 |               "1       2.37650\n",
298 |               "2       2.59380\n",
299 |               "3       2.02890\n",
300 |               "4       2.91890\n",
301 |               "         ...   \n",
302 |               "1139    1.98820\n",
303 |               "1140    3.42130\n",
304 |               "1141    3.60960\n",
305 |               "1142    2.56214\n",
306 |               "1143    2.02164\n",
307 |               "Name: MolLogP, Length: 1144, dtype: float64"
308 |             ]
309 |           },
310 |           "metadata": {
311 |             "tags": []
312 |           },
313 |           "execution_count": 13
314 |         }
315 |       ]
316 |     },
317 |     {
318 |       "cell_type": "code",
319 |       "metadata": {
320 |         "id": "ua7cZKWzIP-J",
321 |         "colab_type": "code",
322 |         "colab": {
323 |           "base_uri": "https://localhost:8080/",
324 |           "height": 225
325 |         },
326 |         "outputId": "3cff5ff9-f26a-43ea-9c53-7772d68c48e2"
327 |       },
328 |       "source": [
329 |         "df['MolLogP']"
330 |       ],
331 |       "execution_count": 14,
332 |       "outputs": [
333 |         {
334 |           "output_type": "execute_result",
335 |           "data": {
336 |             "text/plain": [
337 |               "0       2.59540\n",
338 |               "1       2.37650\n",
339 |               "2       2.59380\n",
340 |               "3       2.02890\n",
341 |               "4       2.91890\n",
342 |               "         ...   \n",
343 |               "1139    1.98820\n",
344 |               "1140    3.42130\n",
345 |               "1141    3.60960\n",
346 |               "1142    2.56214\n",
347 |               "1143    2.02164\n",
348 |               "Name: MolLogP, Length: 1144, dtype: float64"
349 |             ]
350 |           },
351 |           "metadata": {
352 |             "tags": []
353 |           },
354 |           "execution_count": 14
355 |         }
356 |       ]
357 |     },
358 |     {
359 |       "cell_type": "markdown",
360 |       "metadata": {
361 |         "id": "Mb_loyXfIyhw",
362 |         "colab_type": "text"
363 |       },
364 |       "source": [
365 |         "### Selecting two or more columns"
366 |       ]
367 |     },
368 |     {
369 |       "cell_type": "code",
370 |       "metadata": {
371 |         "id": "ZbLCqhFbIRfS",
372 |         "colab_type": "code",
373 |         "colab": {
374 |           "base_uri": "https://localhost:8080/",
375 |           "height": 415
376 |         },
377 |         "outputId": "d1d61288-290a-4fce-db4b-e3fbf635ba4d"
378 |       },
379 |       "source": [
380 |         "df[['MolLogP','NumRotatableBonds']]"
381 |       ],
382 |       "execution_count": 23,
383 |       "outputs": [
384 |         {
385 |           "output_type": "execute_result",
386 |           "data": {
387 |             "text/html": [
388 |               "<div>\n",
389 |               "<style scoped>\n",
390 |               "    .dataframe tbody tr th:only-of-type {\n",
391 |               "        vertical-align: middle;\n",
392 |               "    }\n",
393 |               "\n",
394 |               "    .dataframe tbody tr th {\n",
395 |               "        vertical-align: top;\n",
396 |               "    }\n",
397 |               "\n",
398 |               "    .dataframe thead th {\n",
399 |               "        text-align: right;\n",
400 |               "    }\n",
401 |               "</style>\n",
402 |               "<table border=\"1\" class=\"dataframe\">\n",
403 |               "  <thead>\n",
404 |               "    <tr style=\"text-align: right;\">\n",
405 |               "      <th></th>\n",
406 |               "      <th>MolLogP</th>\n",
407 |               "      <th>NumRotatableBonds</th>\n",
408 |               "    </tr>\n",
409 |               "  </thead>\n",
410 |               "  <tbody>\n",
411 |               "    <tr>\n",
412 |               "      <th>0</th>\n",
413 |               "      <td>2.59540</td>\n",
414 |               "      <td>0.0</td>\n",
415 |               "    </tr>\n",
416 |               "    <tr>\n",
417 |               "      <th>1</th>\n",
418 |               "      <td>2.37650</td>\n",
419 |               "      <td>0.0</td>\n",
420 |               "    </tr>\n",
421 |               "    <tr>\n",
422 |               "      <th>2</th>\n",
423 |               "      <td>2.59380</td>\n",
424 |               "      <td>1.0</td>\n",
425 |               "    </tr>\n",
426 |               "    <tr>\n",
427 |               "      <th>3</th>\n",
428 |               "      <td>2.02890</td>\n",
429 |               "      <td>1.0</td>\n",
430 |               "    </tr>\n",
431 |               "    <tr>\n",
432 |               "      <th>4</th>\n",
433 |               "      <td>2.91890</td>\n",
434 |               "      <td>1.0</td>\n",
435 |               "    </tr>\n",
436 |               "    <tr>\n",
437 |               "      <th>...</th>\n",
438 |               "      <td>...</td>\n",
439 |               "      <td>...</td>\n",
440 |               "    </tr>\n",
441 |               "    <tr>\n",
442 |               "      <th>1139</th>\n",
443 |               "      <td>1.98820</td>\n",
444 |               "      <td>8.0</td>\n",
445 |               "    </tr>\n",
446 |               "    <tr>\n",
447 |               "      <th>1140</th>\n",
448 |               "      <td>3.42130</td>\n",
449 |               "      <td>2.0</td>\n",
450 |               "    </tr>\n",
451 |               "    <tr>\n",
452 |               "      <th>1141</th>\n",
453 |               "      <td>3.60960</td>\n",
454 |               "      <td>4.0</td>\n",
455 |               "    </tr>\n",
456 |               "    <tr>\n",
457 |               "      <th>1142</th>\n",
458 |               "      <td>2.56214</td>\n",
459 |               "      <td>3.0</td>\n",
460 |               "    </tr>\n",
461 |               "    <tr>\n",
462 |               "      <th>1143</th>\n",
463 |               "      <td>2.02164</td>\n",
464 |               "      <td>1.0</td>\n",
465 |               "    </tr>\n",
466 |               "  </tbody>\n",
467 |               "</table>\n",
468 |               "<p>1144 rows × 2 columns</p>\n",
469 |               "</div>"
470 |             ],
471 |             "text/plain": [
472 |               "      MolLogP  NumRotatableBonds\n",
473 |               "0     2.59540                0.0\n",
474 |               "1     2.37650                0.0\n",
475 |               "2     2.59380                1.0\n",
476 |               "3     2.02890                1.0\n",
477 |               "4     2.91890                1.0\n",
478 |               "...       ...                ...\n",
479 |               "1139  1.98820                8.0\n",
480 |               "1140  3.42130                2.0\n",
481 |               "1141  3.60960                4.0\n",
482 |               "1142  2.56214                3.0\n",
483 |               "1143  2.02164                1.0\n",
484 |               "\n",
485 |               "[1144 rows x 2 columns]"
486 |             ]
487 |           },
488 |           "metadata": {
489 |             "tags": []
490 |           },
491 |           "execution_count": 23
492 |         }
493 |       ]
494 |     },
495 |     {
496 |       "cell_type": "code",
497 |       "metadata": {
498 |         "id": "qxMA09nEIV7e",
499 |         "colab_type": "code",
500 |         "colab": {
501 |           "base_uri": "https://localhost:8080/",
502 |           "height": 415
503 |         },
504 |         "outputId": "6f1131df-d3ff-4735-96d5-166a11c03f56"
505 |       },
506 |       "source": [
507 |         "df.iloc[:,[0,2]]"
508 |       ],
509 |       "execution_count": 22,
510 |       "outputs": [
511 |         {
512 |           "output_type": "execute_result",
513 |           "data": {
514 |             "text/html": [
515 |               "<div>\n",
516 |               "<style scoped>\n",
517 |               "    .dataframe tbody tr th:only-of-type {\n",
518 |               "        vertical-align: middle;\n",
519 |               "    }\n",
520 |               "\n",
521 |               "    .dataframe tbody tr th {\n",
522 |               "        vertical-align: top;\n",
523 |               "    }\n",
524 |               "\n",
525 |               "    .dataframe thead th {\n",
526 |               "        text-align: right;\n",
527 |               "    }\n",
528 |               "</style>\n",
529 |               "<table border=\"1\" class=\"dataframe\">\n",
530 |               "  <thead>\n",
531 |               "    <tr style=\"text-align: right;\">\n",
532 |               "      <th></th>\n",
533 |               "      <th>MolLogP</th>\n",
534 |               "      <th>NumRotatableBonds</th>\n",
535 |               "    </tr>\n",
536 |               "  </thead>\n",
537 |               "  <tbody>\n",
538 |               "    <tr>\n",
539 |               "      <th>0</th>\n",
540 |               "      <td>2.59540</td>\n",
541 |               "      <td>0.0</td>\n",
542 |               "    </tr>\n",
543 |               "    <tr>\n",
544 |               "      <th>1</th>\n",
545 |               "      <td>2.37650</td>\n",
546 |               "      <td>0.0</td>\n",
547 |               "    </tr>\n",
548 |               "    <tr>\n",
549 |               "      <th>2</th>\n",
550 |               "      <td>2.59380</td>\n",
551 |               "      <td>1.0</td>\n",
552 |               "    </tr>\n",
553 |               "    <tr>\n",
554 |               "      <th>3</th>\n",
555 |               "      <td>2.02890</td>\n",
556 |               "      <td>1.0</td>\n",
557 |               "    </tr>\n",
558 |               "    <tr>\n",
559 |               "      <th>4</th>\n",
560 |               "      <td>2.91890</td>\n",
561 |               "      <td>1.0</td>\n",
562 |               "    </tr>\n",
563 |               "    <tr>\n",
564 |               "      <th>...</th>\n",
565 |               "      <td>...</td>\n",
566 |               "      <td>...</td>\n",
567 |               "    </tr>\n",
568 |               "    <tr>\n",
569 |               "      <th>1139</th>\n",
570 |               "      <td>1.98820</td>\n",
571 |               "      <td>8.0</td>\n",
572 |               "    </tr>\n",
573 |               "    <tr>\n",
574 |               "      <th>1140</th>\n",
575 |               "      <td>3.42130</td>\n",
576 |               "      <td>2.0</td>\n",
577 |               "    </tr>\n",
578 |               "    <tr>\n",
579 |               "      <th>1141</th>\n",
580 |               "      <td>3.60960</td>\n",
581 |               "      <td>4.0</td>\n",
582 |               "    </tr>\n",
583 |               "    <tr>\n",
584 |               "      <th>1142</th>\n",
585 |               "      <td>2.56214</td>\n",
586 |               "      <td>3.0</td>\n",
587 |               "    </tr>\n",
588 |               "    <tr>\n",
589 |               "      <th>1143</th>\n",
590 |               "      <td>2.02164</td>\n",
591 |               "      <td>1.0</td>\n",
592 |               "    </tr>\n",
593 |               "  </tbody>\n",
594 |               "</table>\n",
595 |               "<p>1144 rows × 2 columns</p>\n",
596 |               "</div>"
597 |             ],
598 |             "text/plain": [
599 |               "      MolLogP  NumRotatableBonds\n",
600 |               "0     2.59540                0.0\n",
601 |               "1     2.37650                0.0\n",
602 |               "2     2.59380                1.0\n",
603 |               "3     2.02890                1.0\n",
604 |               "4     2.91890                1.0\n",
605 |               "...       ...                ...\n",
606 |               "1139  1.98820                8.0\n",
607 |               "1140  3.42130                2.0\n",
608 |               "1141  3.60960                4.0\n",
609 |               "1142  2.56214                3.0\n",
610 |               "1143  2.02164                1.0\n",
611 |               "\n",
612 |               "[1144 rows x 2 columns]"
613 |             ]
614 |           },
615 |           "metadata": {
616 |             "tags": []
617 |           },
618 |           "execution_count": 22
619 |         }
620 |       ]
621 |     },
622 |     {
623 |       "cell_type": "code",
624 |       "metadata": {
625 |         "id": "V53KSsZjId1X",
626 |         "colab_type": "code",
627 |         "colab": {
628 |           "base_uri": "https://localhost:8080/",
629 |           "height": 415
630 |         },
631 |         "outputId": "81be4e24-2bac-4dd5-97d7-32ece84768c6"
632 |       },
633 |       "source": [
634 |         "selection = ['MolLogP','NumRotatableBonds', 'logS']\n",
635 |         "df[selection]"
636 |       ],
637 |       "execution_count": 25,
638 |       "outputs": [
639 |         {
640 |           "output_type": "execute_result",
641 |           "data": {
642 |             "text/html": [
643 |               "<div>\n",
644 |               "<style scoped>\n",
645 |               "    .dataframe tbody tr th:only-of-type {\n",
646 |               "        vertical-align: middle;\n",
647 |               "    }\n",
648 |               "\n",
649 |               "    .dataframe tbody tr th {\n",
650 |               "        vertical-align: top;\n",
651 |               "    }\n",
652 |               "\n",
653 |               "    .dataframe thead th {\n",
654 |               "        text-align: right;\n",
655 |               "    }\n",
656 |               "</style>\n",
657 |               "<table border=\"1\" class=\"dataframe\">\n",
658 |               "  <thead>\n",
659 |               "    <tr style=\"text-align: right;\">\n",
660 |               "      <th></th>\n",
661 |               "      <th>MolLogP</th>\n",
662 |               "      <th>NumRotatableBonds</th>\n",
663 |               "      <th>logS</th>\n",
664 |               "    </tr>\n",
665 |               "  </thead>\n",
666 |               "  <tbody>\n",
667 |               "    <tr>\n",
668 |               "      <th>0</th>\n",
669 |               "      <td>2.59540</td>\n",
670 |               "      <td>0.0</td>\n",
671 |               "      <td>-2.180</td>\n",
672 |               "    </tr>\n",
673 |               "    <tr>\n",
674 |               "      <th>1</th>\n",
675 |               "      <td>2.37650</td>\n",
676 |               "      <td>0.0</td>\n",
677 |               "      <td>-2.000</td>\n",
678 |               "    </tr>\n",
679 |               "    <tr>\n",
680 |               "      <th>2</th>\n",
681 |               "      <td>2.59380</td>\n",
682 |               "      <td>1.0</td>\n",
683 |               "      <td>-1.740</td>\n",
684 |               "    </tr>\n",
685 |               "    <tr>\n",
686 |               "      <th>3</th>\n",
687 |               "      <td>2.02890</td>\n",
688 |               "      <td>1.0</td>\n",
689 |               "      <td>-1.480</td>\n",
690 |               "    </tr>\n",
691 |               "    <tr>\n",
692 |               "      <th>4</th>\n",
693 |               "      <td>2.91890</td>\n",
694 |               "      <td>1.0</td>\n",
695 |               "      <td>-3.040</td>\n",
696 |               "    </tr>\n",
697 |               "    <tr>\n",
698 |               "      <th>...</th>\n",
699 |               "      <td>...</td>\n",
700 |               "      <td>...</td>\n",
701 |               "      <td>...</td>\n",
702 |               "    </tr>\n",
703 |               "    <tr>\n",
704 |               "      <th>1139</th>\n",
705 |               "      <td>1.98820</td>\n",
706 |               "      <td>8.0</td>\n",
707 |               "      <td>1.144</td>\n",
708 |               "    </tr>\n",
709 |               "    <tr>\n",
710 |               "      <th>1140</th>\n",
711 |               "      <td>3.42130</td>\n",
712 |               "      <td>2.0</td>\n",
713 |               "      <td>-4.925</td>\n",
714 |               "    </tr>\n",
715 |               "    <tr>\n",
716 |               "      <th>1141</th>\n",
717 |               "      <td>3.60960</td>\n",
718 |               "      <td>4.0</td>\n",
719 |               "      <td>-3.893</td>\n",
720 |               "    </tr>\n",
721 |               "    <tr>\n",
722 |               "      <th>1142</th>\n",
723 |               "      <td>2.56214</td>\n",
724 |               "      <td>3.0</td>\n",
725 |               "      <td>-3.790</td>\n",
726 |               "    </tr>\n",
727 |               "    <tr>\n",
728 |               "      <th>1143</th>\n",
729 |               "      <td>2.02164</td>\n",
730 |               "      <td>1.0</td>\n",
731 |               "      <td>-2.581</td>\n",
732 |               "    </tr>\n",
733 |               "  </tbody>\n",
734 |               "</table>\n",
735 |               "<p>1144 rows × 3 columns</p>\n",
736 |               "</div>"
737 |             ],
738 |             "text/plain": [
739 |               "      MolLogP  NumRotatableBonds   logS\n",
740 |               "0     2.59540                0.0 -2.180\n",
741 |               "1     2.37650                0.0 -2.000\n",
742 |               "2     2.59380                1.0 -1.740\n",
743 |               "3     2.02890                1.0 -1.480\n",
744 |               "4     2.91890                1.0 -3.040\n",
745 |               "...       ...                ...    ...\n",
746 |               "1139  1.98820                8.0  1.144\n",
747 |               "1140  3.42130                2.0 -4.925\n",
748 |               "1141  3.60960                4.0 -3.893\n",
749 |               "1142  2.56214                3.0 -3.790\n",
750 |               "1143  2.02164                1.0 -2.581\n",
751 |               "\n",
752 |               "[1144 rows x 3 columns]"
753 |             ]
754 |           },
755 |           "metadata": {
756 |             "tags": []
757 |           },
758 |           "execution_count": 25
759 |         }
760 |       ]
761 |     },
762 |     {
763 |       "cell_type": "code",
764 |       "metadata": {
765 |         "id": "rF85rPt1I9sY",
766 |         "colab_type": "code",
767 |         "colab": {}
768 |       },
769 |       "source": [
770 |         ""
771 |       ],
772 |       "execution_count": 0,
773 |       "outputs": []
774 |     }
775 |   ]
776 | }


--------------------------------------------------------------------------------
/python/r_magic_command.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "r-magic-command.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     }
 13 |   },
 14 |   "cells": [
 15 |     {
 16 |       "cell_type": "markdown",
 17 |       "metadata": {
 18 |         "id": "EnyONbNhCqSK",
 19 |         "colab_type": "text"
 20 |       },
 21 |       "source": [
 22 |         "# **Using R and Python in the Same Notebook**\n",
 23 |         "\n",
 24 |         "Chanin Nantasenamat\n",
 25 |         "\n",
 26 |         "[*'Data Professor' YouTube channel*](http://youtube.com/dataprofessor)\n",
 27 |         "\n",
 28 |         "In this Jupyter notebook, I will show you how to use R and Python in the same notebook.\n",
 29 |         "\n",
 30 |         "---"
 31 |       ]
 32 |     },
 33 |     {
 34 |       "cell_type": "code",
 35 |       "metadata": {
 36 |         "id": "2h-2I4CviFCR",
 37 |         "colab_type": "code",
 38 |         "colab": {}
 39 |       },
 40 |       "source": [
 41 |         "# activate R magic\n",
 42 |         "%load_ext rpy2.ipython"
 43 |       ],
 44 |       "execution_count": 0,
 45 |       "outputs": []
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "id": "FftFvPLNiZME",
 51 |         "colab_type": "text"
 52 |       },
 53 |       "source": [
 54 |         "## Python"
 55 |       ]
 56 |     },
 57 |     {
 58 |       "cell_type": "code",
 59 |       "metadata": {
 60 |         "id": "3hPnRI2piJM3",
 61 |         "colab_type": "code",
 62 |         "colab": {}
 63 |       },
 64 |       "source": [
 65 |         "import pandas as pd"
 66 |       ],
 67 |       "execution_count": 0,
 68 |       "outputs": []
 69 |     },
 70 |     {
 71 |       "cell_type": "code",
 72 |       "metadata": {
 73 |         "id": "yNKM70-ZiPcg",
 74 |         "colab_type": "code",
 75 |         "colab": {}
 76 |       },
 77 |       "source": [
 78 |         "x <- 42\n",
 79 |         "print(x)"
 80 |       ],
 81 |       "execution_count": 0,
 82 |       "outputs": []
 83 |     },
 84 |     {
 85 |       "cell_type": "markdown",
 86 |       "metadata": {
 87 |         "id": "dtkChhxpiWEd",
 88 |         "colab_type": "text"
 89 |       },
 90 |       "source": [
 91 |         "## R"
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "metadata": {
 97 |         "id": "ozqbZ3lviTPj",
 98 |         "colab_type": "code",
 99 |         "colab": {}
100 |       },
101 |       "source": [
102 |         "%%R\n",
103 |         "x <- 42\n",
104 |         "print(x)"
105 |       ],
106 |       "execution_count": 0,
107 |       "outputs": []
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "metadata": {
112 |         "id": "napTAYyXiU8r",
113 |         "colab_type": "code",
114 |         "colab": {}
115 |       },
116 |       "source": [
117 |         "%%R\n",
118 |         "install.packages('caret')\n",
119 |         "install.packages('mlbench')"
120 |       ],
121 |       "execution_count": 0,
122 |       "outputs": []
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "metadata": {
127 |         "id": "4eB_IbK4kztb",
128 |         "colab_type": "code",
129 |         "colab": {}
130 |       },
131 |       "source": [
132 |         "%%R\n",
133 |         "install.packages('mlbench')"
134 |       ],
135 |       "execution_count": 0,
136 |       "outputs": []
137 |     },
138 |     {
139 |       "cell_type": "code",
140 |       "metadata": {
141 |         "id": "Bl0feNEUi-Jk",
142 |         "colab_type": "code",
143 |         "colab": {}
144 |       },
145 |       "source": [
146 |         "%%R\n",
147 |         "library(caret)"
148 |       ],
149 |       "execution_count": 0,
150 |       "outputs": []
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "metadata": {
155 |         "id": "zY7WFnrSj4Mr",
156 |         "colab_type": "code",
157 |         "colab": {}
158 |       },
159 |       "source": [
160 |         "%%R\n",
161 |         "############################################\n",
162 |         "# Data Professor                           #\n",
163 |         "# http://youtube.com/dataprofessor         #\n",
164 |         "# http://github.com/dataprofessor          #\n",
165 |         "# http://facebook.com/dataprofessor        #\n",
166 |         "# https://www.instagram.com/data.professor #\n",
167 |         "############################################\n",
168 |         "\n",
169 |         "# Importing libraries\n",
170 |         "library(mlbench) # Contains several benchmark data sets (especially the Boston Housing dataset)\n",
171 |         "library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training\n",
172 |         "\n",
173 |         "# Importing the Boston Housing data set\n",
174 |         "data(BostonHousing)\n",
175 |         "\n",
176 |         "head(BostonHousing)\n",
177 |         "\n",
178 |         "# Check to see if there are missing data?\n",
179 |         "sum(is.na(BostonHousing))\n",
180 |         "\n",
181 |         "# To achieve reproducible model; set the random seed number\n",
182 |         "set.seed(100)\n",
183 |         "\n",
184 |         "# Performs stratified random split of the data set\n",
185 |         "TrainingIndex <- createDataPartition(BostonHousing$medv, p=0.8, list = FALSE)\n",
186 |         "TrainingSet <- BostonHousing[TrainingIndex,] # Training Set\n",
187 |         "TestingSet <- BostonHousing[-TrainingIndex,] # Test Set\n",
188 |         "\n",
189 |         "\n",
190 |         "###############################\n",
191 |         "\n",
192 |         "# Build Training model\n",
193 |         "Model <- train(medv ~ ., data = TrainingSet,\n",
194 |         "               method = \"lm\",\n",
195 |         "               na.action = na.omit,\n",
196 |         "               preProcess=c(\"scale\",\"center\"),\n",
197 |         "               trControl= trainControl(method=\"none\")\n",
198 |         ")\n",
199 |         "\n",
200 |         "# Apply model for prediction\n",
201 |         "Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set\n",
202 |         "Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set\n",
203 |         "\n",
204 |         "# Model performance (Displays scatter plot and performance metrics)\n",
205 |         "  # Scatter plot of Training set\n",
206 |         "plot(TrainingSet$medv,Model.training, col = \"blue\" )\n",
207 |         "plot(TestingSet$medv,Model.testing, col = \"blue\" )"
208 |       ],
209 |       "execution_count": 0,
210 |       "outputs": []
211 |     },
212 |     {
213 |       "cell_type": "code",
214 |       "metadata": {
215 |         "id": "Q6A7bOvbll8D",
216 |         "colab_type": "code",
217 |         "colab": {}
218 |       },
219 |       "source": [
220 |         ""
221 |       ],
222 |       "execution_count": 0,
223 |       "outputs": []
224 |     }
225 |   ]
226 | }


--------------------------------------------------------------------------------
/shiny/001-first-app/app.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | # Modified from Winston Chang, 
 8 | # https://shiny.rstudio.com/gallery/shiny-theme-selector.html
 9 | 
10 | # Concepts about Reactive programming used by Shiny, 
11 | # https://shiny.rstudio.com/articles/reactivity-overview.html
12 | 
13 | # Load R packages
14 | library(shiny)
15 | library(shinythemes)
16 | 
17 | 
18 |   # Define UI
19 |   ui <- fluidPage(theme = shinytheme("cerulean"),
20 |     navbarPage(
21 |       # theme = "cerulean",  # <--- To use a theme, uncomment this
22 |       "My first app",
23 |       tabPanel("Navbar 1",
24 |                sidebarPanel(
25 |                  tags$h3("Input:"),
26 |                  textInput("txt1", "Given Name:", ""),
27 |                  textInput("txt2", "Surname:", ""),
28 |                  
29 |                ), # sidebarPanel
30 |                mainPanel(
31 |                             h1("Header 1"),
32 |                             
33 |                             h4("Output 1"),
34 |                             verbatimTextOutput("txtout"),
35 | 
36 |                ) # mainPanel
37 |                
38 |       ), # Navbar 1, tabPanel
39 |       tabPanel("Navbar 2", "This panel is intentionally left blank"),
40 |       tabPanel("Navbar 3", "This panel is intentionally left blank")
41 |   
42 |     ) # navbarPage
43 |   ) # fluidPage
44 | 
45 |   
46 |   # Define server function  
47 |   server <- function(input, output) {
48 |     
49 |     output$txtout <- renderText({
50 |       paste( input$txt1, input$txt2, sep = " " )
51 |     })
52 |   } # server
53 |   
54 | 
55 |   # Create Shiny object
56 |   shinyApp(ui = ui, server = server)
57 | 


--------------------------------------------------------------------------------
/shiny/002-histogram/app.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | # Modified from https://shiny.rstudio.com/tutorial/written-tutorial/lesson1/
 8 | 
 9 | library(shiny)
10 | data(airquality)
11 | 
12 | # Define UI for app that draws a histogram ----
13 | ui <- fluidPage(
14 |   
15 |   # App title ----
16 |   titlePanel("Ozone level!"),
17 |   
18 |   # Sidebar layout with input and output definitions ----
19 |   sidebarLayout(
20 |     
21 |     # Sidebar panel for inputs ----
22 |     sidebarPanel(
23 |       
24 |       # Input: Slider for the number of bins ----
25 |       sliderInput(inputId = "bins",
26 |                   label = "Number of bins:",
27 |                   min = 1,
28 |                   max = 50,
29 |                   value = 30)
30 |       
31 |     ),
32 |     
33 |     # Main panel for displaying outputs ----
34 |     mainPanel(
35 |       
36 |       # Output: Histogram ----
37 |       plotOutput(outputId = "distPlot")
38 |       
39 |     )
40 |   )
41 | )
42 | 
43 | # Define server logic required to draw a histogram ----
44 | server <- function(input, output) {
45 |   
46 | 
47 |   output$distPlot <- renderPlot({
48 |     
49 |     x    <- airquality$Ozone
50 |     x    <- na.omit(x)
51 |     bins <- seq(min(x), max(x), length.out = input$bins + 1)
52 |     
53 |     hist(x, breaks = bins, col = "#75AADB", border = "black",
54 |          xlab = "Ozone level",
55 |          main = "Histogram of Ozone level")
56 |     
57 |   })
58 |   
59 | }
60 | 
61 | # Create Shiny app ----
62 | shinyApp(ui = ui, server = server)
63 | 


--------------------------------------------------------------------------------
/shiny/003-play-golf/app.R:
--------------------------------------------------------------------------------
  1 | ####################################
  2 | # Data Professor                   #
  3 | # http://youtube.com/dataprofessor #
  4 | # http://github.com/dataprofessor  #
  5 | ####################################
  6 | 
  7 | 
  8 | # Import libraries
  9 | library(shiny)
 10 | library(shinythemes)
 11 | library(data.table)
 12 | library(RCurl)
 13 | library(randomForest)
 14 | 
 15 | # Read data
 16 | weather <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/weather-weka.csv") )
 17 | 
 18 | # Build model
 19 | model <- randomForest(play ~ ., data = weather, ntree = 500, mtry = 4, importance = TRUE)
 20 | 
 21 | # Save model to RDS file
 22 | # saveRDS(model, "model.rds")
 23 | 
 24 | # Read in the RF model
 25 | #model <- readRDS("model.rds")
 26 | 
 27 | ####################################
 28 | # User interface                   #
 29 | ####################################
 30 | 
 31 | ui <- fluidPage(theme = shinytheme("united"),
 32 |   
 33 |   # Page header
 34 |   headerPanel('Play Golf?'),
 35 |   
 36 |   # Input values
 37 |   sidebarPanel(
 38 |     HTML("<h3>Input parameters</h3>"),
 39 |     
 40 |     selectInput("outlook", label = "Outlook:", 
 41 |                 choices = list("Sunny" = "sunny", "Overcast" = "overcast", "Rainy" = "rainy"), 
 42 |                 selected = "Rainy"),
 43 |     sliderInput("temperature", "Temperature:",
 44 |                 min = 64, max = 86,
 45 |                 value = 70),
 46 |     sliderInput("humidity", "Humidity:",
 47 |                 min = 65, max = 96,
 48 |                 value = 90),
 49 |     selectInput("windy", label = "Windy:", 
 50 |                 choices = list("Yes" = "TRUE", "No" = "FALSE"), 
 51 |                 selected = "TRUE"),
 52 |     
 53 |     actionButton("submitbutton", "Submit", class = "btn btn-primary")
 54 |   ),
 55 |   
 56 |   mainPanel(
 57 |     tags$label(h3('Status/Output')), # Status/Output Text Box
 58 |     verbatimTextOutput('contents'),
 59 |     tableOutput('tabledata') # Prediction results table
 60 |     
 61 |   )
 62 | )
 63 | 
 64 | ####################################
 65 | # Server                           #
 66 | ####################################
 67 | 
 68 | server <- function(input, output, session) {
 69 | 
 70 |   # Input Data
 71 |   datasetInput <- reactive({  
 72 |     
 73 |   # outlook,temperature,humidity,windy,play
 74 |   df <- data.frame(
 75 |     Name = c("outlook",
 76 |              "temperature",
 77 |              "humidity",
 78 |              "windy"),
 79 |     Value = as.character(c(input$outlook,
 80 |                            input$temperature,
 81 |                            input$humidity,
 82 |                            input$windy)),
 83 |     stringsAsFactors = FALSE)
 84 |   
 85 |   play <- "play"
 86 |   df <- rbind(df, play)
 87 |   input <- transpose(df)
 88 |   write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE)
 89 |   
 90 |   test <- read.csv(paste("input", ".csv", sep=""), header = TRUE)
 91 |   
 92 |   test$outlook <- factor(test$outlook, levels = c("overcast", "rainy", "sunny"))
 93 |   
 94 |   
 95 |   Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3))
 96 |   print(Output)
 97 |   
 98 |   })
 99 |   
100 |   # Status/Output Text Box
101 |   output$contents <- renderPrint({
102 |     if (input$submitbutton>0) { 
103 |       isolate("Calculation complete.") 
104 |     } else {
105 |       return("Server is ready for calculation.")
106 |     }
107 |   })
108 |   
109 |   # Prediction results table
110 |   output$tabledata <- renderTable({
111 |     if (input$submitbutton>0) { 
112 |       isolate(datasetInput()) 
113 |     } 
114 |   })
115 |   
116 | }
117 | 
118 | ####################################
119 | # Create the shiny app             #
120 | ####################################
121 | shinyApp(ui = ui, server = server)
122 | 


--------------------------------------------------------------------------------
/shiny/004-iris-predictor/app-numeric.R:
--------------------------------------------------------------------------------
  1 | ############################################
  2 | # Data Professor                           #
  3 | # http://youtube.com/dataprofessor         #
  4 | # http://github.com/dataprofessor          #
  5 | # http://facebook.com/dataprofessor        #
  6 | # https://www.instagram.com/data.professor #
  7 | ############################################
  8 | 
  9 | # Import libraries
 10 | library(shiny)
 11 | library(data.table)
 12 | library(randomForest)
 13 | 
 14 | # Read in the RF model
 15 | model <- readRDS("model.rds")
 16 | 
 17 | 
 18 | ####################################
 19 | # User interface                   #
 20 | ####################################
 21 | 
 22 | ui <- pageWithSidebar(
 23 |   
 24 |   # Page header
 25 |   headerPanel('Iris Predictor'),
 26 |   
 27 |   # Input values
 28 |   sidebarPanel(
 29 |     #HTML("<h3>Input parameters</h3>"),
 30 |     tags$label(h3('Input parameters')),
 31 |     numericInput("Sepal.Length", 
 32 |                  label = "Sepal Length", 
 33 |                  value = 5.1),
 34 |     numericInput("Sepal.Width", 
 35 |                  label = "Sepal Width", 
 36 |                  value = 3.6),
 37 |     numericInput("Petal.Length", 
 38 |                  label = "Petal Length", 
 39 |                  value = 1.4),
 40 |     numericInput("Petal.Width", 
 41 |                  label = "Petal Width", 
 42 |                  value = 0.2),
 43 |     
 44 |     actionButton("submitbutton", "Submit", 
 45 |                  class = "btn btn-primary")
 46 |   ),
 47 |   
 48 |   mainPanel(
 49 |     tags$label(h3('Status/Output')), # Status/Output Text Box
 50 |     verbatimTextOutput('contents'),
 51 |     tableOutput('tabledata') # Prediction results table
 52 |     
 53 |   )
 54 | )
 55 | 
 56 | ####################################
 57 | # Server                           #
 58 | ####################################
 59 | 
 60 | server<- function(input, output, session) {
 61 |   
 62 |   # Input Data
 63 |   datasetInput <- reactive({  
 64 |     
 65 |     df <- data.frame(
 66 |       Name = c("Sepal Length",
 67 |                "Sepal Width",
 68 |                "Petal Length",
 69 |                "Petal Width"),
 70 |       Value = as.character(c(input$Sepal.Length,
 71 |                              input$Sepal.Width,
 72 |                              input$Petal.Length,
 73 |                              input$Petal.Width)),
 74 |       stringsAsFactors = FALSE)
 75 |     
 76 |     Species <- 0
 77 |     df <- rbind(df, Species)
 78 |     input <- transpose(df)
 79 |     write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE)
 80 |     
 81 |     test <- read.csv(paste("input", ".csv", sep=""), header = TRUE)
 82 |     
 83 |     Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3))
 84 |     print(Output)
 85 |     
 86 |   })
 87 |   
 88 |   # Status/Output Text Box
 89 |   output$contents <- renderPrint({
 90 |     if (input$submitbutton>0) { 
 91 |       isolate("Calculation complete.") 
 92 |     } else {
 93 |       return("Server is ready for calculation.")
 94 |     }
 95 |   })
 96 |   
 97 |   # Prediction results table
 98 |   output$tabledata <- renderTable({
 99 |     if (input$submitbutton>0) { 
100 |       isolate(datasetInput()) 
101 |     } 
102 |   })
103 |   
104 | }
105 | 
106 | ####################################
107 | # Create the shiny app             #
108 | ####################################
109 | shinyApp(ui = ui, server = server)
110 | 


--------------------------------------------------------------------------------
/shiny/004-iris-predictor/app-slider.R:
--------------------------------------------------------------------------------
  1 | ############################################
  2 | # Data Professor                           #
  3 | # http://youtube.com/dataprofessor         #
  4 | # http://github.com/dataprofessor          #
  5 | # http://facebook.com/dataprofessor        #
  6 | # https://www.instagram.com/data.professor #
  7 | ############################################
  8 | 
  9 | # Import libraries
 10 | library(shiny)
 11 | library(data.table)
 12 | library(randomForest)
 13 | 
 14 | # Read in the RF model
 15 | model <- readRDS("model.rds")
 16 | 
 17 | # Training set
 18 | TrainSet <- read.csv("training.csv", header = TRUE)
 19 | TrainSet <- TrainSet[,-1]
 20 | 
 21 | 
 22 | ####################################
 23 | # User interface                   #
 24 | ####################################
 25 | 
 26 | ui <- pageWithSidebar(
 27 |   
 28 |   # Page header
 29 |   headerPanel('Iris Predictor'),
 30 |   
 31 |   # Input values
 32 |   sidebarPanel(
 33 |     HTML("<h3>Input parameters</h4>"),
 34 |     sliderInput("Sepal.Length", label = "Sepal Length", value = 5.0,
 35 |                 min = min(TrainSet$Sepal.Length),
 36 |                 max = max(TrainSet$Sepal.Length)
 37 |     ),
 38 |     sliderInput("Sepal.Width", label = "Sepal Width", value = 3.6,
 39 |                 min = min(TrainSet$Sepal.Width),
 40 |                 max = max(TrainSet$Sepal.Width)),
 41 |     sliderInput("Petal.Length", label = "Petal Length", value = 1.4,
 42 |                 min = min(TrainSet$Petal.Length),
 43 |                 max = max(TrainSet$Petal.Length)),
 44 |     sliderInput("Petal.Width", label = "Petal Width", value = 0.2,
 45 |                 min = min(TrainSet$Petal.Width),
 46 |                 max = max(TrainSet$Petal.Width)),
 47 |     
 48 |     actionButton("submitbutton", "Submit", class = "btn btn-primary")
 49 |   ),
 50 |   
 51 |   mainPanel(
 52 |     tags$label(h3('Status/Output')), # Status/Output Text Box
 53 |     verbatimTextOutput('contents'),
 54 |     tableOutput('tabledata') # Prediction results table
 55 |     
 56 |   )
 57 | )
 58 | 
 59 | ####################################
 60 | # Server                           #
 61 | ####################################
 62 | 
 63 | server<- function(input, output, session) {
 64 |   
 65 |   # Input Data
 66 |   datasetInput <- reactive({  
 67 |     
 68 |     df <- data.frame(
 69 |       Name = c("Sepal Length",
 70 |                "Sepal Width",
 71 |                "Petal Length",
 72 |                "Petal Width"),
 73 |       Value = as.character(c(input$Sepal.Length,
 74 |                              input$Sepal.Width,
 75 |                              input$Petal.Length,
 76 |                              input$Petal.Width)),
 77 |       stringsAsFactors = FALSE)
 78 |     
 79 |     Species <- 0
 80 |     df <- rbind(df, Species)
 81 |     input <- transpose(df)
 82 |     write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE)
 83 |     
 84 |     test <- read.csv(paste("input", ".csv", sep=""), header = TRUE)
 85 |     
 86 |     Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3))
 87 |     print(Output)
 88 |     
 89 |   })
 90 |   
 91 |   # Status/Output Text Box
 92 |   output$contents <- renderPrint({
 93 |     if (input$submitbutton>0) { 
 94 |       isolate("Calculation complete.") 
 95 |     } else {
 96 |       return("Server is ready for calculation.")
 97 |     }
 98 |   })
 99 |   
100 |   # Prediction results table
101 |   output$tabledata <- renderTable({
102 |     if (input$submitbutton>0) { 
103 |       isolate(datasetInput()) 
104 |     } 
105 |   })
106 |   
107 | }
108 | 
109 | ####################################
110 | # Create the shiny app             #
111 | ####################################
112 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/shiny/004-iris-predictor/model.R:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | # Data Professor                   #
 3 | # http://youtube.com/dataprofessor #
 4 | # http://github.com/dataprofessor  #
 5 | ####################################
 6 | 
 7 | # Importing libraries
 8 | library(RCurl) # for downloading the iris CSV file
 9 | library(randomForest)
10 | library(caret)
11 | 
12 | # Importing the Iris data set
13 | iris <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/iris.csv") )
14 | 
15 | # Performs stratified random split of the data set
16 | TrainingIndex <- createDataPartition(iris$Species, p=0.8, list = FALSE)
17 | TrainingSet <- iris[TrainingIndex,] # Training Set
18 | TestingSet <- iris[-TrainingIndex,] # Test Set
19 | 
20 | write.csv(TrainingSet, "training.csv")
21 | write.csv(TestingSet, "testing.csv")
22 | 
23 | TrainSet <- read.csv("training.csv", header = TRUE)
24 | TrainSet <- TrainSet[,-1]
25 | 
26 | # Building Random forest model
27 | 
28 | model <- randomForest(Species ~ ., data = TrainSet, ntree = 500, mtry = 4, importance = TRUE)
29 | 
30 | # Save model to RDS file
31 | saveRDS(model, "model.rds")
32 | 


--------------------------------------------------------------------------------
/shiny/004-iris-predictor/model.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/shiny/004-iris-predictor/model.rds


--------------------------------------------------------------------------------
/shiny/004-iris-predictor/testing.csv:
--------------------------------------------------------------------------------
 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
 2 | "5",5,3.6,1.4,0.2,"setosa"
 3 | "9",4.4,2.9,1.4,0.2,"setosa"
 4 | "14",4.3,3,1.1,0.1,"setosa"
 5 | "19",5.7,3.8,1.7,0.3,"setosa"
 6 | "22",5.1,3.7,1.5,0.4,"setosa"
 7 | "26",5,3,1.6,0.2,"setosa"
 8 | "29",5.2,3.4,1.4,0.2,"setosa"
 9 | "37",5.5,3.5,1.3,0.2,"setosa"
10 | "41",5,3.5,1.3,0.3,"setosa"
11 | "42",4.5,2.3,1.3,0.3,"setosa"
12 | "55",6.5,2.8,4.6,1.5,"versicolor"
13 | "56",5.7,2.8,4.5,1.3,"versicolor"
14 | "61",5,2,3.5,1,"versicolor"
15 | "65",5.6,2.9,3.6,1.3,"versicolor"
16 | "66",6.7,3.1,4.4,1.4,"versicolor"
17 | "68",5.8,2.7,4.1,1,"versicolor"
18 | "73",6.3,2.5,4.9,1.5,"versicolor"
19 | "90",5.5,2.5,4,1.3,"versicolor"
20 | "92",6.1,3,4.6,1.4,"versicolor"
21 | "99",5.1,2.5,3,1.1,"versicolor"
22 | "103",7.1,3,5.9,2.1,"virginica"
23 | "111",6.5,3.2,5.1,2,"virginica"
24 | "112",6.4,2.7,5.3,1.9,"virginica"
25 | "113",6.8,3,5.5,2.1,"virginica"
26 | "120",6,2.2,5,1.5,"virginica"
27 | "133",6.4,2.8,5.6,2.2,"virginica"
28 | "134",6.3,2.8,5.1,1.5,"virginica"
29 | "136",7.7,3,6.1,2.3,"virginica"
30 | "146",6.7,3,5.2,2.3,"virginica"
31 | "147",6.3,2.5,5,1.9,"virginica"
32 | 


--------------------------------------------------------------------------------
/shiny/004-iris-predictor/training.csv:
--------------------------------------------------------------------------------
  1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
  2 | "1",5.1,3.5,1.4,0.2,"setosa"
  3 | "2",4.9,3,1.4,0.2,"setosa"
  4 | "3",4.7,3.2,1.3,0.2,"setosa"
  5 | "4",4.6,3.1,1.5,0.2,"setosa"
  6 | "6",5.4,3.9,1.7,0.4,"setosa"
  7 | "7",4.6,3.4,1.4,0.3,"setosa"
  8 | "8",5,3.4,1.5,0.2,"setosa"
  9 | "10",4.9,3.1,1.5,0.1,"setosa"
 10 | "11",5.4,3.7,1.5,0.2,"setosa"
 11 | "12",4.8,3.4,1.6,0.2,"setosa"
 12 | "13",4.8,3,1.4,0.1,"setosa"
 13 | "15",5.8,4,1.2,0.2,"setosa"
 14 | "16",5.7,4.4,1.5,0.4,"setosa"
 15 | "17",5.4,3.9,1.3,0.4,"setosa"
 16 | "18",5.1,3.5,1.4,0.3,"setosa"
 17 | "20",5.1,3.8,1.5,0.3,"setosa"
 18 | "21",5.4,3.4,1.7,0.2,"setosa"
 19 | "23",4.6,3.6,1,0.2,"setosa"
 20 | "24",5.1,3.3,1.7,0.5,"setosa"
 21 | "25",4.8,3.4,1.9,0.2,"setosa"
 22 | "27",5,3.4,1.6,0.4,"setosa"
 23 | "28",5.2,3.5,1.5,0.2,"setosa"
 24 | "30",4.7,3.2,1.6,0.2,"setosa"
 25 | "31",4.8,3.1,1.6,0.2,"setosa"
 26 | "32",5.4,3.4,1.5,0.4,"setosa"
 27 | "33",5.2,4.1,1.5,0.1,"setosa"
 28 | "34",5.5,4.2,1.4,0.2,"setosa"
 29 | "35",4.9,3.1,1.5,0.1,"setosa"
 30 | "36",5,3.2,1.2,0.2,"setosa"
 31 | "38",4.9,3.1,1.5,0.1,"setosa"
 32 | "39",4.4,3,1.3,0.2,"setosa"
 33 | "40",5.1,3.4,1.5,0.2,"setosa"
 34 | "43",4.4,3.2,1.3,0.2,"setosa"
 35 | "44",5,3.5,1.6,0.6,"setosa"
 36 | "45",5.1,3.8,1.9,0.4,"setosa"
 37 | "46",4.8,3,1.4,0.3,"setosa"
 38 | "47",5.1,3.8,1.6,0.2,"setosa"
 39 | "48",4.6,3.2,1.4,0.2,"setosa"
 40 | "49",5.3,3.7,1.5,0.2,"setosa"
 41 | "50",5,3.3,1.4,0.2,"setosa"
 42 | "51",7,3.2,4.7,1.4,"versicolor"
 43 | "52",6.4,3.2,4.5,1.5,"versicolor"
 44 | "53",6.9,3.1,4.9,1.5,"versicolor"
 45 | "54",5.5,2.3,4,1.3,"versicolor"
 46 | "57",6.3,3.3,4.7,1.6,"versicolor"
 47 | "58",4.9,2.4,3.3,1,"versicolor"
 48 | "59",6.6,2.9,4.6,1.3,"versicolor"
 49 | "60",5.2,2.7,3.9,1.4,"versicolor"
 50 | "62",5.9,3,4.2,1.5,"versicolor"
 51 | "63",6,2.2,4,1,"versicolor"
 52 | "64",6.1,2.9,4.7,1.4,"versicolor"
 53 | "67",5.6,3,4.5,1.5,"versicolor"
 54 | "69",6.2,2.2,4.5,1.5,"versicolor"
 55 | "70",5.6,2.5,3.9,1.1,"versicolor"
 56 | "71",5.9,3.2,4.8,1.8,"versicolor"
 57 | "72",6.1,2.8,4,1.3,"versicolor"
 58 | "74",6.1,2.8,4.7,1.2,"versicolor"
 59 | "75",6.4,2.9,4.3,1.3,"versicolor"
 60 | "76",6.6,3,4.4,1.4,"versicolor"
 61 | "77",6.8,2.8,4.8,1.4,"versicolor"
 62 | "78",6.7,3,5,1.7,"versicolor"
 63 | "79",6,2.9,4.5,1.5,"versicolor"
 64 | "80",5.7,2.6,3.5,1,"versicolor"
 65 | "81",5.5,2.4,3.8,1.1,"versicolor"
 66 | "82",5.5,2.4,3.7,1,"versicolor"
 67 | "83",5.8,2.7,3.9,1.2,"versicolor"
 68 | "84",6,2.7,5.1,1.6,"versicolor"
 69 | "85",5.4,3,4.5,1.5,"versicolor"
 70 | "86",6,3.4,4.5,1.6,"versicolor"
 71 | "87",6.7,3.1,4.7,1.5,"versicolor"
 72 | "88",6.3,2.3,4.4,1.3,"versicolor"
 73 | "89",5.6,3,4.1,1.3,"versicolor"
 74 | "91",5.5,2.6,4.4,1.2,"versicolor"
 75 | "93",5.8,2.6,4,1.2,"versicolor"
 76 | "94",5,2.3,3.3,1,"versicolor"
 77 | "95",5.6,2.7,4.2,1.3,"versicolor"
 78 | "96",5.7,3,4.2,1.2,"versicolor"
 79 | "97",5.7,2.9,4.2,1.3,"versicolor"
 80 | "98",6.2,2.9,4.3,1.3,"versicolor"
 81 | "100",5.7,2.8,4.1,1.3,"versicolor"
 82 | "101",6.3,3.3,6,2.5,"virginica"
 83 | "102",5.8,2.7,5.1,1.9,"virginica"
 84 | "104",6.3,2.9,5.6,1.8,"virginica"
 85 | "105",6.5,3,5.8,2.2,"virginica"
 86 | "106",7.6,3,6.6,2.1,"virginica"
 87 | "107",4.9,2.5,4.5,1.7,"virginica"
 88 | "108",7.3,2.9,6.3,1.8,"virginica"
 89 | "109",6.7,2.5,5.8,1.8,"virginica"
 90 | "110",7.2,3.6,6.1,2.5,"virginica"
 91 | "114",5.7,2.5,5,2,"virginica"
 92 | "115",5.8,2.8,5.1,2.4,"virginica"
 93 | "116",6.4,3.2,5.3,2.3,"virginica"
 94 | "117",6.5,3,5.5,1.8,"virginica"
 95 | "118",7.7,3.8,6.7,2.2,"virginica"
 96 | "119",7.7,2.6,6.9,2.3,"virginica"
 97 | "121",6.9,3.2,5.7,2.3,"virginica"
 98 | "122",5.6,2.8,4.9,2,"virginica"
 99 | "123",7.7,2.8,6.7,2,"virginica"
100 | "124",6.3,2.7,4.9,1.8,"virginica"
101 | "125",6.7,3.3,5.7,2.1,"virginica"
102 | "126",7.2,3.2,6,1.8,"virginica"
103 | "127",6.2,2.8,4.8,1.8,"virginica"
104 | "128",6.1,3,4.9,1.8,"virginica"
105 | "129",6.4,2.8,5.6,2.1,"virginica"
106 | "130",7.2,3,5.8,1.6,"virginica"
107 | "131",7.4,2.8,6.1,1.9,"virginica"
108 | "132",7.9,3.8,6.4,2,"virginica"
109 | "135",6.1,2.6,5.6,1.4,"virginica"
110 | "137",6.3,3.4,5.6,2.4,"virginica"
111 | "138",6.4,3.1,5.5,1.8,"virginica"
112 | "139",6,3,4.8,1.8,"virginica"
113 | "140",6.9,3.1,5.4,2.1,"virginica"
114 | "141",6.7,3.1,5.6,2.4,"virginica"
115 | "142",6.9,3.1,5.1,2.3,"virginica"
116 | "143",5.8,2.7,5.1,1.9,"virginica"
117 | "144",6.8,3.2,5.9,2.3,"virginica"
118 | "145",6.7,3.3,5.7,2.5,"virginica"
119 | "148",6.5,3,5.2,2,"virginica"
120 | "149",6.2,3.4,5.4,2.3,"virginica"
121 | "150",5.9,3,5.1,1.8,"virginica"
122 | 


--------------------------------------------------------------------------------
/shiny/005-bmi/about.md:
--------------------------------------------------------------------------------
 1 | #### What is BMI?
 2 | 
 3 | **Body Mass Index (BMI)** is essentially a value obtained from the weight and height of a person [1].
 4 | 
 5 | #### Calculating the BMI
 6 | BMI can be computed by dividing the person's weight (kg) by their squared height (m) as follows:
 7 | 
 8 | > BMI = kg/m^2
 9 | 
10 | where *kg* represents the person's weight and *m^2* the person's squared height.
11 | 
12 | #### About this BMI Calculator
13 | 
14 | This *BMI Calculator* is for adults 20 years and older. Further information on calculating BMI for children and teenagers is available from the CDC [2].
15 | 
16 | #### References
17 | 1. Centers for Disease Control. [Body Mass Index (BMI)](https://www.cdc.gov/healthyweight/assessing/bmi/index.html), Accessed January 26, 2020.
18 | 2. Centers for Disease Control. [BMI Percentile Calculator for Child and Teen](https://www.cdc.gov/healthyweight/bmi/calculator.html), Accessed January 26, 2020.
19 | 


--------------------------------------------------------------------------------
/shiny/005-bmi/app.R:
--------------------------------------------------------------------------------
 1 | ############################################
 2 | # Data Professor                           #
 3 | # http://youtube.com/dataprofessor         #
 4 | # http://github.com/dataprofessor          #
 5 | # http://facebook.com/dataprofessor        #
 6 | # https://www.instagram.com/data.professor #
 7 | ############################################
 8 | 
 9 | library(shiny)
10 | library(shinythemes)
11 | 
12 | 
13 | ####################################
14 | # User Interface                   #
15 | ####################################
16 | ui <- fluidPage(theme = shinytheme("united"),
17 |                 navbarPage("BMI Calculator:",
18 |                            
19 |                            tabPanel("Home",
20 |                                     # Input values
21 |                                     sidebarPanel(
22 |                                       HTML("<h3>Input parameters</h3>"),
23 |                                       sliderInput("height", 
24 |                                                   label = "Height", 
25 |                                                   value = 175, 
26 |                                                   min = 40, 
27 |                                                   max = 250),
28 |                                       sliderInput("weight", 
29 |                                                   label = "Weight", 
30 |                                                   value = 70, 
31 |                                                   min = 20, 
32 |                                                   max = 100),
33 |                                       
34 |                                       actionButton("submitbutton", 
35 |                                                    "Submit", 
36 |                                                    class = "btn btn-primary")
37 |                                     ),
38 |                                     
39 |                                     mainPanel(
40 |                                       tags$label(h3('Status/Output')), # Status/Output Text Box
41 |                                       verbatimTextOutput('contents'),
42 |                                       tableOutput('tabledata') # Results table
43 |                                     ) # mainPanel()
44 |                                     
45 |                            ), #tabPanel(), Home
46 |                            
47 |                            tabPanel("About", 
48 |                                     titlePanel("About"), 
49 |                                     div(includeMarkdown("about.md"), 
50 |                                         align="justify")
51 |                            ) #tabPanel(), About
52 |                            
53 |                 ) # navbarPage()
54 | ) # fluidPage()
55 | 
56 | 
57 | ####################################
58 | # Server                           #
59 | ####################################
60 | server <- function(input, output, session) {
61 |   
62 |   # Input Data
63 |   datasetInput <- reactive({  
64 |     
65 |     bmi <- input$weight/( (input$height/100) * (input$height/100) )
66 |     bmi <- data.frame(bmi)
67 |     names(bmi) <- "BMI"
68 |     print(bmi)
69 |     
70 |   })
71 |   
72 |   # Status/Output Text Box
73 |   output$contents <- renderPrint({
74 |     if (input$submitbutton>0) { 
75 |       isolate("Calculation complete.") 
76 |     } else {
77 |       return("Server is ready for calculation.")
78 |     }
79 |   })
80 |   
81 |   # Prediction results table
82 |   output$tabledata <- renderTable({
83 |     if (input$submitbutton>0) { 
84 |       isolate(datasetInput()) 
85 |     } 
86 |   })
87 |   
88 | }
89 | 
90 | 
91 | ####################################
92 | # Create Shiny App                 #
93 | ####################################
94 | shinyApp(ui = ui, server = server)
95 | 


--------------------------------------------------------------------------------
/streamlit/part1/myapp.py:
--------------------------------------------------------------------------------
 1 | import yfinance as yf
 2 | import streamlit as st
 3 | 
 4 | st.write("""
 5 | # Simple Stock Price App
 6 | 
 7 | Shown are the stock closing price and volume of Google!
 8 | 
 9 | """)
10 | 
11 | # https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75
12 | #define the ticker symbol
13 | tickerSymbol = 'GOOGL'
14 | #get data on this ticker
15 | tickerData = yf.Ticker(tickerSymbol)
16 | #get the historical prices for this ticker
17 | tickerDf = tickerData.history(period='1d', start='2010-5-31', end='2020-5-31')
18 | # Open	High	Low	Close	Volume	Dividends	Stock Splits
19 | 
20 | st.line_chart(tickerDf.Close)
21 | st.line_chart(tickerDf.Volume)
22 | 


--------------------------------------------------------------------------------
/streamlit/part1/myapp2.py:
--------------------------------------------------------------------------------
 1 | import yfinance as yf
 2 | import streamlit as st
 3 | 
 4 | st.write("""
 5 | # Simple Stock Price App
 6 | 
 7 | Shown are the stock **closing price** and ***volume*** of Google!
 8 | 
 9 | """)
10 | 
11 | # https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75
12 | #define the ticker symbol
13 | tickerSymbol = 'GOOGL'
14 | #get data on this ticker
15 | tickerData = yf.Ticker(tickerSymbol)
16 | #get the historical prices for this ticker
17 | tickerDf = tickerData.history(period='1d', start='2010-5-31', end='2020-5-31')
18 | # Open	High	Low	Close	Volume	Dividends	Stock Splits
19 | 
20 | st.write("""
21 | ## Closing Price
22 | """)
23 | st.line_chart(tickerDf.Close)
24 | st.write("""
25 | ## Volume Price
26 | """)
27 | st.line_chart(tickerDf.Volume)
28 | 


--------------------------------------------------------------------------------
/streamlit/part10/sp500-app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import base64
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | import numpy as np
 7 | import yfinance as yf
 8 | 
 9 | st.title('S&P 500 App')
10 | 
11 | st.markdown("""
12 | This app retrieves the list of the **S&P 500** (from Wikipedia) and its corresponding **stock closing price** (year-to-date)!
13 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn
14 | * **Data source:** [Wikipedia](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies).
15 | """)
16 | 
17 | st.sidebar.header('User Input Features')
18 | 
19 | # Web scraping of S&P 500 data
20 | #
21 | @st.cache
22 | def load_data():
23 |     url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
24 |     html = pd.read_html(url, header = 0)
25 |     df = html[0]
26 |     return df
27 | 
28 | df = load_data()
29 | sector = df.groupby('GICS Sector')
30 | 
31 | # Sidebar - Sector selection
32 | sorted_sector_unique = sorted( df['GICS Sector'].unique() )
33 | selected_sector = st.sidebar.multiselect('Sector', sorted_sector_unique, sorted_sector_unique)
34 | 
35 | # Filtering data
36 | df_selected_sector = df[ (df['GICS Sector'].isin(selected_sector)) ]
37 | 
38 | st.header('Display Companies in Selected Sector')
39 | st.write('Data Dimension: ' + str(df_selected_sector.shape[0]) + ' rows and ' + str(df_selected_sector.shape[1]) + ' columns.')
40 | st.dataframe(df_selected_sector)
41 | 
42 | # Download S&P500 data
43 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
44 | def filedownload(df):
45 |     csv = df.to_csv(index=False)
46 |     b64 = base64.b64encode(csv.encode()).decode()  # strings <-> bytes conversions
47 |     href = f'<a href="data:file/csv;base64,{b64}" download="SP500.csv">Download CSV File</a>'
48 |     return href
49 | 
50 | st.markdown(filedownload(df_selected_sector), unsafe_allow_html=True)
51 | 
52 | # https://pypi.org/project/yfinance/
53 | 
54 | data = yf.download(
55 |         tickers = list(df_selected_sector[:10].Symbol),
56 |         period = "ytd",
57 |         interval = "1d",
58 |         group_by = 'ticker',
59 |         auto_adjust = True,
60 |         prepost = True,
61 |         threads = True,
62 |         proxy = None
63 |     )
64 | 
65 | # Plot Closing Price of Query Symbol
66 | def price_plot(symbol):
67 |   df = pd.DataFrame(data[symbol].Close)
68 |   df['Date'] = df.index
69 |   plt.fill_between(df.Date, df.Close, color='skyblue', alpha=0.3)
70 |   plt.plot(df.Date, df.Close, color='skyblue', alpha=0.8)
71 |   plt.xticks(rotation=90)
72 |   plt.title(symbol, fontweight='bold')
73 |   plt.xlabel('Date', fontweight='bold')
74 |   plt.ylabel('Closing Price', fontweight='bold')
75 |   return st.pyplot()
76 | 
77 | num_company = st.sidebar.slider('Number of Companies', 1, 5)
78 | 
79 | if st.button('Show Plots'):
80 |     st.header('Stock Closing Price')
81 |     for i in list(df_selected_sector.Symbol)[:num_company]:
82 |         price_plot(i)
83 | 


--------------------------------------------------------------------------------
/streamlit/part12/crypto-price-app.py:
--------------------------------------------------------------------------------
  1 | # This app is for educational purpose only. Insights gained is not financial advice. Use at your own risk!
  2 | import streamlit as st
  3 | from PIL import Image
  4 | import pandas as pd
  5 | import base64
  6 | import matplotlib.pyplot as plt
  7 | from bs4 import BeautifulSoup
  8 | import requests
  9 | import json
 10 | import time
 11 | #---------------------------------#
 12 | # New feature (make sure to upgrade your streamlit library)
 13 | # pip install --upgrade streamlit
 14 | 
 15 | #---------------------------------#
 16 | # Page layout
 17 | ## Page expands to full width
 18 | st.set_page_config(layout="wide")
 19 | #---------------------------------#
 20 | # Title
 21 | 
 22 | image = Image.open('logo.jpg')
 23 | 
 24 | st.image(image, width = 500)
 25 | 
 26 | st.title('Crypto Price App')
 27 | st.markdown("""
 28 | This app retrieves cryptocurrency prices for the top 100 cryptocurrency from the **CoinMarketCap**!
 29 | 
 30 | """)
 31 | #---------------------------------#
 32 | # About
 33 | expander_bar = st.beta_expander("About")
 34 | expander_bar.markdown("""
 35 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn, BeautifulSoup, requests, json, time
 36 | * **Data source:** [CoinMarketCap](http://coinmarketcap.com).
 37 | * **Credit:** Web scraper adapted from the Medium article *[Web Scraping Crypto Prices With Python](https://towardsdatascience.com/web-scraping-crypto-prices-with-python-41072ea5b5bf)* written by [Bryan Feng](https://medium.com/@bryanf).
 38 | """)
 39 | 
 40 | 
 41 | #---------------------------------#
 42 | # Page layout (continued)
 43 | ## Divide page to 3 columns (col1 = sidebar, col2 and col3 = page contents)
 44 | col1 = st.sidebar
 45 | col2, col3 = st.beta_columns((2,1))
 46 | 
 47 | #---------------------------------#
 48 | # Sidebar + Main panel
 49 | col1.header('Input Options')
 50 | 
 51 | ## Sidebar - Currency price unit
 52 | currency_price_unit = col1.selectbox('Select currency for price', ('USD', 'BTC', 'ETH'))
 53 | 
 54 | # Web scraping of CoinMarketCap data
 55 | @st.cache
 56 | def load_data():
 57 |     cmc = requests.get('https://coinmarketcap.com')
 58 |     soup = BeautifulSoup(cmc.content, 'html.parser')
 59 | 
 60 |     data = soup.find('script', id='__NEXT_DATA__', type='application/json')
 61 |     coins = {}
 62 |     coin_data = json.loads(data.contents[0])
 63 |     listings = coin_data['props']['initialState']['cryptocurrency']['listingLatest']['data']
 64 |     for i in listings:
 65 |       coins[str(i['id'])] = i['slug']
 66 | 
 67 |     coin_name = []
 68 |     coin_symbol = []
 69 |     market_cap = []
 70 |     percent_change_1h = []
 71 |     percent_change_24h = []
 72 |     percent_change_7d = []
 73 |     price = []
 74 |     volume_24h = []
 75 | 
 76 |     for i in listings:
 77 |       coin_name.append(i['slug'])
 78 |       coin_symbol.append(i['symbol'])
 79 |       price.append(i['quote'][currency_price_unit]['price'])
 80 |       percent_change_1h.append(i['quote'][currency_price_unit]['percent_change_1h'])
 81 |       percent_change_24h.append(i['quote'][currency_price_unit]['percent_change_24h'])
 82 |       percent_change_7d.append(i['quote'][currency_price_unit]['percent_change_7d'])
 83 |       market_cap.append(i['quote'][currency_price_unit]['market_cap'])
 84 |       volume_24h.append(i['quote'][currency_price_unit]['volume_24h'])
 85 | 
 86 |     df = pd.DataFrame(columns=['coin_name', 'coin_symbol', 'market_cap', 'percent_change_1h', 'percent_change_24h', 'percent_change_7d', 'price', 'volume_24h'])
 87 |     df['coin_name'] = coin_name
 88 |     df['coin_symbol'] = coin_symbol
 89 |     df['price'] = price
 90 |     df['percent_change_1h'] = percent_change_1h
 91 |     df['percent_change_24h'] = percent_change_24h
 92 |     df['percent_change_7d'] = percent_change_7d
 93 |     df['market_cap'] = market_cap
 94 |     df['volume_24h'] = volume_24h
 95 |     return df
 96 | 
 97 | df = load_data()
 98 | 
 99 | ## Sidebar - Cryptocurrency selections
100 | sorted_coin = sorted( df['coin_symbol'] )
101 | selected_coin = col1.multiselect('Cryptocurrency', sorted_coin, sorted_coin)
102 | 
103 | df_selected_coin = df[ (df['coin_symbol'].isin(selected_coin)) ] # Filtering data
104 | 
105 | ## Sidebar - Number of coins to display
106 | num_coin = col1.slider('Display Top N Coins', 1, 100, 100)
107 | df_coins = df_selected_coin[:num_coin]
108 | 
109 | ## Sidebar - Percent change timeframe
110 | percent_timeframe = col1.selectbox('Percent change time frame',
111 |                                     ['7d','24h', '1h'])
112 | percent_dict = {"7d":'percent_change_7d',"24h":'percent_change_24h',"1h":'percent_change_1h'}
113 | selected_percent_timeframe = percent_dict[percent_timeframe]
114 | 
115 | ## Sidebar - Sorting values
116 | sort_values = col1.selectbox('Sort values?', ['Yes', 'No'])
117 | 
118 | col2.subheader('Price Data of Selected Cryptocurrency')
119 | col2.write('Data Dimension: ' + str(df_selected_coin.shape[0]) + ' rows and ' + str(df_selected_coin.shape[1]) + ' columns.')
120 | 
121 | col2.dataframe(df_coins)
122 | 
123 | # Download CSV data
124 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
125 | def filedownload(df):
126 |     csv = df.to_csv(index=False)
127 |     b64 = base64.b64encode(csv.encode()).decode()  # strings <-> bytes conversions
128 |     href = f'<a href="data:file/csv;base64,{b64}" download="crypto.csv">Download CSV File</a>'
129 |     return href
130 | 
131 | col2.markdown(filedownload(df_selected_coin), unsafe_allow_html=True)
132 | 
133 | #---------------------------------#
134 | # Preparing data for Bar plot of % Price change
135 | col2.subheader('Table of % Price Change')
136 | df_change = pd.concat([df_coins.coin_symbol, df_coins.percent_change_1h, df_coins.percent_change_24h, df_coins.percent_change_7d], axis=1)
137 | df_change = df_change.set_index('coin_symbol')
138 | df_change['positive_percent_change_1h'] = df_change['percent_change_1h'] > 0
139 | df_change['positive_percent_change_24h'] = df_change['percent_change_24h'] > 0
140 | df_change['positive_percent_change_7d'] = df_change['percent_change_7d'] > 0
141 | col2.dataframe(df_change)
142 | 
143 | # Conditional creation of Bar plot (time frame)
144 | col3.subheader('Bar plot of % Price Change')
145 | 
146 | if percent_timeframe == '7d':
147 |     if sort_values == 'Yes':
148 |         df_change = df_change.sort_values(by=['percent_change_7d'])
149 |     col3.write('*7 days period*')
150 |     plt.figure(figsize=(5,25))
151 |     plt.subplots_adjust(top = 1, bottom = 0)
152 |     df_change['percent_change_7d'].plot(kind='barh', color=df_change.positive_percent_change_7d.map({True: 'g', False: 'r'}))
153 |     col3.pyplot(plt)
154 | elif percent_timeframe == '24h':
155 |     if sort_values == 'Yes':
156 |         df_change = df_change.sort_values(by=['percent_change_24h'])
157 |     col3.write('*24 hour period*')
158 |     plt.figure(figsize=(5,25))
159 |     plt.subplots_adjust(top = 1, bottom = 0)
160 |     df_change['percent_change_24h'].plot(kind='barh', color=df_change.positive_percent_change_24h.map({True: 'g', False: 'r'}))
161 |     col3.pyplot(plt)
162 | else:
163 |     if sort_values == 'Yes':
164 |         df_change = df_change.sort_values(by=['percent_change_1h'])
165 |     col3.write('*1 hour period*')
166 |     plt.figure(figsize=(5,25))
167 |     plt.subplots_adjust(top = 1, bottom = 0)
168 |     df_change['percent_change_1h'].plot(kind='barh', color=df_change.positive_percent_change_1h.map({True: 'g', False: 'r'}))
169 |     col3.pyplot(plt)
170 | 


--------------------------------------------------------------------------------
/streamlit/part12/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part12/logo.jpg


--------------------------------------------------------------------------------
/streamlit/part2/iris-ml-app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | from sklearn import datasets
 4 | from sklearn.ensemble import RandomForestClassifier
 5 | 
 6 | st.write("""
 7 | # Simple Iris Flower Prediction App
 8 | 
 9 | This app predicts the **Iris flower** type!
10 | """)
11 | 
12 | st.sidebar.header('User Input Parameters')
13 | 
14 | def user_input_features():
15 |     sepal_length = st.sidebar.slider('Sepal length', 4.3, 7.9, 5.4)
16 |     sepal_width = st.sidebar.slider('Sepal width', 2.0, 4.4, 3.4)
17 |     petal_length = st.sidebar.slider('Petal length', 1.0, 6.9, 1.3)
18 |     petal_width = st.sidebar.slider('Petal width', 0.1, 2.5, 0.2)
19 |     data = {'sepal_length': sepal_length,
20 |             'sepal_width': sepal_width,
21 |             'petal_length': petal_length,
22 |             'petal_width': petal_width}
23 |     features = pd.DataFrame(data, index=[0])
24 |     return features
25 | 
26 | df = user_input_features()
27 | 
28 | st.subheader('User Input parameters')
29 | st.write(df)
30 | 
31 | iris = datasets.load_iris()
32 | X = iris.data
33 | Y = iris.target
34 | 
35 | clf = RandomForestClassifier()
36 | clf.fit(X, Y)
37 | 
38 | prediction = clf.predict(df)
39 | prediction_proba = clf.predict_proba(df)
40 | 
41 | st.subheader('Class labels and their corresponding index number')
42 | st.write(iris.target_names)
43 | 
44 | st.subheader('Prediction')
45 | st.write(iris.target_names[prediction])
46 | #st.write(prediction)
47 | 
48 | st.subheader('Prediction Probability')
49 | st.write(prediction_proba)
50 | 


--------------------------------------------------------------------------------
/streamlit/part3/penguins-app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import numpy as np
 4 | import pickle
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | 
 7 | st.write("""
 8 | # Penguin Prediction App
 9 | 
10 | This app predicts the **Palmer Penguin** species!
11 | 
12 | Data obtained from the [palmerpenguins library](https://github.com/allisonhorst/palmerpenguins) in R by Allison Horst.
13 | """)
14 | 
15 | st.sidebar.header('User Input Features')
16 | 
17 | st.sidebar.markdown("""
18 | [Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/penguins_example.csv)
19 | """)
20 | 
21 | # Collects user input features into dataframe
22 | uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"])
23 | if uploaded_file is not None:
24 |     input_df = pd.read_csv(uploaded_file)
25 | else:
26 |     def user_input_features():
27 |         island = st.sidebar.selectbox('Island',('Biscoe','Dream','Torgersen'))
28 |         sex = st.sidebar.selectbox('Sex',('male','female'))
29 |         bill_length_mm = st.sidebar.slider('Bill length (mm)', 32.1,59.6,43.9)
30 |         bill_depth_mm = st.sidebar.slider('Bill depth (mm)', 13.1,21.5,17.2)
31 |         flipper_length_mm = st.sidebar.slider('Flipper length (mm)', 172.0,231.0,201.0)
32 |         body_mass_g = st.sidebar.slider('Body mass (g)', 2700.0,6300.0,4207.0)
33 |         data = {'island': island,
34 |                 'bill_length_mm': bill_length_mm,
35 |                 'bill_depth_mm': bill_depth_mm,
36 |                 'flipper_length_mm': flipper_length_mm,
37 |                 'body_mass_g': body_mass_g,
38 |                 'sex': sex}
39 |         features = pd.DataFrame(data, index=[0])
40 |         return features
41 |     input_df = user_input_features()
42 | 
43 | # Combines user input features with entire penguins dataset
44 | # This will be useful for the encoding phase
45 | penguins_raw = pd.read_csv('penguins_cleaned.csv')
46 | penguins = penguins_raw.drop(columns=['species'])
47 | df = pd.concat([input_df,penguins],axis=0)
48 | 
49 | # Encoding of ordinal features
50 | # https://www.kaggle.com/pratik1120/penguin-dataset-eda-classification-and-clustering
51 | encode = ['sex','island']
52 | for col in encode:
53 |     dummy = pd.get_dummies(df[col], prefix=col)
54 |     df = pd.concat([df,dummy], axis=1)
55 |     del df[col]
56 | df = df[:1] # Selects only the first row (the user input data)
57 | 
58 | # Displays the user input features
59 | st.subheader('User Input features')
60 | 
61 | if uploaded_file is not None:
62 |     st.write(df)
63 | else:
64 |     st.write('Awaiting CSV file to be uploaded. Currently using example input parameters (shown below).')
65 |     st.write(df)
66 | 
67 | # Reads in saved classification model
68 | load_clf = pickle.load(open('penguins_clf.pkl', 'rb'))
69 | 
70 | # Apply model to make predictions
71 | prediction = load_clf.predict(df)
72 | prediction_proba = load_clf.predict_proba(df)
73 | 
74 | 
75 | st.subheader('Prediction')
76 | penguins_species = np.array(['Adelie','Chinstrap','Gentoo'])
77 | st.write(penguins_species[prediction])
78 | 
79 | st.subheader('Prediction Probability')
80 | st.write(prediction_proba)
81 | 


--------------------------------------------------------------------------------
/streamlit/part3/penguins-model-building.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | penguins = pd.read_csv('penguins_cleaned.csv')
 3 | 
 4 | # Ordinal feature encoding
 5 | # https://www.kaggle.com/pratik1120/penguin-dataset-eda-classification-and-clustering
 6 | df = penguins.copy()
 7 | target = 'species'
 8 | encode = ['sex','island']
 9 | 
10 | for col in encode:
11 |     dummy = pd.get_dummies(df[col], prefix=col)
12 |     df = pd.concat([df,dummy], axis=1)
13 |     del df[col]
14 | 
15 | target_mapper = {'Adelie':0, 'Chinstrap':1, 'Gentoo':2}
16 | def target_encode(val):
17 |     return target_mapper[val]
18 | 
19 | df['species'] = df['species'].apply(target_encode)
20 | 
21 | # Separating X and y
22 | X = df.drop('species', axis=1)
23 | Y = df['species']
24 | 
25 | # Build random forest model
26 | from sklearn.ensemble import RandomForestClassifier
27 | clf = RandomForestClassifier()
28 | clf.fit(X, Y)
29 | 
30 | # Saving the model
31 | import pickle
32 | pickle.dump(clf, open('penguins_clf.pkl', 'wb'))
33 | 


--------------------------------------------------------------------------------
/streamlit/part3/penguins_cleaned.csv:
--------------------------------------------------------------------------------
  1 | "species","island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex"
  2 | "Adelie","Torgersen",39.1,18.7,181,3750,"male"
  3 | "Adelie","Torgersen",39.5,17.4,186,3800,"female"
  4 | "Adelie","Torgersen",40.3,18,195,3250,"female"
  5 | "Adelie","Torgersen",36.7,19.3,193,3450,"female"
  6 | "Adelie","Torgersen",39.3,20.6,190,3650,"male"
  7 | "Adelie","Torgersen",38.9,17.8,181,3625,"female"
  8 | "Adelie","Torgersen",39.2,19.6,195,4675,"male"
  9 | "Adelie","Torgersen",41.1,17.6,182,3200,"female"
 10 | "Adelie","Torgersen",38.6,21.2,191,3800,"male"
 11 | "Adelie","Torgersen",34.6,21.1,198,4400,"male"
 12 | "Adelie","Torgersen",36.6,17.8,185,3700,"female"
 13 | "Adelie","Torgersen",38.7,19,195,3450,"female"
 14 | "Adelie","Torgersen",42.5,20.7,197,4500,"male"
 15 | "Adelie","Torgersen",34.4,18.4,184,3325,"female"
 16 | "Adelie","Torgersen",46,21.5,194,4200,"male"
 17 | "Adelie","Biscoe",37.8,18.3,174,3400,"female"
 18 | "Adelie","Biscoe",37.7,18.7,180,3600,"male"
 19 | "Adelie","Biscoe",35.9,19.2,189,3800,"female"
 20 | "Adelie","Biscoe",38.2,18.1,185,3950,"male"
 21 | "Adelie","Biscoe",38.8,17.2,180,3800,"male"
 22 | "Adelie","Biscoe",35.3,18.9,187,3800,"female"
 23 | "Adelie","Biscoe",40.6,18.6,183,3550,"male"
 24 | "Adelie","Biscoe",40.5,17.9,187,3200,"female"
 25 | "Adelie","Biscoe",37.9,18.6,172,3150,"female"
 26 | "Adelie","Biscoe",40.5,18.9,180,3950,"male"
 27 | "Adelie","Dream",39.5,16.7,178,3250,"female"
 28 | "Adelie","Dream",37.2,18.1,178,3900,"male"
 29 | "Adelie","Dream",39.5,17.8,188,3300,"female"
 30 | "Adelie","Dream",40.9,18.9,184,3900,"male"
 31 | "Adelie","Dream",36.4,17,195,3325,"female"
 32 | "Adelie","Dream",39.2,21.1,196,4150,"male"
 33 | "Adelie","Dream",38.8,20,190,3950,"male"
 34 | "Adelie","Dream",42.2,18.5,180,3550,"female"
 35 | "Adelie","Dream",37.6,19.3,181,3300,"female"
 36 | "Adelie","Dream",39.8,19.1,184,4650,"male"
 37 | "Adelie","Dream",36.5,18,182,3150,"female"
 38 | "Adelie","Dream",40.8,18.4,195,3900,"male"
 39 | "Adelie","Dream",36,18.5,186,3100,"female"
 40 | "Adelie","Dream",44.1,19.7,196,4400,"male"
 41 | "Adelie","Dream",37,16.9,185,3000,"female"
 42 | "Adelie","Dream",39.6,18.8,190,4600,"male"
 43 | "Adelie","Dream",41.1,19,182,3425,"male"
 44 | "Adelie","Dream",36,17.9,190,3450,"female"
 45 | "Adelie","Dream",42.3,21.2,191,4150,"male"
 46 | "Adelie","Biscoe",39.6,17.7,186,3500,"female"
 47 | "Adelie","Biscoe",40.1,18.9,188,4300,"male"
 48 | "Adelie","Biscoe",35,17.9,190,3450,"female"
 49 | "Adelie","Biscoe",42,19.5,200,4050,"male"
 50 | "Adelie","Biscoe",34.5,18.1,187,2900,"female"
 51 | "Adelie","Biscoe",41.4,18.6,191,3700,"male"
 52 | "Adelie","Biscoe",39,17.5,186,3550,"female"
 53 | "Adelie","Biscoe",40.6,18.8,193,3800,"male"
 54 | "Adelie","Biscoe",36.5,16.6,181,2850,"female"
 55 | "Adelie","Biscoe",37.6,19.1,194,3750,"male"
 56 | "Adelie","Biscoe",35.7,16.9,185,3150,"female"
 57 | "Adelie","Biscoe",41.3,21.1,195,4400,"male"
 58 | "Adelie","Biscoe",37.6,17,185,3600,"female"
 59 | "Adelie","Biscoe",41.1,18.2,192,4050,"male"
 60 | "Adelie","Biscoe",36.4,17.1,184,2850,"female"
 61 | "Adelie","Biscoe",41.6,18,192,3950,"male"
 62 | "Adelie","Biscoe",35.5,16.2,195,3350,"female"
 63 | "Adelie","Biscoe",41.1,19.1,188,4100,"male"
 64 | "Adelie","Torgersen",35.9,16.6,190,3050,"female"
 65 | "Adelie","Torgersen",41.8,19.4,198,4450,"male"
 66 | "Adelie","Torgersen",33.5,19,190,3600,"female"
 67 | "Adelie","Torgersen",39.7,18.4,190,3900,"male"
 68 | "Adelie","Torgersen",39.6,17.2,196,3550,"female"
 69 | "Adelie","Torgersen",45.8,18.9,197,4150,"male"
 70 | "Adelie","Torgersen",35.5,17.5,190,3700,"female"
 71 | "Adelie","Torgersen",42.8,18.5,195,4250,"male"
 72 | "Adelie","Torgersen",40.9,16.8,191,3700,"female"
 73 | "Adelie","Torgersen",37.2,19.4,184,3900,"male"
 74 | "Adelie","Torgersen",36.2,16.1,187,3550,"female"
 75 | "Adelie","Torgersen",42.1,19.1,195,4000,"male"
 76 | "Adelie","Torgersen",34.6,17.2,189,3200,"female"
 77 | "Adelie","Torgersen",42.9,17.6,196,4700,"male"
 78 | "Adelie","Torgersen",36.7,18.8,187,3800,"female"
 79 | "Adelie","Torgersen",35.1,19.4,193,4200,"male"
 80 | "Adelie","Dream",37.3,17.8,191,3350,"female"
 81 | "Adelie","Dream",41.3,20.3,194,3550,"male"
 82 | "Adelie","Dream",36.3,19.5,190,3800,"male"
 83 | "Adelie","Dream",36.9,18.6,189,3500,"female"
 84 | "Adelie","Dream",38.3,19.2,189,3950,"male"
 85 | "Adelie","Dream",38.9,18.8,190,3600,"female"
 86 | "Adelie","Dream",35.7,18,202,3550,"female"
 87 | "Adelie","Dream",41.1,18.1,205,4300,"male"
 88 | "Adelie","Dream",34,17.1,185,3400,"female"
 89 | "Adelie","Dream",39.6,18.1,186,4450,"male"
 90 | "Adelie","Dream",36.2,17.3,187,3300,"female"
 91 | "Adelie","Dream",40.8,18.9,208,4300,"male"
 92 | "Adelie","Dream",38.1,18.6,190,3700,"female"
 93 | "Adelie","Dream",40.3,18.5,196,4350,"male"
 94 | "Adelie","Dream",33.1,16.1,178,2900,"female"
 95 | "Adelie","Dream",43.2,18.5,192,4100,"male"
 96 | "Adelie","Biscoe",35,17.9,192,3725,"female"
 97 | "Adelie","Biscoe",41,20,203,4725,"male"
 98 | "Adelie","Biscoe",37.7,16,183,3075,"female"
 99 | "Adelie","Biscoe",37.8,20,190,4250,"male"
100 | "Adelie","Biscoe",37.9,18.6,193,2925,"female"
101 | "Adelie","Biscoe",39.7,18.9,184,3550,"male"
102 | "Adelie","Biscoe",38.6,17.2,199,3750,"female"
103 | "Adelie","Biscoe",38.2,20,190,3900,"male"
104 | "Adelie","Biscoe",38.1,17,181,3175,"female"
105 | "Adelie","Biscoe",43.2,19,197,4775,"male"
106 | "Adelie","Biscoe",38.1,16.5,198,3825,"female"
107 | "Adelie","Biscoe",45.6,20.3,191,4600,"male"
108 | "Adelie","Biscoe",39.7,17.7,193,3200,"female"
109 | "Adelie","Biscoe",42.2,19.5,197,4275,"male"
110 | "Adelie","Biscoe",39.6,20.7,191,3900,"female"
111 | "Adelie","Biscoe",42.7,18.3,196,4075,"male"
112 | "Adelie","Torgersen",38.6,17,188,2900,"female"
113 | "Adelie","Torgersen",37.3,20.5,199,3775,"male"
114 | "Adelie","Torgersen",35.7,17,189,3350,"female"
115 | "Adelie","Torgersen",41.1,18.6,189,3325,"male"
116 | "Adelie","Torgersen",36.2,17.2,187,3150,"female"
117 | "Adelie","Torgersen",37.7,19.8,198,3500,"male"
118 | "Adelie","Torgersen",40.2,17,176,3450,"female"
119 | "Adelie","Torgersen",41.4,18.5,202,3875,"male"
120 | "Adelie","Torgersen",35.2,15.9,186,3050,"female"
121 | "Adelie","Torgersen",40.6,19,199,4000,"male"
122 | "Adelie","Torgersen",38.8,17.6,191,3275,"female"
123 | "Adelie","Torgersen",41.5,18.3,195,4300,"male"
124 | "Adelie","Torgersen",39,17.1,191,3050,"female"
125 | "Adelie","Torgersen",44.1,18,210,4000,"male"
126 | "Adelie","Torgersen",38.5,17.9,190,3325,"female"
127 | "Adelie","Torgersen",43.1,19.2,197,3500,"male"
128 | "Adelie","Dream",36.8,18.5,193,3500,"female"
129 | "Adelie","Dream",37.5,18.5,199,4475,"male"
130 | "Adelie","Dream",38.1,17.6,187,3425,"female"
131 | "Adelie","Dream",41.1,17.5,190,3900,"male"
132 | "Adelie","Dream",35.6,17.5,191,3175,"female"
133 | "Adelie","Dream",40.2,20.1,200,3975,"male"
134 | "Adelie","Dream",37,16.5,185,3400,"female"
135 | "Adelie","Dream",39.7,17.9,193,4250,"male"
136 | "Adelie","Dream",40.2,17.1,193,3400,"female"
137 | "Adelie","Dream",40.6,17.2,187,3475,"male"
138 | "Adelie","Dream",32.1,15.5,188,3050,"female"
139 | "Adelie","Dream",40.7,17,190,3725,"male"
140 | "Adelie","Dream",37.3,16.8,192,3000,"female"
141 | "Adelie","Dream",39,18.7,185,3650,"male"
142 | "Adelie","Dream",39.2,18.6,190,4250,"male"
143 | "Adelie","Dream",36.6,18.4,184,3475,"female"
144 | "Adelie","Dream",36,17.8,195,3450,"female"
145 | "Adelie","Dream",37.8,18.1,193,3750,"male"
146 | "Adelie","Dream",36,17.1,187,3700,"female"
147 | "Adelie","Dream",41.5,18.5,201,4000,"male"
148 | "Gentoo","Biscoe",46.1,13.2,211,4500,"female"
149 | "Gentoo","Biscoe",50,16.3,230,5700,"male"
150 | "Gentoo","Biscoe",48.7,14.1,210,4450,"female"
151 | "Gentoo","Biscoe",50,15.2,218,5700,"male"
152 | "Gentoo","Biscoe",47.6,14.5,215,5400,"male"
153 | "Gentoo","Biscoe",46.5,13.5,210,4550,"female"
154 | "Gentoo","Biscoe",45.4,14.6,211,4800,"female"
155 | "Gentoo","Biscoe",46.7,15.3,219,5200,"male"
156 | "Gentoo","Biscoe",43.3,13.4,209,4400,"female"
157 | "Gentoo","Biscoe",46.8,15.4,215,5150,"male"
158 | "Gentoo","Biscoe",40.9,13.7,214,4650,"female"
159 | "Gentoo","Biscoe",49,16.1,216,5550,"male"
160 | "Gentoo","Biscoe",45.5,13.7,214,4650,"female"
161 | "Gentoo","Biscoe",48.4,14.6,213,5850,"male"
162 | "Gentoo","Biscoe",45.8,14.6,210,4200,"female"
163 | "Gentoo","Biscoe",49.3,15.7,217,5850,"male"
164 | "Gentoo","Biscoe",42,13.5,210,4150,"female"
165 | "Gentoo","Biscoe",49.2,15.2,221,6300,"male"
166 | "Gentoo","Biscoe",46.2,14.5,209,4800,"female"
167 | "Gentoo","Biscoe",48.7,15.1,222,5350,"male"
168 | "Gentoo","Biscoe",50.2,14.3,218,5700,"male"
169 | "Gentoo","Biscoe",45.1,14.5,215,5000,"female"
170 | "Gentoo","Biscoe",46.5,14.5,213,4400,"female"
171 | "Gentoo","Biscoe",46.3,15.8,215,5050,"male"
172 | "Gentoo","Biscoe",42.9,13.1,215,5000,"female"
173 | "Gentoo","Biscoe",46.1,15.1,215,5100,"male"
174 | "Gentoo","Biscoe",47.8,15,215,5650,"male"
175 | "Gentoo","Biscoe",48.2,14.3,210,4600,"female"
176 | "Gentoo","Biscoe",50,15.3,220,5550,"male"
177 | "Gentoo","Biscoe",47.3,15.3,222,5250,"male"
178 | "Gentoo","Biscoe",42.8,14.2,209,4700,"female"
179 | "Gentoo","Biscoe",45.1,14.5,207,5050,"female"
180 | "Gentoo","Biscoe",59.6,17,230,6050,"male"
181 | "Gentoo","Biscoe",49.1,14.8,220,5150,"female"
182 | "Gentoo","Biscoe",48.4,16.3,220,5400,"male"
183 | "Gentoo","Biscoe",42.6,13.7,213,4950,"female"
184 | "Gentoo","Biscoe",44.4,17.3,219,5250,"male"
185 | "Gentoo","Biscoe",44,13.6,208,4350,"female"
186 | "Gentoo","Biscoe",48.7,15.7,208,5350,"male"
187 | "Gentoo","Biscoe",42.7,13.7,208,3950,"female"
188 | "Gentoo","Biscoe",49.6,16,225,5700,"male"
189 | "Gentoo","Biscoe",45.3,13.7,210,4300,"female"
190 | "Gentoo","Biscoe",49.6,15,216,4750,"male"
191 | "Gentoo","Biscoe",50.5,15.9,222,5550,"male"
192 | "Gentoo","Biscoe",43.6,13.9,217,4900,"female"
193 | "Gentoo","Biscoe",45.5,13.9,210,4200,"female"
194 | "Gentoo","Biscoe",50.5,15.9,225,5400,"male"
195 | "Gentoo","Biscoe",44.9,13.3,213,5100,"female"
196 | "Gentoo","Biscoe",45.2,15.8,215,5300,"male"
197 | "Gentoo","Biscoe",46.6,14.2,210,4850,"female"
198 | "Gentoo","Biscoe",48.5,14.1,220,5300,"male"
199 | "Gentoo","Biscoe",45.1,14.4,210,4400,"female"
200 | "Gentoo","Biscoe",50.1,15,225,5000,"male"
201 | "Gentoo","Biscoe",46.5,14.4,217,4900,"female"
202 | "Gentoo","Biscoe",45,15.4,220,5050,"male"
203 | "Gentoo","Biscoe",43.8,13.9,208,4300,"female"
204 | "Gentoo","Biscoe",45.5,15,220,5000,"male"
205 | "Gentoo","Biscoe",43.2,14.5,208,4450,"female"
206 | "Gentoo","Biscoe",50.4,15.3,224,5550,"male"
207 | "Gentoo","Biscoe",45.3,13.8,208,4200,"female"
208 | "Gentoo","Biscoe",46.2,14.9,221,5300,"male"
209 | "Gentoo","Biscoe",45.7,13.9,214,4400,"female"
210 | "Gentoo","Biscoe",54.3,15.7,231,5650,"male"
211 | "Gentoo","Biscoe",45.8,14.2,219,4700,"female"
212 | "Gentoo","Biscoe",49.8,16.8,230,5700,"male"
213 | "Gentoo","Biscoe",49.5,16.2,229,5800,"male"
214 | "Gentoo","Biscoe",43.5,14.2,220,4700,"female"
215 | "Gentoo","Biscoe",50.7,15,223,5550,"male"
216 | "Gentoo","Biscoe",47.7,15,216,4750,"female"
217 | "Gentoo","Biscoe",46.4,15.6,221,5000,"male"
218 | "Gentoo","Biscoe",48.2,15.6,221,5100,"male"
219 | "Gentoo","Biscoe",46.5,14.8,217,5200,"female"
220 | "Gentoo","Biscoe",46.4,15,216,4700,"female"
221 | "Gentoo","Biscoe",48.6,16,230,5800,"male"
222 | "Gentoo","Biscoe",47.5,14.2,209,4600,"female"
223 | "Gentoo","Biscoe",51.1,16.3,220,6000,"male"
224 | "Gentoo","Biscoe",45.2,13.8,215,4750,"female"
225 | "Gentoo","Biscoe",45.2,16.4,223,5950,"male"
226 | "Gentoo","Biscoe",49.1,14.5,212,4625,"female"
227 | "Gentoo","Biscoe",52.5,15.6,221,5450,"male"
228 | "Gentoo","Biscoe",47.4,14.6,212,4725,"female"
229 | "Gentoo","Biscoe",50,15.9,224,5350,"male"
230 | "Gentoo","Biscoe",44.9,13.8,212,4750,"female"
231 | "Gentoo","Biscoe",50.8,17.3,228,5600,"male"
232 | "Gentoo","Biscoe",43.4,14.4,218,4600,"female"
233 | "Gentoo","Biscoe",51.3,14.2,218,5300,"male"
234 | "Gentoo","Biscoe",47.5,14,212,4875,"female"
235 | "Gentoo","Biscoe",52.1,17,230,5550,"male"
236 | "Gentoo","Biscoe",47.5,15,218,4950,"female"
237 | "Gentoo","Biscoe",52.2,17.1,228,5400,"male"
238 | "Gentoo","Biscoe",45.5,14.5,212,4750,"female"
239 | "Gentoo","Biscoe",49.5,16.1,224,5650,"male"
240 | "Gentoo","Biscoe",44.5,14.7,214,4850,"female"
241 | "Gentoo","Biscoe",50.8,15.7,226,5200,"male"
242 | "Gentoo","Biscoe",49.4,15.8,216,4925,"male"
243 | "Gentoo","Biscoe",46.9,14.6,222,4875,"female"
244 | "Gentoo","Biscoe",48.4,14.4,203,4625,"female"
245 | "Gentoo","Biscoe",51.1,16.5,225,5250,"male"
246 | "Gentoo","Biscoe",48.5,15,219,4850,"female"
247 | "Gentoo","Biscoe",55.9,17,228,5600,"male"
248 | "Gentoo","Biscoe",47.2,15.5,215,4975,"female"
249 | "Gentoo","Biscoe",49.1,15,228,5500,"male"
250 | "Gentoo","Biscoe",46.8,16.1,215,5500,"male"
251 | "Gentoo","Biscoe",41.7,14.7,210,4700,"female"
252 | "Gentoo","Biscoe",53.4,15.8,219,5500,"male"
253 | "Gentoo","Biscoe",43.3,14,208,4575,"female"
254 | "Gentoo","Biscoe",48.1,15.1,209,5500,"male"
255 | "Gentoo","Biscoe",50.5,15.2,216,5000,"female"
256 | "Gentoo","Biscoe",49.8,15.9,229,5950,"male"
257 | "Gentoo","Biscoe",43.5,15.2,213,4650,"female"
258 | "Gentoo","Biscoe",51.5,16.3,230,5500,"male"
259 | "Gentoo","Biscoe",46.2,14.1,217,4375,"female"
260 | "Gentoo","Biscoe",55.1,16,230,5850,"male"
261 | "Gentoo","Biscoe",48.8,16.2,222,6000,"male"
262 | "Gentoo","Biscoe",47.2,13.7,214,4925,"female"
263 | "Gentoo","Biscoe",46.8,14.3,215,4850,"female"
264 | "Gentoo","Biscoe",50.4,15.7,222,5750,"male"
265 | "Gentoo","Biscoe",45.2,14.8,212,5200,"female"
266 | "Gentoo","Biscoe",49.9,16.1,213,5400,"male"
267 | "Chinstrap","Dream",46.5,17.9,192,3500,"female"
268 | "Chinstrap","Dream",50,19.5,196,3900,"male"
269 | "Chinstrap","Dream",51.3,19.2,193,3650,"male"
270 | "Chinstrap","Dream",45.4,18.7,188,3525,"female"
271 | "Chinstrap","Dream",52.7,19.8,197,3725,"male"
272 | "Chinstrap","Dream",45.2,17.8,198,3950,"female"
273 | "Chinstrap","Dream",46.1,18.2,178,3250,"female"
274 | "Chinstrap","Dream",51.3,18.2,197,3750,"male"
275 | "Chinstrap","Dream",46,18.9,195,4150,"female"
276 | "Chinstrap","Dream",51.3,19.9,198,3700,"male"
277 | "Chinstrap","Dream",46.6,17.8,193,3800,"female"
278 | "Chinstrap","Dream",51.7,20.3,194,3775,"male"
279 | "Chinstrap","Dream",47,17.3,185,3700,"female"
280 | "Chinstrap","Dream",52,18.1,201,4050,"male"
281 | "Chinstrap","Dream",45.9,17.1,190,3575,"female"
282 | "Chinstrap","Dream",50.5,19.6,201,4050,"male"
283 | "Chinstrap","Dream",50.3,20,197,3300,"male"
284 | "Chinstrap","Dream",58,17.8,181,3700,"female"
285 | "Chinstrap","Dream",46.4,18.6,190,3450,"female"
286 | "Chinstrap","Dream",49.2,18.2,195,4400,"male"
287 | "Chinstrap","Dream",42.4,17.3,181,3600,"female"
288 | "Chinstrap","Dream",48.5,17.5,191,3400,"male"
289 | "Chinstrap","Dream",43.2,16.6,187,2900,"female"
290 | "Chinstrap","Dream",50.6,19.4,193,3800,"male"
291 | "Chinstrap","Dream",46.7,17.9,195,3300,"female"
292 | "Chinstrap","Dream",52,19,197,4150,"male"
293 | "Chinstrap","Dream",50.5,18.4,200,3400,"female"
294 | "Chinstrap","Dream",49.5,19,200,3800,"male"
295 | "Chinstrap","Dream",46.4,17.8,191,3700,"female"
296 | "Chinstrap","Dream",52.8,20,205,4550,"male"
297 | "Chinstrap","Dream",40.9,16.6,187,3200,"female"
298 | "Chinstrap","Dream",54.2,20.8,201,4300,"male"
299 | "Chinstrap","Dream",42.5,16.7,187,3350,"female"
300 | "Chinstrap","Dream",51,18.8,203,4100,"male"
301 | "Chinstrap","Dream",49.7,18.6,195,3600,"male"
302 | "Chinstrap","Dream",47.5,16.8,199,3900,"female"
303 | "Chinstrap","Dream",47.6,18.3,195,3850,"female"
304 | "Chinstrap","Dream",52,20.7,210,4800,"male"
305 | "Chinstrap","Dream",46.9,16.6,192,2700,"female"
306 | "Chinstrap","Dream",53.5,19.9,205,4500,"male"
307 | "Chinstrap","Dream",49,19.5,210,3950,"male"
308 | "Chinstrap","Dream",46.2,17.5,187,3650,"female"
309 | "Chinstrap","Dream",50.9,19.1,196,3550,"male"
310 | "Chinstrap","Dream",45.5,17,196,3500,"female"
311 | "Chinstrap","Dream",50.9,17.9,196,3675,"female"
312 | "Chinstrap","Dream",50.8,18.5,201,4450,"male"
313 | "Chinstrap","Dream",50.1,17.9,190,3400,"female"
314 | "Chinstrap","Dream",49,19.6,212,4300,"male"
315 | "Chinstrap","Dream",51.5,18.7,187,3250,"male"
316 | "Chinstrap","Dream",49.8,17.3,198,3675,"female"
317 | "Chinstrap","Dream",48.1,16.4,199,3325,"female"
318 | "Chinstrap","Dream",51.4,19,201,3950,"male"
319 | "Chinstrap","Dream",45.7,17.3,193,3600,"female"
320 | "Chinstrap","Dream",50.7,19.7,203,4050,"male"
321 | "Chinstrap","Dream",42.5,17.3,187,3350,"female"
322 | "Chinstrap","Dream",52.2,18.8,197,3450,"male"
323 | "Chinstrap","Dream",45.2,16.6,191,3250,"female"
324 | "Chinstrap","Dream",49.3,19.9,203,4050,"male"
325 | "Chinstrap","Dream",50.2,18.8,202,3800,"male"
326 | "Chinstrap","Dream",45.6,19.4,194,3525,"female"
327 | "Chinstrap","Dream",51.9,19.5,206,3950,"male"
328 | "Chinstrap","Dream",46.8,16.5,189,3650,"female"
329 | "Chinstrap","Dream",45.7,17,195,3650,"female"
330 | "Chinstrap","Dream",55.8,19.8,207,4000,"male"
331 | "Chinstrap","Dream",43.5,18.1,202,3400,"female"
332 | "Chinstrap","Dream",49.6,18.2,193,3775,"male"
333 | "Chinstrap","Dream",50.8,19,210,4100,"male"
334 | "Chinstrap","Dream",50.2,18.7,198,3775,"female"
335 | 


--------------------------------------------------------------------------------
/streamlit/part3/penguins_clf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part3/penguins_clf.pkl


--------------------------------------------------------------------------------
/streamlit/part3/penguins_example.csv:
--------------------------------------------------------------------------------
1 | island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
2 | Biscoe,43.9,17.2,201.0,4207.0,male
3 | 


--------------------------------------------------------------------------------
/streamlit/part5/basketball_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import base64
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | import numpy as np
 7 | 
 8 | st.title('NBA Player Stats Explorer')
 9 | 
10 | st.markdown("""
11 | This app performs simple webscraping of NBA player stats data!
12 | * **Python libraries:** base64, pandas, streamlit
13 | * **Data source:** [Basketball-reference.com](https://www.basketball-reference.com/).
14 | """)
15 | 
16 | st.sidebar.header('User Input Features')
17 | selected_year = st.sidebar.selectbox('Year', list(reversed(range(1950,2020))))
18 | 
19 | # Web scraping of NBA player stats
20 | @st.cache
21 | def load_data(year):
22 |     url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_per_game.html"
23 |     html = pd.read_html(url, header = 0)
24 |     df = html[0]
25 |     raw = df.drop(df[df.Age == 'Age'].index) # Deletes repeating headers in content
26 |     raw = raw.fillna(0)
27 |     playerstats = raw.drop(['Rk'], axis=1)
28 |     return playerstats
29 | playerstats = load_data(selected_year)
30 | 
31 | # Sidebar - Team selection
32 | sorted_unique_team = sorted(playerstats.Tm.unique())
33 | selected_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team)
34 | 
35 | # Sidebar - Position selection
36 | unique_pos = ['C','PF','SF','PG','SG']
37 | selected_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos)
38 | 
39 | # Filtering data
40 | df_selected_team = playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))]
41 | 
42 | st.header('Display Player Stats of Selected Team(s)')
43 | st.write('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.')
44 | st.dataframe(df_selected_team)
45 | 
46 | # Download NBA player stats data
47 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
48 | def filedownload(df):
49 |     csv = df.to_csv(index=False)
50 |     b64 = base64.b64encode(csv.encode()).decode()  # strings <-> bytes conversions
51 |     href = f'<a href="data:file/csv;base64,{b64}" download="playerstats.csv">Download CSV File</a>'
52 |     return href
53 | 
54 | st.markdown(filedownload(df_selected_team), unsafe_allow_html=True)
55 | 
56 | # Heatmap
57 | if st.button('Intercorrelation Heatmap'):
58 |     st.header('Intercorrelation Matrix Heatmap')
59 |     df_selected_team.to_csv('output.csv',index=False)
60 |     df = pd.read_csv('output.csv')
61 | 
62 |     corr = df.corr()
63 |     mask = np.zeros_like(corr)
64 |     mask[np.triu_indices_from(mask)] = True
65 |     with sns.axes_style("white"):
66 |         f, ax = plt.subplots(figsize=(7, 5))
67 |         ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)
68 |     st.pyplot()
69 | 


--------------------------------------------------------------------------------
/streamlit/part6/boston-house-ml-app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import shap
 4 | import matplotlib.pyplot as plt
 5 | from sklearn import datasets
 6 | from sklearn.ensemble import RandomForestRegressor
 7 | 
 8 | st.write("""
 9 | # Boston House Price Prediction App
10 | 
11 | This app predicts the **Boston House Price**!
12 | """)
13 | st.write('---')
14 | 
15 | # Loads the Boston House Price Dataset
16 | boston = datasets.load_boston()
17 | X = pd.DataFrame(boston.data, columns=boston.feature_names)
18 | Y = pd.DataFrame(boston.target, columns=["MEDV"])
19 | 
20 | # Sidebar
21 | # Header of Specify Input Parameters
22 | st.sidebar.header('Specify Input Parameters')
23 | 
24 | def user_input_features():
25 |     CRIM = st.sidebar.slider('CRIM', X.CRIM.min(), X.CRIM.max(), X.CRIM.mean())
26 |     ZN = st.sidebar.slider('ZN', X.ZN.min(), X.ZN.max(), X.ZN.mean())
27 |     INDUS = st.sidebar.slider('INDUS', X.INDUS.min(), X.INDUS.max(), X.INDUS.mean())
28 |     CHAS = st.sidebar.slider('CHAS', X.CHAS.min(), X.CHAS.max(), X.CHAS.mean())
29 |     NOX = st.sidebar.slider('NOX', X.NOX.min(), X.NOX.max(), X.NOX.mean())
30 |     RM = st.sidebar.slider('RM', X.RM.min(), X.RM.max(), X.RM.mean())
31 |     AGE = st.sidebar.slider('AGE', X.AGE.min(), X.AGE.max(), X.AGE.mean())
32 |     DIS = st.sidebar.slider('DIS', X.DIS.min(), X.DIS.max(), X.DIS.mean())
33 |     RAD = st.sidebar.slider('RAD', X.RAD.min(), X.RAD.max(), X.RAD.mean())
34 |     TAX = st.sidebar.slider('TAX', X.TAX.min(), X.TAX.max(), X.TAX.mean())
35 |     PTRATIO = st.sidebar.slider('PTRATIO', X.PTRATIO.min(), X.PTRATIO.max(), X.PTRATIO.mean())
36 |     B = st.sidebar.slider('B', X.B.min(), X.B.max(), X.B.mean())
37 |     LSTAT = st.sidebar.slider('LSTAT', X.LSTAT.min(), X.LSTAT.max(), X.LSTAT.mean())
38 |     data = {'CRIM': CRIM,
39 |             'ZN': ZN,
40 |             'INDUS': INDUS,
41 |             'CHAS': CHAS,
42 |             'NOX': NOX,
43 |             'RM': RM,
44 |             'AGE': AGE,
45 |             'DIS': DIS,
46 |             'RAD': RAD,
47 |             'TAX': TAX,
48 |             'PTRATIO': PTRATIO,
49 |             'B': B,
50 |             'LSTAT': LSTAT}
51 |     features = pd.DataFrame(data, index=[0])
52 |     return features
53 | 
54 | df = user_input_features()
55 | 
56 | # Main Panel
57 | 
58 | # Print specified input parameters
59 | st.header('Specified Input parameters')
60 | st.write(df)
61 | st.write('---')
62 | 
63 | # Build Regression Model
64 | model = RandomForestRegressor()
65 | model.fit(X, Y)
66 | # Apply Model to Make Prediction
67 | prediction = model.predict(df)
68 | 
69 | st.header('Prediction of MEDV')
70 | st.write(prediction)
71 | st.write('---')
72 | 
73 | # Explaining the model's predictions using SHAP values
74 | # https://github.com/slundberg/shap
75 | explainer = shap.TreeExplainer(model)
76 | shap_values = explainer.shap_values(X)
77 | 
78 | st.header('Feature Importance')
79 | plt.title('Feature importance based on SHAP values')
80 | shap.summary_plot(shap_values, X)
81 | st.pyplot(bbox_inches='tight')
82 | st.write('---')
83 | 
84 | plt.title('Feature importance based on SHAP values (Bar)')
85 | shap.summary_plot(shap_values, X, plot_type="bar")
86 | st.pyplot(bbox_inches='tight')
87 | 


--------------------------------------------------------------------------------
/streamlit/part7/solubility-app.py:
--------------------------------------------------------------------------------
  1 | ######################
  2 | # Import libraries
  3 | ######################
  4 | import numpy as np
  5 | import pandas as pd
  6 | import streamlit as st
  7 | import pickle
  8 | from PIL import Image
  9 | from rdkit import Chem
 10 | from rdkit.Chem import Descriptors
 11 | 
 12 | ######################
 13 | # Custom function
 14 | ######################
 15 | ## Calculate molecular descriptors
 16 | def AromaticProportion(m):
 17 |   aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())]
 18 |   aa_count = []
 19 |   for i in aromatic_atoms:
 20 |     if i==True:
 21 |       aa_count.append(1)
 22 |   AromaticAtom = sum(aa_count)
 23 |   HeavyAtom = Descriptors.HeavyAtomCount(m)
 24 |   AR = AromaticAtom/HeavyAtom
 25 |   return AR
 26 | 
 27 | def generate(smiles, verbose=False):
 28 | 
 29 |     moldata= []
 30 |     for elem in smiles:
 31 |         mol=Chem.MolFromSmiles(elem)
 32 |         moldata.append(mol)
 33 | 
 34 |     baseData= np.arange(1,1)
 35 |     i=0
 36 |     for mol in moldata:
 37 | 
 38 |         desc_MolLogP = Descriptors.MolLogP(mol)
 39 |         desc_MolWt = Descriptors.MolWt(mol)
 40 |         desc_NumRotatableBonds = Descriptors.NumRotatableBonds(mol)
 41 |         desc_AromaticProportion = AromaticProportion(mol)
 42 | 
 43 |         row = np.array([desc_MolLogP,
 44 |                         desc_MolWt,
 45 |                         desc_NumRotatableBonds,
 46 |                         desc_AromaticProportion])
 47 | 
 48 |         if(i==0):
 49 |             baseData=row
 50 |         else:
 51 |             baseData=np.vstack([baseData, row])
 52 |         i=i+1
 53 | 
 54 |     columnNames=["MolLogP","MolWt","NumRotatableBonds","AromaticProportion"]
 55 |     descriptors = pd.DataFrame(data=baseData,columns=columnNames)
 56 | 
 57 |     return descriptors
 58 | 
 59 | ######################
 60 | # Page Title
 61 | ######################
 62 | 
 63 | image = Image.open('solubility-logo.jpg')
 64 | 
 65 | st.image(image, use_column_width=True)
 66 | 
 67 | st.write("""
 68 | # Molecular Solubility Prediction Web App
 69 | 
 70 | This app predicts the **Solubility (LogS)** values of molecules!
 71 | 
 72 | Data obtained from the John S. Delaney. [ESOL:  Estimating Aqueous Solubility Directly from Molecular Structure](https://pubs.acs.org/doi/10.1021/ci034243x). ***J. Chem. Inf. Comput. Sci.*** 2004, 44, 3, 1000-1005.
 73 | ***
 74 | """)
 75 | 
 76 | 
 77 | ######################
 78 | # Input molecules (Side Panel)
 79 | ######################
 80 | 
 81 | st.sidebar.header('User Input Features')
 82 | 
 83 | ## Read SMILES input
 84 | SMILES_input = "NCCCC\nCCC\nCN"
 85 | 
 86 | SMILES = st.sidebar.text_area("SMILES input", SMILES_input)
 87 | SMILES = "C\n" + SMILES #Adds C as a dummy, first item
 88 | SMILES = SMILES.split('\n')
 89 | 
 90 | st.header('Input SMILES')
 91 | SMILES[1:] # Skips the dummy first item
 92 | 
 93 | ## Calculate molecular descriptors
 94 | st.header('Computed molecular descriptors')
 95 | X = generate(SMILES)
 96 | X[1:] # Skips the dummy first item
 97 | 
 98 | ######################
 99 | # Pre-built model
100 | ######################
101 | 
102 | # Reads in saved model
103 | load_model = pickle.load(open('solubility_model.pkl', 'rb'))
104 | 
105 | # Apply model to make predictions
106 | prediction = load_model.predict(X)
107 | #prediction_proba = load_model.predict_proba(X)
108 | 
109 | st.header('Predicted LogS values')
110 | prediction[1:] # Skips the dummy first item
111 | 


--------------------------------------------------------------------------------
/streamlit/part7/solubility-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part7/solubility-logo.jpg


--------------------------------------------------------------------------------
/streamlit/part7/solubility_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part7/solubility_model.pkl


--------------------------------------------------------------------------------
/streamlit/part8/dna-app.py:
--------------------------------------------------------------------------------
 1 | ######################
 2 | # Import libraries
 3 | ######################
 4 | 
 5 | import pandas as pd
 6 | import streamlit as st
 7 | import altair as alt
 8 | from PIL import Image
 9 | 
10 | ######################
11 | # Page Title
12 | ######################
13 | 
14 | image = Image.open('dna-logo.jpg')
15 | 
16 | st.image(image, use_column_width=True)
17 | 
18 | st.write("""
19 | # DNA Nucleotide Count Web App
20 | 
21 | This app counts the nucleotide composition of query DNA!
22 | 
23 | ***
24 | """)
25 | 
26 | 
27 | ######################
28 | # Input Text Box
29 | ######################
30 | 
31 | #st.sidebar.header('Enter DNA sequence')
32 | st.header('Enter DNA sequence')
33 | 
34 | sequence_input = ">DNA Query 2\nGAACACGTGGAGGCAAACAGGAAGGTGAAGAAGAACTTATCCTATCAGGACGGAAGGTCCTGTGCTCGGG\nATCTTCCAGACGTCGCGACTCTAAATTGCCCCCTCTGAGGTCAAGGAACACAAGATGGTTTTGGAAATGC\nTGAACCCGATACATTATAACATCACCAGCATCGTGCCTGAAGCCATGCCTGCTGCCACCATGCCAGTCCT"
35 | 
36 | #sequence = st.sidebar.text_area("Sequence input", sequence_input, height=250)
37 | sequence = st.text_area("Sequence input", sequence_input, height=250)
38 | sequence = sequence.splitlines()
39 | sequence = sequence[1:] # Skips the sequence name (first line)
40 | sequence = ''.join(sequence) # Concatenates list to string
41 | 
42 | st.write("""
43 | ***
44 | """)
45 | 
46 | ## Prints the input DNA sequence
47 | st.header('INPUT (DNA Query)')
48 | sequence
49 | 
50 | ## DNA nucleotide count
51 | st.header('OUTPUT (DNA Nucleotide Count)')
52 | 
53 | ### 1. Print dictionary
54 | st.subheader('1. Print dictionary')
55 | def DNA_nucleotide_count(seq):
56 |   d = dict([
57 |             ('A',seq.count('A')),
58 |             ('T',seq.count('T')),
59 |             ('G',seq.count('G')),
60 |             ('C',seq.count('C'))
61 |             ])
62 |   return d
63 | 
64 | X = DNA_nucleotide_count(sequence)
65 | 
66 | #X_label = list(X)
67 | #X_values = list(X.values())
68 | 
69 | X
70 | 
71 | ### 2. Print text
72 | st.subheader('2. Print text')
73 | st.write('There are  ' + str(X['A']) + ' adenine (A)')
74 | st.write('There are  ' + str(X['T']) + ' thymine (T)')
75 | st.write('There are  ' + str(X['G']) + ' guanine (G)')
76 | st.write('There are  ' + str(X['C']) + ' cytosine (C)')
77 | 
78 | ### 3. Display DataFrame
79 | st.subheader('3. Display DataFrame')
80 | df = pd.DataFrame.from_dict(X, orient='index')
81 | df = df.rename({0: 'count'}, axis='columns')
82 | df.reset_index(inplace=True)
83 | df = df.rename(columns = {'index':'nucleotide'})
84 | st.write(df)
85 | 
86 | ### 4. Display Bar Chart using Altair
87 | st.subheader('4. Display Bar chart')
88 | p = alt.Chart(df).mark_bar().encode(
89 |     x='nucleotide',
90 |     y='count'
91 | )
92 | p = p.properties(
93 |     width=alt.Step(80)  # controls width of bar.
94 | )
95 | st.write(p)
96 | 


--------------------------------------------------------------------------------
/streamlit/part8/dna-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part8/dna-logo.jpg


--------------------------------------------------------------------------------
/streamlit/part9/football_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import base64
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | import numpy as np
 7 | 
 8 | st.title('NFL Football Stats (Rushing) Explorer')
 9 | 
10 | st.markdown("""
11 | This app performs simple webscraping of NFL Football player stats data (focusing on Rushing)!
12 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn
13 | * **Data source:** [pro-football-reference.com](https://www.pro-football-reference.com/).
14 | """)
15 | 
16 | st.sidebar.header('User Input Features')
17 | selected_year = st.sidebar.selectbox('Year', list(reversed(range(1990,2020))))
18 | 
19 | # Web scraping of NFL player stats
20 | # https://www.pro-football-reference.com/years/2019/rushing.htm
21 | @st.cache
22 | def load_data(year):
23 |     url = "https://www.pro-football-reference.com/years/" + str(year) + "/rushing.htm"
24 |     html = pd.read_html(url, header = 1)
25 |     df = html[0]
26 |     raw = df.drop(df[df.Age == 'Age'].index) # Deletes repeating headers in content
27 |     raw = raw.fillna(0)
28 |     playerstats = raw.drop(['Rk'], axis=1)
29 |     return playerstats
30 | playerstats = load_data(selected_year)
31 | 
32 | # Sidebar - Team selection
33 | sorted_unique_team = sorted(playerstats.Tm.unique())
34 | selected_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team)
35 | 
36 | # Sidebar - Position selection
37 | unique_pos = ['RB','QB','WR','FB','TE']
38 | selected_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos)
39 | 
40 | # Filtering data
41 | df_selected_team = playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))]
42 | 
43 | st.header('Display Player Stats of Selected Team(s)')
44 | st.write('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.')
45 | st.dataframe(df_selected_team)
46 | 
47 | # Download NBA player stats data
48 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
49 | def filedownload(df):
50 |     csv = df.to_csv(index=False)
51 |     b64 = base64.b64encode(csv.encode()).decode()  # strings <-> bytes conversions
52 |     href = f'<a href="data:file/csv;base64,{b64}" download="playerstats.csv">Download CSV File</a>'
53 |     return href
54 | 
55 | st.markdown(filedownload(df_selected_team), unsafe_allow_html=True)
56 | 
57 | # Heatmap
58 | if st.button('Intercorrelation Heatmap'):
59 |     st.header('Intercorrelation Matrix Heatmap')
60 |     df_selected_team.to_csv('output.csv',index=False)
61 |     df = pd.read_csv('output.csv')
62 | 
63 |     corr = df.corr()
64 |     mask = np.zeros_like(corr)
65 |     mask[np.triu_indices_from(mask)] = True
66 |     with sns.axes_style("white"):
67 |         f, ax = plt.subplots(figsize=(7, 5))
68 |         ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)
69 |     st.pyplot()
70 | 


--------------------------------------------------------------------------------