├── README.md ├── dhfr ├── dhfr-classification-deploy.R ├── dhfr-classification.R ├── dhfr-data-understanding.R ├── dhfr-handling-missing-data.R └── dhfr-parallel-speed-up.R ├── iris ├── iris-classification.R └── iris-data-understanding.R ├── linear-regression └── boston-housing-linear-regression.R ├── plot └── scatter-plot │ ├── aromatase.csv │ └── code-scatter-plot.R ├── python-in-r └── using-reticulate.R ├── python ├── CDD_ML_Part_1_Acetylcholinesterase_Bioactivity_Data_Concised.ipynb ├── CDD_ML_Part_1_Bioactivity_Data_Concised.ipynb ├── CDD_ML_Part_1_bioactivity_data.ipynb ├── CDD_ML_Part_2_Acetylcholinesterase_Exploratory_Data_Analysis.ipynb ├── CDD_ML_Part_2_Exploratory_Data_Analysis.ipynb ├── CDD_ML_Part_3_Acetylcholinesterase_Descriptor_Dataset_Preparation.ipynb ├── CDD_ML_Part_4_Acetylcholinesterase_Regression_Random_Forest.ipynb ├── CDD_ML_Part_5_Acetylcholinesterase_Compare_Regressors.ipynb ├── Colab_File_handling_on_Google_Colab.ipynb ├── How_to_build_a_simple_linear_regression_model_in_python.ipynb ├── Hummingbird_ML.ipynb ├── PCA_analysis.ipynb ├── ROC_curve.ipynb ├── ROC_curve_kNN.ipynb ├── Sweetviz.ipynb ├── cheminformatics_predicting_solubility.ipynb ├── cheminformatics_predicting_solubility_2_1_PyCaret.ipynb ├── cheminformatics_predicting_solubility_2_2_PyCaret.ipynb ├── comparing-classifiers.ipynb ├── google_colab_install_conda.ipynb ├── google_colab_r_magic_command.ipynb ├── google_colab_r_notebook.ipynb ├── hyperparameter_tuning.ipynb ├── iris │ └── iris-classification-random-forest.ipynb ├── klib.ipynb ├── linear_regression.ipynb ├── model_is_training_progress_bar.ipynb ├── pandas-create-and-combine-dataframes.ipynb ├── pandas_exploratory_data_analysis.ipynb ├── pandas_profiling_example.ipynb ├── pandas_read_html_for_webscraping.ipynb ├── pandas_select_columns.ipynb ├── pandas_styling_dataframe.ipynb └── r_magic_command.ipynb ├── shiny ├── 001-first-app │ └── app.R ├── 002-histogram │ └── app.R ├── 003-play-golf │ └── app.R ├── 004-iris-predictor │ ├── app-numeric.R │ ├── app-slider.R │ ├── model.R │ ├── model.rds │ ├── testing.csv │ └── training.csv └── 005-bmi │ ├── about.md │ └── app.R └── streamlit ├── part1 ├── myapp.py └── myapp2.py ├── part10 └── sp500-app.py ├── part12 ├── crypto-price-app.py └── logo.jpg ├── part2 └── iris-ml-app.py ├── part3 ├── penguins-app.py ├── penguins-model-building.py ├── penguins_cleaned.csv ├── penguins_clf.pkl └── penguins_example.csv ├── part5 └── basketball_app.py ├── part6 └── boston-house-ml-app.py ├── part7 ├── solubility-app.py ├── solubility-logo.jpg ├── solubility-web-app.ipynb └── solubility_model.pkl ├── part8 ├── dna-app.py └── dna-logo.jpg └── part9 └── football_app.py /README.md: -------------------------------------------------------------------------------- 1 | # code 2 | This is a compilation of R programming codes used on the **Data Professor** YouTube channel tutorial videos. 3 | 4 | Folder | Description 5 | ---|--- 6 | [iris](https://github.com/dataprofessor/code/tree/master/iris) | Codes for performing *exploratory data analysis* (so as to gain *data understanding*) and for building *classification models* of the Iris dataset. 7 | [dhfr](https://github.com/dataprofessor/code/tree/master/dhfr) | Codes for performing *exploratory data analysis* (so as to gain *data understanding*) and for building *classification models* of the Iris dataset. 8 | [python](https://github.com/dataprofessor/code/tree/master/python) | Codes for various Python data science project tutorials. 9 | [shiny](https://github.com/dataprofessor/code/tree/master/shiny) | Codes for building *web applications* in R with *shiny* package. 10 | 11 | > Note: More to come. Please stay tuned! 12 | -------------------------------------------------------------------------------- /dhfr/dhfr-classification-deploy.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Importing libraries 8 | library(datasets) # Contains several data sets 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training 10 | 11 | # Importing the dhfr data set 12 | data(dhfr) 13 | 14 | # Check to see if there are missing data? 15 | sum(is.na(dhfr)) 16 | 17 | # To achieve reproducible model; set the random seed number 18 | set.seed(100) 19 | 20 | # Performs stratified random split of the data set 21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE) 22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set 23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set 24 | 25 | 26 | 27 | ############################### 28 | # SVM model (polynomial kernel) 29 | 30 | # Build Training model 31 | Model <- train(Y ~ ., data = TrainingSet, 32 | method = "svmPoly", 33 | na.action = na.omit, 34 | preProcess=c("scale","center"), 35 | trControl= trainControl(method="none"), 36 | tuneGrid = data.frame(degree=1,scale=1,C=1) 37 | ) 38 | 39 | 40 | # Save model to RDS file 41 | 42 | saveRDS(Model, "Model.rds") 43 | 44 | # Read the model from RDS file 45 | 46 | read.Model <- readRDS("Model.rds") 47 | 48 | 49 | # Apply model for prediction 50 | Model.training <-predict(read.Model, TrainingSet) # Apply model to make prediction on Training set 51 | Model.testing <-predict(read.Model, TestingSet) # Apply model to make prediction on Testing set 52 | 53 | # Model performance (Displays confusion matrix and statistics) 54 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y) 55 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Y) 56 | 57 | print(Model.training.confusion) 58 | print(Model.testing.confusion) 59 | 60 | # Feature importance 61 | Importance <- varImp(Model) 62 | plot(Importance, top = 25) 63 | plot(Importance, col = "red") 64 | -------------------------------------------------------------------------------- /dhfr/dhfr-classification.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Importing libraries 8 | library(datasets) # Contains several data sets 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training 10 | 11 | # Importing the dhfr data set 12 | data(dhfr) 13 | 14 | # Check to see if there are missing data? 15 | sum(is.na(dhfr)) 16 | 17 | # To achieve reproducible model; set the random seed number 18 | set.seed(100) 19 | 20 | # Performs stratified random split of the data set 21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE) 22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set 23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set 24 | 25 | 26 | 27 | ############################### 28 | # SVM model (polynomial kernel) 29 | 30 | # Build Training model 31 | Model <- train(Y ~ ., data = TrainingSet, 32 | method = "svmPoly", 33 | na.action = na.omit, 34 | preProcess=c("scale","center"), 35 | trControl= trainControl(method="none"), 36 | tuneGrid = data.frame(degree=1,scale=1,C=1) 37 | ) 38 | 39 | # Build CV model 40 | Model.cv <- train(Y ~ ., data = TrainingSet, 41 | method = "svmPoly", 42 | na.action = na.omit, 43 | preProcess=c("scale","center"), 44 | trControl= trainControl(method="cv", number=10), 45 | tuneGrid = data.frame(degree=1,scale=1,C=1) 46 | ) 47 | 48 | 49 | # Apply model for prediction 50 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set 51 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set 52 | Model.cv <-predict(Model.cv, TrainingSet) # Perform cross-validation 53 | 54 | # Model performance (Displays confusion matrix and statistics) 55 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y) 56 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Y) 57 | Model.cv.confusion <-confusionMatrix(Model.cv, TrainingSet$Y) 58 | 59 | print(Model.training.confusion) 60 | print(Model.testing.confusion) 61 | print(Model.cv.confusion) 62 | 63 | # Feature importance 64 | Importance <- varImp(Model) 65 | plot(Importance, top = 25) 66 | plot(Importance, col = "red") 67 | -------------------------------------------------------------------------------- /dhfr/dhfr-data-understanding.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | ######################### 8 | # Loading DHFR data set 9 | ######################### 10 | 11 | # Method 1 12 | 13 | library(datasets) 14 | data(dhfr) 15 | 16 | # Method 2 17 | #dhfr2 <- datasets::dhfr 18 | 19 | # Method 3 20 | # install.packages("RCurl") 21 | 22 | #library(RCurl) 23 | dhfr <- read.csv(text = getURL("https://github.com/dataprofessor/data/raw/master/dhfr.csv") ) 24 | 25 | # View the data 26 | View(dhfr) 27 | 28 | ############################# 29 | # Display summary statistics 30 | ############################# 31 | 32 | # head() / tail() 33 | head(dhfr, 5) 34 | tail(dhfr, 5) 35 | 36 | 37 | # summary() 38 | summary(dhfr) 39 | summary(dhfr$Y) 40 | 41 | 42 | # Check to see if there are missing data? 43 | sum(is.na(dhfr)) 44 | 45 | 46 | # skimr() - expands on summary() by providing larger set of statistics 47 | # install.packages("skimr") 48 | # https://github.com/ropensci/skimr 49 | 50 | library(skimr) 51 | 52 | skim(dhfr) # Perform skim to display summary statistics 53 | 54 | # Group data by Y (biological activity) then perform skim 55 | dhfr %>% 56 | dplyr::group_by(Y) %>% 57 | skim() 58 | 59 | ############################# 60 | # Quick data visualization 61 | # 62 | # R base plot() 63 | ############################# 64 | 65 | 66 | # Panel plots 67 | #plot(dhfr) 68 | #plot(iris, col = "red") 69 | 70 | # Scatter plot 71 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol) 72 | 73 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = "red") # Makes red circles 74 | 75 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = dhfr$Y) # Color by Y 76 | 77 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = "red", # Makes red circles + Adds x and y axis labels 78 | xlab = "moe2D_zagreb", ylab = "moe2D_weinerPol") 79 | 80 | # Histogram 81 | hist(dhfr$moe2D_zagreb) 82 | hist(dhfr$moe2D_zagreb, col = "red") # Makes red bars 83 | 84 | # Feature plots 85 | # https://www.machinelearningplus.com/machine-learning/caret-package/ 86 | featurePlot(x = dhfr[,2:21], 87 | y = dhfr$Y, 88 | plot = "box", 89 | strip=strip.custom(par.strip.text=list(cex=.7)), 90 | scales = list(x = list(relation="free"), 91 | y = list(relation="free"))) 92 | -------------------------------------------------------------------------------- /dhfr/dhfr-handling-missing-data.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | 8 | # 1. Loading the DHFR data 9 | library(RCurl) 10 | dhfr <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/dhfr.csv") ) 11 | 12 | View(dhfr) 13 | 14 | 15 | # 2. Check for missing data 16 | 17 | sum(is.na(dhfr)) 18 | 19 | 20 | # 3. If data is clean, randomly introduce NA to the dataset 21 | 22 | na.gen <- function(data,n) { 23 | i <- 1 24 | while (i < n+1) { 25 | idx1 <- sample(1:nrow(data), 1) 26 | idx2 <- sample(1:ncol(data), 1) 27 | data[idx1,idx2] <- NA 28 | i = i+1 29 | } 30 | return(data) 31 | } 32 | 33 | 34 | # Before introducing NA to the dataset, leave the Y class label (output variable) out 35 | 36 | dhfr <- dhfr[,-1] 37 | 38 | 39 | # Choose 1 of the following to run (they'll produce the same result) 40 | 41 | dhfr <- na.gen(dhfr,100) 42 | 43 | dhfr <- na.gen(n=100,data=dhfr) 44 | 45 | dhfr <- na.gen(100,dhfr) # This produces an error, why? 46 | 47 | 48 | # 4. Check again for missing data 49 | 50 | sum(is.na(dhfr)) 51 | 52 | colSums(is.na(dhfr)) 53 | 54 | str(dhfr) 55 | 56 | 57 | # Lists rows with missing data 58 | 59 | missingdata <- dhfr[!complete.cases(dhfr), ] 60 | 61 | sum(is.na(missingdata)) 62 | 63 | 64 | # If above sum is 0, this means that there is no missing data and proceed to modeling. 65 | # If above sum is greater than 0, then proceed to # 5 66 | 67 | 68 | # 5. Handling the missing data. There are 2 options, decide and choose only 1 69 | 70 | # 5.1. Simply delete all entries with missing data 71 | 72 | clean.data <- na.omit(dhfr) 73 | 74 | sum(is.na(clean.data)) 75 | 76 | 77 | # 5.2. Imputation: Replace missing values with the column's 78 | 79 | # MEAN 80 | dhfr.impute <- dhfr 81 | 82 | for (i in which(sapply(dhfr.impute, is.numeric))) { 83 | dhfr.impute[is.na(dhfr.impute[, i]), i] <- mean(dhfr.impute[, i], na.rm = TRUE) 84 | } 85 | 86 | sum(is.na(dhfr.impute)) 87 | 88 | 89 | # MEDIAN 90 | dhfr.impute <- dhfr 91 | 92 | for (i in which(sapply(dhfr.impute, is.numeric))) { 93 | dhfr.impute[is.na(dhfr.impute[, i]), i] <- median(dhfr.impute[, i], na.rm = TRUE) 94 | } 95 | 96 | sum(is.na(dhfr.impute)) 97 | -------------------------------------------------------------------------------- /dhfr/dhfr-parallel-speed-up.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Importing libraries 8 | library(datasets) # Contains several data sets 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training 10 | 11 | # Importing the dhfr data set 12 | data(dhfr) 13 | 14 | # Check to see if there are missing data? 15 | sum(is.na(dhfr)) 16 | 17 | # To achieve reproducible model; set the random seed number 18 | set.seed(100) 19 | 20 | # Performs stratified random split of the data set 21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE) 22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set 23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set 24 | 25 | 26 | 27 | ############################### 28 | # Random forest 29 | 30 | 31 | # Run normally without parallel processing 32 | start.time <- proc.time() 33 | Model <- train(Y ~ ., 34 | data = TrainingSet, # Build model using training set 35 | method = "rf" # Learning algorithm 36 | ) 37 | stop.time <- proc.time() 38 | run.time <- stop.time - start.time 39 | print(run.time) 40 | 41 | 42 | 43 | # Use doParallel 44 | # https://topepo.github.io/caret/parallel-processing.html 45 | 46 | library(doParallel) 47 | 48 | cl <- makePSOCKcluster(5) 49 | registerDoParallel(cl) 50 | 51 | start.time <- proc.time() 52 | Model <- train(Y ~ ., 53 | data = TrainingSet, # Build model using training set 54 | method = "rf" # Learning algorithm 55 | ) 56 | stop.time <- proc.time() 57 | run.time <- stop.time - start.time 58 | print(run.time) 59 | 60 | stopCluster(cl) 61 | 62 | 63 | 64 | 65 | ########################## 66 | 67 | # Run without parallel processing 68 | 69 | start.time <- proc.time() 70 | Model <- train(Y ~ ., 71 | data = TrainingSet, # Build model using training set 72 | method = "rf", # Learning algorithm 73 | tuneGrid = data.frame(mtry = seq(5,15, by=5)) 74 | ) 75 | stop.time <- proc.time() 76 | run.time <- stop.time - start.time 77 | print(run.time) 78 | 79 | # Using doParallel 80 | 81 | library(doParallel) 82 | 83 | cl <- makePSOCKcluster(5) 84 | registerDoParallel(cl) 85 | 86 | start.time <- proc.time() 87 | Model <- train(Y ~ ., 88 | data = TrainingSet, # Build model using training set 89 | method = "rf", # Learning algorithm 90 | tuneGrid = data.frame(mtry = seq(5,15, by=5)) 91 | ) 92 | stop.time <- proc.time() 93 | run.time <- stop.time - start.time 94 | print(run.time) 95 | 96 | stopCluster(cl) 97 | 98 | 99 | ########################## 100 | # Apply model for prediction 101 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set 102 | 103 | # Model performance (Displays confusion matrix and statistics) 104 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y) 105 | 106 | print(Model.training.confusion) 107 | 108 | # Feature importance 109 | Importance <- varImp(Model) 110 | plot(Importance, top = 25) 111 | plot(Importance, col = "red") 112 | -------------------------------------------------------------------------------- /iris/iris-classification.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Importing libraries 8 | library(datasets) # Contains the Iris data set 9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training 10 | 11 | # Importing the Iris data set 12 | data(iris) 13 | 14 | # Check to see if there are missing data? 15 | sum(is.na(iris)) 16 | 17 | # To achieve reproducible model; set the random seed number 18 | set.seed(100) 19 | 20 | # Performs stratified random split of the data set 21 | TrainingIndex <- createDataPartition(iris$Species, p=0.8, list = FALSE) 22 | TrainingSet <- iris[TrainingIndex,] # Training Set 23 | TestingSet <- iris[-TrainingIndex,] # Test Set 24 | 25 | # Compare scatter plot of the 80 and 20 data subsets 26 | 27 | 28 | 29 | 30 | ############################### 31 | # SVM model (polynomial kernel) 32 | 33 | # Build Training model 34 | Model <- train(Species ~ ., data = TrainingSet, 35 | method = "svmPoly", 36 | na.action = na.omit, 37 | preProcess=c("scale","center"), 38 | trControl= trainControl(method="none"), 39 | tuneGrid = data.frame(degree=1,scale=1,C=1) 40 | ) 41 | 42 | # Build CV model 43 | Model.cv <- train(Species ~ ., data = TrainingSet, 44 | method = "svmPoly", 45 | na.action = na.omit, 46 | preProcess=c("scale","center"), 47 | trControl= trainControl(method="cv", number=10), 48 | tuneGrid = data.frame(degree=1,scale=1,C=1) 49 | ) 50 | 51 | 52 | # Apply model for prediction 53 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set 54 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set 55 | Model.cv <-predict(Model.cv, TrainingSet) # Perform cross-validation 56 | 57 | # Model performance (Displays confusion matrix and statistics) 58 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Species) 59 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Species) 60 | Model.cv.confusion <-confusionMatrix(Model.cv, TrainingSet$Species) 61 | 62 | print(Model.training.confusion) 63 | print(Model.testing.confusion) 64 | print(Model.cv.confusion) 65 | 66 | # Feature importance 67 | Importance <- varImp(Model) 68 | plot(Importance) 69 | plot(Importance, col = "red") 70 | -------------------------------------------------------------------------------- /iris/iris-data-understanding.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | ######################### 8 | # Loading Iris data set 9 | ######################### 10 | 11 | # Method 1 12 | 13 | library(datasets) 14 | data(iris) 15 | 16 | iris2 <- datasets::iris 17 | 18 | # Method 2 19 | # install.packages("RCurl") 20 | 21 | library(RCurl) 22 | iris3 <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/iris.csv") ) 23 | 24 | # View the data 25 | View(iris) 26 | 27 | ############################# 28 | # Display summary statistics 29 | ############################# 30 | 31 | # head() / tail() 32 | head(iris, 5) 33 | tail(iris, 5) 34 | 35 | 36 | # summary() 37 | summary(iris) 38 | summary(iris$Sepal.Length) 39 | 40 | 41 | # Check to see if there are missing data? 42 | sum(is.na(iris)) 43 | 44 | 45 | # skimr() - expands on summary() by providing larger set of statistics 46 | # install.packages("skimr") 47 | # https://github.com/ropensci/skimr 48 | 49 | library(skimr) 50 | 51 | skim(iris) # Perform skim to display summary statistics 52 | 53 | # Group data by Species then perform skim 54 | iris %>% 55 | dplyr::group_by(Species) %>% 56 | skim() 57 | 58 | ############################# 59 | # Quick data visualization 60 | # 61 | # R base plot() 62 | ############################# 63 | 64 | 65 | # Panel plots 66 | plot(iris) 67 | plot(iris, col = "red") 68 | 69 | # Scatter plot 70 | plot(iris$Sepal.Width, iris$Sepal.Length) 71 | 72 | plot(iris$Sepal.Width, iris$Sepal.Length, col = "red") # Makes red circles 73 | 74 | plot(iris$Sepal.Width, iris$Sepal.Length, col = "red", # Makes red circles + Adds x and y axis labels 75 | xlab = "Sepal width", ylab = "Sepal length") 76 | 77 | # Histogram 78 | hist(iris$Sepal.Width) 79 | hist(iris$Sepal.Width, col = "red") # Makes red bars 80 | 81 | # Feature plots 82 | # https://www.machinelearningplus.com/machine-learning/caret-package/ 83 | featurePlot(x = iris[,1:4], 84 | y = iris$Species, 85 | plot = "box", 86 | strip=strip.custom(par.strip.text=list(cex=.7)), 87 | scales = list(x = list(relation="free"), 88 | y = list(relation="free"))) 89 | 90 | -------------------------------------------------------------------------------- /linear-regression/boston-housing-linear-regression.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | # http://facebook.com/dataprofessor # 6 | # https://www.instagram.com/data.professor # 7 | ############################################ 8 | 9 | # Importing libraries 10 | library(mlbench) # Contains several benchmark data sets (especially the Boston Housing dataset) 11 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training 12 | 13 | # Importing the Boston Housing data set 14 | data(BostonHousing) 15 | 16 | head(BostonHousing) 17 | 18 | # Check to see if there are missing data? 19 | sum(is.na(BostonHousing)) 20 | 21 | # To achieve reproducible model; set the random seed number 22 | set.seed(100) 23 | 24 | # Performs stratified random split of the data set 25 | TrainingIndex <- createDataPartition(BostonHousing$medv, p=0.8, list = FALSE) 26 | TrainingSet <- BostonHousing[TrainingIndex,] # Training Set 27 | TestingSet <- BostonHousing[-TrainingIndex,] # Test Set 28 | 29 | 30 | ############################### 31 | 32 | # Build Training model 33 | Model <- train(medv ~ ., data = TrainingSet, 34 | method = "lm", 35 | na.action = na.omit, 36 | preProcess=c("scale","center"), 37 | trControl= trainControl(method="none") 38 | ) 39 | 40 | # Apply model for prediction 41 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set 42 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set 43 | 44 | # Model performance (Displays scatter plot and performance metrics) 45 | # Scatter plot of Training set 46 | plot(TrainingSet$medv,Model.training, col = "blue" ) 47 | plot(TestingSet$medv,Model.testing, col = "blue" ) 48 | -------------------------------------------------------------------------------- /plot/scatter-plot/code-scatter-plot.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | # http://facebook.com/dataprofessor # 6 | # https://www.instagram.com/data.professor # 7 | ############################################ 8 | 9 | ######## READ DATA 10 | # https://link.springer.com/article/10.1007%2Fs11030-013-9462-x 11 | # 11030_2013_9462_MOESM2_ESM.xls (423 kb) 12 | # Supplementary material 2 (xls 423 KB) 13 | aromatase <- read.csv("aromatase.csv") 14 | 15 | ######## MISSING DATA 16 | sum(is.na(aromatase)) 17 | missingdata <- aromatase[!complete.cases(aromatase), ] # Identify which row contains missing data 18 | 19 | aromatase <- na.omit(aromatase) # Remove any missing data >> Complete case 20 | sum(is.na(aromatase)) # Check again for missing data 21 | 22 | class <- aromatase[ ,2] # Class label 23 | aromatase2 <- aromatase[,6:18] # Descriptors 24 | aromatase3 <- cbind(class, aromatase2) # Combine Class label + Descriptors into same dataframe 25 | 26 | df <- aromatase3 # Once we are satisfied with the dataset, let's call it "df" for conciseness 27 | 28 | 29 | ######## plot() 30 | 31 | # See at a glance all possible scatter plots 32 | plot(df) 33 | plot(df , col = "blue") 34 | 35 | # Select a pair of interest to visualize scatter plot 36 | 37 | # Figure 1, https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0066566 38 | 39 | 40 | plot(df$MW, df$ALogP) 41 | 42 | 43 | # We're going to make Steroids blue and Non-Steroids red 44 | library(colorspace) 45 | df$color <- factor(df$class, 46 | levels=c("Steroid", "Non-Steroid"), 47 | labels=c("blue", "red")) 48 | plot(df$MW, df$ALogP, pch = 16, col=as.character(df$color) ) 49 | 50 | 51 | 52 | 53 | # col argument for defining the color 54 | # R has 657 colors, colors() function lists these colors 55 | plot(df$MW, df$ALogP, col = "red") 56 | plot(df$MW, df$ALogP, col = "blue") 57 | plot(df$MW, df$ALogP, col = "green") 58 | plot(df$MW, df$ALogP, col = "purple") 59 | 60 | plot(df$MW, df$ALogP, col = "orangered3") 61 | 62 | plot(df$MW, df$ALogP, col = "#FF0000") # Hex color code for red 63 | 64 | # Color in RGB color code 65 | rgb(1,0,0) # red color 66 | rgb(255,0,0, max=255) # red color 67 | 68 | plot(df$MW, df$ALogP, col = rgb(0,0,0, max=255) ) 69 | 70 | 71 | 72 | # symbols 73 | 74 | plot(df$MW, df$ALogP, pch = 1) # pch = 1, open circles (the default value) 75 | # There are a total of 25 symbols to choose from 76 | plot(df$MW, df$ALogP, pch = 2) # pch = 2, open triangle symbols 77 | plot(df$MW, df$ALogP, pch = 3) # pch = 3, plus symbols 78 | plot(df$MW, df$ALogP, pch = 4) # pch = 4, x symbols 79 | plot(df$MW, df$ALogP, pch = 5) # pch = 5, diamond diamongs 80 | plot(df$MW, df$ALogP, pch = 16) # pch = 16, filled circle symbols 81 | 82 | 83 | plot(df$MW, df$ALogP, pch = 16, col = "orangered3") 84 | col2rgb("orangered3") # This gives us rgb(205,55,0, max=255) 85 | plot(df$MW, df$ALogP, pch = 16, col = rgb(205,55,0, max=255)) 86 | 87 | 88 | # Add transparency to color 89 | 90 | library(scales) 91 | 92 | plot(df$MW, df$ALogP, pch = 16, 93 | col = alpha("orangered3", 0.3)) 94 | 95 | plot(df$MW, df$ALogP, pch = 16, 96 | col = rgb(205,55,0, 75, max=255)) 97 | 98 | plot(df$MW, df$ALogP, pch = 16, col=alpha(as.character(df$color),0.3 ) ) 99 | 100 | 101 | ################################## 102 | # Multi-plot 103 | 104 | # Scatter plot of first pair 105 | plot(df$MW, df$ALogP, pch = 16, 106 | col = alpha("red", 0.3), 107 | xlab = "Molecular Weight (MW)", # X-axis label 108 | ylab = "Solubility (ALogP)", # Y-axis label 109 | font.lab = 2 # X and Y labels are now bold 110 | ) 111 | abline(lm(df$ALogP ~ df$MW)) # Trend line 112 | 113 | 114 | # Scatter plot of second pair 115 | plot(df$MW, df$Qm, pch = 16, 116 | col = alpha("blue", 0.3), 117 | xlab = "MW", # X-axis label 118 | ylab = "Qm", # Y-axis label 119 | font.lab = 2 # X and Y labels are now bold 120 | ) 121 | abline(lm(df$Qm ~ df$MW)) # Trend line 122 | 123 | 124 | # Scatter plot of third pair 125 | plot(df$HOMO, df$LUMO, pch = 16, 126 | col = alpha("green", 0.3), 127 | xlab = "HOMO", # X-axis label 128 | ylab = "LUMO", # Y-axis label 129 | font.lab = 2 # X and Y labels are now bold 130 | ) 131 | abline(lm(df$LUMO ~ df$HOMO)) # Trend line 132 | 133 | 134 | # Scatter plot of fourth pair 135 | plot(df$MW, df$HOMO, pch = 16, 136 | col = alpha("purple", 0.3), 137 | xlab = "MW", # X-axis label 138 | ylab = "HOMO", # Y-axis label 139 | font.lab = 2 # X and Y labels are now bold 140 | ) 141 | abline(lm(df$HOMO ~ df$MW)) # Trend line 142 | 143 | 144 | ######## Creating multi-plot figures 145 | 146 | # 2 rows by 2 columns 147 | 148 | par(mfrow=c(2,2)) 149 | # Plot 1 150 | # Plot 2 151 | # Plot 3 152 | # Plot 4 153 | 154 | par(mfrow=c(2,2), mai = c(0.7, 0.7, 0.3, 0.3)) 155 | plot(df$MW, df$ALogP) # Plot 1 156 | plot(df$MW, df$Qm) # Plot 2 157 | plot(df$HOMO, df$LUMO) # Plot 3 158 | plot(df$MW, df$HOMO) # Plot 4 159 | 160 | 161 | # 3 rows by 1 column 162 | 163 | par(mfrow=c(3,1)) 164 | # Plot 1 165 | # Plot 2 166 | # Plot 4 167 | 168 | par(mfrow=c(3,1), mai = c(0.3, 0.7, 0.1, 0.3)) 169 | plot(df$MW, df$ALogP) # Plot 1 170 | plot(df$MW, df$Qm) # Plot 2 171 | plot(df$MW, df$HOMO) # Plot 4 172 | 173 | 174 | # 1 row by 3 column 175 | 176 | par(mfrow=c(1,3)) 177 | # Plot 1 178 | # Plot 2 179 | # Plot 3 180 | # Plot 4 181 | 182 | par(mfrow=c(1,3), mai = c(0.3, 0.3, 0.3, 0.3)) 183 | plot(df$MW, df$ALogP) # Plot 1 184 | plot(df$MW, df$Qm) # Plot 2 185 | plot(df$MW, df$HOMO) # Plot 4 186 | 187 | par(mfrow=c(1,3), mai = c(0.3, 0.3, 0.3, 0)) 188 | plot(df$ALogP, df$MW) # Plot 1 189 | plot(df$Qm, df$MW) # Plot 2 190 | plot(df$HOMO, df$MW) # Plot 4 191 | 192 | 193 | ######## Saving plot to file 194 | 195 | # Single plot 196 | 197 | pdf("plot.pdf") 198 | #...Insert plot function here... 199 | dev.off() 200 | 201 | pdf("plot.pdf") 202 | plot(df$ALogP, df$MW) 203 | dev.off() 204 | 205 | # Multi-plot 206 | 207 | pdf("plot2.pdf") 208 | par(mfrow=c(2,2)) 209 | # Plot 1 210 | # Plot 2 211 | # Plot 3 212 | # Plot 4 213 | dev.off() 214 | 215 | pdf("plot_multiplot.pdf") 216 | par(mfrow=c(1,3), mai = c(0.3, 0.3, 0.3, 0)) 217 | plot(df$ALogP, df$MW) # Plot 1 218 | plot(df$Qm, df$MW) # Plot 2 219 | plot(df$HOMO, df$MW) # Plot 4 220 | dev.off() 221 | 222 | pdf("plot2.pdf") 223 | par(mfrow=c(2,2), mai = c(0.7, 0.7, 0.3, 0.3)) 224 | # Plot 1 225 | # Plot 2 226 | # Plot 3 227 | # Plot 4 228 | dev.off() 229 | -------------------------------------------------------------------------------- /python-in-r/using-reticulate.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | # http://facebook.com/dataprofessor # 6 | # https://www.instagram.com/data.professor # 7 | ############################################ 8 | 9 | # https://rstudio.github.io/reticulate/ 10 | # install.packages("reticulate") 11 | library(reticulate) 12 | 13 | # Loads Python Shell 14 | repl_python() 15 | 16 | # Check the current Python version 17 | 18 | reticulate::py_config() 19 | 20 | # Load a particular Python version on our system 21 | use_python("C:/Program Files/Python38", required = TRUE) 22 | 23 | 24 | 25 | ############################ 26 | # 27 | # matplotlib Example - Scatter plot 28 | # https://matplotlib.org/3.1.1/gallery/shapes_and_collections/scatter.html#sphx-glr-gallery-shapes-and-collections-scatter-py 29 | # 30 | ############################ 31 | 32 | ############################ 33 | # Import libraries 34 | ############################ 35 | 36 | # import matplotlib.pyplot as plt 37 | plt <- import('matplotlib.pyplot') 38 | 39 | # import numpy as np 40 | np <- import('numpy') 41 | 42 | ############################ 43 | # Load the Iris dataset 44 | ############################ 45 | data(iris) 46 | 47 | 48 | ############################ 49 | # Fixing random state for reproducibility 50 | ############################ 51 | 52 | # np.random.seed(19680801) # https://github.com/rstudio/reticulate/issues/226 53 | np$random$seed(19680801L) 54 | 55 | # N = 50 56 | N <- 50L 57 | # x = np.random.rand(N) 58 | x <- np$random$rand(N) 59 | 60 | # y = np.random.rand(N) 61 | y <- np$random$rand(N) 62 | 63 | # colors = np.random.rand(N) 64 | colors <- np$random$rand(N) 65 | 66 | # area = (30 * np.random.rand(N))**2 # 0 to 15 point radii 67 | area <- (30 * np$random$rand(N))**2 68 | 69 | # plt.scatter(x, y, s=area, c=colors, alpha=0.5) 70 | plt$scatter(x, y, s=area, c=colors, alpha=0.5) 71 | 72 | # plt.show() 73 | plt$show() 74 | -------------------------------------------------------------------------------- /python/Hummingbird_ML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Hummingbird-ML.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU" 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "IpOdlr3WAPHJ", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "# **Hummingbird-ML**\n", 25 | "\n", 26 | "[How to Harness GPU to Speed Up Machine Learning with Hummingbird-ML](https://www.youtube.com/watch?v=qN8jcUmo8TI)\n", 27 | "\n", 28 | "Adapted from: https://github.com/microsoft/hummingbird" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "ir3DZd5-_jiu", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "# Install Hummingbird-ML" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "id": "ra3JEgWN_bfp", 45 | "colab_type": "code", 46 | "colab": { 47 | "base_uri": "https://localhost:8080/", 48 | "height": 408 49 | }, 50 | "outputId": "4fae39de-26f0-4939-846d-039fb876725a" 51 | }, 52 | "source": [ 53 | "! pip install hummingbird-ml[extra]" 54 | ], 55 | "execution_count": 1, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "Collecting hummingbird-ml[extra]\n", 61 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ed/3b/cf1b8c1e7531377adead8de29e29b00b5aed380544ad0def4c0188b50d80/hummingbird_ml-0.0.5-py2.py3-none-any.whl (60kB)\n", 62 | "\r\u001b[K |█████▌ | 10kB 16.6MB/s eta 0:00:01\r\u001b[K |███████████ | 20kB 1.8MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 30kB 2.2MB/s eta 0:00:01\r\u001b[K |█████████████████████▉ | 40kB 2.5MB/s eta 0:00:01\r\u001b[K |███████████████████████████▎ | 51kB 2.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 61kB 1.8MB/s \n", 63 | "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.18.5)\n", 64 | "Requirement already satisfied: torch>=1.4.* in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.6.0+cu101)\n", 65 | "Collecting onnxconverter-common>=1.6.0\n", 66 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/fe/7a/7e30c643cd7d2ad87689188ef34ce93e657bd14da3605f87bcdbc19cd5b1/onnxconverter_common-1.7.0-py2.py3-none-any.whl (64kB)\n", 67 | "\u001b[K |████████████████████████████████| 71kB 3.7MB/s \n", 68 | "\u001b[?25hRequirement already satisfied: scikit-learn>=0.22.1 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.22.2.post1)\n", 69 | "Requirement already satisfied: xgboost==0.90; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.90)\n", 70 | "Requirement already satisfied: lightgbm>=2.2; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (2.2.3)\n", 71 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.4.*->hummingbird-ml[extra]) (0.16.0)\n", 72 | "Collecting onnx\n", 73 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/36/ee/bc7bc88fc8449266add978627e90c363069211584b937fd867b0ccc59f09/onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4MB)\n", 74 | "\u001b[K |████████████████████████████████| 7.4MB 16.0MB/s \n", 75 | "\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.6/dist-packages (from onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.12.4)\n", 76 | "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (0.16.0)\n", 77 | "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (1.4.1)\n", 78 | "Requirement already satisfied: typing-extensions>=3.6.2.1 in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.7.4.3)\n", 79 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (1.15.0)\n", 80 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (49.6.0)\n", 81 | "Installing collected packages: onnx, onnxconverter-common, hummingbird-ml\n", 82 | "Successfully installed hummingbird-ml-0.0.5 onnx-1.7.0 onnxconverter-common-1.7.0\n" 83 | ], 84 | "name": "stdout" 85 | } 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "id": "YnA-PmeA_q70", 92 | "colab_type": "text" 93 | }, 94 | "source": [ 95 | "# Import libraries" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "lkIThThi_puf", 102 | "colab_type": "code", 103 | "colab": {} 104 | }, 105 | "source": [ 106 | "import numpy as np\n", 107 | "from sklearn.ensemble import RandomForestClassifier\n", 108 | "from hummingbird.ml import convert" 109 | ], 110 | "execution_count": 2, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "id": "rFw_4cGa_-tF", 117 | "colab_type": "text" 118 | }, 119 | "source": [ 120 | "# Create some random data for binary classification" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "metadata": { 126 | "id": "hGGngPPp__mx", 127 | "colab_type": "code", 128 | "colab": {} 129 | }, 130 | "source": [ 131 | "num_classes = 2\n", 132 | "X = np.random.rand(100000, 28)\n", 133 | "y = np.random.randint(num_classes, size=100000)" 134 | ], 135 | "execution_count": 3, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "id": "WusxNKH4AHII", 142 | "colab_type": "text" 143 | }, 144 | "source": [ 145 | "# Create and train a model (scikit-learn RandomForestClassifier)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "metadata": { 151 | "id": "GMRJRuBwAGeV", 152 | "colab_type": "code", 153 | "colab": {} 154 | }, 155 | "source": [ 156 | "skl_model = RandomForestClassifier(n_estimators=10, max_depth=10)" 157 | ], 158 | "execution_count": 4, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "M_kGo80yAYTn", 165 | "colab_type": "code", 166 | "colab": { 167 | "base_uri": "https://localhost:8080/", 168 | "height": 34 169 | }, 170 | "outputId": "aa863652-02f8-4578-8fb7-e3b028685cd7" 171 | }, 172 | "source": [ 173 | "%%timeit\n", 174 | "skl_model.fit(X, y)" 175 | ], 176 | "execution_count": 5, 177 | "outputs": [ 178 | { 179 | "output_type": "stream", 180 | "text": [ 181 | "1 loop, best of 3: 4.78 s per loop\n" 182 | ], 183 | "name": "stdout" 184 | } 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "metadata": { 190 | "id": "Hp4a8I0tAbBl", 191 | "colab_type": "code", 192 | "colab": { 193 | "base_uri": "https://localhost:8080/", 194 | "height": 34 195 | }, 196 | "outputId": "4e083fd5-981f-4238-9158-3f4500585560" 197 | }, 198 | "source": [ 199 | "%%timeit\n", 200 | "skl_model.predict(X)" 201 | ], 202 | "execution_count": 6, 203 | "outputs": [ 204 | { 205 | "output_type": "stream", 206 | "text": [ 207 | "10 loops, best of 3: 85.6 ms per loop\n" 208 | ], 209 | "name": "stdout" 210 | } 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "id": "mNiBvy9BA7wR", 217 | "colab_type": "text" 218 | }, 219 | "source": [ 220 | "# Use Hummingbird to convert the model to PyTorch" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "metadata": { 226 | "id": "vcAOpuxxAzPc", 227 | "colab_type": "code", 228 | "colab": {} 229 | }, 230 | "source": [ 231 | "model = convert(skl_model, 'pytorch')" 232 | ], 233 | "execution_count": 7, 234 | "outputs": [] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "id": "dpt6_4l8BF7e", 240 | "colab_type": "text" 241 | }, 242 | "source": [ 243 | "# Run predictions on CPU" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "metadata": { 249 | "id": "_BiU63hNBDu-", 250 | "colab_type": "code", 251 | "colab": { 252 | "base_uri": "https://localhost:8080/", 253 | "height": 34 254 | }, 255 | "outputId": "1bd8b158-a62b-4fe0-be09-ca382c817247" 256 | }, 257 | "source": [ 258 | "%%timeit\n", 259 | "model.predict(X)" 260 | ], 261 | "execution_count": 8, 262 | "outputs": [ 263 | { 264 | "output_type": "stream", 265 | "text": [ 266 | "1 loop, best of 3: 174 ms per loop\n" 267 | ], 268 | "name": "stdout" 269 | } 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "id": "F10tJEMKBPZG", 276 | "colab_type": "text" 277 | }, 278 | "source": [ 279 | "# Run predictions on GPU" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "l2PUbqoHBJBX", 286 | "colab_type": "code", 287 | "colab": {} 288 | }, 289 | "source": [ 290 | "model.to('cuda')" 291 | ], 292 | "execution_count": 9, 293 | "outputs": [] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "metadata": { 298 | "id": "-AB23_VTBRMP", 299 | "colab_type": "code", 300 | "colab": { 301 | "base_uri": "https://localhost:8080/", 302 | "height": 51 303 | }, 304 | "outputId": "b9efea7d-913c-4326-c14a-6b6ca0e9c063" 305 | }, 306 | "source": [ 307 | "%%timeit\n", 308 | "model.predict(X)" 309 | ], 310 | "execution_count": 10, 311 | "outputs": [ 312 | { 313 | "output_type": "stream", 314 | "text": [ 315 | "The slowest run took 5.22 times longer than the fastest. This could mean that an intermediate result is being cached.\n", 316 | "100 loops, best of 3: 14.8 ms per loop\n" 317 | ], 318 | "name": "stdout" 319 | } 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "id": "dbkQU69JDt7T", 326 | "colab_type": "text" 327 | }, 328 | "source": [ 329 | "# Calculation Time" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": { 335 | "id": "Hr1R_9nwDwpc", 336 | "colab_type": "text" 337 | }, 338 | "source": [ 339 | "Methods | Timing | Performance\n", 340 | "--|--|--\n", 341 | "scikit-learn | 85.6 ms | -\n", 342 | "PyTorch (CPU) | 174 ms | 2 X slower than scikit-learn\n", 343 | "PyTorch (GPU) | 14.8 ms | Almost 6 X faster than scikit-learn; Almost 12 X faster than PyTorch (CPU)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "id": "9lmR3LHoEzhl", 350 | "colab_type": "code", 351 | "colab": {} 352 | }, 353 | "source": [ 354 | "" 355 | ], 356 | "execution_count": null, 357 | "outputs": [] 358 | } 359 | ] 360 | } -------------------------------------------------------------------------------- /python/google_colab_install_conda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "conda-on-google-colab.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "metadata": { 19 | "id": "uyfFc8VufUyl", 20 | "colab_type": "code", 21 | "outputId": "36b12856-fc68-40b8-c203-0fcf8ab7244e", 22 | "colab": { 23 | "base_uri": "https://localhost:8080/", 24 | "height": 1000 25 | } 26 | }, 27 | "source": [ 28 | "################################################################################\n", 29 | "# INSTALL CONDA ON GOOGLE COLAB\n", 30 | "################################################################################\n", 31 | "! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n", 32 | "! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n", 33 | "! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n", 34 | "import sys\n", 35 | "sys.path.append('/usr/local/lib/python3.7/site-packages/')" 36 | ], 37 | "execution_count": 0, 38 | "outputs": [ 39 | { 40 | "output_type": "stream", 41 | "text": [ 42 | "--2020-04-06 03:23:37-- https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n", 43 | "Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8303, ...\n", 44 | "Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.\n", 45 | "HTTP request sent, awaiting response... 200 OK\n", 46 | "Length: 85055499 (81M) [application/x-sh]\n", 47 | "Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.2’\n", 48 | "\n", 49 | "\r Miniconda 0%[ ] 0 --.-KB/s \r Miniconda3 48%[========> ] 39.24M 196MB/s \r Miniconda3- 93%[=================> ] 76.01M 189MB/s \rMiniconda3-py37_4.8 100%[===================>] 81.12M 187MB/s in 0.4s \n", 50 | "\n", 51 | "2020-04-06 03:23:37 (187 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.2’ saved [85055499/85055499]\n", 52 | "\n", 53 | "PREFIX=/usr/local\n", 54 | "Unpacking payload ...\n", 55 | "Collecting package metadata (current_repodata.json): - \b\b\\ \b\bdone\n", 56 | "Solving environment: / \b\b- \b\b\\ \n", 57 | "The environment is inconsistent, please check the package plan carefully\n", 58 | "The following packages are causing the inconsistency:\n", 59 | "\n", 60 | " - defaults/linux-64::urllib3==1.25.8=py37_0\n", 61 | " - defaults/linux-64::ruamel_yaml==0.15.87=py37h7b6447c_0\n", 62 | " - defaults/linux-64::pyopenssl==19.1.0=py37_0\n", 63 | " - defaults/linux-64::pysocks==1.7.1=py37_0\n", 64 | " - defaults/linux-64::six==1.14.0=py37_0\n", 65 | " - defaults/linux-64::setuptools==45.2.0=py37_0\n", 66 | " - defaults/linux-64::idna==2.8=py37_0\n", 67 | " - defaults/noarch::tqdm==4.42.1=py_0\n", 68 | " - defaults/linux-64::asn1crypto==1.3.0=py37_0\n", 69 | " - defaults/linux-64::cffi==1.14.0=py37h2e261b9_0\n", 70 | " - defaults/linux-64::wheel==0.34.2=py37_0\n", 71 | " - defaults/linux-64::conda-package-handling==1.6.0=py37h7b6447c_0\n", 72 | " - defaults/linux-64::pip==20.0.2=py37_1\n", 73 | " - defaults/linux-64::cryptography==2.8=py37h1ba5d50_0\n", 74 | " - defaults/linux-64::python==3.7.6=h0371630_2\n", 75 | " - defaults/linux-64::pycparser==2.19=py37_0\n", 76 | " - defaults/linux-64::pycosat==0.6.3=py37h7b6447c_0\n", 77 | " - defaults/linux-64::requests==2.22.0=py37_1\n", 78 | " - defaults/linux-64::chardet==3.0.4=py37_1003\n", 79 | "\b\b| \b\b/ \b\bdone\n", 80 | "\n", 81 | "## Package Plan ##\n", 82 | "\n", 83 | " environment location: /usr/local\n", 84 | "\n", 85 | " added / updated specs:\n", 86 | " - _libgcc_mutex==0.1=main\n", 87 | " - asn1crypto==1.3.0=py37_0\n", 88 | " - ca-certificates==2020.1.1=0\n", 89 | " - certifi==2019.11.28=py37_0\n", 90 | " - cffi==1.14.0=py37h2e261b9_0\n", 91 | " - chardet==3.0.4=py37_1003\n", 92 | " - conda-package-handling==1.6.0=py37h7b6447c_0\n", 93 | " - conda==4.8.2=py37_0\n", 94 | " - cryptography==2.8=py37h1ba5d50_0\n", 95 | " - idna==2.8=py37_0\n", 96 | " - ld_impl_linux-64==2.33.1=h53a641e_7\n", 97 | " - libedit==3.1.20181209=hc058e9b_0\n", 98 | " - libffi==3.2.1=hd88cf55_4\n", 99 | " - libgcc-ng==9.1.0=hdf63c60_0\n", 100 | " - libstdcxx-ng==9.1.0=hdf63c60_0\n", 101 | " - ncurses==6.2=he6710b0_0\n", 102 | " - openssl==1.1.1d=h7b6447c_4\n", 103 | " - pip==20.0.2=py37_1\n", 104 | " - pycosat==0.6.3=py37h7b6447c_0\n", 105 | " - pycparser==2.19=py37_0\n", 106 | " - pyopenssl==19.1.0=py37_0\n", 107 | " - pysocks==1.7.1=py37_0\n", 108 | " - python==3.7.6=h0371630_2\n", 109 | " - readline==7.0=h7b6447c_5\n", 110 | " - requests==2.22.0=py37_1\n", 111 | " - ruamel_yaml==0.15.87=py37h7b6447c_0\n", 112 | " - setuptools==45.2.0=py37_0\n", 113 | " - six==1.14.0=py37_0\n", 114 | " - sqlite==3.31.1=h7b6447c_0\n", 115 | " - tk==8.6.8=hbc83047_0\n", 116 | " - tqdm==4.42.1=py_0\n", 117 | " - urllib3==1.25.8=py37_0\n", 118 | " - wheel==0.34.2=py37_0\n", 119 | " - xz==5.2.4=h14c3975_4\n", 120 | " - yaml==0.1.7=had09818_2\n", 121 | " - zlib==1.2.11=h7b6447c_3\n", 122 | "\n", 123 | "\n", 124 | "The following NEW packages will be INSTALLED:\n", 125 | "\n", 126 | " certifi pkgs/main/linux-64::certifi-2019.11.28-py37_0\n", 127 | " conda pkgs/main/linux-64::conda-4.8.2-py37_0\n", 128 | " openssl pkgs/main/linux-64::openssl-1.1.1d-h7b6447c_4\n", 129 | "\n", 130 | "\n", 131 | "Preparing transaction: \\ \b\bdone\n", 132 | "Executing transaction: / \b\b- \b\bdone\n", 133 | "installation finished.\n", 134 | "WARNING:\n", 135 | " You currently have a PYTHONPATH environment variable set. This may cause\n", 136 | " unexpected behavior when running the Python interpreter in Miniconda3.\n", 137 | " For best results, please verify that your PYTHONPATH only points to\n", 138 | " directories of packages that are compatible with the Python interpreter\n", 139 | " in Miniconda3: /usr/local\n" 140 | ], 141 | "name": "stdout" 142 | } 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "metadata": { 148 | "id": "QD319lDvf6Xp", 149 | "colab_type": "code", 150 | "outputId": "71cc1953-4fd9-4f85-fc0a-5a3f7e7688bd", 151 | "colab": { 152 | "base_uri": "https://localhost:8080/", 153 | "height": 996 154 | } 155 | }, 156 | "source": [ 157 | "! conda install -c rdkit rdkit -y" 158 | ], 159 | "execution_count": 0, 160 | "outputs": [ 161 | { 162 | "output_type": "stream", 163 | "text": [ 164 | "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n", 165 | "Solving environment: - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bfailed with initial frozen solve. Retrying with flexible solve.\n", 166 | "Solving environment: / \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bfailed with repodata from current_repodata.json, will retry with next repodata source.\n", 167 | "Collecting package metadata (repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n", 168 | "Solving environment: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n", 169 | "\n", 170 | "## Package Plan ##\n", 171 | "\n", 172 | " environment location: /usr/local\n", 173 | "\n", 174 | " added / updated specs:\n", 175 | " - rdkit\n", 176 | "\n", 177 | "\n", 178 | "The following NEW packages will be INSTALLED:\n", 179 | "\n", 180 | " blas pkgs/main/linux-64::blas-1.0-mkl\n", 181 | " bzip2 pkgs/main/linux-64::bzip2-1.0.8-h7b6447c_0\n", 182 | " cairo pkgs/main/linux-64::cairo-1.14.12-h8948797_3\n", 183 | " fontconfig pkgs/main/linux-64::fontconfig-2.13.0-h9420a91_0\n", 184 | " freetype pkgs/main/linux-64::freetype-2.9.1-h8a8886c_1\n", 185 | " glib pkgs/main/linux-64::glib-2.63.1-h5a9c865_0\n", 186 | " icu pkgs/main/linux-64::icu-58.2-h9c2bf20_1\n", 187 | " intel-openmp pkgs/main/linux-64::intel-openmp-2020.0-166\n", 188 | " jpeg pkgs/main/linux-64::jpeg-9b-h024ee3a_2\n", 189 | " libboost pkgs/main/linux-64::libboost-1.67.0-h46d08c1_4\n", 190 | " libgfortran-ng pkgs/main/linux-64::libgfortran-ng-7.3.0-hdf63c60_0\n", 191 | " libpng pkgs/main/linux-64::libpng-1.6.37-hbc83047_0\n", 192 | " libtiff pkgs/main/linux-64::libtiff-4.1.0-h2733197_0\n", 193 | " libuuid pkgs/main/linux-64::libuuid-1.0.3-h1bed415_2\n", 194 | " libxcb pkgs/main/linux-64::libxcb-1.13-h1bed415_1\n", 195 | " libxml2 pkgs/main/linux-64::libxml2-2.9.9-hea5a465_1\n", 196 | " mkl pkgs/main/linux-64::mkl-2020.0-166\n", 197 | " mkl-service pkgs/main/linux-64::mkl-service-2.3.0-py37he904b0f_0\n", 198 | " mkl_fft pkgs/main/linux-64::mkl_fft-1.0.15-py37ha843d7b_0\n", 199 | " mkl_random pkgs/main/linux-64::mkl_random-1.1.0-py37hd6b4f25_0\n", 200 | " numpy pkgs/main/linux-64::numpy-1.18.1-py37h4f9e942_0\n", 201 | " numpy-base pkgs/main/linux-64::numpy-base-1.18.1-py37hde5b4d6_1\n", 202 | " olefile pkgs/main/linux-64::olefile-0.46-py37_0\n", 203 | " pandas pkgs/main/linux-64::pandas-1.0.3-py37h0573a6f_0\n", 204 | " pcre pkgs/main/linux-64::pcre-8.43-he6710b0_0\n", 205 | " pillow pkgs/main/linux-64::pillow-7.0.0-py37hb39fc2d_0\n", 206 | " pixman pkgs/main/linux-64::pixman-0.38.0-h7b6447c_0\n", 207 | " py-boost pkgs/main/linux-64::py-boost-1.67.0-py37h04863e7_4\n", 208 | " python-dateutil pkgs/main/noarch::python-dateutil-2.8.1-py_0\n", 209 | " pytz pkgs/main/noarch::pytz-2019.3-py_0\n", 210 | " rdkit rdkit/linux-64::rdkit-2020.03.1.0-py37hc20afe1_1\n", 211 | " zstd pkgs/main/linux-64::zstd-1.3.7-h0b5b093_0\n", 212 | "\n", 213 | "The following packages will be UPDATED:\n", 214 | "\n", 215 | " certifi 2019.11.28-py37_0 --> 2019.11.28-py37_1\n", 216 | " conda 4.8.2-py37_0 --> 4.8.3-py37_0\n", 217 | " openssl 1.1.1d-h7b6447c_4 --> 1.1.1f-h7b6447c_0\n", 218 | "\n", 219 | "\n", 220 | "Proceed ([y]/n)? " 221 | ], 222 | "name": "stdout" 223 | } 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "M8MeOz0miqJ1", 230 | "colab_type": "code", 231 | "colab": {} 232 | }, 233 | "source": [ 234 | "" 235 | ], 236 | "execution_count": 0, 237 | "outputs": [] 238 | } 239 | ] 240 | } -------------------------------------------------------------------------------- /python/iris/iris-classification-random-forest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Building a Classification Model for the Iris data set\n", 8 | "\n", 9 | "Chanin Nantasenamat\n", 10 | "\n", 11 | "Data Professor YouTube channel, http://youtube.com/dataprofessor \n", 12 | "\n", 13 | "In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## 1. Import libraries" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn import datasets\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "from sklearn.ensemble import RandomForestClassifier\n", 32 | "from sklearn.datasets import make_classification" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## 2. Load the *iris* data set" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "iris = datasets.load_iris()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## 3. Input features\n", 56 | "The ***iris*** data set contains 4 input features and 1 output variable (the class label)." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### 3.1. Input features" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": { 70 | "scrolled": true 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "print(iris.feature_names)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### 3.2. Output features" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "['setosa' 'versicolor' 'virginica']\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "print(iris.target_names)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## 4. Glimpse of the data" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### 4.1. Input features" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "array([[5.1, 3.5, 1.4, 0.2],\n", 132 | " [4.9, 3. , 1.4, 0.2],\n", 133 | " [4.7, 3.2, 1.3, 0.2],\n", 134 | " [4.6, 3.1, 1.5, 0.2],\n", 135 | " [5. , 3.6, 1.4, 0.2],\n", 136 | " [5.4, 3.9, 1.7, 0.4],\n", 137 | " [4.6, 3.4, 1.4, 0.3],\n", 138 | " [5. , 3.4, 1.5, 0.2],\n", 139 | " [4.4, 2.9, 1.4, 0.2],\n", 140 | " [4.9, 3.1, 1.5, 0.1],\n", 141 | " [5.4, 3.7, 1.5, 0.2],\n", 142 | " [4.8, 3.4, 1.6, 0.2],\n", 143 | " [4.8, 3. , 1.4, 0.1],\n", 144 | " [4.3, 3. , 1.1, 0.1],\n", 145 | " [5.8, 4. , 1.2, 0.2],\n", 146 | " [5.7, 4.4, 1.5, 0.4],\n", 147 | " [5.4, 3.9, 1.3, 0.4],\n", 148 | " [5.1, 3.5, 1.4, 0.3],\n", 149 | " [5.7, 3.8, 1.7, 0.3],\n", 150 | " [5.1, 3.8, 1.5, 0.3],\n", 151 | " [5.4, 3.4, 1.7, 0.2],\n", 152 | " [5.1, 3.7, 1.5, 0.4],\n", 153 | " [4.6, 3.6, 1. , 0.2],\n", 154 | " [5.1, 3.3, 1.7, 0.5],\n", 155 | " [4.8, 3.4, 1.9, 0.2],\n", 156 | " [5. , 3. , 1.6, 0.2],\n", 157 | " [5. , 3.4, 1.6, 0.4],\n", 158 | " [5.2, 3.5, 1.5, 0.2],\n", 159 | " [5.2, 3.4, 1.4, 0.2],\n", 160 | " [4.7, 3.2, 1.6, 0.2],\n", 161 | " [4.8, 3.1, 1.6, 0.2],\n", 162 | " [5.4, 3.4, 1.5, 0.4],\n", 163 | " [5.2, 4.1, 1.5, 0.1],\n", 164 | " [5.5, 4.2, 1.4, 0.2],\n", 165 | " [4.9, 3.1, 1.5, 0.2],\n", 166 | " [5. , 3.2, 1.2, 0.2],\n", 167 | " [5.5, 3.5, 1.3, 0.2],\n", 168 | " [4.9, 3.6, 1.4, 0.1],\n", 169 | " [4.4, 3. , 1.3, 0.2],\n", 170 | " [5.1, 3.4, 1.5, 0.2],\n", 171 | " [5. , 3.5, 1.3, 0.3],\n", 172 | " [4.5, 2.3, 1.3, 0.3],\n", 173 | " [4.4, 3.2, 1.3, 0.2],\n", 174 | " [5. , 3.5, 1.6, 0.6],\n", 175 | " [5.1, 3.8, 1.9, 0.4],\n", 176 | " [4.8, 3. , 1.4, 0.3],\n", 177 | " [5.1, 3.8, 1.6, 0.2],\n", 178 | " [4.6, 3.2, 1.4, 0.2],\n", 179 | " [5.3, 3.7, 1.5, 0.2],\n", 180 | " [5. , 3.3, 1.4, 0.2],\n", 181 | " [7. , 3.2, 4.7, 1.4],\n", 182 | " [6.4, 3.2, 4.5, 1.5],\n", 183 | " [6.9, 3.1, 4.9, 1.5],\n", 184 | " [5.5, 2.3, 4. , 1.3],\n", 185 | " [6.5, 2.8, 4.6, 1.5],\n", 186 | " [5.7, 2.8, 4.5, 1.3],\n", 187 | " [6.3, 3.3, 4.7, 1.6],\n", 188 | " [4.9, 2.4, 3.3, 1. ],\n", 189 | " [6.6, 2.9, 4.6, 1.3],\n", 190 | " [5.2, 2.7, 3.9, 1.4],\n", 191 | " [5. , 2. , 3.5, 1. ],\n", 192 | " [5.9, 3. , 4.2, 1.5],\n", 193 | " [6. , 2.2, 4. , 1. ],\n", 194 | " [6.1, 2.9, 4.7, 1.4],\n", 195 | " [5.6, 2.9, 3.6, 1.3],\n", 196 | " [6.7, 3.1, 4.4, 1.4],\n", 197 | " [5.6, 3. , 4.5, 1.5],\n", 198 | " [5.8, 2.7, 4.1, 1. ],\n", 199 | " [6.2, 2.2, 4.5, 1.5],\n", 200 | " [5.6, 2.5, 3.9, 1.1],\n", 201 | " [5.9, 3.2, 4.8, 1.8],\n", 202 | " [6.1, 2.8, 4. , 1.3],\n", 203 | " [6.3, 2.5, 4.9, 1.5],\n", 204 | " [6.1, 2.8, 4.7, 1.2],\n", 205 | " [6.4, 2.9, 4.3, 1.3],\n", 206 | " [6.6, 3. , 4.4, 1.4],\n", 207 | " [6.8, 2.8, 4.8, 1.4],\n", 208 | " [6.7, 3. , 5. , 1.7],\n", 209 | " [6. , 2.9, 4.5, 1.5],\n", 210 | " [5.7, 2.6, 3.5, 1. ],\n", 211 | " [5.5, 2.4, 3.8, 1.1],\n", 212 | " [5.5, 2.4, 3.7, 1. ],\n", 213 | " [5.8, 2.7, 3.9, 1.2],\n", 214 | " [6. , 2.7, 5.1, 1.6],\n", 215 | " [5.4, 3. , 4.5, 1.5],\n", 216 | " [6. , 3.4, 4.5, 1.6],\n", 217 | " [6.7, 3.1, 4.7, 1.5],\n", 218 | " [6.3, 2.3, 4.4, 1.3],\n", 219 | " [5.6, 3. , 4.1, 1.3],\n", 220 | " [5.5, 2.5, 4. , 1.3],\n", 221 | " [5.5, 2.6, 4.4, 1.2],\n", 222 | " [6.1, 3. , 4.6, 1.4],\n", 223 | " [5.8, 2.6, 4. , 1.2],\n", 224 | " [5. , 2.3, 3.3, 1. ],\n", 225 | " [5.6, 2.7, 4.2, 1.3],\n", 226 | " [5.7, 3. , 4.2, 1.2],\n", 227 | " [5.7, 2.9, 4.2, 1.3],\n", 228 | " [6.2, 2.9, 4.3, 1.3],\n", 229 | " [5.1, 2.5, 3. , 1.1],\n", 230 | " [5.7, 2.8, 4.1, 1.3],\n", 231 | " [6.3, 3.3, 6. , 2.5],\n", 232 | " [5.8, 2.7, 5.1, 1.9],\n", 233 | " [7.1, 3. , 5.9, 2.1],\n", 234 | " [6.3, 2.9, 5.6, 1.8],\n", 235 | " [6.5, 3. , 5.8, 2.2],\n", 236 | " [7.6, 3. , 6.6, 2.1],\n", 237 | " [4.9, 2.5, 4.5, 1.7],\n", 238 | " [7.3, 2.9, 6.3, 1.8],\n", 239 | " [6.7, 2.5, 5.8, 1.8],\n", 240 | " [7.2, 3.6, 6.1, 2.5],\n", 241 | " [6.5, 3.2, 5.1, 2. ],\n", 242 | " [6.4, 2.7, 5.3, 1.9],\n", 243 | " [6.8, 3. , 5.5, 2.1],\n", 244 | " [5.7, 2.5, 5. , 2. ],\n", 245 | " [5.8, 2.8, 5.1, 2.4],\n", 246 | " [6.4, 3.2, 5.3, 2.3],\n", 247 | " [6.5, 3. , 5.5, 1.8],\n", 248 | " [7.7, 3.8, 6.7, 2.2],\n", 249 | " [7.7, 2.6, 6.9, 2.3],\n", 250 | " [6. , 2.2, 5. , 1.5],\n", 251 | " [6.9, 3.2, 5.7, 2.3],\n", 252 | " [5.6, 2.8, 4.9, 2. ],\n", 253 | " [7.7, 2.8, 6.7, 2. ],\n", 254 | " [6.3, 2.7, 4.9, 1.8],\n", 255 | " [6.7, 3.3, 5.7, 2.1],\n", 256 | " [7.2, 3.2, 6. , 1.8],\n", 257 | " [6.2, 2.8, 4.8, 1.8],\n", 258 | " [6.1, 3. , 4.9, 1.8],\n", 259 | " [6.4, 2.8, 5.6, 2.1],\n", 260 | " [7.2, 3. , 5.8, 1.6],\n", 261 | " [7.4, 2.8, 6.1, 1.9],\n", 262 | " [7.9, 3.8, 6.4, 2. ],\n", 263 | " [6.4, 2.8, 5.6, 2.2],\n", 264 | " [6.3, 2.8, 5.1, 1.5],\n", 265 | " [6.1, 2.6, 5.6, 1.4],\n", 266 | " [7.7, 3. , 6.1, 2.3],\n", 267 | " [6.3, 3.4, 5.6, 2.4],\n", 268 | " [6.4, 3.1, 5.5, 1.8],\n", 269 | " [6. , 3. , 4.8, 1.8],\n", 270 | " [6.9, 3.1, 5.4, 2.1],\n", 271 | " [6.7, 3.1, 5.6, 2.4],\n", 272 | " [6.9, 3.1, 5.1, 2.3],\n", 273 | " [5.8, 2.7, 5.1, 1.9],\n", 274 | " [6.8, 3.2, 5.9, 2.3],\n", 275 | " [6.7, 3.3, 5.7, 2.5],\n", 276 | " [6.7, 3. , 5.2, 2.3],\n", 277 | " [6.3, 2.5, 5. , 1.9],\n", 278 | " [6.5, 3. , 5.2, 2. ],\n", 279 | " [6.2, 3.4, 5.4, 2.3],\n", 280 | " [5.9, 3. , 5.1, 1.8]])" 281 | ] 282 | }, 283 | "execution_count": 5, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "iris.data" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### 4.2. Output variable (the Class label)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 30, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 308 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 309 | " 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 310 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 311 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", 312 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", 313 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])" 314 | ] 315 | }, 316 | "execution_count": 30, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "iris.target" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "### 4.3. Assigning *input* and *output* variables\n", 330 | "Let's assign the 4 input variables to X and the output variable (class label) to Y" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 9, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "X = iris.data\n", 340 | "Y = iris.target" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "### 4.3. Let's examine the data dimension" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 10, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "(150, 4)" 359 | ] 360 | }, 361 | "execution_count": 10, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "X.shape" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 11, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "(150,)" 379 | ] 380 | }, 381 | "execution_count": 11, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "Y.shape" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## 5. Build Classification Model using Random Forest" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 9, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "clf = RandomForestClassifier()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 10, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/plain": [ 414 | "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", 415 | " criterion='gini', max_depth=None, max_features='auto',\n", 416 | " max_leaf_nodes=None, max_samples=None,\n", 417 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 418 | " min_samples_leaf=1, min_samples_split=2,\n", 419 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 420 | " n_jobs=None, oob_score=False, random_state=None,\n", 421 | " verbose=0, warm_start=False)" 422 | ] 423 | }, 424 | "execution_count": 10, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "clf.fit(X, Y)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "## 6. Feature Importance" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 11, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "[0.07344346 0.01623453 0.42869861 0.4816234 ]\n" 450 | ] 451 | } 452 | ], 453 | "source": [ 454 | "print(clf.feature_importances_)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "## 7. Make Prediction" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 12, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "array([5.1, 3.5, 1.4, 0.2])" 473 | ] 474 | }, 475 | "execution_count": 12, 476 | "metadata": {}, 477 | "output_type": "execute_result" 478 | } 479 | ], 480 | "source": [ 481 | "X[0]" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 13, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "name": "stdout", 491 | "output_type": "stream", 492 | "text": [ 493 | "[0]\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 14, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "[0]\n" 511 | ] 512 | } 513 | ], 514 | "source": [ 515 | "print(clf.predict(X[[0]]))" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 15, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "name": "stdout", 525 | "output_type": "stream", 526 | "text": [ 527 | "[[1. 0. 0.]]\n" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "print(clf.predict_proba(X[[0]]))" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 16, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "data": { 542 | "text/plain": [ 543 | "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", 544 | " criterion='gini', max_depth=None, max_features='auto',\n", 545 | " max_leaf_nodes=None, max_samples=None,\n", 546 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 547 | " min_samples_leaf=1, min_samples_split=2,\n", 548 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 549 | " n_jobs=None, oob_score=False, random_state=None,\n", 550 | " verbose=0, warm_start=False)" 551 | ] 552 | }, 553 | "execution_count": 16, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "clf.fit(iris.data, iris.target_names[iris.target])" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "## 8. Data split (80/20 ratio)" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 17, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 18, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "data": { 585 | "text/plain": [ 586 | "((120, 4), (120,))" 587 | ] 588 | }, 589 | "execution_count": 18, 590 | "metadata": {}, 591 | "output_type": "execute_result" 592 | } 593 | ], 594 | "source": [ 595 | "X_train.shape, Y_train.shape" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 19, 601 | "metadata": {}, 602 | "outputs": [ 603 | { 604 | "data": { 605 | "text/plain": [ 606 | "((30, 4), (30,))" 607 | ] 608 | }, 609 | "execution_count": 19, 610 | "metadata": {}, 611 | "output_type": "execute_result" 612 | } 613 | ], 614 | "source": [ 615 | "X_test.shape, Y_test.shape" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "## 9. Rebuild the Random Forest Model" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 20, 628 | "metadata": {}, 629 | "outputs": [ 630 | { 631 | "data": { 632 | "text/plain": [ 633 | "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", 634 | " criterion='gini', max_depth=None, max_features='auto',\n", 635 | " max_leaf_nodes=None, max_samples=None,\n", 636 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 637 | " min_samples_leaf=1, min_samples_split=2,\n", 638 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 639 | " n_jobs=None, oob_score=False, random_state=None,\n", 640 | " verbose=0, warm_start=False)" 641 | ] 642 | }, 643 | "execution_count": 20, 644 | "metadata": {}, 645 | "output_type": "execute_result" 646 | } 647 | ], 648 | "source": [ 649 | "clf.fit(X_train, Y_train)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "### 9.1. Performs prediction on single sample from the data set" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 21, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "name": "stdout", 666 | "output_type": "stream", 667 | "text": [ 668 | "[0]\n" 669 | ] 670 | } 671 | ], 672 | "source": [ 673 | "print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 22, 679 | "metadata": {}, 680 | "outputs": [ 681 | { 682 | "name": "stdout", 683 | "output_type": "stream", 684 | "text": [ 685 | "[[1. 0. 0.]]\n" 686 | ] 687 | } 688 | ], 689 | "source": [ 690 | "print(clf.predict_proba([[5.1, 3.5, 1.4, 0.2]]))" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "### 9.2. Performs prediction on the test set" 698 | ] 699 | }, 700 | { 701 | "cell_type": "markdown", 702 | "metadata": {}, 703 | "source": [ 704 | "#### *Predicted class labels*" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 23, 710 | "metadata": {}, 711 | "outputs": [ 712 | { 713 | "name": "stdout", 714 | "output_type": "stream", 715 | "text": [ 716 | "[2 1 0 1 1 2 1 0 1 0 2 1 1 1 1 1 1 2 2 0 0 2 0 0 0 1 1 1 1 0]\n" 717 | ] 718 | } 719 | ], 720 | "source": [ 721 | "print(clf.predict(X_test))" 722 | ] 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "metadata": {}, 727 | "source": [ 728 | "#### *Actual class labels*" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 24, 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "name": "stdout", 738 | "output_type": "stream", 739 | "text": [ 740 | "[2 1 0 1 1 2 1 0 1 0 2 1 2 1 1 2 2 2 2 0 0 2 0 0 0 1 1 1 1 0]\n" 741 | ] 742 | } 743 | ], 744 | "source": [ 745 | "print(Y_test)" 746 | ] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "metadata": {}, 751 | "source": [ 752 | "## 10. Model Performance" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 25, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "name": "stdout", 762 | "output_type": "stream", 763 | "text": [ 764 | "0.9\n" 765 | ] 766 | } 767 | ], 768 | "source": [ 769 | "print(clf.score(X_test, Y_test))" 770 | ] 771 | } 772 | ], 773 | "metadata": { 774 | "kernelspec": { 775 | "display_name": "Python 3", 776 | "language": "python", 777 | "name": "python3" 778 | }, 779 | "language_info": { 780 | "codemirror_mode": { 781 | "name": "ipython", 782 | "version": 3 783 | }, 784 | "file_extension": ".py", 785 | "mimetype": "text/x-python", 786 | "name": "python", 787 | "nbconvert_exporter": "python", 788 | "pygments_lexer": "ipython3", 789 | "version": "3.7.6" 790 | } 791 | }, 792 | "nbformat": 4, 793 | "nbformat_minor": 4 794 | } 795 | -------------------------------------------------------------------------------- /python/model_is_training_progress_bar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "zzD4-HxqXBmt" 7 | }, 8 | "source": [ 9 | "# **Progress Bar in Jupyter Notebook**\n", 10 | "\n", 11 | "Chanin Nantasenamat\n", 12 | "\n", 13 | "**Data Professor YouTube channel**, http://youtube.com/dataprofessor" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "An7XU557Y5ci" 20 | }, 21 | "source": [ 22 | "# **Progress Bar with the tqdm library**" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "id": "3yc04janmetd" 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "# ! pip install tqdm" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 1, 39 | "metadata": { 40 | "id": "gxa8jup1DNjt" 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from tqdm.notebook import tqdm\n", 45 | "from time import sleep" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": { 52 | "id": "009bdoXCE74q" 53 | }, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "application/vnd.jupyter.widget-view+json": { 58 | "model_id": "93cc2d7933af4faf96fda14e55f24e23", 59 | "version_major": 2, 60 | "version_minor": 0 61 | }, 62 | "text/plain": [ 63 | " 0%| | 0/100 [00:00] 56.03K --.-KB/s in 0.01s \n", 71 | "\n", 72 | "2020-05-03 17:41:00 (3.82 MB/s) - ‘delaney_solubility_with_descriptors.csv’ saved [57370/57370]\n", 73 | "\n" 74 | ], 75 | "name": "stdout" 76 | } 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "metadata": { 82 | "id": "qJ9j-V_zHwm8", 83 | "colab_type": "code", 84 | "colab": {} 85 | }, 86 | "source": [ 87 | "import pandas as pd" 88 | ], 89 | "execution_count": 0, 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "metadata": { 95 | "id": "D6twMV5THz2r", 96 | "colab_type": "code", 97 | "colab": { 98 | "base_uri": "https://localhost:8080/", 99 | "height": 415 100 | }, 101 | "outputId": "3230bda2-d61e-48f6-f232-0dabecc08908" 102 | }, 103 | "source": [ 104 | "df = pd.read_csv('delaney_solubility_with_descriptors.csv')\n", 105 | "df" 106 | ], 107 | "execution_count": 12, 108 | "outputs": [ 109 | { 110 | "output_type": "execute_result", 111 | "data": { 112 | "text/html": [ 113 | "
\n", 114 | "\n", 127 | "\n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | "
MolLogPMolWtNumRotatableBondsAromaticProportionlogS
02.59540167.8500.00.000000-2.180
12.37650133.4050.00.000000-2.000
22.59380167.8501.00.000000-1.740
32.02890133.4051.00.000000-1.480
42.91890187.3751.00.000000-3.040
..................
11391.98820287.3438.00.0000001.144
11403.42130286.1142.00.333333-4.925
11413.60960308.3334.00.695652-3.893
11422.56214354.8153.00.521739-3.790
11432.02164179.2191.00.461538-2.581
\n", 229 | "

1144 rows × 5 columns

\n", 230 | "
" 231 | ], 232 | "text/plain": [ 233 | " MolLogP MolWt NumRotatableBonds AromaticProportion logS\n", 234 | "0 2.59540 167.850 0.0 0.000000 -2.180\n", 235 | "1 2.37650 133.405 0.0 0.000000 -2.000\n", 236 | "2 2.59380 167.850 1.0 0.000000 -1.740\n", 237 | "3 2.02890 133.405 1.0 0.000000 -1.480\n", 238 | "4 2.91890 187.375 1.0 0.000000 -3.040\n", 239 | "... ... ... ... ... ...\n", 240 | "1139 1.98820 287.343 8.0 0.000000 1.144\n", 241 | "1140 3.42130 286.114 2.0 0.333333 -4.925\n", 242 | "1141 3.60960 308.333 4.0 0.695652 -3.893\n", 243 | "1142 2.56214 354.815 3.0 0.521739 -3.790\n", 244 | "1143 2.02164 179.219 1.0 0.461538 -2.581\n", 245 | "\n", 246 | "[1144 rows x 5 columns]" 247 | ] 248 | }, 249 | "metadata": { 250 | "tags": [] 251 | }, 252 | "execution_count": 12 253 | } 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "id": "uc36WETtHp77", 260 | "colab_type": "text" 261 | }, 262 | "source": [ 263 | "## **Selecting specific column(s)**" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "id": "Ha6n6gH3IsPs", 270 | "colab_type": "text" 271 | }, 272 | "source": [ 273 | "### Selecting a single column" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "metadata": { 279 | "id": "94xKLaCoHpeY", 280 | "colab_type": "code", 281 | "colab": { 282 | "base_uri": "https://localhost:8080/", 283 | "height": 225 284 | }, 285 | "outputId": "7bc417a4-ff9e-43c4-f05a-7dc4eb5aa387" 286 | }, 287 | "source": [ 288 | "df.MolLogP" 289 | ], 290 | "execution_count": 13, 291 | "outputs": [ 292 | { 293 | "output_type": "execute_result", 294 | "data": { 295 | "text/plain": [ 296 | "0 2.59540\n", 297 | "1 2.37650\n", 298 | "2 2.59380\n", 299 | "3 2.02890\n", 300 | "4 2.91890\n", 301 | " ... \n", 302 | "1139 1.98820\n", 303 | "1140 3.42130\n", 304 | "1141 3.60960\n", 305 | "1142 2.56214\n", 306 | "1143 2.02164\n", 307 | "Name: MolLogP, Length: 1144, dtype: float64" 308 | ] 309 | }, 310 | "metadata": { 311 | "tags": [] 312 | }, 313 | "execution_count": 13 314 | } 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "id": "ua7cZKWzIP-J", 321 | "colab_type": "code", 322 | "colab": { 323 | "base_uri": "https://localhost:8080/", 324 | "height": 225 325 | }, 326 | "outputId": "3cff5ff9-f26a-43ea-9c53-7772d68c48e2" 327 | }, 328 | "source": [ 329 | "df['MolLogP']" 330 | ], 331 | "execution_count": 14, 332 | "outputs": [ 333 | { 334 | "output_type": "execute_result", 335 | "data": { 336 | "text/plain": [ 337 | "0 2.59540\n", 338 | "1 2.37650\n", 339 | "2 2.59380\n", 340 | "3 2.02890\n", 341 | "4 2.91890\n", 342 | " ... \n", 343 | "1139 1.98820\n", 344 | "1140 3.42130\n", 345 | "1141 3.60960\n", 346 | "1142 2.56214\n", 347 | "1143 2.02164\n", 348 | "Name: MolLogP, Length: 1144, dtype: float64" 349 | ] 350 | }, 351 | "metadata": { 352 | "tags": [] 353 | }, 354 | "execution_count": 14 355 | } 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "id": "Mb_loyXfIyhw", 362 | "colab_type": "text" 363 | }, 364 | "source": [ 365 | "### Selecting two or more columns" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "metadata": { 371 | "id": "ZbLCqhFbIRfS", 372 | "colab_type": "code", 373 | "colab": { 374 | "base_uri": "https://localhost:8080/", 375 | "height": 415 376 | }, 377 | "outputId": "d1d61288-290a-4fce-db4b-e3fbf635ba4d" 378 | }, 379 | "source": [ 380 | "df[['MolLogP','NumRotatableBonds']]" 381 | ], 382 | "execution_count": 23, 383 | "outputs": [ 384 | { 385 | "output_type": "execute_result", 386 | "data": { 387 | "text/html": [ 388 | "
\n", 389 | "\n", 402 | "\n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | "
MolLogPNumRotatableBonds
02.595400.0
12.376500.0
22.593801.0
32.028901.0
42.918901.0
.........
11391.988208.0
11403.421302.0
11413.609604.0
11422.562143.0
11432.021641.0
\n", 468 | "

1144 rows × 2 columns

\n", 469 | "
" 470 | ], 471 | "text/plain": [ 472 | " MolLogP NumRotatableBonds\n", 473 | "0 2.59540 0.0\n", 474 | "1 2.37650 0.0\n", 475 | "2 2.59380 1.0\n", 476 | "3 2.02890 1.0\n", 477 | "4 2.91890 1.0\n", 478 | "... ... ...\n", 479 | "1139 1.98820 8.0\n", 480 | "1140 3.42130 2.0\n", 481 | "1141 3.60960 4.0\n", 482 | "1142 2.56214 3.0\n", 483 | "1143 2.02164 1.0\n", 484 | "\n", 485 | "[1144 rows x 2 columns]" 486 | ] 487 | }, 488 | "metadata": { 489 | "tags": [] 490 | }, 491 | "execution_count": 23 492 | } 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "qxMA09nEIV7e", 499 | "colab_type": "code", 500 | "colab": { 501 | "base_uri": "https://localhost:8080/", 502 | "height": 415 503 | }, 504 | "outputId": "6f1131df-d3ff-4735-96d5-166a11c03f56" 505 | }, 506 | "source": [ 507 | "df.iloc[:,[0,2]]" 508 | ], 509 | "execution_count": 22, 510 | "outputs": [ 511 | { 512 | "output_type": "execute_result", 513 | "data": { 514 | "text/html": [ 515 | "
\n", 516 | "\n", 529 | "\n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | "
MolLogPNumRotatableBonds
02.595400.0
12.376500.0
22.593801.0
32.028901.0
42.918901.0
.........
11391.988208.0
11403.421302.0
11413.609604.0
11422.562143.0
11432.021641.0
\n", 595 | "

1144 rows × 2 columns

\n", 596 | "
" 597 | ], 598 | "text/plain": [ 599 | " MolLogP NumRotatableBonds\n", 600 | "0 2.59540 0.0\n", 601 | "1 2.37650 0.0\n", 602 | "2 2.59380 1.0\n", 603 | "3 2.02890 1.0\n", 604 | "4 2.91890 1.0\n", 605 | "... ... ...\n", 606 | "1139 1.98820 8.0\n", 607 | "1140 3.42130 2.0\n", 608 | "1141 3.60960 4.0\n", 609 | "1142 2.56214 3.0\n", 610 | "1143 2.02164 1.0\n", 611 | "\n", 612 | "[1144 rows x 2 columns]" 613 | ] 614 | }, 615 | "metadata": { 616 | "tags": [] 617 | }, 618 | "execution_count": 22 619 | } 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "metadata": { 625 | "id": "V53KSsZjId1X", 626 | "colab_type": "code", 627 | "colab": { 628 | "base_uri": "https://localhost:8080/", 629 | "height": 415 630 | }, 631 | "outputId": "81be4e24-2bac-4dd5-97d7-32ece84768c6" 632 | }, 633 | "source": [ 634 | "selection = ['MolLogP','NumRotatableBonds', 'logS']\n", 635 | "df[selection]" 636 | ], 637 | "execution_count": 25, 638 | "outputs": [ 639 | { 640 | "output_type": "execute_result", 641 | "data": { 642 | "text/html": [ 643 | "
\n", 644 | "\n", 657 | "\n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | "
MolLogPNumRotatableBondslogS
02.595400.0-2.180
12.376500.0-2.000
22.593801.0-1.740
32.028901.0-1.480
42.918901.0-3.040
............
11391.988208.01.144
11403.421302.0-4.925
11413.609604.0-3.893
11422.562143.0-3.790
11432.021641.0-2.581
\n", 735 | "

1144 rows × 3 columns

\n", 736 | "
" 737 | ], 738 | "text/plain": [ 739 | " MolLogP NumRotatableBonds logS\n", 740 | "0 2.59540 0.0 -2.180\n", 741 | "1 2.37650 0.0 -2.000\n", 742 | "2 2.59380 1.0 -1.740\n", 743 | "3 2.02890 1.0 -1.480\n", 744 | "4 2.91890 1.0 -3.040\n", 745 | "... ... ... ...\n", 746 | "1139 1.98820 8.0 1.144\n", 747 | "1140 3.42130 2.0 -4.925\n", 748 | "1141 3.60960 4.0 -3.893\n", 749 | "1142 2.56214 3.0 -3.790\n", 750 | "1143 2.02164 1.0 -2.581\n", 751 | "\n", 752 | "[1144 rows x 3 columns]" 753 | ] 754 | }, 755 | "metadata": { 756 | "tags": [] 757 | }, 758 | "execution_count": 25 759 | } 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "metadata": { 765 | "id": "rF85rPt1I9sY", 766 | "colab_type": "code", 767 | "colab": {} 768 | }, 769 | "source": [ 770 | "" 771 | ], 772 | "execution_count": 0, 773 | "outputs": [] 774 | } 775 | ] 776 | } -------------------------------------------------------------------------------- /python/r_magic_command.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "r-magic-command.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | } 13 | }, 14 | "cells": [ 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "id": "EnyONbNhCqSK", 19 | "colab_type": "text" 20 | }, 21 | "source": [ 22 | "# **Using R and Python in the Same Notebook**\n", 23 | "\n", 24 | "Chanin Nantasenamat\n", 25 | "\n", 26 | "[*'Data Professor' YouTube channel*](http://youtube.com/dataprofessor)\n", 27 | "\n", 28 | "In this Jupyter notebook, I will show you how to use R and Python in the same notebook.\n", 29 | "\n", 30 | "---" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "metadata": { 36 | "id": "2h-2I4CviFCR", 37 | "colab_type": "code", 38 | "colab": {} 39 | }, 40 | "source": [ 41 | "# activate R magic\n", 42 | "%load_ext rpy2.ipython" 43 | ], 44 | "execution_count": 0, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "id": "FftFvPLNiZME", 51 | "colab_type": "text" 52 | }, 53 | "source": [ 54 | "## Python" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "metadata": { 60 | "id": "3hPnRI2piJM3", 61 | "colab_type": "code", 62 | "colab": {} 63 | }, 64 | "source": [ 65 | "import pandas as pd" 66 | ], 67 | "execution_count": 0, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "yNKM70-ZiPcg", 74 | "colab_type": "code", 75 | "colab": {} 76 | }, 77 | "source": [ 78 | "x <- 42\n", 79 | "print(x)" 80 | ], 81 | "execution_count": 0, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": { 87 | "id": "dtkChhxpiWEd", 88 | "colab_type": "text" 89 | }, 90 | "source": [ 91 | "## R" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "ozqbZ3lviTPj", 98 | "colab_type": "code", 99 | "colab": {} 100 | }, 101 | "source": [ 102 | "%%R\n", 103 | "x <- 42\n", 104 | "print(x)" 105 | ], 106 | "execution_count": 0, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "metadata": { 112 | "id": "napTAYyXiU8r", 113 | "colab_type": "code", 114 | "colab": {} 115 | }, 116 | "source": [ 117 | "%%R\n", 118 | "install.packages('caret')\n", 119 | "install.packages('mlbench')" 120 | ], 121 | "execution_count": 0, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "4eB_IbK4kztb", 128 | "colab_type": "code", 129 | "colab": {} 130 | }, 131 | "source": [ 132 | "%%R\n", 133 | "install.packages('mlbench')" 134 | ], 135 | "execution_count": 0, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "metadata": { 141 | "id": "Bl0feNEUi-Jk", 142 | "colab_type": "code", 143 | "colab": {} 144 | }, 145 | "source": [ 146 | "%%R\n", 147 | "library(caret)" 148 | ], 149 | "execution_count": 0, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "metadata": { 155 | "id": "zY7WFnrSj4Mr", 156 | "colab_type": "code", 157 | "colab": {} 158 | }, 159 | "source": [ 160 | "%%R\n", 161 | "############################################\n", 162 | "# Data Professor #\n", 163 | "# http://youtube.com/dataprofessor #\n", 164 | "# http://github.com/dataprofessor #\n", 165 | "# http://facebook.com/dataprofessor #\n", 166 | "# https://www.instagram.com/data.professor #\n", 167 | "############################################\n", 168 | "\n", 169 | "# Importing libraries\n", 170 | "library(mlbench) # Contains several benchmark data sets (especially the Boston Housing dataset)\n", 171 | "library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training\n", 172 | "\n", 173 | "# Importing the Boston Housing data set\n", 174 | "data(BostonHousing)\n", 175 | "\n", 176 | "head(BostonHousing)\n", 177 | "\n", 178 | "# Check to see if there are missing data?\n", 179 | "sum(is.na(BostonHousing))\n", 180 | "\n", 181 | "# To achieve reproducible model; set the random seed number\n", 182 | "set.seed(100)\n", 183 | "\n", 184 | "# Performs stratified random split of the data set\n", 185 | "TrainingIndex <- createDataPartition(BostonHousing$medv, p=0.8, list = FALSE)\n", 186 | "TrainingSet <- BostonHousing[TrainingIndex,] # Training Set\n", 187 | "TestingSet <- BostonHousing[-TrainingIndex,] # Test Set\n", 188 | "\n", 189 | "\n", 190 | "###############################\n", 191 | "\n", 192 | "# Build Training model\n", 193 | "Model <- train(medv ~ ., data = TrainingSet,\n", 194 | " method = \"lm\",\n", 195 | " na.action = na.omit,\n", 196 | " preProcess=c(\"scale\",\"center\"),\n", 197 | " trControl= trainControl(method=\"none\")\n", 198 | ")\n", 199 | "\n", 200 | "# Apply model for prediction\n", 201 | "Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set\n", 202 | "Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set\n", 203 | "\n", 204 | "# Model performance (Displays scatter plot and performance metrics)\n", 205 | " # Scatter plot of Training set\n", 206 | "plot(TrainingSet$medv,Model.training, col = \"blue\" )\n", 207 | "plot(TestingSet$medv,Model.testing, col = \"blue\" )" 208 | ], 209 | "execution_count": 0, 210 | "outputs": [] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "metadata": { 215 | "id": "Q6A7bOvbll8D", 216 | "colab_type": "code", 217 | "colab": {} 218 | }, 219 | "source": [ 220 | "" 221 | ], 222 | "execution_count": 0, 223 | "outputs": [] 224 | } 225 | ] 226 | } -------------------------------------------------------------------------------- /shiny/001-first-app/app.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Modified from Winston Chang, 8 | # https://shiny.rstudio.com/gallery/shiny-theme-selector.html 9 | 10 | # Concepts about Reactive programming used by Shiny, 11 | # https://shiny.rstudio.com/articles/reactivity-overview.html 12 | 13 | # Load R packages 14 | library(shiny) 15 | library(shinythemes) 16 | 17 | 18 | # Define UI 19 | ui <- fluidPage(theme = shinytheme("cerulean"), 20 | navbarPage( 21 | # theme = "cerulean", # <--- To use a theme, uncomment this 22 | "My first app", 23 | tabPanel("Navbar 1", 24 | sidebarPanel( 25 | tags$h3("Input:"), 26 | textInput("txt1", "Given Name:", ""), 27 | textInput("txt2", "Surname:", ""), 28 | 29 | ), # sidebarPanel 30 | mainPanel( 31 | h1("Header 1"), 32 | 33 | h4("Output 1"), 34 | verbatimTextOutput("txtout"), 35 | 36 | ) # mainPanel 37 | 38 | ), # Navbar 1, tabPanel 39 | tabPanel("Navbar 2", "This panel is intentionally left blank"), 40 | tabPanel("Navbar 3", "This panel is intentionally left blank") 41 | 42 | ) # navbarPage 43 | ) # fluidPage 44 | 45 | 46 | # Define server function 47 | server <- function(input, output) { 48 | 49 | output$txtout <- renderText({ 50 | paste( input$txt1, input$txt2, sep = " " ) 51 | }) 52 | } # server 53 | 54 | 55 | # Create Shiny object 56 | shinyApp(ui = ui, server = server) 57 | -------------------------------------------------------------------------------- /shiny/002-histogram/app.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Modified from https://shiny.rstudio.com/tutorial/written-tutorial/lesson1/ 8 | 9 | library(shiny) 10 | data(airquality) 11 | 12 | # Define UI for app that draws a histogram ---- 13 | ui <- fluidPage( 14 | 15 | # App title ---- 16 | titlePanel("Ozone level!"), 17 | 18 | # Sidebar layout with input and output definitions ---- 19 | sidebarLayout( 20 | 21 | # Sidebar panel for inputs ---- 22 | sidebarPanel( 23 | 24 | # Input: Slider for the number of bins ---- 25 | sliderInput(inputId = "bins", 26 | label = "Number of bins:", 27 | min = 1, 28 | max = 50, 29 | value = 30) 30 | 31 | ), 32 | 33 | # Main panel for displaying outputs ---- 34 | mainPanel( 35 | 36 | # Output: Histogram ---- 37 | plotOutput(outputId = "distPlot") 38 | 39 | ) 40 | ) 41 | ) 42 | 43 | # Define server logic required to draw a histogram ---- 44 | server <- function(input, output) { 45 | 46 | 47 | output$distPlot <- renderPlot({ 48 | 49 | x <- airquality$Ozone 50 | x <- na.omit(x) 51 | bins <- seq(min(x), max(x), length.out = input$bins + 1) 52 | 53 | hist(x, breaks = bins, col = "#75AADB", border = "black", 54 | xlab = "Ozone level", 55 | main = "Histogram of Ozone level") 56 | 57 | }) 58 | 59 | } 60 | 61 | # Create Shiny app ---- 62 | shinyApp(ui = ui, server = server) 63 | -------------------------------------------------------------------------------- /shiny/003-play-golf/app.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | 8 | # Import libraries 9 | library(shiny) 10 | library(shinythemes) 11 | library(data.table) 12 | library(RCurl) 13 | library(randomForest) 14 | 15 | # Read data 16 | weather <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/weather-weka.csv") ) 17 | 18 | # Build model 19 | model <- randomForest(play ~ ., data = weather, ntree = 500, mtry = 4, importance = TRUE) 20 | 21 | # Save model to RDS file 22 | # saveRDS(model, "model.rds") 23 | 24 | # Read in the RF model 25 | #model <- readRDS("model.rds") 26 | 27 | #################################### 28 | # User interface # 29 | #################################### 30 | 31 | ui <- fluidPage(theme = shinytheme("united"), 32 | 33 | # Page header 34 | headerPanel('Play Golf?'), 35 | 36 | # Input values 37 | sidebarPanel( 38 | HTML("

Input parameters

"), 39 | 40 | selectInput("outlook", label = "Outlook:", 41 | choices = list("Sunny" = "sunny", "Overcast" = "overcast", "Rainy" = "rainy"), 42 | selected = "Rainy"), 43 | sliderInput("temperature", "Temperature:", 44 | min = 64, max = 86, 45 | value = 70), 46 | sliderInput("humidity", "Humidity:", 47 | min = 65, max = 96, 48 | value = 90), 49 | selectInput("windy", label = "Windy:", 50 | choices = list("Yes" = "TRUE", "No" = "FALSE"), 51 | selected = "TRUE"), 52 | 53 | actionButton("submitbutton", "Submit", class = "btn btn-primary") 54 | ), 55 | 56 | mainPanel( 57 | tags$label(h3('Status/Output')), # Status/Output Text Box 58 | verbatimTextOutput('contents'), 59 | tableOutput('tabledata') # Prediction results table 60 | 61 | ) 62 | ) 63 | 64 | #################################### 65 | # Server # 66 | #################################### 67 | 68 | server <- function(input, output, session) { 69 | 70 | # Input Data 71 | datasetInput <- reactive({ 72 | 73 | # outlook,temperature,humidity,windy,play 74 | df <- data.frame( 75 | Name = c("outlook", 76 | "temperature", 77 | "humidity", 78 | "windy"), 79 | Value = as.character(c(input$outlook, 80 | input$temperature, 81 | input$humidity, 82 | input$windy)), 83 | stringsAsFactors = FALSE) 84 | 85 | play <- "play" 86 | df <- rbind(df, play) 87 | input <- transpose(df) 88 | write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE) 89 | 90 | test <- read.csv(paste("input", ".csv", sep=""), header = TRUE) 91 | 92 | test$outlook <- factor(test$outlook, levels = c("overcast", "rainy", "sunny")) 93 | 94 | 95 | Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3)) 96 | print(Output) 97 | 98 | }) 99 | 100 | # Status/Output Text Box 101 | output$contents <- renderPrint({ 102 | if (input$submitbutton>0) { 103 | isolate("Calculation complete.") 104 | } else { 105 | return("Server is ready for calculation.") 106 | } 107 | }) 108 | 109 | # Prediction results table 110 | output$tabledata <- renderTable({ 111 | if (input$submitbutton>0) { 112 | isolate(datasetInput()) 113 | } 114 | }) 115 | 116 | } 117 | 118 | #################################### 119 | # Create the shiny app # 120 | #################################### 121 | shinyApp(ui = ui, server = server) 122 | -------------------------------------------------------------------------------- /shiny/004-iris-predictor/app-numeric.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | # http://facebook.com/dataprofessor # 6 | # https://www.instagram.com/data.professor # 7 | ############################################ 8 | 9 | # Import libraries 10 | library(shiny) 11 | library(data.table) 12 | library(randomForest) 13 | 14 | # Read in the RF model 15 | model <- readRDS("model.rds") 16 | 17 | 18 | #################################### 19 | # User interface # 20 | #################################### 21 | 22 | ui <- pageWithSidebar( 23 | 24 | # Page header 25 | headerPanel('Iris Predictor'), 26 | 27 | # Input values 28 | sidebarPanel( 29 | #HTML("

Input parameters

"), 30 | tags$label(h3('Input parameters')), 31 | numericInput("Sepal.Length", 32 | label = "Sepal Length", 33 | value = 5.1), 34 | numericInput("Sepal.Width", 35 | label = "Sepal Width", 36 | value = 3.6), 37 | numericInput("Petal.Length", 38 | label = "Petal Length", 39 | value = 1.4), 40 | numericInput("Petal.Width", 41 | label = "Petal Width", 42 | value = 0.2), 43 | 44 | actionButton("submitbutton", "Submit", 45 | class = "btn btn-primary") 46 | ), 47 | 48 | mainPanel( 49 | tags$label(h3('Status/Output')), # Status/Output Text Box 50 | verbatimTextOutput('contents'), 51 | tableOutput('tabledata') # Prediction results table 52 | 53 | ) 54 | ) 55 | 56 | #################################### 57 | # Server # 58 | #################################### 59 | 60 | server<- function(input, output, session) { 61 | 62 | # Input Data 63 | datasetInput <- reactive({ 64 | 65 | df <- data.frame( 66 | Name = c("Sepal Length", 67 | "Sepal Width", 68 | "Petal Length", 69 | "Petal Width"), 70 | Value = as.character(c(input$Sepal.Length, 71 | input$Sepal.Width, 72 | input$Petal.Length, 73 | input$Petal.Width)), 74 | stringsAsFactors = FALSE) 75 | 76 | Species <- 0 77 | df <- rbind(df, Species) 78 | input <- transpose(df) 79 | write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE) 80 | 81 | test <- read.csv(paste("input", ".csv", sep=""), header = TRUE) 82 | 83 | Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3)) 84 | print(Output) 85 | 86 | }) 87 | 88 | # Status/Output Text Box 89 | output$contents <- renderPrint({ 90 | if (input$submitbutton>0) { 91 | isolate("Calculation complete.") 92 | } else { 93 | return("Server is ready for calculation.") 94 | } 95 | }) 96 | 97 | # Prediction results table 98 | output$tabledata <- renderTable({ 99 | if (input$submitbutton>0) { 100 | isolate(datasetInput()) 101 | } 102 | }) 103 | 104 | } 105 | 106 | #################################### 107 | # Create the shiny app # 108 | #################################### 109 | shinyApp(ui = ui, server = server) 110 | -------------------------------------------------------------------------------- /shiny/004-iris-predictor/app-slider.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | # http://facebook.com/dataprofessor # 6 | # https://www.instagram.com/data.professor # 7 | ############################################ 8 | 9 | # Import libraries 10 | library(shiny) 11 | library(data.table) 12 | library(randomForest) 13 | 14 | # Read in the RF model 15 | model <- readRDS("model.rds") 16 | 17 | # Training set 18 | TrainSet <- read.csv("training.csv", header = TRUE) 19 | TrainSet <- TrainSet[,-1] 20 | 21 | 22 | #################################### 23 | # User interface # 24 | #################################### 25 | 26 | ui <- pageWithSidebar( 27 | 28 | # Page header 29 | headerPanel('Iris Predictor'), 30 | 31 | # Input values 32 | sidebarPanel( 33 | HTML("

Input parameters

"), 34 | sliderInput("Sepal.Length", label = "Sepal Length", value = 5.0, 35 | min = min(TrainSet$Sepal.Length), 36 | max = max(TrainSet$Sepal.Length) 37 | ), 38 | sliderInput("Sepal.Width", label = "Sepal Width", value = 3.6, 39 | min = min(TrainSet$Sepal.Width), 40 | max = max(TrainSet$Sepal.Width)), 41 | sliderInput("Petal.Length", label = "Petal Length", value = 1.4, 42 | min = min(TrainSet$Petal.Length), 43 | max = max(TrainSet$Petal.Length)), 44 | sliderInput("Petal.Width", label = "Petal Width", value = 0.2, 45 | min = min(TrainSet$Petal.Width), 46 | max = max(TrainSet$Petal.Width)), 47 | 48 | actionButton("submitbutton", "Submit", class = "btn btn-primary") 49 | ), 50 | 51 | mainPanel( 52 | tags$label(h3('Status/Output')), # Status/Output Text Box 53 | verbatimTextOutput('contents'), 54 | tableOutput('tabledata') # Prediction results table 55 | 56 | ) 57 | ) 58 | 59 | #################################### 60 | # Server # 61 | #################################### 62 | 63 | server<- function(input, output, session) { 64 | 65 | # Input Data 66 | datasetInput <- reactive({ 67 | 68 | df <- data.frame( 69 | Name = c("Sepal Length", 70 | "Sepal Width", 71 | "Petal Length", 72 | "Petal Width"), 73 | Value = as.character(c(input$Sepal.Length, 74 | input$Sepal.Width, 75 | input$Petal.Length, 76 | input$Petal.Width)), 77 | stringsAsFactors = FALSE) 78 | 79 | Species <- 0 80 | df <- rbind(df, Species) 81 | input <- transpose(df) 82 | write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE) 83 | 84 | test <- read.csv(paste("input", ".csv", sep=""), header = TRUE) 85 | 86 | Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3)) 87 | print(Output) 88 | 89 | }) 90 | 91 | # Status/Output Text Box 92 | output$contents <- renderPrint({ 93 | if (input$submitbutton>0) { 94 | isolate("Calculation complete.") 95 | } else { 96 | return("Server is ready for calculation.") 97 | } 98 | }) 99 | 100 | # Prediction results table 101 | output$tabledata <- renderTable({ 102 | if (input$submitbutton>0) { 103 | isolate(datasetInput()) 104 | } 105 | }) 106 | 107 | } 108 | 109 | #################################### 110 | # Create the shiny app # 111 | #################################### 112 | shinyApp(ui = ui, server = server) -------------------------------------------------------------------------------- /shiny/004-iris-predictor/model.R: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | #################################### 6 | 7 | # Importing libraries 8 | library(RCurl) # for downloading the iris CSV file 9 | library(randomForest) 10 | library(caret) 11 | 12 | # Importing the Iris data set 13 | iris <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/iris.csv") ) 14 | 15 | # Performs stratified random split of the data set 16 | TrainingIndex <- createDataPartition(iris$Species, p=0.8, list = FALSE) 17 | TrainingSet <- iris[TrainingIndex,] # Training Set 18 | TestingSet <- iris[-TrainingIndex,] # Test Set 19 | 20 | write.csv(TrainingSet, "training.csv") 21 | write.csv(TestingSet, "testing.csv") 22 | 23 | TrainSet <- read.csv("training.csv", header = TRUE) 24 | TrainSet <- TrainSet[,-1] 25 | 26 | # Building Random forest model 27 | 28 | model <- randomForest(Species ~ ., data = TrainSet, ntree = 500, mtry = 4, importance = TRUE) 29 | 30 | # Save model to RDS file 31 | saveRDS(model, "model.rds") 32 | -------------------------------------------------------------------------------- /shiny/004-iris-predictor/model.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/shiny/004-iris-predictor/model.rds -------------------------------------------------------------------------------- /shiny/004-iris-predictor/testing.csv: -------------------------------------------------------------------------------- 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 2 | "5",5,3.6,1.4,0.2,"setosa" 3 | "9",4.4,2.9,1.4,0.2,"setosa" 4 | "14",4.3,3,1.1,0.1,"setosa" 5 | "19",5.7,3.8,1.7,0.3,"setosa" 6 | "22",5.1,3.7,1.5,0.4,"setosa" 7 | "26",5,3,1.6,0.2,"setosa" 8 | "29",5.2,3.4,1.4,0.2,"setosa" 9 | "37",5.5,3.5,1.3,0.2,"setosa" 10 | "41",5,3.5,1.3,0.3,"setosa" 11 | "42",4.5,2.3,1.3,0.3,"setosa" 12 | "55",6.5,2.8,4.6,1.5,"versicolor" 13 | "56",5.7,2.8,4.5,1.3,"versicolor" 14 | "61",5,2,3.5,1,"versicolor" 15 | "65",5.6,2.9,3.6,1.3,"versicolor" 16 | "66",6.7,3.1,4.4,1.4,"versicolor" 17 | "68",5.8,2.7,4.1,1,"versicolor" 18 | "73",6.3,2.5,4.9,1.5,"versicolor" 19 | "90",5.5,2.5,4,1.3,"versicolor" 20 | "92",6.1,3,4.6,1.4,"versicolor" 21 | "99",5.1,2.5,3,1.1,"versicolor" 22 | "103",7.1,3,5.9,2.1,"virginica" 23 | "111",6.5,3.2,5.1,2,"virginica" 24 | "112",6.4,2.7,5.3,1.9,"virginica" 25 | "113",6.8,3,5.5,2.1,"virginica" 26 | "120",6,2.2,5,1.5,"virginica" 27 | "133",6.4,2.8,5.6,2.2,"virginica" 28 | "134",6.3,2.8,5.1,1.5,"virginica" 29 | "136",7.7,3,6.1,2.3,"virginica" 30 | "146",6.7,3,5.2,2.3,"virginica" 31 | "147",6.3,2.5,5,1.9,"virginica" 32 | -------------------------------------------------------------------------------- /shiny/004-iris-predictor/training.csv: -------------------------------------------------------------------------------- 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 2 | "1",5.1,3.5,1.4,0.2,"setosa" 3 | "2",4.9,3,1.4,0.2,"setosa" 4 | "3",4.7,3.2,1.3,0.2,"setosa" 5 | "4",4.6,3.1,1.5,0.2,"setosa" 6 | "6",5.4,3.9,1.7,0.4,"setosa" 7 | "7",4.6,3.4,1.4,0.3,"setosa" 8 | "8",5,3.4,1.5,0.2,"setosa" 9 | "10",4.9,3.1,1.5,0.1,"setosa" 10 | "11",5.4,3.7,1.5,0.2,"setosa" 11 | "12",4.8,3.4,1.6,0.2,"setosa" 12 | "13",4.8,3,1.4,0.1,"setosa" 13 | "15",5.8,4,1.2,0.2,"setosa" 14 | "16",5.7,4.4,1.5,0.4,"setosa" 15 | "17",5.4,3.9,1.3,0.4,"setosa" 16 | "18",5.1,3.5,1.4,0.3,"setosa" 17 | "20",5.1,3.8,1.5,0.3,"setosa" 18 | "21",5.4,3.4,1.7,0.2,"setosa" 19 | "23",4.6,3.6,1,0.2,"setosa" 20 | "24",5.1,3.3,1.7,0.5,"setosa" 21 | "25",4.8,3.4,1.9,0.2,"setosa" 22 | "27",5,3.4,1.6,0.4,"setosa" 23 | "28",5.2,3.5,1.5,0.2,"setosa" 24 | "30",4.7,3.2,1.6,0.2,"setosa" 25 | "31",4.8,3.1,1.6,0.2,"setosa" 26 | "32",5.4,3.4,1.5,0.4,"setosa" 27 | "33",5.2,4.1,1.5,0.1,"setosa" 28 | "34",5.5,4.2,1.4,0.2,"setosa" 29 | "35",4.9,3.1,1.5,0.1,"setosa" 30 | "36",5,3.2,1.2,0.2,"setosa" 31 | "38",4.9,3.1,1.5,0.1,"setosa" 32 | "39",4.4,3,1.3,0.2,"setosa" 33 | "40",5.1,3.4,1.5,0.2,"setosa" 34 | "43",4.4,3.2,1.3,0.2,"setosa" 35 | "44",5,3.5,1.6,0.6,"setosa" 36 | "45",5.1,3.8,1.9,0.4,"setosa" 37 | "46",4.8,3,1.4,0.3,"setosa" 38 | "47",5.1,3.8,1.6,0.2,"setosa" 39 | "48",4.6,3.2,1.4,0.2,"setosa" 40 | "49",5.3,3.7,1.5,0.2,"setosa" 41 | "50",5,3.3,1.4,0.2,"setosa" 42 | "51",7,3.2,4.7,1.4,"versicolor" 43 | "52",6.4,3.2,4.5,1.5,"versicolor" 44 | "53",6.9,3.1,4.9,1.5,"versicolor" 45 | "54",5.5,2.3,4,1.3,"versicolor" 46 | "57",6.3,3.3,4.7,1.6,"versicolor" 47 | "58",4.9,2.4,3.3,1,"versicolor" 48 | "59",6.6,2.9,4.6,1.3,"versicolor" 49 | "60",5.2,2.7,3.9,1.4,"versicolor" 50 | "62",5.9,3,4.2,1.5,"versicolor" 51 | "63",6,2.2,4,1,"versicolor" 52 | "64",6.1,2.9,4.7,1.4,"versicolor" 53 | "67",5.6,3,4.5,1.5,"versicolor" 54 | "69",6.2,2.2,4.5,1.5,"versicolor" 55 | "70",5.6,2.5,3.9,1.1,"versicolor" 56 | "71",5.9,3.2,4.8,1.8,"versicolor" 57 | "72",6.1,2.8,4,1.3,"versicolor" 58 | "74",6.1,2.8,4.7,1.2,"versicolor" 59 | "75",6.4,2.9,4.3,1.3,"versicolor" 60 | "76",6.6,3,4.4,1.4,"versicolor" 61 | "77",6.8,2.8,4.8,1.4,"versicolor" 62 | "78",6.7,3,5,1.7,"versicolor" 63 | "79",6,2.9,4.5,1.5,"versicolor" 64 | "80",5.7,2.6,3.5,1,"versicolor" 65 | "81",5.5,2.4,3.8,1.1,"versicolor" 66 | "82",5.5,2.4,3.7,1,"versicolor" 67 | "83",5.8,2.7,3.9,1.2,"versicolor" 68 | "84",6,2.7,5.1,1.6,"versicolor" 69 | "85",5.4,3,4.5,1.5,"versicolor" 70 | "86",6,3.4,4.5,1.6,"versicolor" 71 | "87",6.7,3.1,4.7,1.5,"versicolor" 72 | "88",6.3,2.3,4.4,1.3,"versicolor" 73 | "89",5.6,3,4.1,1.3,"versicolor" 74 | "91",5.5,2.6,4.4,1.2,"versicolor" 75 | "93",5.8,2.6,4,1.2,"versicolor" 76 | "94",5,2.3,3.3,1,"versicolor" 77 | "95",5.6,2.7,4.2,1.3,"versicolor" 78 | "96",5.7,3,4.2,1.2,"versicolor" 79 | "97",5.7,2.9,4.2,1.3,"versicolor" 80 | "98",6.2,2.9,4.3,1.3,"versicolor" 81 | "100",5.7,2.8,4.1,1.3,"versicolor" 82 | "101",6.3,3.3,6,2.5,"virginica" 83 | "102",5.8,2.7,5.1,1.9,"virginica" 84 | "104",6.3,2.9,5.6,1.8,"virginica" 85 | "105",6.5,3,5.8,2.2,"virginica" 86 | "106",7.6,3,6.6,2.1,"virginica" 87 | "107",4.9,2.5,4.5,1.7,"virginica" 88 | "108",7.3,2.9,6.3,1.8,"virginica" 89 | "109",6.7,2.5,5.8,1.8,"virginica" 90 | "110",7.2,3.6,6.1,2.5,"virginica" 91 | "114",5.7,2.5,5,2,"virginica" 92 | "115",5.8,2.8,5.1,2.4,"virginica" 93 | "116",6.4,3.2,5.3,2.3,"virginica" 94 | "117",6.5,3,5.5,1.8,"virginica" 95 | "118",7.7,3.8,6.7,2.2,"virginica" 96 | "119",7.7,2.6,6.9,2.3,"virginica" 97 | "121",6.9,3.2,5.7,2.3,"virginica" 98 | "122",5.6,2.8,4.9,2,"virginica" 99 | "123",7.7,2.8,6.7,2,"virginica" 100 | "124",6.3,2.7,4.9,1.8,"virginica" 101 | "125",6.7,3.3,5.7,2.1,"virginica" 102 | "126",7.2,3.2,6,1.8,"virginica" 103 | "127",6.2,2.8,4.8,1.8,"virginica" 104 | "128",6.1,3,4.9,1.8,"virginica" 105 | "129",6.4,2.8,5.6,2.1,"virginica" 106 | "130",7.2,3,5.8,1.6,"virginica" 107 | "131",7.4,2.8,6.1,1.9,"virginica" 108 | "132",7.9,3.8,6.4,2,"virginica" 109 | "135",6.1,2.6,5.6,1.4,"virginica" 110 | "137",6.3,3.4,5.6,2.4,"virginica" 111 | "138",6.4,3.1,5.5,1.8,"virginica" 112 | "139",6,3,4.8,1.8,"virginica" 113 | "140",6.9,3.1,5.4,2.1,"virginica" 114 | "141",6.7,3.1,5.6,2.4,"virginica" 115 | "142",6.9,3.1,5.1,2.3,"virginica" 116 | "143",5.8,2.7,5.1,1.9,"virginica" 117 | "144",6.8,3.2,5.9,2.3,"virginica" 118 | "145",6.7,3.3,5.7,2.5,"virginica" 119 | "148",6.5,3,5.2,2,"virginica" 120 | "149",6.2,3.4,5.4,2.3,"virginica" 121 | "150",5.9,3,5.1,1.8,"virginica" 122 | -------------------------------------------------------------------------------- /shiny/005-bmi/about.md: -------------------------------------------------------------------------------- 1 | #### What is BMI? 2 | 3 | **Body Mass Index (BMI)** is essentially a value obtained from the weight and height of a person [1]. 4 | 5 | #### Calculating the BMI 6 | BMI can be computed by dividing the person's weight (kg) by their squared height (m) as follows: 7 | 8 | > BMI = kg/m^2 9 | 10 | where *kg* represents the person's weight and *m^2* the person's squared height. 11 | 12 | #### About this BMI Calculator 13 | 14 | This *BMI Calculator* is for adults 20 years and older. Further information on calculating BMI for children and teenagers is available from the CDC [2]. 15 | 16 | #### References 17 | 1. Centers for Disease Control. [Body Mass Index (BMI)](https://www.cdc.gov/healthyweight/assessing/bmi/index.html), Accessed January 26, 2020. 18 | 2. Centers for Disease Control. [BMI Percentile Calculator for Child and Teen](https://www.cdc.gov/healthyweight/bmi/calculator.html), Accessed January 26, 2020. 19 | -------------------------------------------------------------------------------- /shiny/005-bmi/app.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | # Data Professor # 3 | # http://youtube.com/dataprofessor # 4 | # http://github.com/dataprofessor # 5 | # http://facebook.com/dataprofessor # 6 | # https://www.instagram.com/data.professor # 7 | ############################################ 8 | 9 | library(shiny) 10 | library(shinythemes) 11 | 12 | 13 | #################################### 14 | # User Interface # 15 | #################################### 16 | ui <- fluidPage(theme = shinytheme("united"), 17 | navbarPage("BMI Calculator:", 18 | 19 | tabPanel("Home", 20 | # Input values 21 | sidebarPanel( 22 | HTML("

Input parameters

"), 23 | sliderInput("height", 24 | label = "Height", 25 | value = 175, 26 | min = 40, 27 | max = 250), 28 | sliderInput("weight", 29 | label = "Weight", 30 | value = 70, 31 | min = 20, 32 | max = 100), 33 | 34 | actionButton("submitbutton", 35 | "Submit", 36 | class = "btn btn-primary") 37 | ), 38 | 39 | mainPanel( 40 | tags$label(h3('Status/Output')), # Status/Output Text Box 41 | verbatimTextOutput('contents'), 42 | tableOutput('tabledata') # Results table 43 | ) # mainPanel() 44 | 45 | ), #tabPanel(), Home 46 | 47 | tabPanel("About", 48 | titlePanel("About"), 49 | div(includeMarkdown("about.md"), 50 | align="justify") 51 | ) #tabPanel(), About 52 | 53 | ) # navbarPage() 54 | ) # fluidPage() 55 | 56 | 57 | #################################### 58 | # Server # 59 | #################################### 60 | server <- function(input, output, session) { 61 | 62 | # Input Data 63 | datasetInput <- reactive({ 64 | 65 | bmi <- input$weight/( (input$height/100) * (input$height/100) ) 66 | bmi <- data.frame(bmi) 67 | names(bmi) <- "BMI" 68 | print(bmi) 69 | 70 | }) 71 | 72 | # Status/Output Text Box 73 | output$contents <- renderPrint({ 74 | if (input$submitbutton>0) { 75 | isolate("Calculation complete.") 76 | } else { 77 | return("Server is ready for calculation.") 78 | } 79 | }) 80 | 81 | # Prediction results table 82 | output$tabledata <- renderTable({ 83 | if (input$submitbutton>0) { 84 | isolate(datasetInput()) 85 | } 86 | }) 87 | 88 | } 89 | 90 | 91 | #################################### 92 | # Create Shiny App # 93 | #################################### 94 | shinyApp(ui = ui, server = server) 95 | -------------------------------------------------------------------------------- /streamlit/part1/myapp.py: -------------------------------------------------------------------------------- 1 | import yfinance as yf 2 | import streamlit as st 3 | 4 | st.write(""" 5 | # Simple Stock Price App 6 | 7 | Shown are the stock closing price and volume of Google! 8 | 9 | """) 10 | 11 | # https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75 12 | #define the ticker symbol 13 | tickerSymbol = 'GOOGL' 14 | #get data on this ticker 15 | tickerData = yf.Ticker(tickerSymbol) 16 | #get the historical prices for this ticker 17 | tickerDf = tickerData.history(period='1d', start='2010-5-31', end='2020-5-31') 18 | # Open High Low Close Volume Dividends Stock Splits 19 | 20 | st.line_chart(tickerDf.Close) 21 | st.line_chart(tickerDf.Volume) 22 | -------------------------------------------------------------------------------- /streamlit/part1/myapp2.py: -------------------------------------------------------------------------------- 1 | import yfinance as yf 2 | import streamlit as st 3 | 4 | st.write(""" 5 | # Simple Stock Price App 6 | 7 | Shown are the stock **closing price** and ***volume*** of Google! 8 | 9 | """) 10 | 11 | # https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75 12 | #define the ticker symbol 13 | tickerSymbol = 'GOOGL' 14 | #get data on this ticker 15 | tickerData = yf.Ticker(tickerSymbol) 16 | #get the historical prices for this ticker 17 | tickerDf = tickerData.history(period='1d', start='2010-5-31', end='2020-5-31') 18 | # Open High Low Close Volume Dividends Stock Splits 19 | 20 | st.write(""" 21 | ## Closing Price 22 | """) 23 | st.line_chart(tickerDf.Close) 24 | st.write(""" 25 | ## Volume Price 26 | """) 27 | st.line_chart(tickerDf.Volume) 28 | -------------------------------------------------------------------------------- /streamlit/part10/sp500-app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import base64 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import numpy as np 7 | import yfinance as yf 8 | 9 | st.title('S&P 500 App') 10 | 11 | st.markdown(""" 12 | This app retrieves the list of the **S&P 500** (from Wikipedia) and its corresponding **stock closing price** (year-to-date)! 13 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn 14 | * **Data source:** [Wikipedia](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies). 15 | """) 16 | 17 | st.sidebar.header('User Input Features') 18 | 19 | # Web scraping of S&P 500 data 20 | # 21 | @st.cache 22 | def load_data(): 23 | url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' 24 | html = pd.read_html(url, header = 0) 25 | df = html[0] 26 | return df 27 | 28 | df = load_data() 29 | sector = df.groupby('GICS Sector') 30 | 31 | # Sidebar - Sector selection 32 | sorted_sector_unique = sorted( df['GICS Sector'].unique() ) 33 | selected_sector = st.sidebar.multiselect('Sector', sorted_sector_unique, sorted_sector_unique) 34 | 35 | # Filtering data 36 | df_selected_sector = df[ (df['GICS Sector'].isin(selected_sector)) ] 37 | 38 | st.header('Display Companies in Selected Sector') 39 | st.write('Data Dimension: ' + str(df_selected_sector.shape[0]) + ' rows and ' + str(df_selected_sector.shape[1]) + ' columns.') 40 | st.dataframe(df_selected_sector) 41 | 42 | # Download S&P500 data 43 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806 44 | def filedownload(df): 45 | csv = df.to_csv(index=False) 46 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions 47 | href = f'Download CSV File' 48 | return href 49 | 50 | st.markdown(filedownload(df_selected_sector), unsafe_allow_html=True) 51 | 52 | # https://pypi.org/project/yfinance/ 53 | 54 | data = yf.download( 55 | tickers = list(df_selected_sector[:10].Symbol), 56 | period = "ytd", 57 | interval = "1d", 58 | group_by = 'ticker', 59 | auto_adjust = True, 60 | prepost = True, 61 | threads = True, 62 | proxy = None 63 | ) 64 | 65 | # Plot Closing Price of Query Symbol 66 | def price_plot(symbol): 67 | df = pd.DataFrame(data[symbol].Close) 68 | df['Date'] = df.index 69 | plt.fill_between(df.Date, df.Close, color='skyblue', alpha=0.3) 70 | plt.plot(df.Date, df.Close, color='skyblue', alpha=0.8) 71 | plt.xticks(rotation=90) 72 | plt.title(symbol, fontweight='bold') 73 | plt.xlabel('Date', fontweight='bold') 74 | plt.ylabel('Closing Price', fontweight='bold') 75 | return st.pyplot() 76 | 77 | num_company = st.sidebar.slider('Number of Companies', 1, 5) 78 | 79 | if st.button('Show Plots'): 80 | st.header('Stock Closing Price') 81 | for i in list(df_selected_sector.Symbol)[:num_company]: 82 | price_plot(i) 83 | -------------------------------------------------------------------------------- /streamlit/part12/crypto-price-app.py: -------------------------------------------------------------------------------- 1 | # This app is for educational purpose only. Insights gained is not financial advice. Use at your own risk! 2 | import streamlit as st 3 | from PIL import Image 4 | import pandas as pd 5 | import base64 6 | import matplotlib.pyplot as plt 7 | from bs4 import BeautifulSoup 8 | import requests 9 | import json 10 | import time 11 | #---------------------------------# 12 | # New feature (make sure to upgrade your streamlit library) 13 | # pip install --upgrade streamlit 14 | 15 | #---------------------------------# 16 | # Page layout 17 | ## Page expands to full width 18 | st.set_page_config(layout="wide") 19 | #---------------------------------# 20 | # Title 21 | 22 | image = Image.open('logo.jpg') 23 | 24 | st.image(image, width = 500) 25 | 26 | st.title('Crypto Price App') 27 | st.markdown(""" 28 | This app retrieves cryptocurrency prices for the top 100 cryptocurrency from the **CoinMarketCap**! 29 | 30 | """) 31 | #---------------------------------# 32 | # About 33 | expander_bar = st.beta_expander("About") 34 | expander_bar.markdown(""" 35 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn, BeautifulSoup, requests, json, time 36 | * **Data source:** [CoinMarketCap](http://coinmarketcap.com). 37 | * **Credit:** Web scraper adapted from the Medium article *[Web Scraping Crypto Prices With Python](https://towardsdatascience.com/web-scraping-crypto-prices-with-python-41072ea5b5bf)* written by [Bryan Feng](https://medium.com/@bryanf). 38 | """) 39 | 40 | 41 | #---------------------------------# 42 | # Page layout (continued) 43 | ## Divide page to 3 columns (col1 = sidebar, col2 and col3 = page contents) 44 | col1 = st.sidebar 45 | col2, col3 = st.beta_columns((2,1)) 46 | 47 | #---------------------------------# 48 | # Sidebar + Main panel 49 | col1.header('Input Options') 50 | 51 | ## Sidebar - Currency price unit 52 | currency_price_unit = col1.selectbox('Select currency for price', ('USD', 'BTC', 'ETH')) 53 | 54 | # Web scraping of CoinMarketCap data 55 | @st.cache 56 | def load_data(): 57 | cmc = requests.get('https://coinmarketcap.com') 58 | soup = BeautifulSoup(cmc.content, 'html.parser') 59 | 60 | data = soup.find('script', id='__NEXT_DATA__', type='application/json') 61 | coins = {} 62 | coin_data = json.loads(data.contents[0]) 63 | listings = coin_data['props']['initialState']['cryptocurrency']['listingLatest']['data'] 64 | for i in listings: 65 | coins[str(i['id'])] = i['slug'] 66 | 67 | coin_name = [] 68 | coin_symbol = [] 69 | market_cap = [] 70 | percent_change_1h = [] 71 | percent_change_24h = [] 72 | percent_change_7d = [] 73 | price = [] 74 | volume_24h = [] 75 | 76 | for i in listings: 77 | coin_name.append(i['slug']) 78 | coin_symbol.append(i['symbol']) 79 | price.append(i['quote'][currency_price_unit]['price']) 80 | percent_change_1h.append(i['quote'][currency_price_unit]['percent_change_1h']) 81 | percent_change_24h.append(i['quote'][currency_price_unit]['percent_change_24h']) 82 | percent_change_7d.append(i['quote'][currency_price_unit]['percent_change_7d']) 83 | market_cap.append(i['quote'][currency_price_unit]['market_cap']) 84 | volume_24h.append(i['quote'][currency_price_unit]['volume_24h']) 85 | 86 | df = pd.DataFrame(columns=['coin_name', 'coin_symbol', 'market_cap', 'percent_change_1h', 'percent_change_24h', 'percent_change_7d', 'price', 'volume_24h']) 87 | df['coin_name'] = coin_name 88 | df['coin_symbol'] = coin_symbol 89 | df['price'] = price 90 | df['percent_change_1h'] = percent_change_1h 91 | df['percent_change_24h'] = percent_change_24h 92 | df['percent_change_7d'] = percent_change_7d 93 | df['market_cap'] = market_cap 94 | df['volume_24h'] = volume_24h 95 | return df 96 | 97 | df = load_data() 98 | 99 | ## Sidebar - Cryptocurrency selections 100 | sorted_coin = sorted( df['coin_symbol'] ) 101 | selected_coin = col1.multiselect('Cryptocurrency', sorted_coin, sorted_coin) 102 | 103 | df_selected_coin = df[ (df['coin_symbol'].isin(selected_coin)) ] # Filtering data 104 | 105 | ## Sidebar - Number of coins to display 106 | num_coin = col1.slider('Display Top N Coins', 1, 100, 100) 107 | df_coins = df_selected_coin[:num_coin] 108 | 109 | ## Sidebar - Percent change timeframe 110 | percent_timeframe = col1.selectbox('Percent change time frame', 111 | ['7d','24h', '1h']) 112 | percent_dict = {"7d":'percent_change_7d',"24h":'percent_change_24h',"1h":'percent_change_1h'} 113 | selected_percent_timeframe = percent_dict[percent_timeframe] 114 | 115 | ## Sidebar - Sorting values 116 | sort_values = col1.selectbox('Sort values?', ['Yes', 'No']) 117 | 118 | col2.subheader('Price Data of Selected Cryptocurrency') 119 | col2.write('Data Dimension: ' + str(df_selected_coin.shape[0]) + ' rows and ' + str(df_selected_coin.shape[1]) + ' columns.') 120 | 121 | col2.dataframe(df_coins) 122 | 123 | # Download CSV data 124 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806 125 | def filedownload(df): 126 | csv = df.to_csv(index=False) 127 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions 128 | href = f'Download CSV File' 129 | return href 130 | 131 | col2.markdown(filedownload(df_selected_coin), unsafe_allow_html=True) 132 | 133 | #---------------------------------# 134 | # Preparing data for Bar plot of % Price change 135 | col2.subheader('Table of % Price Change') 136 | df_change = pd.concat([df_coins.coin_symbol, df_coins.percent_change_1h, df_coins.percent_change_24h, df_coins.percent_change_7d], axis=1) 137 | df_change = df_change.set_index('coin_symbol') 138 | df_change['positive_percent_change_1h'] = df_change['percent_change_1h'] > 0 139 | df_change['positive_percent_change_24h'] = df_change['percent_change_24h'] > 0 140 | df_change['positive_percent_change_7d'] = df_change['percent_change_7d'] > 0 141 | col2.dataframe(df_change) 142 | 143 | # Conditional creation of Bar plot (time frame) 144 | col3.subheader('Bar plot of % Price Change') 145 | 146 | if percent_timeframe == '7d': 147 | if sort_values == 'Yes': 148 | df_change = df_change.sort_values(by=['percent_change_7d']) 149 | col3.write('*7 days period*') 150 | plt.figure(figsize=(5,25)) 151 | plt.subplots_adjust(top = 1, bottom = 0) 152 | df_change['percent_change_7d'].plot(kind='barh', color=df_change.positive_percent_change_7d.map({True: 'g', False: 'r'})) 153 | col3.pyplot(plt) 154 | elif percent_timeframe == '24h': 155 | if sort_values == 'Yes': 156 | df_change = df_change.sort_values(by=['percent_change_24h']) 157 | col3.write('*24 hour period*') 158 | plt.figure(figsize=(5,25)) 159 | plt.subplots_adjust(top = 1, bottom = 0) 160 | df_change['percent_change_24h'].plot(kind='barh', color=df_change.positive_percent_change_24h.map({True: 'g', False: 'r'})) 161 | col3.pyplot(plt) 162 | else: 163 | if sort_values == 'Yes': 164 | df_change = df_change.sort_values(by=['percent_change_1h']) 165 | col3.write('*1 hour period*') 166 | plt.figure(figsize=(5,25)) 167 | plt.subplots_adjust(top = 1, bottom = 0) 168 | df_change['percent_change_1h'].plot(kind='barh', color=df_change.positive_percent_change_1h.map({True: 'g', False: 'r'})) 169 | col3.pyplot(plt) 170 | -------------------------------------------------------------------------------- /streamlit/part12/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part12/logo.jpg -------------------------------------------------------------------------------- /streamlit/part2/iris-ml-app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | from sklearn import datasets 4 | from sklearn.ensemble import RandomForestClassifier 5 | 6 | st.write(""" 7 | # Simple Iris Flower Prediction App 8 | 9 | This app predicts the **Iris flower** type! 10 | """) 11 | 12 | st.sidebar.header('User Input Parameters') 13 | 14 | def user_input_features(): 15 | sepal_length = st.sidebar.slider('Sepal length', 4.3, 7.9, 5.4) 16 | sepal_width = st.sidebar.slider('Sepal width', 2.0, 4.4, 3.4) 17 | petal_length = st.sidebar.slider('Petal length', 1.0, 6.9, 1.3) 18 | petal_width = st.sidebar.slider('Petal width', 0.1, 2.5, 0.2) 19 | data = {'sepal_length': sepal_length, 20 | 'sepal_width': sepal_width, 21 | 'petal_length': petal_length, 22 | 'petal_width': petal_width} 23 | features = pd.DataFrame(data, index=[0]) 24 | return features 25 | 26 | df = user_input_features() 27 | 28 | st.subheader('User Input parameters') 29 | st.write(df) 30 | 31 | iris = datasets.load_iris() 32 | X = iris.data 33 | Y = iris.target 34 | 35 | clf = RandomForestClassifier() 36 | clf.fit(X, Y) 37 | 38 | prediction = clf.predict(df) 39 | prediction_proba = clf.predict_proba(df) 40 | 41 | st.subheader('Class labels and their corresponding index number') 42 | st.write(iris.target_names) 43 | 44 | st.subheader('Prediction') 45 | st.write(iris.target_names[prediction]) 46 | #st.write(prediction) 47 | 48 | st.subheader('Prediction Probability') 49 | st.write(prediction_proba) 50 | -------------------------------------------------------------------------------- /streamlit/part3/penguins-app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import pickle 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | st.write(""" 8 | # Penguin Prediction App 9 | 10 | This app predicts the **Palmer Penguin** species! 11 | 12 | Data obtained from the [palmerpenguins library](https://github.com/allisonhorst/palmerpenguins) in R by Allison Horst. 13 | """) 14 | 15 | st.sidebar.header('User Input Features') 16 | 17 | st.sidebar.markdown(""" 18 | [Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/penguins_example.csv) 19 | """) 20 | 21 | # Collects user input features into dataframe 22 | uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"]) 23 | if uploaded_file is not None: 24 | input_df = pd.read_csv(uploaded_file) 25 | else: 26 | def user_input_features(): 27 | island = st.sidebar.selectbox('Island',('Biscoe','Dream','Torgersen')) 28 | sex = st.sidebar.selectbox('Sex',('male','female')) 29 | bill_length_mm = st.sidebar.slider('Bill length (mm)', 32.1,59.6,43.9) 30 | bill_depth_mm = st.sidebar.slider('Bill depth (mm)', 13.1,21.5,17.2) 31 | flipper_length_mm = st.sidebar.slider('Flipper length (mm)', 172.0,231.0,201.0) 32 | body_mass_g = st.sidebar.slider('Body mass (g)', 2700.0,6300.0,4207.0) 33 | data = {'island': island, 34 | 'bill_length_mm': bill_length_mm, 35 | 'bill_depth_mm': bill_depth_mm, 36 | 'flipper_length_mm': flipper_length_mm, 37 | 'body_mass_g': body_mass_g, 38 | 'sex': sex} 39 | features = pd.DataFrame(data, index=[0]) 40 | return features 41 | input_df = user_input_features() 42 | 43 | # Combines user input features with entire penguins dataset 44 | # This will be useful for the encoding phase 45 | penguins_raw = pd.read_csv('penguins_cleaned.csv') 46 | penguins = penguins_raw.drop(columns=['species']) 47 | df = pd.concat([input_df,penguins],axis=0) 48 | 49 | # Encoding of ordinal features 50 | # https://www.kaggle.com/pratik1120/penguin-dataset-eda-classification-and-clustering 51 | encode = ['sex','island'] 52 | for col in encode: 53 | dummy = pd.get_dummies(df[col], prefix=col) 54 | df = pd.concat([df,dummy], axis=1) 55 | del df[col] 56 | df = df[:1] # Selects only the first row (the user input data) 57 | 58 | # Displays the user input features 59 | st.subheader('User Input features') 60 | 61 | if uploaded_file is not None: 62 | st.write(df) 63 | else: 64 | st.write('Awaiting CSV file to be uploaded. Currently using example input parameters (shown below).') 65 | st.write(df) 66 | 67 | # Reads in saved classification model 68 | load_clf = pickle.load(open('penguins_clf.pkl', 'rb')) 69 | 70 | # Apply model to make predictions 71 | prediction = load_clf.predict(df) 72 | prediction_proba = load_clf.predict_proba(df) 73 | 74 | 75 | st.subheader('Prediction') 76 | penguins_species = np.array(['Adelie','Chinstrap','Gentoo']) 77 | st.write(penguins_species[prediction]) 78 | 79 | st.subheader('Prediction Probability') 80 | st.write(prediction_proba) 81 | -------------------------------------------------------------------------------- /streamlit/part3/penguins-model-building.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | penguins = pd.read_csv('penguins_cleaned.csv') 3 | 4 | # Ordinal feature encoding 5 | # https://www.kaggle.com/pratik1120/penguin-dataset-eda-classification-and-clustering 6 | df = penguins.copy() 7 | target = 'species' 8 | encode = ['sex','island'] 9 | 10 | for col in encode: 11 | dummy = pd.get_dummies(df[col], prefix=col) 12 | df = pd.concat([df,dummy], axis=1) 13 | del df[col] 14 | 15 | target_mapper = {'Adelie':0, 'Chinstrap':1, 'Gentoo':2} 16 | def target_encode(val): 17 | return target_mapper[val] 18 | 19 | df['species'] = df['species'].apply(target_encode) 20 | 21 | # Separating X and y 22 | X = df.drop('species', axis=1) 23 | Y = df['species'] 24 | 25 | # Build random forest model 26 | from sklearn.ensemble import RandomForestClassifier 27 | clf = RandomForestClassifier() 28 | clf.fit(X, Y) 29 | 30 | # Saving the model 31 | import pickle 32 | pickle.dump(clf, open('penguins_clf.pkl', 'wb')) 33 | -------------------------------------------------------------------------------- /streamlit/part3/penguins_cleaned.csv: -------------------------------------------------------------------------------- 1 | "species","island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex" 2 | "Adelie","Torgersen",39.1,18.7,181,3750,"male" 3 | "Adelie","Torgersen",39.5,17.4,186,3800,"female" 4 | "Adelie","Torgersen",40.3,18,195,3250,"female" 5 | "Adelie","Torgersen",36.7,19.3,193,3450,"female" 6 | "Adelie","Torgersen",39.3,20.6,190,3650,"male" 7 | "Adelie","Torgersen",38.9,17.8,181,3625,"female" 8 | "Adelie","Torgersen",39.2,19.6,195,4675,"male" 9 | "Adelie","Torgersen",41.1,17.6,182,3200,"female" 10 | "Adelie","Torgersen",38.6,21.2,191,3800,"male" 11 | "Adelie","Torgersen",34.6,21.1,198,4400,"male" 12 | "Adelie","Torgersen",36.6,17.8,185,3700,"female" 13 | "Adelie","Torgersen",38.7,19,195,3450,"female" 14 | "Adelie","Torgersen",42.5,20.7,197,4500,"male" 15 | "Adelie","Torgersen",34.4,18.4,184,3325,"female" 16 | "Adelie","Torgersen",46,21.5,194,4200,"male" 17 | "Adelie","Biscoe",37.8,18.3,174,3400,"female" 18 | "Adelie","Biscoe",37.7,18.7,180,3600,"male" 19 | "Adelie","Biscoe",35.9,19.2,189,3800,"female" 20 | "Adelie","Biscoe",38.2,18.1,185,3950,"male" 21 | "Adelie","Biscoe",38.8,17.2,180,3800,"male" 22 | "Adelie","Biscoe",35.3,18.9,187,3800,"female" 23 | "Adelie","Biscoe",40.6,18.6,183,3550,"male" 24 | "Adelie","Biscoe",40.5,17.9,187,3200,"female" 25 | "Adelie","Biscoe",37.9,18.6,172,3150,"female" 26 | "Adelie","Biscoe",40.5,18.9,180,3950,"male" 27 | "Adelie","Dream",39.5,16.7,178,3250,"female" 28 | "Adelie","Dream",37.2,18.1,178,3900,"male" 29 | "Adelie","Dream",39.5,17.8,188,3300,"female" 30 | "Adelie","Dream",40.9,18.9,184,3900,"male" 31 | "Adelie","Dream",36.4,17,195,3325,"female" 32 | "Adelie","Dream",39.2,21.1,196,4150,"male" 33 | "Adelie","Dream",38.8,20,190,3950,"male" 34 | "Adelie","Dream",42.2,18.5,180,3550,"female" 35 | "Adelie","Dream",37.6,19.3,181,3300,"female" 36 | "Adelie","Dream",39.8,19.1,184,4650,"male" 37 | "Adelie","Dream",36.5,18,182,3150,"female" 38 | "Adelie","Dream",40.8,18.4,195,3900,"male" 39 | "Adelie","Dream",36,18.5,186,3100,"female" 40 | "Adelie","Dream",44.1,19.7,196,4400,"male" 41 | "Adelie","Dream",37,16.9,185,3000,"female" 42 | "Adelie","Dream",39.6,18.8,190,4600,"male" 43 | "Adelie","Dream",41.1,19,182,3425,"male" 44 | "Adelie","Dream",36,17.9,190,3450,"female" 45 | "Adelie","Dream",42.3,21.2,191,4150,"male" 46 | "Adelie","Biscoe",39.6,17.7,186,3500,"female" 47 | "Adelie","Biscoe",40.1,18.9,188,4300,"male" 48 | "Adelie","Biscoe",35,17.9,190,3450,"female" 49 | "Adelie","Biscoe",42,19.5,200,4050,"male" 50 | "Adelie","Biscoe",34.5,18.1,187,2900,"female" 51 | "Adelie","Biscoe",41.4,18.6,191,3700,"male" 52 | "Adelie","Biscoe",39,17.5,186,3550,"female" 53 | "Adelie","Biscoe",40.6,18.8,193,3800,"male" 54 | "Adelie","Biscoe",36.5,16.6,181,2850,"female" 55 | "Adelie","Biscoe",37.6,19.1,194,3750,"male" 56 | "Adelie","Biscoe",35.7,16.9,185,3150,"female" 57 | "Adelie","Biscoe",41.3,21.1,195,4400,"male" 58 | "Adelie","Biscoe",37.6,17,185,3600,"female" 59 | "Adelie","Biscoe",41.1,18.2,192,4050,"male" 60 | "Adelie","Biscoe",36.4,17.1,184,2850,"female" 61 | "Adelie","Biscoe",41.6,18,192,3950,"male" 62 | "Adelie","Biscoe",35.5,16.2,195,3350,"female" 63 | "Adelie","Biscoe",41.1,19.1,188,4100,"male" 64 | "Adelie","Torgersen",35.9,16.6,190,3050,"female" 65 | "Adelie","Torgersen",41.8,19.4,198,4450,"male" 66 | "Adelie","Torgersen",33.5,19,190,3600,"female" 67 | "Adelie","Torgersen",39.7,18.4,190,3900,"male" 68 | "Adelie","Torgersen",39.6,17.2,196,3550,"female" 69 | "Adelie","Torgersen",45.8,18.9,197,4150,"male" 70 | "Adelie","Torgersen",35.5,17.5,190,3700,"female" 71 | "Adelie","Torgersen",42.8,18.5,195,4250,"male" 72 | "Adelie","Torgersen",40.9,16.8,191,3700,"female" 73 | "Adelie","Torgersen",37.2,19.4,184,3900,"male" 74 | "Adelie","Torgersen",36.2,16.1,187,3550,"female" 75 | "Adelie","Torgersen",42.1,19.1,195,4000,"male" 76 | "Adelie","Torgersen",34.6,17.2,189,3200,"female" 77 | "Adelie","Torgersen",42.9,17.6,196,4700,"male" 78 | "Adelie","Torgersen",36.7,18.8,187,3800,"female" 79 | "Adelie","Torgersen",35.1,19.4,193,4200,"male" 80 | "Adelie","Dream",37.3,17.8,191,3350,"female" 81 | "Adelie","Dream",41.3,20.3,194,3550,"male" 82 | "Adelie","Dream",36.3,19.5,190,3800,"male" 83 | "Adelie","Dream",36.9,18.6,189,3500,"female" 84 | "Adelie","Dream",38.3,19.2,189,3950,"male" 85 | "Adelie","Dream",38.9,18.8,190,3600,"female" 86 | "Adelie","Dream",35.7,18,202,3550,"female" 87 | "Adelie","Dream",41.1,18.1,205,4300,"male" 88 | "Adelie","Dream",34,17.1,185,3400,"female" 89 | "Adelie","Dream",39.6,18.1,186,4450,"male" 90 | "Adelie","Dream",36.2,17.3,187,3300,"female" 91 | "Adelie","Dream",40.8,18.9,208,4300,"male" 92 | "Adelie","Dream",38.1,18.6,190,3700,"female" 93 | "Adelie","Dream",40.3,18.5,196,4350,"male" 94 | "Adelie","Dream",33.1,16.1,178,2900,"female" 95 | "Adelie","Dream",43.2,18.5,192,4100,"male" 96 | "Adelie","Biscoe",35,17.9,192,3725,"female" 97 | "Adelie","Biscoe",41,20,203,4725,"male" 98 | "Adelie","Biscoe",37.7,16,183,3075,"female" 99 | "Adelie","Biscoe",37.8,20,190,4250,"male" 100 | "Adelie","Biscoe",37.9,18.6,193,2925,"female" 101 | "Adelie","Biscoe",39.7,18.9,184,3550,"male" 102 | "Adelie","Biscoe",38.6,17.2,199,3750,"female" 103 | "Adelie","Biscoe",38.2,20,190,3900,"male" 104 | "Adelie","Biscoe",38.1,17,181,3175,"female" 105 | "Adelie","Biscoe",43.2,19,197,4775,"male" 106 | "Adelie","Biscoe",38.1,16.5,198,3825,"female" 107 | "Adelie","Biscoe",45.6,20.3,191,4600,"male" 108 | "Adelie","Biscoe",39.7,17.7,193,3200,"female" 109 | "Adelie","Biscoe",42.2,19.5,197,4275,"male" 110 | "Adelie","Biscoe",39.6,20.7,191,3900,"female" 111 | "Adelie","Biscoe",42.7,18.3,196,4075,"male" 112 | "Adelie","Torgersen",38.6,17,188,2900,"female" 113 | "Adelie","Torgersen",37.3,20.5,199,3775,"male" 114 | "Adelie","Torgersen",35.7,17,189,3350,"female" 115 | "Adelie","Torgersen",41.1,18.6,189,3325,"male" 116 | "Adelie","Torgersen",36.2,17.2,187,3150,"female" 117 | "Adelie","Torgersen",37.7,19.8,198,3500,"male" 118 | "Adelie","Torgersen",40.2,17,176,3450,"female" 119 | "Adelie","Torgersen",41.4,18.5,202,3875,"male" 120 | "Adelie","Torgersen",35.2,15.9,186,3050,"female" 121 | "Adelie","Torgersen",40.6,19,199,4000,"male" 122 | "Adelie","Torgersen",38.8,17.6,191,3275,"female" 123 | "Adelie","Torgersen",41.5,18.3,195,4300,"male" 124 | "Adelie","Torgersen",39,17.1,191,3050,"female" 125 | "Adelie","Torgersen",44.1,18,210,4000,"male" 126 | "Adelie","Torgersen",38.5,17.9,190,3325,"female" 127 | "Adelie","Torgersen",43.1,19.2,197,3500,"male" 128 | "Adelie","Dream",36.8,18.5,193,3500,"female" 129 | "Adelie","Dream",37.5,18.5,199,4475,"male" 130 | "Adelie","Dream",38.1,17.6,187,3425,"female" 131 | "Adelie","Dream",41.1,17.5,190,3900,"male" 132 | "Adelie","Dream",35.6,17.5,191,3175,"female" 133 | "Adelie","Dream",40.2,20.1,200,3975,"male" 134 | "Adelie","Dream",37,16.5,185,3400,"female" 135 | "Adelie","Dream",39.7,17.9,193,4250,"male" 136 | "Adelie","Dream",40.2,17.1,193,3400,"female" 137 | "Adelie","Dream",40.6,17.2,187,3475,"male" 138 | "Adelie","Dream",32.1,15.5,188,3050,"female" 139 | "Adelie","Dream",40.7,17,190,3725,"male" 140 | "Adelie","Dream",37.3,16.8,192,3000,"female" 141 | "Adelie","Dream",39,18.7,185,3650,"male" 142 | "Adelie","Dream",39.2,18.6,190,4250,"male" 143 | "Adelie","Dream",36.6,18.4,184,3475,"female" 144 | "Adelie","Dream",36,17.8,195,3450,"female" 145 | "Adelie","Dream",37.8,18.1,193,3750,"male" 146 | "Adelie","Dream",36,17.1,187,3700,"female" 147 | "Adelie","Dream",41.5,18.5,201,4000,"male" 148 | "Gentoo","Biscoe",46.1,13.2,211,4500,"female" 149 | "Gentoo","Biscoe",50,16.3,230,5700,"male" 150 | "Gentoo","Biscoe",48.7,14.1,210,4450,"female" 151 | "Gentoo","Biscoe",50,15.2,218,5700,"male" 152 | "Gentoo","Biscoe",47.6,14.5,215,5400,"male" 153 | "Gentoo","Biscoe",46.5,13.5,210,4550,"female" 154 | "Gentoo","Biscoe",45.4,14.6,211,4800,"female" 155 | "Gentoo","Biscoe",46.7,15.3,219,5200,"male" 156 | "Gentoo","Biscoe",43.3,13.4,209,4400,"female" 157 | "Gentoo","Biscoe",46.8,15.4,215,5150,"male" 158 | "Gentoo","Biscoe",40.9,13.7,214,4650,"female" 159 | "Gentoo","Biscoe",49,16.1,216,5550,"male" 160 | "Gentoo","Biscoe",45.5,13.7,214,4650,"female" 161 | "Gentoo","Biscoe",48.4,14.6,213,5850,"male" 162 | "Gentoo","Biscoe",45.8,14.6,210,4200,"female" 163 | "Gentoo","Biscoe",49.3,15.7,217,5850,"male" 164 | "Gentoo","Biscoe",42,13.5,210,4150,"female" 165 | "Gentoo","Biscoe",49.2,15.2,221,6300,"male" 166 | "Gentoo","Biscoe",46.2,14.5,209,4800,"female" 167 | "Gentoo","Biscoe",48.7,15.1,222,5350,"male" 168 | "Gentoo","Biscoe",50.2,14.3,218,5700,"male" 169 | "Gentoo","Biscoe",45.1,14.5,215,5000,"female" 170 | "Gentoo","Biscoe",46.5,14.5,213,4400,"female" 171 | "Gentoo","Biscoe",46.3,15.8,215,5050,"male" 172 | "Gentoo","Biscoe",42.9,13.1,215,5000,"female" 173 | "Gentoo","Biscoe",46.1,15.1,215,5100,"male" 174 | "Gentoo","Biscoe",47.8,15,215,5650,"male" 175 | "Gentoo","Biscoe",48.2,14.3,210,4600,"female" 176 | "Gentoo","Biscoe",50,15.3,220,5550,"male" 177 | "Gentoo","Biscoe",47.3,15.3,222,5250,"male" 178 | "Gentoo","Biscoe",42.8,14.2,209,4700,"female" 179 | "Gentoo","Biscoe",45.1,14.5,207,5050,"female" 180 | "Gentoo","Biscoe",59.6,17,230,6050,"male" 181 | "Gentoo","Biscoe",49.1,14.8,220,5150,"female" 182 | "Gentoo","Biscoe",48.4,16.3,220,5400,"male" 183 | "Gentoo","Biscoe",42.6,13.7,213,4950,"female" 184 | "Gentoo","Biscoe",44.4,17.3,219,5250,"male" 185 | "Gentoo","Biscoe",44,13.6,208,4350,"female" 186 | "Gentoo","Biscoe",48.7,15.7,208,5350,"male" 187 | "Gentoo","Biscoe",42.7,13.7,208,3950,"female" 188 | "Gentoo","Biscoe",49.6,16,225,5700,"male" 189 | "Gentoo","Biscoe",45.3,13.7,210,4300,"female" 190 | "Gentoo","Biscoe",49.6,15,216,4750,"male" 191 | "Gentoo","Biscoe",50.5,15.9,222,5550,"male" 192 | "Gentoo","Biscoe",43.6,13.9,217,4900,"female" 193 | "Gentoo","Biscoe",45.5,13.9,210,4200,"female" 194 | "Gentoo","Biscoe",50.5,15.9,225,5400,"male" 195 | "Gentoo","Biscoe",44.9,13.3,213,5100,"female" 196 | "Gentoo","Biscoe",45.2,15.8,215,5300,"male" 197 | "Gentoo","Biscoe",46.6,14.2,210,4850,"female" 198 | "Gentoo","Biscoe",48.5,14.1,220,5300,"male" 199 | "Gentoo","Biscoe",45.1,14.4,210,4400,"female" 200 | "Gentoo","Biscoe",50.1,15,225,5000,"male" 201 | "Gentoo","Biscoe",46.5,14.4,217,4900,"female" 202 | "Gentoo","Biscoe",45,15.4,220,5050,"male" 203 | "Gentoo","Biscoe",43.8,13.9,208,4300,"female" 204 | "Gentoo","Biscoe",45.5,15,220,5000,"male" 205 | "Gentoo","Biscoe",43.2,14.5,208,4450,"female" 206 | "Gentoo","Biscoe",50.4,15.3,224,5550,"male" 207 | "Gentoo","Biscoe",45.3,13.8,208,4200,"female" 208 | "Gentoo","Biscoe",46.2,14.9,221,5300,"male" 209 | "Gentoo","Biscoe",45.7,13.9,214,4400,"female" 210 | "Gentoo","Biscoe",54.3,15.7,231,5650,"male" 211 | "Gentoo","Biscoe",45.8,14.2,219,4700,"female" 212 | "Gentoo","Biscoe",49.8,16.8,230,5700,"male" 213 | "Gentoo","Biscoe",49.5,16.2,229,5800,"male" 214 | "Gentoo","Biscoe",43.5,14.2,220,4700,"female" 215 | "Gentoo","Biscoe",50.7,15,223,5550,"male" 216 | "Gentoo","Biscoe",47.7,15,216,4750,"female" 217 | "Gentoo","Biscoe",46.4,15.6,221,5000,"male" 218 | "Gentoo","Biscoe",48.2,15.6,221,5100,"male" 219 | "Gentoo","Biscoe",46.5,14.8,217,5200,"female" 220 | "Gentoo","Biscoe",46.4,15,216,4700,"female" 221 | "Gentoo","Biscoe",48.6,16,230,5800,"male" 222 | "Gentoo","Biscoe",47.5,14.2,209,4600,"female" 223 | "Gentoo","Biscoe",51.1,16.3,220,6000,"male" 224 | "Gentoo","Biscoe",45.2,13.8,215,4750,"female" 225 | "Gentoo","Biscoe",45.2,16.4,223,5950,"male" 226 | "Gentoo","Biscoe",49.1,14.5,212,4625,"female" 227 | "Gentoo","Biscoe",52.5,15.6,221,5450,"male" 228 | "Gentoo","Biscoe",47.4,14.6,212,4725,"female" 229 | "Gentoo","Biscoe",50,15.9,224,5350,"male" 230 | "Gentoo","Biscoe",44.9,13.8,212,4750,"female" 231 | "Gentoo","Biscoe",50.8,17.3,228,5600,"male" 232 | "Gentoo","Biscoe",43.4,14.4,218,4600,"female" 233 | "Gentoo","Biscoe",51.3,14.2,218,5300,"male" 234 | "Gentoo","Biscoe",47.5,14,212,4875,"female" 235 | "Gentoo","Biscoe",52.1,17,230,5550,"male" 236 | "Gentoo","Biscoe",47.5,15,218,4950,"female" 237 | "Gentoo","Biscoe",52.2,17.1,228,5400,"male" 238 | "Gentoo","Biscoe",45.5,14.5,212,4750,"female" 239 | "Gentoo","Biscoe",49.5,16.1,224,5650,"male" 240 | "Gentoo","Biscoe",44.5,14.7,214,4850,"female" 241 | "Gentoo","Biscoe",50.8,15.7,226,5200,"male" 242 | "Gentoo","Biscoe",49.4,15.8,216,4925,"male" 243 | "Gentoo","Biscoe",46.9,14.6,222,4875,"female" 244 | "Gentoo","Biscoe",48.4,14.4,203,4625,"female" 245 | "Gentoo","Biscoe",51.1,16.5,225,5250,"male" 246 | "Gentoo","Biscoe",48.5,15,219,4850,"female" 247 | "Gentoo","Biscoe",55.9,17,228,5600,"male" 248 | "Gentoo","Biscoe",47.2,15.5,215,4975,"female" 249 | "Gentoo","Biscoe",49.1,15,228,5500,"male" 250 | "Gentoo","Biscoe",46.8,16.1,215,5500,"male" 251 | "Gentoo","Biscoe",41.7,14.7,210,4700,"female" 252 | "Gentoo","Biscoe",53.4,15.8,219,5500,"male" 253 | "Gentoo","Biscoe",43.3,14,208,4575,"female" 254 | "Gentoo","Biscoe",48.1,15.1,209,5500,"male" 255 | "Gentoo","Biscoe",50.5,15.2,216,5000,"female" 256 | "Gentoo","Biscoe",49.8,15.9,229,5950,"male" 257 | "Gentoo","Biscoe",43.5,15.2,213,4650,"female" 258 | "Gentoo","Biscoe",51.5,16.3,230,5500,"male" 259 | "Gentoo","Biscoe",46.2,14.1,217,4375,"female" 260 | "Gentoo","Biscoe",55.1,16,230,5850,"male" 261 | "Gentoo","Biscoe",48.8,16.2,222,6000,"male" 262 | "Gentoo","Biscoe",47.2,13.7,214,4925,"female" 263 | "Gentoo","Biscoe",46.8,14.3,215,4850,"female" 264 | "Gentoo","Biscoe",50.4,15.7,222,5750,"male" 265 | "Gentoo","Biscoe",45.2,14.8,212,5200,"female" 266 | "Gentoo","Biscoe",49.9,16.1,213,5400,"male" 267 | "Chinstrap","Dream",46.5,17.9,192,3500,"female" 268 | "Chinstrap","Dream",50,19.5,196,3900,"male" 269 | "Chinstrap","Dream",51.3,19.2,193,3650,"male" 270 | "Chinstrap","Dream",45.4,18.7,188,3525,"female" 271 | "Chinstrap","Dream",52.7,19.8,197,3725,"male" 272 | "Chinstrap","Dream",45.2,17.8,198,3950,"female" 273 | "Chinstrap","Dream",46.1,18.2,178,3250,"female" 274 | "Chinstrap","Dream",51.3,18.2,197,3750,"male" 275 | "Chinstrap","Dream",46,18.9,195,4150,"female" 276 | "Chinstrap","Dream",51.3,19.9,198,3700,"male" 277 | "Chinstrap","Dream",46.6,17.8,193,3800,"female" 278 | "Chinstrap","Dream",51.7,20.3,194,3775,"male" 279 | "Chinstrap","Dream",47,17.3,185,3700,"female" 280 | "Chinstrap","Dream",52,18.1,201,4050,"male" 281 | "Chinstrap","Dream",45.9,17.1,190,3575,"female" 282 | "Chinstrap","Dream",50.5,19.6,201,4050,"male" 283 | "Chinstrap","Dream",50.3,20,197,3300,"male" 284 | "Chinstrap","Dream",58,17.8,181,3700,"female" 285 | "Chinstrap","Dream",46.4,18.6,190,3450,"female" 286 | "Chinstrap","Dream",49.2,18.2,195,4400,"male" 287 | "Chinstrap","Dream",42.4,17.3,181,3600,"female" 288 | "Chinstrap","Dream",48.5,17.5,191,3400,"male" 289 | "Chinstrap","Dream",43.2,16.6,187,2900,"female" 290 | "Chinstrap","Dream",50.6,19.4,193,3800,"male" 291 | "Chinstrap","Dream",46.7,17.9,195,3300,"female" 292 | "Chinstrap","Dream",52,19,197,4150,"male" 293 | "Chinstrap","Dream",50.5,18.4,200,3400,"female" 294 | "Chinstrap","Dream",49.5,19,200,3800,"male" 295 | "Chinstrap","Dream",46.4,17.8,191,3700,"female" 296 | "Chinstrap","Dream",52.8,20,205,4550,"male" 297 | "Chinstrap","Dream",40.9,16.6,187,3200,"female" 298 | "Chinstrap","Dream",54.2,20.8,201,4300,"male" 299 | "Chinstrap","Dream",42.5,16.7,187,3350,"female" 300 | "Chinstrap","Dream",51,18.8,203,4100,"male" 301 | "Chinstrap","Dream",49.7,18.6,195,3600,"male" 302 | "Chinstrap","Dream",47.5,16.8,199,3900,"female" 303 | "Chinstrap","Dream",47.6,18.3,195,3850,"female" 304 | "Chinstrap","Dream",52,20.7,210,4800,"male" 305 | "Chinstrap","Dream",46.9,16.6,192,2700,"female" 306 | "Chinstrap","Dream",53.5,19.9,205,4500,"male" 307 | "Chinstrap","Dream",49,19.5,210,3950,"male" 308 | "Chinstrap","Dream",46.2,17.5,187,3650,"female" 309 | "Chinstrap","Dream",50.9,19.1,196,3550,"male" 310 | "Chinstrap","Dream",45.5,17,196,3500,"female" 311 | "Chinstrap","Dream",50.9,17.9,196,3675,"female" 312 | "Chinstrap","Dream",50.8,18.5,201,4450,"male" 313 | "Chinstrap","Dream",50.1,17.9,190,3400,"female" 314 | "Chinstrap","Dream",49,19.6,212,4300,"male" 315 | "Chinstrap","Dream",51.5,18.7,187,3250,"male" 316 | "Chinstrap","Dream",49.8,17.3,198,3675,"female" 317 | "Chinstrap","Dream",48.1,16.4,199,3325,"female" 318 | "Chinstrap","Dream",51.4,19,201,3950,"male" 319 | "Chinstrap","Dream",45.7,17.3,193,3600,"female" 320 | "Chinstrap","Dream",50.7,19.7,203,4050,"male" 321 | "Chinstrap","Dream",42.5,17.3,187,3350,"female" 322 | "Chinstrap","Dream",52.2,18.8,197,3450,"male" 323 | "Chinstrap","Dream",45.2,16.6,191,3250,"female" 324 | "Chinstrap","Dream",49.3,19.9,203,4050,"male" 325 | "Chinstrap","Dream",50.2,18.8,202,3800,"male" 326 | "Chinstrap","Dream",45.6,19.4,194,3525,"female" 327 | "Chinstrap","Dream",51.9,19.5,206,3950,"male" 328 | "Chinstrap","Dream",46.8,16.5,189,3650,"female" 329 | "Chinstrap","Dream",45.7,17,195,3650,"female" 330 | "Chinstrap","Dream",55.8,19.8,207,4000,"male" 331 | "Chinstrap","Dream",43.5,18.1,202,3400,"female" 332 | "Chinstrap","Dream",49.6,18.2,193,3775,"male" 333 | "Chinstrap","Dream",50.8,19,210,4100,"male" 334 | "Chinstrap","Dream",50.2,18.7,198,3775,"female" 335 | -------------------------------------------------------------------------------- /streamlit/part3/penguins_clf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part3/penguins_clf.pkl -------------------------------------------------------------------------------- /streamlit/part3/penguins_example.csv: -------------------------------------------------------------------------------- 1 | island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex 2 | Biscoe,43.9,17.2,201.0,4207.0,male 3 | -------------------------------------------------------------------------------- /streamlit/part5/basketball_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import base64 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import numpy as np 7 | 8 | st.title('NBA Player Stats Explorer') 9 | 10 | st.markdown(""" 11 | This app performs simple webscraping of NBA player stats data! 12 | * **Python libraries:** base64, pandas, streamlit 13 | * **Data source:** [Basketball-reference.com](https://www.basketball-reference.com/). 14 | """) 15 | 16 | st.sidebar.header('User Input Features') 17 | selected_year = st.sidebar.selectbox('Year', list(reversed(range(1950,2020)))) 18 | 19 | # Web scraping of NBA player stats 20 | @st.cache 21 | def load_data(year): 22 | url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_per_game.html" 23 | html = pd.read_html(url, header = 0) 24 | df = html[0] 25 | raw = df.drop(df[df.Age == 'Age'].index) # Deletes repeating headers in content 26 | raw = raw.fillna(0) 27 | playerstats = raw.drop(['Rk'], axis=1) 28 | return playerstats 29 | playerstats = load_data(selected_year) 30 | 31 | # Sidebar - Team selection 32 | sorted_unique_team = sorted(playerstats.Tm.unique()) 33 | selected_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team) 34 | 35 | # Sidebar - Position selection 36 | unique_pos = ['C','PF','SF','PG','SG'] 37 | selected_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos) 38 | 39 | # Filtering data 40 | df_selected_team = playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))] 41 | 42 | st.header('Display Player Stats of Selected Team(s)') 43 | st.write('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.') 44 | st.dataframe(df_selected_team) 45 | 46 | # Download NBA player stats data 47 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806 48 | def filedownload(df): 49 | csv = df.to_csv(index=False) 50 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions 51 | href = f'Download CSV File' 52 | return href 53 | 54 | st.markdown(filedownload(df_selected_team), unsafe_allow_html=True) 55 | 56 | # Heatmap 57 | if st.button('Intercorrelation Heatmap'): 58 | st.header('Intercorrelation Matrix Heatmap') 59 | df_selected_team.to_csv('output.csv',index=False) 60 | df = pd.read_csv('output.csv') 61 | 62 | corr = df.corr() 63 | mask = np.zeros_like(corr) 64 | mask[np.triu_indices_from(mask)] = True 65 | with sns.axes_style("white"): 66 | f, ax = plt.subplots(figsize=(7, 5)) 67 | ax = sns.heatmap(corr, mask=mask, vmax=1, square=True) 68 | st.pyplot() 69 | -------------------------------------------------------------------------------- /streamlit/part6/boston-house-ml-app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import shap 4 | import matplotlib.pyplot as plt 5 | from sklearn import datasets 6 | from sklearn.ensemble import RandomForestRegressor 7 | 8 | st.write(""" 9 | # Boston House Price Prediction App 10 | 11 | This app predicts the **Boston House Price**! 12 | """) 13 | st.write('---') 14 | 15 | # Loads the Boston House Price Dataset 16 | boston = datasets.load_boston() 17 | X = pd.DataFrame(boston.data, columns=boston.feature_names) 18 | Y = pd.DataFrame(boston.target, columns=["MEDV"]) 19 | 20 | # Sidebar 21 | # Header of Specify Input Parameters 22 | st.sidebar.header('Specify Input Parameters') 23 | 24 | def user_input_features(): 25 | CRIM = st.sidebar.slider('CRIM', X.CRIM.min(), X.CRIM.max(), X.CRIM.mean()) 26 | ZN = st.sidebar.slider('ZN', X.ZN.min(), X.ZN.max(), X.ZN.mean()) 27 | INDUS = st.sidebar.slider('INDUS', X.INDUS.min(), X.INDUS.max(), X.INDUS.mean()) 28 | CHAS = st.sidebar.slider('CHAS', X.CHAS.min(), X.CHAS.max(), X.CHAS.mean()) 29 | NOX = st.sidebar.slider('NOX', X.NOX.min(), X.NOX.max(), X.NOX.mean()) 30 | RM = st.sidebar.slider('RM', X.RM.min(), X.RM.max(), X.RM.mean()) 31 | AGE = st.sidebar.slider('AGE', X.AGE.min(), X.AGE.max(), X.AGE.mean()) 32 | DIS = st.sidebar.slider('DIS', X.DIS.min(), X.DIS.max(), X.DIS.mean()) 33 | RAD = st.sidebar.slider('RAD', X.RAD.min(), X.RAD.max(), X.RAD.mean()) 34 | TAX = st.sidebar.slider('TAX', X.TAX.min(), X.TAX.max(), X.TAX.mean()) 35 | PTRATIO = st.sidebar.slider('PTRATIO', X.PTRATIO.min(), X.PTRATIO.max(), X.PTRATIO.mean()) 36 | B = st.sidebar.slider('B', X.B.min(), X.B.max(), X.B.mean()) 37 | LSTAT = st.sidebar.slider('LSTAT', X.LSTAT.min(), X.LSTAT.max(), X.LSTAT.mean()) 38 | data = {'CRIM': CRIM, 39 | 'ZN': ZN, 40 | 'INDUS': INDUS, 41 | 'CHAS': CHAS, 42 | 'NOX': NOX, 43 | 'RM': RM, 44 | 'AGE': AGE, 45 | 'DIS': DIS, 46 | 'RAD': RAD, 47 | 'TAX': TAX, 48 | 'PTRATIO': PTRATIO, 49 | 'B': B, 50 | 'LSTAT': LSTAT} 51 | features = pd.DataFrame(data, index=[0]) 52 | return features 53 | 54 | df = user_input_features() 55 | 56 | # Main Panel 57 | 58 | # Print specified input parameters 59 | st.header('Specified Input parameters') 60 | st.write(df) 61 | st.write('---') 62 | 63 | # Build Regression Model 64 | model = RandomForestRegressor() 65 | model.fit(X, Y) 66 | # Apply Model to Make Prediction 67 | prediction = model.predict(df) 68 | 69 | st.header('Prediction of MEDV') 70 | st.write(prediction) 71 | st.write('---') 72 | 73 | # Explaining the model's predictions using SHAP values 74 | # https://github.com/slundberg/shap 75 | explainer = shap.TreeExplainer(model) 76 | shap_values = explainer.shap_values(X) 77 | 78 | st.header('Feature Importance') 79 | plt.title('Feature importance based on SHAP values') 80 | shap.summary_plot(shap_values, X) 81 | st.pyplot(bbox_inches='tight') 82 | st.write('---') 83 | 84 | plt.title('Feature importance based on SHAP values (Bar)') 85 | shap.summary_plot(shap_values, X, plot_type="bar") 86 | st.pyplot(bbox_inches='tight') 87 | -------------------------------------------------------------------------------- /streamlit/part7/solubility-app.py: -------------------------------------------------------------------------------- 1 | ###################### 2 | # Import libraries 3 | ###################### 4 | import numpy as np 5 | import pandas as pd 6 | import streamlit as st 7 | import pickle 8 | from PIL import Image 9 | from rdkit import Chem 10 | from rdkit.Chem import Descriptors 11 | 12 | ###################### 13 | # Custom function 14 | ###################### 15 | ## Calculate molecular descriptors 16 | def AromaticProportion(m): 17 | aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())] 18 | aa_count = [] 19 | for i in aromatic_atoms: 20 | if i==True: 21 | aa_count.append(1) 22 | AromaticAtom = sum(aa_count) 23 | HeavyAtom = Descriptors.HeavyAtomCount(m) 24 | AR = AromaticAtom/HeavyAtom 25 | return AR 26 | 27 | def generate(smiles, verbose=False): 28 | 29 | moldata= [] 30 | for elem in smiles: 31 | mol=Chem.MolFromSmiles(elem) 32 | moldata.append(mol) 33 | 34 | baseData= np.arange(1,1) 35 | i=0 36 | for mol in moldata: 37 | 38 | desc_MolLogP = Descriptors.MolLogP(mol) 39 | desc_MolWt = Descriptors.MolWt(mol) 40 | desc_NumRotatableBonds = Descriptors.NumRotatableBonds(mol) 41 | desc_AromaticProportion = AromaticProportion(mol) 42 | 43 | row = np.array([desc_MolLogP, 44 | desc_MolWt, 45 | desc_NumRotatableBonds, 46 | desc_AromaticProportion]) 47 | 48 | if(i==0): 49 | baseData=row 50 | else: 51 | baseData=np.vstack([baseData, row]) 52 | i=i+1 53 | 54 | columnNames=["MolLogP","MolWt","NumRotatableBonds","AromaticProportion"] 55 | descriptors = pd.DataFrame(data=baseData,columns=columnNames) 56 | 57 | return descriptors 58 | 59 | ###################### 60 | # Page Title 61 | ###################### 62 | 63 | image = Image.open('solubility-logo.jpg') 64 | 65 | st.image(image, use_column_width=True) 66 | 67 | st.write(""" 68 | # Molecular Solubility Prediction Web App 69 | 70 | This app predicts the **Solubility (LogS)** values of molecules! 71 | 72 | Data obtained from the John S. Delaney. [ESOL:  Estimating Aqueous Solubility Directly from Molecular Structure](https://pubs.acs.org/doi/10.1021/ci034243x). ***J. Chem. Inf. Comput. Sci.*** 2004, 44, 3, 1000-1005. 73 | *** 74 | """) 75 | 76 | 77 | ###################### 78 | # Input molecules (Side Panel) 79 | ###################### 80 | 81 | st.sidebar.header('User Input Features') 82 | 83 | ## Read SMILES input 84 | SMILES_input = "NCCCC\nCCC\nCN" 85 | 86 | SMILES = st.sidebar.text_area("SMILES input", SMILES_input) 87 | SMILES = "C\n" + SMILES #Adds C as a dummy, first item 88 | SMILES = SMILES.split('\n') 89 | 90 | st.header('Input SMILES') 91 | SMILES[1:] # Skips the dummy first item 92 | 93 | ## Calculate molecular descriptors 94 | st.header('Computed molecular descriptors') 95 | X = generate(SMILES) 96 | X[1:] # Skips the dummy first item 97 | 98 | ###################### 99 | # Pre-built model 100 | ###################### 101 | 102 | # Reads in saved model 103 | load_model = pickle.load(open('solubility_model.pkl', 'rb')) 104 | 105 | # Apply model to make predictions 106 | prediction = load_model.predict(X) 107 | #prediction_proba = load_model.predict_proba(X) 108 | 109 | st.header('Predicted LogS values') 110 | prediction[1:] # Skips the dummy first item 111 | -------------------------------------------------------------------------------- /streamlit/part7/solubility-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part7/solubility-logo.jpg -------------------------------------------------------------------------------- /streamlit/part7/solubility_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part7/solubility_model.pkl -------------------------------------------------------------------------------- /streamlit/part8/dna-app.py: -------------------------------------------------------------------------------- 1 | ###################### 2 | # Import libraries 3 | ###################### 4 | 5 | import pandas as pd 6 | import streamlit as st 7 | import altair as alt 8 | from PIL import Image 9 | 10 | ###################### 11 | # Page Title 12 | ###################### 13 | 14 | image = Image.open('dna-logo.jpg') 15 | 16 | st.image(image, use_column_width=True) 17 | 18 | st.write(""" 19 | # DNA Nucleotide Count Web App 20 | 21 | This app counts the nucleotide composition of query DNA! 22 | 23 | *** 24 | """) 25 | 26 | 27 | ###################### 28 | # Input Text Box 29 | ###################### 30 | 31 | #st.sidebar.header('Enter DNA sequence') 32 | st.header('Enter DNA sequence') 33 | 34 | sequence_input = ">DNA Query 2\nGAACACGTGGAGGCAAACAGGAAGGTGAAGAAGAACTTATCCTATCAGGACGGAAGGTCCTGTGCTCGGG\nATCTTCCAGACGTCGCGACTCTAAATTGCCCCCTCTGAGGTCAAGGAACACAAGATGGTTTTGGAAATGC\nTGAACCCGATACATTATAACATCACCAGCATCGTGCCTGAAGCCATGCCTGCTGCCACCATGCCAGTCCT" 35 | 36 | #sequence = st.sidebar.text_area("Sequence input", sequence_input, height=250) 37 | sequence = st.text_area("Sequence input", sequence_input, height=250) 38 | sequence = sequence.splitlines() 39 | sequence = sequence[1:] # Skips the sequence name (first line) 40 | sequence = ''.join(sequence) # Concatenates list to string 41 | 42 | st.write(""" 43 | *** 44 | """) 45 | 46 | ## Prints the input DNA sequence 47 | st.header('INPUT (DNA Query)') 48 | sequence 49 | 50 | ## DNA nucleotide count 51 | st.header('OUTPUT (DNA Nucleotide Count)') 52 | 53 | ### 1. Print dictionary 54 | st.subheader('1. Print dictionary') 55 | def DNA_nucleotide_count(seq): 56 | d = dict([ 57 | ('A',seq.count('A')), 58 | ('T',seq.count('T')), 59 | ('G',seq.count('G')), 60 | ('C',seq.count('C')) 61 | ]) 62 | return d 63 | 64 | X = DNA_nucleotide_count(sequence) 65 | 66 | #X_label = list(X) 67 | #X_values = list(X.values()) 68 | 69 | X 70 | 71 | ### 2. Print text 72 | st.subheader('2. Print text') 73 | st.write('There are ' + str(X['A']) + ' adenine (A)') 74 | st.write('There are ' + str(X['T']) + ' thymine (T)') 75 | st.write('There are ' + str(X['G']) + ' guanine (G)') 76 | st.write('There are ' + str(X['C']) + ' cytosine (C)') 77 | 78 | ### 3. Display DataFrame 79 | st.subheader('3. Display DataFrame') 80 | df = pd.DataFrame.from_dict(X, orient='index') 81 | df = df.rename({0: 'count'}, axis='columns') 82 | df.reset_index(inplace=True) 83 | df = df.rename(columns = {'index':'nucleotide'}) 84 | st.write(df) 85 | 86 | ### 4. Display Bar Chart using Altair 87 | st.subheader('4. Display Bar chart') 88 | p = alt.Chart(df).mark_bar().encode( 89 | x='nucleotide', 90 | y='count' 91 | ) 92 | p = p.properties( 93 | width=alt.Step(80) # controls width of bar. 94 | ) 95 | st.write(p) 96 | -------------------------------------------------------------------------------- /streamlit/part8/dna-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part8/dna-logo.jpg -------------------------------------------------------------------------------- /streamlit/part9/football_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import base64 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import numpy as np 7 | 8 | st.title('NFL Football Stats (Rushing) Explorer') 9 | 10 | st.markdown(""" 11 | This app performs simple webscraping of NFL Football player stats data (focusing on Rushing)! 12 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn 13 | * **Data source:** [pro-football-reference.com](https://www.pro-football-reference.com/). 14 | """) 15 | 16 | st.sidebar.header('User Input Features') 17 | selected_year = st.sidebar.selectbox('Year', list(reversed(range(1990,2020)))) 18 | 19 | # Web scraping of NFL player stats 20 | # https://www.pro-football-reference.com/years/2019/rushing.htm 21 | @st.cache 22 | def load_data(year): 23 | url = "https://www.pro-football-reference.com/years/" + str(year) + "/rushing.htm" 24 | html = pd.read_html(url, header = 1) 25 | df = html[0] 26 | raw = df.drop(df[df.Age == 'Age'].index) # Deletes repeating headers in content 27 | raw = raw.fillna(0) 28 | playerstats = raw.drop(['Rk'], axis=1) 29 | return playerstats 30 | playerstats = load_data(selected_year) 31 | 32 | # Sidebar - Team selection 33 | sorted_unique_team = sorted(playerstats.Tm.unique()) 34 | selected_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team) 35 | 36 | # Sidebar - Position selection 37 | unique_pos = ['RB','QB','WR','FB','TE'] 38 | selected_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos) 39 | 40 | # Filtering data 41 | df_selected_team = playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))] 42 | 43 | st.header('Display Player Stats of Selected Team(s)') 44 | st.write('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.') 45 | st.dataframe(df_selected_team) 46 | 47 | # Download NBA player stats data 48 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806 49 | def filedownload(df): 50 | csv = df.to_csv(index=False) 51 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions 52 | href = f'Download CSV File' 53 | return href 54 | 55 | st.markdown(filedownload(df_selected_team), unsafe_allow_html=True) 56 | 57 | # Heatmap 58 | if st.button('Intercorrelation Heatmap'): 59 | st.header('Intercorrelation Matrix Heatmap') 60 | df_selected_team.to_csv('output.csv',index=False) 61 | df = pd.read_csv('output.csv') 62 | 63 | corr = df.corr() 64 | mask = np.zeros_like(corr) 65 | mask[np.triu_indices_from(mask)] = True 66 | with sns.axes_style("white"): 67 | f, ax = plt.subplots(figsize=(7, 5)) 68 | ax = sns.heatmap(corr, mask=mask, vmax=1, square=True) 69 | st.pyplot() 70 | --------------------------------------------------------------------------------