├── README.md
├── dhfr
├── dhfr-classification-deploy.R
├── dhfr-classification.R
├── dhfr-data-understanding.R
├── dhfr-handling-missing-data.R
└── dhfr-parallel-speed-up.R
├── iris
├── iris-classification.R
└── iris-data-understanding.R
├── linear-regression
└── boston-housing-linear-regression.R
├── plot
└── scatter-plot
│ ├── aromatase.csv
│ └── code-scatter-plot.R
├── python-in-r
└── using-reticulate.R
├── python
├── CDD_ML_Part_1_Acetylcholinesterase_Bioactivity_Data_Concised.ipynb
├── CDD_ML_Part_1_Bioactivity_Data_Concised.ipynb
├── CDD_ML_Part_1_bioactivity_data.ipynb
├── CDD_ML_Part_2_Acetylcholinesterase_Exploratory_Data_Analysis.ipynb
├── CDD_ML_Part_2_Exploratory_Data_Analysis.ipynb
├── CDD_ML_Part_3_Acetylcholinesterase_Descriptor_Dataset_Preparation.ipynb
├── CDD_ML_Part_4_Acetylcholinesterase_Regression_Random_Forest.ipynb
├── CDD_ML_Part_5_Acetylcholinesterase_Compare_Regressors.ipynb
├── Colab_File_handling_on_Google_Colab.ipynb
├── How_to_build_a_simple_linear_regression_model_in_python.ipynb
├── Hummingbird_ML.ipynb
├── PCA_analysis.ipynb
├── ROC_curve.ipynb
├── ROC_curve_kNN.ipynb
├── Sweetviz.ipynb
├── cheminformatics_predicting_solubility.ipynb
├── cheminformatics_predicting_solubility_2_1_PyCaret.ipynb
├── cheminformatics_predicting_solubility_2_2_PyCaret.ipynb
├── comparing-classifiers.ipynb
├── google_colab_install_conda.ipynb
├── google_colab_r_magic_command.ipynb
├── google_colab_r_notebook.ipynb
├── hyperparameter_tuning.ipynb
├── iris
│ └── iris-classification-random-forest.ipynb
├── klib.ipynb
├── linear_regression.ipynb
├── model_is_training_progress_bar.ipynb
├── pandas-create-and-combine-dataframes.ipynb
├── pandas_exploratory_data_analysis.ipynb
├── pandas_profiling_example.ipynb
├── pandas_read_html_for_webscraping.ipynb
├── pandas_select_columns.ipynb
├── pandas_styling_dataframe.ipynb
└── r_magic_command.ipynb
├── shiny
├── 001-first-app
│ └── app.R
├── 002-histogram
│ └── app.R
├── 003-play-golf
│ └── app.R
├── 004-iris-predictor
│ ├── app-numeric.R
│ ├── app-slider.R
│ ├── model.R
│ ├── model.rds
│ ├── testing.csv
│ └── training.csv
└── 005-bmi
│ ├── about.md
│ └── app.R
└── streamlit
├── part1
├── myapp.py
└── myapp2.py
├── part10
└── sp500-app.py
├── part12
├── crypto-price-app.py
└── logo.jpg
├── part2
└── iris-ml-app.py
├── part3
├── penguins-app.py
├── penguins-model-building.py
├── penguins_cleaned.csv
├── penguins_clf.pkl
└── penguins_example.csv
├── part5
└── basketball_app.py
├── part6
└── boston-house-ml-app.py
├── part7
├── solubility-app.py
├── solubility-logo.jpg
├── solubility-web-app.ipynb
└── solubility_model.pkl
├── part8
├── dna-app.py
└── dna-logo.jpg
└── part9
└── football_app.py
/README.md:
--------------------------------------------------------------------------------
1 | # code
2 | This is a compilation of R programming codes used on the **Data Professor** YouTube channel tutorial videos.
3 |
4 | Folder | Description
5 | ---|---
6 | [iris](https://github.com/dataprofessor/code/tree/master/iris) | Codes for performing *exploratory data analysis* (so as to gain *data understanding*) and for building *classification models* of the Iris dataset.
7 | [dhfr](https://github.com/dataprofessor/code/tree/master/dhfr) | Codes for performing *exploratory data analysis* (so as to gain *data understanding*) and for building *classification models* of the Iris dataset.
8 | [python](https://github.com/dataprofessor/code/tree/master/python) | Codes for various Python data science project tutorials.
9 | [shiny](https://github.com/dataprofessor/code/tree/master/shiny) | Codes for building *web applications* in R with *shiny* package.
10 |
11 | > Note: More to come. Please stay tuned!
12 |
--------------------------------------------------------------------------------
/dhfr/dhfr-classification-deploy.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Importing libraries
8 | library(datasets) # Contains several data sets
9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 |
11 | # Importing the dhfr data set
12 | data(dhfr)
13 |
14 | # Check to see if there are missing data?
15 | sum(is.na(dhfr))
16 |
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 |
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set
23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set
24 |
25 |
26 |
27 | ###############################
28 | # SVM model (polynomial kernel)
29 |
30 | # Build Training model
31 | Model <- train(Y ~ ., data = TrainingSet,
32 | method = "svmPoly",
33 | na.action = na.omit,
34 | preProcess=c("scale","center"),
35 | trControl= trainControl(method="none"),
36 | tuneGrid = data.frame(degree=1,scale=1,C=1)
37 | )
38 |
39 |
40 | # Save model to RDS file
41 |
42 | saveRDS(Model, "Model.rds")
43 |
44 | # Read the model from RDS file
45 |
46 | read.Model <- readRDS("Model.rds")
47 |
48 |
49 | # Apply model for prediction
50 | Model.training <-predict(read.Model, TrainingSet) # Apply model to make prediction on Training set
51 | Model.testing <-predict(read.Model, TestingSet) # Apply model to make prediction on Testing set
52 |
53 | # Model performance (Displays confusion matrix and statistics)
54 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
55 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Y)
56 |
57 | print(Model.training.confusion)
58 | print(Model.testing.confusion)
59 |
60 | # Feature importance
61 | Importance <- varImp(Model)
62 | plot(Importance, top = 25)
63 | plot(Importance, col = "red")
64 |
--------------------------------------------------------------------------------
/dhfr/dhfr-classification.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Importing libraries
8 | library(datasets) # Contains several data sets
9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 |
11 | # Importing the dhfr data set
12 | data(dhfr)
13 |
14 | # Check to see if there are missing data?
15 | sum(is.na(dhfr))
16 |
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 |
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set
23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set
24 |
25 |
26 |
27 | ###############################
28 | # SVM model (polynomial kernel)
29 |
30 | # Build Training model
31 | Model <- train(Y ~ ., data = TrainingSet,
32 | method = "svmPoly",
33 | na.action = na.omit,
34 | preProcess=c("scale","center"),
35 | trControl= trainControl(method="none"),
36 | tuneGrid = data.frame(degree=1,scale=1,C=1)
37 | )
38 |
39 | # Build CV model
40 | Model.cv <- train(Y ~ ., data = TrainingSet,
41 | method = "svmPoly",
42 | na.action = na.omit,
43 | preProcess=c("scale","center"),
44 | trControl= trainControl(method="cv", number=10),
45 | tuneGrid = data.frame(degree=1,scale=1,C=1)
46 | )
47 |
48 |
49 | # Apply model for prediction
50 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
51 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set
52 | Model.cv <-predict(Model.cv, TrainingSet) # Perform cross-validation
53 |
54 | # Model performance (Displays confusion matrix and statistics)
55 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
56 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Y)
57 | Model.cv.confusion <-confusionMatrix(Model.cv, TrainingSet$Y)
58 |
59 | print(Model.training.confusion)
60 | print(Model.testing.confusion)
61 | print(Model.cv.confusion)
62 |
63 | # Feature importance
64 | Importance <- varImp(Model)
65 | plot(Importance, top = 25)
66 | plot(Importance, col = "red")
67 |
--------------------------------------------------------------------------------
/dhfr/dhfr-data-understanding.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | #########################
8 | # Loading DHFR data set
9 | #########################
10 |
11 | # Method 1
12 |
13 | library(datasets)
14 | data(dhfr)
15 |
16 | # Method 2
17 | #dhfr2 <- datasets::dhfr
18 |
19 | # Method 3
20 | # install.packages("RCurl")
21 |
22 | #library(RCurl)
23 | dhfr <- read.csv(text = getURL("https://github.com/dataprofessor/data/raw/master/dhfr.csv") )
24 |
25 | # View the data
26 | View(dhfr)
27 |
28 | #############################
29 | # Display summary statistics
30 | #############################
31 |
32 | # head() / tail()
33 | head(dhfr, 5)
34 | tail(dhfr, 5)
35 |
36 |
37 | # summary()
38 | summary(dhfr)
39 | summary(dhfr$Y)
40 |
41 |
42 | # Check to see if there are missing data?
43 | sum(is.na(dhfr))
44 |
45 |
46 | # skimr() - expands on summary() by providing larger set of statistics
47 | # install.packages("skimr")
48 | # https://github.com/ropensci/skimr
49 |
50 | library(skimr)
51 |
52 | skim(dhfr) # Perform skim to display summary statistics
53 |
54 | # Group data by Y (biological activity) then perform skim
55 | dhfr %>%
56 | dplyr::group_by(Y) %>%
57 | skim()
58 |
59 | #############################
60 | # Quick data visualization
61 | #
62 | # R base plot()
63 | #############################
64 |
65 |
66 | # Panel plots
67 | #plot(dhfr)
68 | #plot(iris, col = "red")
69 |
70 | # Scatter plot
71 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol)
72 |
73 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = "red") # Makes red circles
74 |
75 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = dhfr$Y) # Color by Y
76 |
77 | plot(dhfr$moe2D_zagreb, dhfr$moe2D_weinerPol, col = "red", # Makes red circles + Adds x and y axis labels
78 | xlab = "moe2D_zagreb", ylab = "moe2D_weinerPol")
79 |
80 | # Histogram
81 | hist(dhfr$moe2D_zagreb)
82 | hist(dhfr$moe2D_zagreb, col = "red") # Makes red bars
83 |
84 | # Feature plots
85 | # https://www.machinelearningplus.com/machine-learning/caret-package/
86 | featurePlot(x = dhfr[,2:21],
87 | y = dhfr$Y,
88 | plot = "box",
89 | strip=strip.custom(par.strip.text=list(cex=.7)),
90 | scales = list(x = list(relation="free"),
91 | y = list(relation="free")))
92 |
--------------------------------------------------------------------------------
/dhfr/dhfr-handling-missing-data.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 |
8 | # 1. Loading the DHFR data
9 | library(RCurl)
10 | dhfr <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/dhfr.csv") )
11 |
12 | View(dhfr)
13 |
14 |
15 | # 2. Check for missing data
16 |
17 | sum(is.na(dhfr))
18 |
19 |
20 | # 3. If data is clean, randomly introduce NA to the dataset
21 |
22 | na.gen <- function(data,n) {
23 | i <- 1
24 | while (i < n+1) {
25 | idx1 <- sample(1:nrow(data), 1)
26 | idx2 <- sample(1:ncol(data), 1)
27 | data[idx1,idx2] <- NA
28 | i = i+1
29 | }
30 | return(data)
31 | }
32 |
33 |
34 | # Before introducing NA to the dataset, leave the Y class label (output variable) out
35 |
36 | dhfr <- dhfr[,-1]
37 |
38 |
39 | # Choose 1 of the following to run (they'll produce the same result)
40 |
41 | dhfr <- na.gen(dhfr,100)
42 |
43 | dhfr <- na.gen(n=100,data=dhfr)
44 |
45 | dhfr <- na.gen(100,dhfr) # This produces an error, why?
46 |
47 |
48 | # 4. Check again for missing data
49 |
50 | sum(is.na(dhfr))
51 |
52 | colSums(is.na(dhfr))
53 |
54 | str(dhfr)
55 |
56 |
57 | # Lists rows with missing data
58 |
59 | missingdata <- dhfr[!complete.cases(dhfr), ]
60 |
61 | sum(is.na(missingdata))
62 |
63 |
64 | # If above sum is 0, this means that there is no missing data and proceed to modeling.
65 | # If above sum is greater than 0, then proceed to # 5
66 |
67 |
68 | # 5. Handling the missing data. There are 2 options, decide and choose only 1
69 |
70 | # 5.1. Simply delete all entries with missing data
71 |
72 | clean.data <- na.omit(dhfr)
73 |
74 | sum(is.na(clean.data))
75 |
76 |
77 | # 5.2. Imputation: Replace missing values with the column's
78 |
79 | # MEAN
80 | dhfr.impute <- dhfr
81 |
82 | for (i in which(sapply(dhfr.impute, is.numeric))) {
83 | dhfr.impute[is.na(dhfr.impute[, i]), i] <- mean(dhfr.impute[, i], na.rm = TRUE)
84 | }
85 |
86 | sum(is.na(dhfr.impute))
87 |
88 |
89 | # MEDIAN
90 | dhfr.impute <- dhfr
91 |
92 | for (i in which(sapply(dhfr.impute, is.numeric))) {
93 | dhfr.impute[is.na(dhfr.impute[, i]), i] <- median(dhfr.impute[, i], na.rm = TRUE)
94 | }
95 |
96 | sum(is.na(dhfr.impute))
97 |
--------------------------------------------------------------------------------
/dhfr/dhfr-parallel-speed-up.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Importing libraries
8 | library(datasets) # Contains several data sets
9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 |
11 | # Importing the dhfr data set
12 | data(dhfr)
13 |
14 | # Check to see if there are missing data?
15 | sum(is.na(dhfr))
16 |
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 |
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
22 | TrainingSet <- dhfr[TrainingIndex,] # Training Set
23 | TestingSet <- dhfr[-TrainingIndex,] # Test Set
24 |
25 |
26 |
27 | ###############################
28 | # Random forest
29 |
30 |
31 | # Run normally without parallel processing
32 | start.time <- proc.time()
33 | Model <- train(Y ~ .,
34 | data = TrainingSet, # Build model using training set
35 | method = "rf" # Learning algorithm
36 | )
37 | stop.time <- proc.time()
38 | run.time <- stop.time - start.time
39 | print(run.time)
40 |
41 |
42 |
43 | # Use doParallel
44 | # https://topepo.github.io/caret/parallel-processing.html
45 |
46 | library(doParallel)
47 |
48 | cl <- makePSOCKcluster(5)
49 | registerDoParallel(cl)
50 |
51 | start.time <- proc.time()
52 | Model <- train(Y ~ .,
53 | data = TrainingSet, # Build model using training set
54 | method = "rf" # Learning algorithm
55 | )
56 | stop.time <- proc.time()
57 | run.time <- stop.time - start.time
58 | print(run.time)
59 |
60 | stopCluster(cl)
61 |
62 |
63 |
64 |
65 | ##########################
66 |
67 | # Run without parallel processing
68 |
69 | start.time <- proc.time()
70 | Model <- train(Y ~ .,
71 | data = TrainingSet, # Build model using training set
72 | method = "rf", # Learning algorithm
73 | tuneGrid = data.frame(mtry = seq(5,15, by=5))
74 | )
75 | stop.time <- proc.time()
76 | run.time <- stop.time - start.time
77 | print(run.time)
78 |
79 | # Using doParallel
80 |
81 | library(doParallel)
82 |
83 | cl <- makePSOCKcluster(5)
84 | registerDoParallel(cl)
85 |
86 | start.time <- proc.time()
87 | Model <- train(Y ~ .,
88 | data = TrainingSet, # Build model using training set
89 | method = "rf", # Learning algorithm
90 | tuneGrid = data.frame(mtry = seq(5,15, by=5))
91 | )
92 | stop.time <- proc.time()
93 | run.time <- stop.time - start.time
94 | print(run.time)
95 |
96 | stopCluster(cl)
97 |
98 |
99 | ##########################
100 | # Apply model for prediction
101 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
102 |
103 | # Model performance (Displays confusion matrix and statistics)
104 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
105 |
106 | print(Model.training.confusion)
107 |
108 | # Feature importance
109 | Importance <- varImp(Model)
110 | plot(Importance, top = 25)
111 | plot(Importance, col = "red")
112 |
--------------------------------------------------------------------------------
/iris/iris-classification.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Importing libraries
8 | library(datasets) # Contains the Iris data set
9 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
10 |
11 | # Importing the Iris data set
12 | data(iris)
13 |
14 | # Check to see if there are missing data?
15 | sum(is.na(iris))
16 |
17 | # To achieve reproducible model; set the random seed number
18 | set.seed(100)
19 |
20 | # Performs stratified random split of the data set
21 | TrainingIndex <- createDataPartition(iris$Species, p=0.8, list = FALSE)
22 | TrainingSet <- iris[TrainingIndex,] # Training Set
23 | TestingSet <- iris[-TrainingIndex,] # Test Set
24 |
25 | # Compare scatter plot of the 80 and 20 data subsets
26 |
27 |
28 |
29 |
30 | ###############################
31 | # SVM model (polynomial kernel)
32 |
33 | # Build Training model
34 | Model <- train(Species ~ ., data = TrainingSet,
35 | method = "svmPoly",
36 | na.action = na.omit,
37 | preProcess=c("scale","center"),
38 | trControl= trainControl(method="none"),
39 | tuneGrid = data.frame(degree=1,scale=1,C=1)
40 | )
41 |
42 | # Build CV model
43 | Model.cv <- train(Species ~ ., data = TrainingSet,
44 | method = "svmPoly",
45 | na.action = na.omit,
46 | preProcess=c("scale","center"),
47 | trControl= trainControl(method="cv", number=10),
48 | tuneGrid = data.frame(degree=1,scale=1,C=1)
49 | )
50 |
51 |
52 | # Apply model for prediction
53 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
54 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set
55 | Model.cv <-predict(Model.cv, TrainingSet) # Perform cross-validation
56 |
57 | # Model performance (Displays confusion matrix and statistics)
58 | Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Species)
59 | Model.testing.confusion <-confusionMatrix(Model.testing, TestingSet$Species)
60 | Model.cv.confusion <-confusionMatrix(Model.cv, TrainingSet$Species)
61 |
62 | print(Model.training.confusion)
63 | print(Model.testing.confusion)
64 | print(Model.cv.confusion)
65 |
66 | # Feature importance
67 | Importance <- varImp(Model)
68 | plot(Importance)
69 | plot(Importance, col = "red")
70 |
--------------------------------------------------------------------------------
/iris/iris-data-understanding.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | #########################
8 | # Loading Iris data set
9 | #########################
10 |
11 | # Method 1
12 |
13 | library(datasets)
14 | data(iris)
15 |
16 | iris2 <- datasets::iris
17 |
18 | # Method 2
19 | # install.packages("RCurl")
20 |
21 | library(RCurl)
22 | iris3 <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/iris.csv") )
23 |
24 | # View the data
25 | View(iris)
26 |
27 | #############################
28 | # Display summary statistics
29 | #############################
30 |
31 | # head() / tail()
32 | head(iris, 5)
33 | tail(iris, 5)
34 |
35 |
36 | # summary()
37 | summary(iris)
38 | summary(iris$Sepal.Length)
39 |
40 |
41 | # Check to see if there are missing data?
42 | sum(is.na(iris))
43 |
44 |
45 | # skimr() - expands on summary() by providing larger set of statistics
46 | # install.packages("skimr")
47 | # https://github.com/ropensci/skimr
48 |
49 | library(skimr)
50 |
51 | skim(iris) # Perform skim to display summary statistics
52 |
53 | # Group data by Species then perform skim
54 | iris %>%
55 | dplyr::group_by(Species) %>%
56 | skim()
57 |
58 | #############################
59 | # Quick data visualization
60 | #
61 | # R base plot()
62 | #############################
63 |
64 |
65 | # Panel plots
66 | plot(iris)
67 | plot(iris, col = "red")
68 |
69 | # Scatter plot
70 | plot(iris$Sepal.Width, iris$Sepal.Length)
71 |
72 | plot(iris$Sepal.Width, iris$Sepal.Length, col = "red") # Makes red circles
73 |
74 | plot(iris$Sepal.Width, iris$Sepal.Length, col = "red", # Makes red circles + Adds x and y axis labels
75 | xlab = "Sepal width", ylab = "Sepal length")
76 |
77 | # Histogram
78 | hist(iris$Sepal.Width)
79 | hist(iris$Sepal.Width, col = "red") # Makes red bars
80 |
81 | # Feature plots
82 | # https://www.machinelearningplus.com/machine-learning/caret-package/
83 | featurePlot(x = iris[,1:4],
84 | y = iris$Species,
85 | plot = "box",
86 | strip=strip.custom(par.strip.text=list(cex=.7)),
87 | scales = list(x = list(relation="free"),
88 | y = list(relation="free")))
89 |
90 |
--------------------------------------------------------------------------------
/linear-regression/boston-housing-linear-regression.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | # http://facebook.com/dataprofessor #
6 | # https://www.instagram.com/data.professor #
7 | ############################################
8 |
9 | # Importing libraries
10 | library(mlbench) # Contains several benchmark data sets (especially the Boston Housing dataset)
11 | library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
12 |
13 | # Importing the Boston Housing data set
14 | data(BostonHousing)
15 |
16 | head(BostonHousing)
17 |
18 | # Check to see if there are missing data?
19 | sum(is.na(BostonHousing))
20 |
21 | # To achieve reproducible model; set the random seed number
22 | set.seed(100)
23 |
24 | # Performs stratified random split of the data set
25 | TrainingIndex <- createDataPartition(BostonHousing$medv, p=0.8, list = FALSE)
26 | TrainingSet <- BostonHousing[TrainingIndex,] # Training Set
27 | TestingSet <- BostonHousing[-TrainingIndex,] # Test Set
28 |
29 |
30 | ###############################
31 |
32 | # Build Training model
33 | Model <- train(medv ~ ., data = TrainingSet,
34 | method = "lm",
35 | na.action = na.omit,
36 | preProcess=c("scale","center"),
37 | trControl= trainControl(method="none")
38 | )
39 |
40 | # Apply model for prediction
41 | Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
42 | Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set
43 |
44 | # Model performance (Displays scatter plot and performance metrics)
45 | # Scatter plot of Training set
46 | plot(TrainingSet$medv,Model.training, col = "blue" )
47 | plot(TestingSet$medv,Model.testing, col = "blue" )
48 |
--------------------------------------------------------------------------------
/plot/scatter-plot/code-scatter-plot.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | # http://facebook.com/dataprofessor #
6 | # https://www.instagram.com/data.professor #
7 | ############################################
8 |
9 | ######## READ DATA
10 | # https://link.springer.com/article/10.1007%2Fs11030-013-9462-x
11 | # 11030_2013_9462_MOESM2_ESM.xls (423 kb)
12 | # Supplementary material 2 (xls 423 KB)
13 | aromatase <- read.csv("aromatase.csv")
14 |
15 | ######## MISSING DATA
16 | sum(is.na(aromatase))
17 | missingdata <- aromatase[!complete.cases(aromatase), ] # Identify which row contains missing data
18 |
19 | aromatase <- na.omit(aromatase) # Remove any missing data >> Complete case
20 | sum(is.na(aromatase)) # Check again for missing data
21 |
22 | class <- aromatase[ ,2] # Class label
23 | aromatase2 <- aromatase[,6:18] # Descriptors
24 | aromatase3 <- cbind(class, aromatase2) # Combine Class label + Descriptors into same dataframe
25 |
26 | df <- aromatase3 # Once we are satisfied with the dataset, let's call it "df" for conciseness
27 |
28 |
29 | ######## plot()
30 |
31 | # See at a glance all possible scatter plots
32 | plot(df)
33 | plot(df , col = "blue")
34 |
35 | # Select a pair of interest to visualize scatter plot
36 |
37 | # Figure 1, https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0066566
38 |
39 |
40 | plot(df$MW, df$ALogP)
41 |
42 |
43 | # We're going to make Steroids blue and Non-Steroids red
44 | library(colorspace)
45 | df$color <- factor(df$class,
46 | levels=c("Steroid", "Non-Steroid"),
47 | labels=c("blue", "red"))
48 | plot(df$MW, df$ALogP, pch = 16, col=as.character(df$color) )
49 |
50 |
51 |
52 |
53 | # col argument for defining the color
54 | # R has 657 colors, colors() function lists these colors
55 | plot(df$MW, df$ALogP, col = "red")
56 | plot(df$MW, df$ALogP, col = "blue")
57 | plot(df$MW, df$ALogP, col = "green")
58 | plot(df$MW, df$ALogP, col = "purple")
59 |
60 | plot(df$MW, df$ALogP, col = "orangered3")
61 |
62 | plot(df$MW, df$ALogP, col = "#FF0000") # Hex color code for red
63 |
64 | # Color in RGB color code
65 | rgb(1,0,0) # red color
66 | rgb(255,0,0, max=255) # red color
67 |
68 | plot(df$MW, df$ALogP, col = rgb(0,0,0, max=255) )
69 |
70 |
71 |
72 | # symbols
73 |
74 | plot(df$MW, df$ALogP, pch = 1) # pch = 1, open circles (the default value)
75 | # There are a total of 25 symbols to choose from
76 | plot(df$MW, df$ALogP, pch = 2) # pch = 2, open triangle symbols
77 | plot(df$MW, df$ALogP, pch = 3) # pch = 3, plus symbols
78 | plot(df$MW, df$ALogP, pch = 4) # pch = 4, x symbols
79 | plot(df$MW, df$ALogP, pch = 5) # pch = 5, diamond diamongs
80 | plot(df$MW, df$ALogP, pch = 16) # pch = 16, filled circle symbols
81 |
82 |
83 | plot(df$MW, df$ALogP, pch = 16, col = "orangered3")
84 | col2rgb("orangered3") # This gives us rgb(205,55,0, max=255)
85 | plot(df$MW, df$ALogP, pch = 16, col = rgb(205,55,0, max=255))
86 |
87 |
88 | # Add transparency to color
89 |
90 | library(scales)
91 |
92 | plot(df$MW, df$ALogP, pch = 16,
93 | col = alpha("orangered3", 0.3))
94 |
95 | plot(df$MW, df$ALogP, pch = 16,
96 | col = rgb(205,55,0, 75, max=255))
97 |
98 | plot(df$MW, df$ALogP, pch = 16, col=alpha(as.character(df$color),0.3 ) )
99 |
100 |
101 | ##################################
102 | # Multi-plot
103 |
104 | # Scatter plot of first pair
105 | plot(df$MW, df$ALogP, pch = 16,
106 | col = alpha("red", 0.3),
107 | xlab = "Molecular Weight (MW)", # X-axis label
108 | ylab = "Solubility (ALogP)", # Y-axis label
109 | font.lab = 2 # X and Y labels are now bold
110 | )
111 | abline(lm(df$ALogP ~ df$MW)) # Trend line
112 |
113 |
114 | # Scatter plot of second pair
115 | plot(df$MW, df$Qm, pch = 16,
116 | col = alpha("blue", 0.3),
117 | xlab = "MW", # X-axis label
118 | ylab = "Qm", # Y-axis label
119 | font.lab = 2 # X and Y labels are now bold
120 | )
121 | abline(lm(df$Qm ~ df$MW)) # Trend line
122 |
123 |
124 | # Scatter plot of third pair
125 | plot(df$HOMO, df$LUMO, pch = 16,
126 | col = alpha("green", 0.3),
127 | xlab = "HOMO", # X-axis label
128 | ylab = "LUMO", # Y-axis label
129 | font.lab = 2 # X and Y labels are now bold
130 | )
131 | abline(lm(df$LUMO ~ df$HOMO)) # Trend line
132 |
133 |
134 | # Scatter plot of fourth pair
135 | plot(df$MW, df$HOMO, pch = 16,
136 | col = alpha("purple", 0.3),
137 | xlab = "MW", # X-axis label
138 | ylab = "HOMO", # Y-axis label
139 | font.lab = 2 # X and Y labels are now bold
140 | )
141 | abline(lm(df$HOMO ~ df$MW)) # Trend line
142 |
143 |
144 | ######## Creating multi-plot figures
145 |
146 | # 2 rows by 2 columns
147 |
148 | par(mfrow=c(2,2))
149 | # Plot 1
150 | # Plot 2
151 | # Plot 3
152 | # Plot 4
153 |
154 | par(mfrow=c(2,2), mai = c(0.7, 0.7, 0.3, 0.3))
155 | plot(df$MW, df$ALogP) # Plot 1
156 | plot(df$MW, df$Qm) # Plot 2
157 | plot(df$HOMO, df$LUMO) # Plot 3
158 | plot(df$MW, df$HOMO) # Plot 4
159 |
160 |
161 | # 3 rows by 1 column
162 |
163 | par(mfrow=c(3,1))
164 | # Plot 1
165 | # Plot 2
166 | # Plot 4
167 |
168 | par(mfrow=c(3,1), mai = c(0.3, 0.7, 0.1, 0.3))
169 | plot(df$MW, df$ALogP) # Plot 1
170 | plot(df$MW, df$Qm) # Plot 2
171 | plot(df$MW, df$HOMO) # Plot 4
172 |
173 |
174 | # 1 row by 3 column
175 |
176 | par(mfrow=c(1,3))
177 | # Plot 1
178 | # Plot 2
179 | # Plot 3
180 | # Plot 4
181 |
182 | par(mfrow=c(1,3), mai = c(0.3, 0.3, 0.3, 0.3))
183 | plot(df$MW, df$ALogP) # Plot 1
184 | plot(df$MW, df$Qm) # Plot 2
185 | plot(df$MW, df$HOMO) # Plot 4
186 |
187 | par(mfrow=c(1,3), mai = c(0.3, 0.3, 0.3, 0))
188 | plot(df$ALogP, df$MW) # Plot 1
189 | plot(df$Qm, df$MW) # Plot 2
190 | plot(df$HOMO, df$MW) # Plot 4
191 |
192 |
193 | ######## Saving plot to file
194 |
195 | # Single plot
196 |
197 | pdf("plot.pdf")
198 | #...Insert plot function here...
199 | dev.off()
200 |
201 | pdf("plot.pdf")
202 | plot(df$ALogP, df$MW)
203 | dev.off()
204 |
205 | # Multi-plot
206 |
207 | pdf("plot2.pdf")
208 | par(mfrow=c(2,2))
209 | # Plot 1
210 | # Plot 2
211 | # Plot 3
212 | # Plot 4
213 | dev.off()
214 |
215 | pdf("plot_multiplot.pdf")
216 | par(mfrow=c(1,3), mai = c(0.3, 0.3, 0.3, 0))
217 | plot(df$ALogP, df$MW) # Plot 1
218 | plot(df$Qm, df$MW) # Plot 2
219 | plot(df$HOMO, df$MW) # Plot 4
220 | dev.off()
221 |
222 | pdf("plot2.pdf")
223 | par(mfrow=c(2,2), mai = c(0.7, 0.7, 0.3, 0.3))
224 | # Plot 1
225 | # Plot 2
226 | # Plot 3
227 | # Plot 4
228 | dev.off()
229 |
--------------------------------------------------------------------------------
/python-in-r/using-reticulate.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | # http://facebook.com/dataprofessor #
6 | # https://www.instagram.com/data.professor #
7 | ############################################
8 |
9 | # https://rstudio.github.io/reticulate/
10 | # install.packages("reticulate")
11 | library(reticulate)
12 |
13 | # Loads Python Shell
14 | repl_python()
15 |
16 | # Check the current Python version
17 |
18 | reticulate::py_config()
19 |
20 | # Load a particular Python version on our system
21 | use_python("C:/Program Files/Python38", required = TRUE)
22 |
23 |
24 |
25 | ############################
26 | #
27 | # matplotlib Example - Scatter plot
28 | # https://matplotlib.org/3.1.1/gallery/shapes_and_collections/scatter.html#sphx-glr-gallery-shapes-and-collections-scatter-py
29 | #
30 | ############################
31 |
32 | ############################
33 | # Import libraries
34 | ############################
35 |
36 | # import matplotlib.pyplot as plt
37 | plt <- import('matplotlib.pyplot')
38 |
39 | # import numpy as np
40 | np <- import('numpy')
41 |
42 | ############################
43 | # Load the Iris dataset
44 | ############################
45 | data(iris)
46 |
47 |
48 | ############################
49 | # Fixing random state for reproducibility
50 | ############################
51 |
52 | # np.random.seed(19680801) # https://github.com/rstudio/reticulate/issues/226
53 | np$random$seed(19680801L)
54 |
55 | # N = 50
56 | N <- 50L
57 | # x = np.random.rand(N)
58 | x <- np$random$rand(N)
59 |
60 | # y = np.random.rand(N)
61 | y <- np$random$rand(N)
62 |
63 | # colors = np.random.rand(N)
64 | colors <- np$random$rand(N)
65 |
66 | # area = (30 * np.random.rand(N))**2 # 0 to 15 point radii
67 | area <- (30 * np$random$rand(N))**2
68 |
69 | # plt.scatter(x, y, s=area, c=colors, alpha=0.5)
70 | plt$scatter(x, y, s=area, c=colors, alpha=0.5)
71 |
72 | # plt.show()
73 | plt$show()
74 |
--------------------------------------------------------------------------------
/python/Hummingbird_ML.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Hummingbird-ML.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "accelerator": "GPU"
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "id": "IpOdlr3WAPHJ",
21 | "colab_type": "text"
22 | },
23 | "source": [
24 | "# **Hummingbird-ML**\n",
25 | "\n",
26 | "[How to Harness GPU to Speed Up Machine Learning with Hummingbird-ML](https://www.youtube.com/watch?v=qN8jcUmo8TI)\n",
27 | "\n",
28 | "Adapted from: https://github.com/microsoft/hummingbird"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "id": "ir3DZd5-_jiu",
35 | "colab_type": "text"
36 | },
37 | "source": [
38 | "# Install Hummingbird-ML"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "metadata": {
44 | "id": "ra3JEgWN_bfp",
45 | "colab_type": "code",
46 | "colab": {
47 | "base_uri": "https://localhost:8080/",
48 | "height": 408
49 | },
50 | "outputId": "4fae39de-26f0-4939-846d-039fb876725a"
51 | },
52 | "source": [
53 | "! pip install hummingbird-ml[extra]"
54 | ],
55 | "execution_count": 1,
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "text": [
60 | "Collecting hummingbird-ml[extra]\n",
61 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ed/3b/cf1b8c1e7531377adead8de29e29b00b5aed380544ad0def4c0188b50d80/hummingbird_ml-0.0.5-py2.py3-none-any.whl (60kB)\n",
62 | "\r\u001b[K |█████▌ | 10kB 16.6MB/s eta 0:00:01\r\u001b[K |███████████ | 20kB 1.8MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 30kB 2.2MB/s eta 0:00:01\r\u001b[K |█████████████████████▉ | 40kB 2.5MB/s eta 0:00:01\r\u001b[K |███████████████████████████▎ | 51kB 2.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 61kB 1.8MB/s \n",
63 | "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.18.5)\n",
64 | "Requirement already satisfied: torch>=1.4.* in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.6.0+cu101)\n",
65 | "Collecting onnxconverter-common>=1.6.0\n",
66 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/fe/7a/7e30c643cd7d2ad87689188ef34ce93e657bd14da3605f87bcdbc19cd5b1/onnxconverter_common-1.7.0-py2.py3-none-any.whl (64kB)\n",
67 | "\u001b[K |████████████████████████████████| 71kB 3.7MB/s \n",
68 | "\u001b[?25hRequirement already satisfied: scikit-learn>=0.22.1 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.22.2.post1)\n",
69 | "Requirement already satisfied: xgboost==0.90; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.90)\n",
70 | "Requirement already satisfied: lightgbm>=2.2; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (2.2.3)\n",
71 | "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.4.*->hummingbird-ml[extra]) (0.16.0)\n",
72 | "Collecting onnx\n",
73 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/36/ee/bc7bc88fc8449266add978627e90c363069211584b937fd867b0ccc59f09/onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4MB)\n",
74 | "\u001b[K |████████████████████████████████| 7.4MB 16.0MB/s \n",
75 | "\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.6/dist-packages (from onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.12.4)\n",
76 | "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (0.16.0)\n",
77 | "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (1.4.1)\n",
78 | "Requirement already satisfied: typing-extensions>=3.6.2.1 in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.7.4.3)\n",
79 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (1.15.0)\n",
80 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (49.6.0)\n",
81 | "Installing collected packages: onnx, onnxconverter-common, hummingbird-ml\n",
82 | "Successfully installed hummingbird-ml-0.0.5 onnx-1.7.0 onnxconverter-common-1.7.0\n"
83 | ],
84 | "name": "stdout"
85 | }
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {
91 | "id": "YnA-PmeA_q70",
92 | "colab_type": "text"
93 | },
94 | "source": [
95 | "# Import libraries"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "metadata": {
101 | "id": "lkIThThi_puf",
102 | "colab_type": "code",
103 | "colab": {}
104 | },
105 | "source": [
106 | "import numpy as np\n",
107 | "from sklearn.ensemble import RandomForestClassifier\n",
108 | "from hummingbird.ml import convert"
109 | ],
110 | "execution_count": 2,
111 | "outputs": []
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {
116 | "id": "rFw_4cGa_-tF",
117 | "colab_type": "text"
118 | },
119 | "source": [
120 | "# Create some random data for binary classification"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "metadata": {
126 | "id": "hGGngPPp__mx",
127 | "colab_type": "code",
128 | "colab": {}
129 | },
130 | "source": [
131 | "num_classes = 2\n",
132 | "X = np.random.rand(100000, 28)\n",
133 | "y = np.random.randint(num_classes, size=100000)"
134 | ],
135 | "execution_count": 3,
136 | "outputs": []
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {
141 | "id": "WusxNKH4AHII",
142 | "colab_type": "text"
143 | },
144 | "source": [
145 | "# Create and train a model (scikit-learn RandomForestClassifier)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "metadata": {
151 | "id": "GMRJRuBwAGeV",
152 | "colab_type": "code",
153 | "colab": {}
154 | },
155 | "source": [
156 | "skl_model = RandomForestClassifier(n_estimators=10, max_depth=10)"
157 | ],
158 | "execution_count": 4,
159 | "outputs": []
160 | },
161 | {
162 | "cell_type": "code",
163 | "metadata": {
164 | "id": "M_kGo80yAYTn",
165 | "colab_type": "code",
166 | "colab": {
167 | "base_uri": "https://localhost:8080/",
168 | "height": 34
169 | },
170 | "outputId": "aa863652-02f8-4578-8fb7-e3b028685cd7"
171 | },
172 | "source": [
173 | "%%timeit\n",
174 | "skl_model.fit(X, y)"
175 | ],
176 | "execution_count": 5,
177 | "outputs": [
178 | {
179 | "output_type": "stream",
180 | "text": [
181 | "1 loop, best of 3: 4.78 s per loop\n"
182 | ],
183 | "name": "stdout"
184 | }
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "metadata": {
190 | "id": "Hp4a8I0tAbBl",
191 | "colab_type": "code",
192 | "colab": {
193 | "base_uri": "https://localhost:8080/",
194 | "height": 34
195 | },
196 | "outputId": "4e083fd5-981f-4238-9158-3f4500585560"
197 | },
198 | "source": [
199 | "%%timeit\n",
200 | "skl_model.predict(X)"
201 | ],
202 | "execution_count": 6,
203 | "outputs": [
204 | {
205 | "output_type": "stream",
206 | "text": [
207 | "10 loops, best of 3: 85.6 ms per loop\n"
208 | ],
209 | "name": "stdout"
210 | }
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {
216 | "id": "mNiBvy9BA7wR",
217 | "colab_type": "text"
218 | },
219 | "source": [
220 | "# Use Hummingbird to convert the model to PyTorch"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "metadata": {
226 | "id": "vcAOpuxxAzPc",
227 | "colab_type": "code",
228 | "colab": {}
229 | },
230 | "source": [
231 | "model = convert(skl_model, 'pytorch')"
232 | ],
233 | "execution_count": 7,
234 | "outputs": []
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {
239 | "id": "dpt6_4l8BF7e",
240 | "colab_type": "text"
241 | },
242 | "source": [
243 | "# Run predictions on CPU"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "metadata": {
249 | "id": "_BiU63hNBDu-",
250 | "colab_type": "code",
251 | "colab": {
252 | "base_uri": "https://localhost:8080/",
253 | "height": 34
254 | },
255 | "outputId": "1bd8b158-a62b-4fe0-be09-ca382c817247"
256 | },
257 | "source": [
258 | "%%timeit\n",
259 | "model.predict(X)"
260 | ],
261 | "execution_count": 8,
262 | "outputs": [
263 | {
264 | "output_type": "stream",
265 | "text": [
266 | "1 loop, best of 3: 174 ms per loop\n"
267 | ],
268 | "name": "stdout"
269 | }
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {
275 | "id": "F10tJEMKBPZG",
276 | "colab_type": "text"
277 | },
278 | "source": [
279 | "# Run predictions on GPU"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "metadata": {
285 | "id": "l2PUbqoHBJBX",
286 | "colab_type": "code",
287 | "colab": {}
288 | },
289 | "source": [
290 | "model.to('cuda')"
291 | ],
292 | "execution_count": 9,
293 | "outputs": []
294 | },
295 | {
296 | "cell_type": "code",
297 | "metadata": {
298 | "id": "-AB23_VTBRMP",
299 | "colab_type": "code",
300 | "colab": {
301 | "base_uri": "https://localhost:8080/",
302 | "height": 51
303 | },
304 | "outputId": "b9efea7d-913c-4326-c14a-6b6ca0e9c063"
305 | },
306 | "source": [
307 | "%%timeit\n",
308 | "model.predict(X)"
309 | ],
310 | "execution_count": 10,
311 | "outputs": [
312 | {
313 | "output_type": "stream",
314 | "text": [
315 | "The slowest run took 5.22 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
316 | "100 loops, best of 3: 14.8 ms per loop\n"
317 | ],
318 | "name": "stdout"
319 | }
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {
325 | "id": "dbkQU69JDt7T",
326 | "colab_type": "text"
327 | },
328 | "source": [
329 | "# Calculation Time"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {
335 | "id": "Hr1R_9nwDwpc",
336 | "colab_type": "text"
337 | },
338 | "source": [
339 | "Methods | Timing | Performance\n",
340 | "--|--|--\n",
341 | "scikit-learn | 85.6 ms | -\n",
342 | "PyTorch (CPU) | 174 ms | 2 X slower than scikit-learn\n",
343 | "PyTorch (GPU) | 14.8 ms | Almost 6 X faster than scikit-learn; Almost 12 X faster than PyTorch (CPU)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "metadata": {
349 | "id": "9lmR3LHoEzhl",
350 | "colab_type": "code",
351 | "colab": {}
352 | },
353 | "source": [
354 | ""
355 | ],
356 | "execution_count": null,
357 | "outputs": []
358 | }
359 | ]
360 | }
--------------------------------------------------------------------------------
/python/google_colab_install_conda.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "conda-on-google-colab.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "code",
18 | "metadata": {
19 | "id": "uyfFc8VufUyl",
20 | "colab_type": "code",
21 | "outputId": "36b12856-fc68-40b8-c203-0fcf8ab7244e",
22 | "colab": {
23 | "base_uri": "https://localhost:8080/",
24 | "height": 1000
25 | }
26 | },
27 | "source": [
28 | "################################################################################\n",
29 | "# INSTALL CONDA ON GOOGLE COLAB\n",
30 | "################################################################################\n",
31 | "! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
32 | "! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
33 | "! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local\n",
34 | "import sys\n",
35 | "sys.path.append('/usr/local/lib/python3.7/site-packages/')"
36 | ],
37 | "execution_count": 0,
38 | "outputs": [
39 | {
40 | "output_type": "stream",
41 | "text": [
42 | "--2020-04-06 03:23:37-- https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh\n",
43 | "Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8303, ...\n",
44 | "Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.\n",
45 | "HTTP request sent, awaiting response... 200 OK\n",
46 | "Length: 85055499 (81M) [application/x-sh]\n",
47 | "Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.2’\n",
48 | "\n",
49 | "\r Miniconda 0%[ ] 0 --.-KB/s \r Miniconda3 48%[========> ] 39.24M 196MB/s \r Miniconda3- 93%[=================> ] 76.01M 189MB/s \rMiniconda3-py37_4.8 100%[===================>] 81.12M 187MB/s in 0.4s \n",
50 | "\n",
51 | "2020-04-06 03:23:37 (187 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.2’ saved [85055499/85055499]\n",
52 | "\n",
53 | "PREFIX=/usr/local\n",
54 | "Unpacking payload ...\n",
55 | "Collecting package metadata (current_repodata.json): - \b\b\\ \b\bdone\n",
56 | "Solving environment: / \b\b- \b\b\\ \n",
57 | "The environment is inconsistent, please check the package plan carefully\n",
58 | "The following packages are causing the inconsistency:\n",
59 | "\n",
60 | " - defaults/linux-64::urllib3==1.25.8=py37_0\n",
61 | " - defaults/linux-64::ruamel_yaml==0.15.87=py37h7b6447c_0\n",
62 | " - defaults/linux-64::pyopenssl==19.1.0=py37_0\n",
63 | " - defaults/linux-64::pysocks==1.7.1=py37_0\n",
64 | " - defaults/linux-64::six==1.14.0=py37_0\n",
65 | " - defaults/linux-64::setuptools==45.2.0=py37_0\n",
66 | " - defaults/linux-64::idna==2.8=py37_0\n",
67 | " - defaults/noarch::tqdm==4.42.1=py_0\n",
68 | " - defaults/linux-64::asn1crypto==1.3.0=py37_0\n",
69 | " - defaults/linux-64::cffi==1.14.0=py37h2e261b9_0\n",
70 | " - defaults/linux-64::wheel==0.34.2=py37_0\n",
71 | " - defaults/linux-64::conda-package-handling==1.6.0=py37h7b6447c_0\n",
72 | " - defaults/linux-64::pip==20.0.2=py37_1\n",
73 | " - defaults/linux-64::cryptography==2.8=py37h1ba5d50_0\n",
74 | " - defaults/linux-64::python==3.7.6=h0371630_2\n",
75 | " - defaults/linux-64::pycparser==2.19=py37_0\n",
76 | " - defaults/linux-64::pycosat==0.6.3=py37h7b6447c_0\n",
77 | " - defaults/linux-64::requests==2.22.0=py37_1\n",
78 | " - defaults/linux-64::chardet==3.0.4=py37_1003\n",
79 | "\b\b| \b\b/ \b\bdone\n",
80 | "\n",
81 | "## Package Plan ##\n",
82 | "\n",
83 | " environment location: /usr/local\n",
84 | "\n",
85 | " added / updated specs:\n",
86 | " - _libgcc_mutex==0.1=main\n",
87 | " - asn1crypto==1.3.0=py37_0\n",
88 | " - ca-certificates==2020.1.1=0\n",
89 | " - certifi==2019.11.28=py37_0\n",
90 | " - cffi==1.14.0=py37h2e261b9_0\n",
91 | " - chardet==3.0.4=py37_1003\n",
92 | " - conda-package-handling==1.6.0=py37h7b6447c_0\n",
93 | " - conda==4.8.2=py37_0\n",
94 | " - cryptography==2.8=py37h1ba5d50_0\n",
95 | " - idna==2.8=py37_0\n",
96 | " - ld_impl_linux-64==2.33.1=h53a641e_7\n",
97 | " - libedit==3.1.20181209=hc058e9b_0\n",
98 | " - libffi==3.2.1=hd88cf55_4\n",
99 | " - libgcc-ng==9.1.0=hdf63c60_0\n",
100 | " - libstdcxx-ng==9.1.0=hdf63c60_0\n",
101 | " - ncurses==6.2=he6710b0_0\n",
102 | " - openssl==1.1.1d=h7b6447c_4\n",
103 | " - pip==20.0.2=py37_1\n",
104 | " - pycosat==0.6.3=py37h7b6447c_0\n",
105 | " - pycparser==2.19=py37_0\n",
106 | " - pyopenssl==19.1.0=py37_0\n",
107 | " - pysocks==1.7.1=py37_0\n",
108 | " - python==3.7.6=h0371630_2\n",
109 | " - readline==7.0=h7b6447c_5\n",
110 | " - requests==2.22.0=py37_1\n",
111 | " - ruamel_yaml==0.15.87=py37h7b6447c_0\n",
112 | " - setuptools==45.2.0=py37_0\n",
113 | " - six==1.14.0=py37_0\n",
114 | " - sqlite==3.31.1=h7b6447c_0\n",
115 | " - tk==8.6.8=hbc83047_0\n",
116 | " - tqdm==4.42.1=py_0\n",
117 | " - urllib3==1.25.8=py37_0\n",
118 | " - wheel==0.34.2=py37_0\n",
119 | " - xz==5.2.4=h14c3975_4\n",
120 | " - yaml==0.1.7=had09818_2\n",
121 | " - zlib==1.2.11=h7b6447c_3\n",
122 | "\n",
123 | "\n",
124 | "The following NEW packages will be INSTALLED:\n",
125 | "\n",
126 | " certifi pkgs/main/linux-64::certifi-2019.11.28-py37_0\n",
127 | " conda pkgs/main/linux-64::conda-4.8.2-py37_0\n",
128 | " openssl pkgs/main/linux-64::openssl-1.1.1d-h7b6447c_4\n",
129 | "\n",
130 | "\n",
131 | "Preparing transaction: \\ \b\bdone\n",
132 | "Executing transaction: / \b\b- \b\bdone\n",
133 | "installation finished.\n",
134 | "WARNING:\n",
135 | " You currently have a PYTHONPATH environment variable set. This may cause\n",
136 | " unexpected behavior when running the Python interpreter in Miniconda3.\n",
137 | " For best results, please verify that your PYTHONPATH only points to\n",
138 | " directories of packages that are compatible with the Python interpreter\n",
139 | " in Miniconda3: /usr/local\n"
140 | ],
141 | "name": "stdout"
142 | }
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "metadata": {
148 | "id": "QD319lDvf6Xp",
149 | "colab_type": "code",
150 | "outputId": "71cc1953-4fd9-4f85-fc0a-5a3f7e7688bd",
151 | "colab": {
152 | "base_uri": "https://localhost:8080/",
153 | "height": 996
154 | }
155 | },
156 | "source": [
157 | "! conda install -c rdkit rdkit -y"
158 | ],
159 | "execution_count": 0,
160 | "outputs": [
161 | {
162 | "output_type": "stream",
163 | "text": [
164 | "Collecting package metadata (current_repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n",
165 | "Solving environment: - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\bfailed with initial frozen solve. Retrying with flexible solve.\n",
166 | "Solving environment: / \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bfailed with repodata from current_repodata.json, will retry with next repodata source.\n",
167 | "Collecting package metadata (repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
168 | "Solving environment: | \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\bdone\n",
169 | "\n",
170 | "## Package Plan ##\n",
171 | "\n",
172 | " environment location: /usr/local\n",
173 | "\n",
174 | " added / updated specs:\n",
175 | " - rdkit\n",
176 | "\n",
177 | "\n",
178 | "The following NEW packages will be INSTALLED:\n",
179 | "\n",
180 | " blas pkgs/main/linux-64::blas-1.0-mkl\n",
181 | " bzip2 pkgs/main/linux-64::bzip2-1.0.8-h7b6447c_0\n",
182 | " cairo pkgs/main/linux-64::cairo-1.14.12-h8948797_3\n",
183 | " fontconfig pkgs/main/linux-64::fontconfig-2.13.0-h9420a91_0\n",
184 | " freetype pkgs/main/linux-64::freetype-2.9.1-h8a8886c_1\n",
185 | " glib pkgs/main/linux-64::glib-2.63.1-h5a9c865_0\n",
186 | " icu pkgs/main/linux-64::icu-58.2-h9c2bf20_1\n",
187 | " intel-openmp pkgs/main/linux-64::intel-openmp-2020.0-166\n",
188 | " jpeg pkgs/main/linux-64::jpeg-9b-h024ee3a_2\n",
189 | " libboost pkgs/main/linux-64::libboost-1.67.0-h46d08c1_4\n",
190 | " libgfortran-ng pkgs/main/linux-64::libgfortran-ng-7.3.0-hdf63c60_0\n",
191 | " libpng pkgs/main/linux-64::libpng-1.6.37-hbc83047_0\n",
192 | " libtiff pkgs/main/linux-64::libtiff-4.1.0-h2733197_0\n",
193 | " libuuid pkgs/main/linux-64::libuuid-1.0.3-h1bed415_2\n",
194 | " libxcb pkgs/main/linux-64::libxcb-1.13-h1bed415_1\n",
195 | " libxml2 pkgs/main/linux-64::libxml2-2.9.9-hea5a465_1\n",
196 | " mkl pkgs/main/linux-64::mkl-2020.0-166\n",
197 | " mkl-service pkgs/main/linux-64::mkl-service-2.3.0-py37he904b0f_0\n",
198 | " mkl_fft pkgs/main/linux-64::mkl_fft-1.0.15-py37ha843d7b_0\n",
199 | " mkl_random pkgs/main/linux-64::mkl_random-1.1.0-py37hd6b4f25_0\n",
200 | " numpy pkgs/main/linux-64::numpy-1.18.1-py37h4f9e942_0\n",
201 | " numpy-base pkgs/main/linux-64::numpy-base-1.18.1-py37hde5b4d6_1\n",
202 | " olefile pkgs/main/linux-64::olefile-0.46-py37_0\n",
203 | " pandas pkgs/main/linux-64::pandas-1.0.3-py37h0573a6f_0\n",
204 | " pcre pkgs/main/linux-64::pcre-8.43-he6710b0_0\n",
205 | " pillow pkgs/main/linux-64::pillow-7.0.0-py37hb39fc2d_0\n",
206 | " pixman pkgs/main/linux-64::pixman-0.38.0-h7b6447c_0\n",
207 | " py-boost pkgs/main/linux-64::py-boost-1.67.0-py37h04863e7_4\n",
208 | " python-dateutil pkgs/main/noarch::python-dateutil-2.8.1-py_0\n",
209 | " pytz pkgs/main/noarch::pytz-2019.3-py_0\n",
210 | " rdkit rdkit/linux-64::rdkit-2020.03.1.0-py37hc20afe1_1\n",
211 | " zstd pkgs/main/linux-64::zstd-1.3.7-h0b5b093_0\n",
212 | "\n",
213 | "The following packages will be UPDATED:\n",
214 | "\n",
215 | " certifi 2019.11.28-py37_0 --> 2019.11.28-py37_1\n",
216 | " conda 4.8.2-py37_0 --> 4.8.3-py37_0\n",
217 | " openssl 1.1.1d-h7b6447c_4 --> 1.1.1f-h7b6447c_0\n",
218 | "\n",
219 | "\n",
220 | "Proceed ([y]/n)? "
221 | ],
222 | "name": "stdout"
223 | }
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "metadata": {
229 | "id": "M8MeOz0miqJ1",
230 | "colab_type": "code",
231 | "colab": {}
232 | },
233 | "source": [
234 | ""
235 | ],
236 | "execution_count": 0,
237 | "outputs": []
238 | }
239 | ]
240 | }
--------------------------------------------------------------------------------
/python/iris/iris-classification-random-forest.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Building a Classification Model for the Iris data set\n",
8 | "\n",
9 | "Chanin Nantasenamat\n",
10 | "\n",
11 | "Data Professor YouTube channel, http://youtube.com/dataprofessor \n",
12 | "\n",
13 | "In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm."
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## 1. Import libraries"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from sklearn import datasets\n",
30 | "from sklearn.model_selection import train_test_split\n",
31 | "from sklearn.ensemble import RandomForestClassifier\n",
32 | "from sklearn.datasets import make_classification"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## 2. Load the *iris* data set"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "iris = datasets.load_iris()"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## 3. Input features\n",
56 | "The ***iris*** data set contains 4 input features and 1 output variable (the class label)."
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "### 3.1. Input features"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {
70 | "scrolled": true
71 | },
72 | "outputs": [
73 | {
74 | "name": "stdout",
75 | "output_type": "stream",
76 | "text": [
77 | "['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "print(iris.feature_names)"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "### 3.2. Output features"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 4,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "['setosa' 'versicolor' 'virginica']\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "print(iris.target_names)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## 4. Glimpse of the data"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "### 4.1. Input features"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 5,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "array([[5.1, 3.5, 1.4, 0.2],\n",
132 | " [4.9, 3. , 1.4, 0.2],\n",
133 | " [4.7, 3.2, 1.3, 0.2],\n",
134 | " [4.6, 3.1, 1.5, 0.2],\n",
135 | " [5. , 3.6, 1.4, 0.2],\n",
136 | " [5.4, 3.9, 1.7, 0.4],\n",
137 | " [4.6, 3.4, 1.4, 0.3],\n",
138 | " [5. , 3.4, 1.5, 0.2],\n",
139 | " [4.4, 2.9, 1.4, 0.2],\n",
140 | " [4.9, 3.1, 1.5, 0.1],\n",
141 | " [5.4, 3.7, 1.5, 0.2],\n",
142 | " [4.8, 3.4, 1.6, 0.2],\n",
143 | " [4.8, 3. , 1.4, 0.1],\n",
144 | " [4.3, 3. , 1.1, 0.1],\n",
145 | " [5.8, 4. , 1.2, 0.2],\n",
146 | " [5.7, 4.4, 1.5, 0.4],\n",
147 | " [5.4, 3.9, 1.3, 0.4],\n",
148 | " [5.1, 3.5, 1.4, 0.3],\n",
149 | " [5.7, 3.8, 1.7, 0.3],\n",
150 | " [5.1, 3.8, 1.5, 0.3],\n",
151 | " [5.4, 3.4, 1.7, 0.2],\n",
152 | " [5.1, 3.7, 1.5, 0.4],\n",
153 | " [4.6, 3.6, 1. , 0.2],\n",
154 | " [5.1, 3.3, 1.7, 0.5],\n",
155 | " [4.8, 3.4, 1.9, 0.2],\n",
156 | " [5. , 3. , 1.6, 0.2],\n",
157 | " [5. , 3.4, 1.6, 0.4],\n",
158 | " [5.2, 3.5, 1.5, 0.2],\n",
159 | " [5.2, 3.4, 1.4, 0.2],\n",
160 | " [4.7, 3.2, 1.6, 0.2],\n",
161 | " [4.8, 3.1, 1.6, 0.2],\n",
162 | " [5.4, 3.4, 1.5, 0.4],\n",
163 | " [5.2, 4.1, 1.5, 0.1],\n",
164 | " [5.5, 4.2, 1.4, 0.2],\n",
165 | " [4.9, 3.1, 1.5, 0.2],\n",
166 | " [5. , 3.2, 1.2, 0.2],\n",
167 | " [5.5, 3.5, 1.3, 0.2],\n",
168 | " [4.9, 3.6, 1.4, 0.1],\n",
169 | " [4.4, 3. , 1.3, 0.2],\n",
170 | " [5.1, 3.4, 1.5, 0.2],\n",
171 | " [5. , 3.5, 1.3, 0.3],\n",
172 | " [4.5, 2.3, 1.3, 0.3],\n",
173 | " [4.4, 3.2, 1.3, 0.2],\n",
174 | " [5. , 3.5, 1.6, 0.6],\n",
175 | " [5.1, 3.8, 1.9, 0.4],\n",
176 | " [4.8, 3. , 1.4, 0.3],\n",
177 | " [5.1, 3.8, 1.6, 0.2],\n",
178 | " [4.6, 3.2, 1.4, 0.2],\n",
179 | " [5.3, 3.7, 1.5, 0.2],\n",
180 | " [5. , 3.3, 1.4, 0.2],\n",
181 | " [7. , 3.2, 4.7, 1.4],\n",
182 | " [6.4, 3.2, 4.5, 1.5],\n",
183 | " [6.9, 3.1, 4.9, 1.5],\n",
184 | " [5.5, 2.3, 4. , 1.3],\n",
185 | " [6.5, 2.8, 4.6, 1.5],\n",
186 | " [5.7, 2.8, 4.5, 1.3],\n",
187 | " [6.3, 3.3, 4.7, 1.6],\n",
188 | " [4.9, 2.4, 3.3, 1. ],\n",
189 | " [6.6, 2.9, 4.6, 1.3],\n",
190 | " [5.2, 2.7, 3.9, 1.4],\n",
191 | " [5. , 2. , 3.5, 1. ],\n",
192 | " [5.9, 3. , 4.2, 1.5],\n",
193 | " [6. , 2.2, 4. , 1. ],\n",
194 | " [6.1, 2.9, 4.7, 1.4],\n",
195 | " [5.6, 2.9, 3.6, 1.3],\n",
196 | " [6.7, 3.1, 4.4, 1.4],\n",
197 | " [5.6, 3. , 4.5, 1.5],\n",
198 | " [5.8, 2.7, 4.1, 1. ],\n",
199 | " [6.2, 2.2, 4.5, 1.5],\n",
200 | " [5.6, 2.5, 3.9, 1.1],\n",
201 | " [5.9, 3.2, 4.8, 1.8],\n",
202 | " [6.1, 2.8, 4. , 1.3],\n",
203 | " [6.3, 2.5, 4.9, 1.5],\n",
204 | " [6.1, 2.8, 4.7, 1.2],\n",
205 | " [6.4, 2.9, 4.3, 1.3],\n",
206 | " [6.6, 3. , 4.4, 1.4],\n",
207 | " [6.8, 2.8, 4.8, 1.4],\n",
208 | " [6.7, 3. , 5. , 1.7],\n",
209 | " [6. , 2.9, 4.5, 1.5],\n",
210 | " [5.7, 2.6, 3.5, 1. ],\n",
211 | " [5.5, 2.4, 3.8, 1.1],\n",
212 | " [5.5, 2.4, 3.7, 1. ],\n",
213 | " [5.8, 2.7, 3.9, 1.2],\n",
214 | " [6. , 2.7, 5.1, 1.6],\n",
215 | " [5.4, 3. , 4.5, 1.5],\n",
216 | " [6. , 3.4, 4.5, 1.6],\n",
217 | " [6.7, 3.1, 4.7, 1.5],\n",
218 | " [6.3, 2.3, 4.4, 1.3],\n",
219 | " [5.6, 3. , 4.1, 1.3],\n",
220 | " [5.5, 2.5, 4. , 1.3],\n",
221 | " [5.5, 2.6, 4.4, 1.2],\n",
222 | " [6.1, 3. , 4.6, 1.4],\n",
223 | " [5.8, 2.6, 4. , 1.2],\n",
224 | " [5. , 2.3, 3.3, 1. ],\n",
225 | " [5.6, 2.7, 4.2, 1.3],\n",
226 | " [5.7, 3. , 4.2, 1.2],\n",
227 | " [5.7, 2.9, 4.2, 1.3],\n",
228 | " [6.2, 2.9, 4.3, 1.3],\n",
229 | " [5.1, 2.5, 3. , 1.1],\n",
230 | " [5.7, 2.8, 4.1, 1.3],\n",
231 | " [6.3, 3.3, 6. , 2.5],\n",
232 | " [5.8, 2.7, 5.1, 1.9],\n",
233 | " [7.1, 3. , 5.9, 2.1],\n",
234 | " [6.3, 2.9, 5.6, 1.8],\n",
235 | " [6.5, 3. , 5.8, 2.2],\n",
236 | " [7.6, 3. , 6.6, 2.1],\n",
237 | " [4.9, 2.5, 4.5, 1.7],\n",
238 | " [7.3, 2.9, 6.3, 1.8],\n",
239 | " [6.7, 2.5, 5.8, 1.8],\n",
240 | " [7.2, 3.6, 6.1, 2.5],\n",
241 | " [6.5, 3.2, 5.1, 2. ],\n",
242 | " [6.4, 2.7, 5.3, 1.9],\n",
243 | " [6.8, 3. , 5.5, 2.1],\n",
244 | " [5.7, 2.5, 5. , 2. ],\n",
245 | " [5.8, 2.8, 5.1, 2.4],\n",
246 | " [6.4, 3.2, 5.3, 2.3],\n",
247 | " [6.5, 3. , 5.5, 1.8],\n",
248 | " [7.7, 3.8, 6.7, 2.2],\n",
249 | " [7.7, 2.6, 6.9, 2.3],\n",
250 | " [6. , 2.2, 5. , 1.5],\n",
251 | " [6.9, 3.2, 5.7, 2.3],\n",
252 | " [5.6, 2.8, 4.9, 2. ],\n",
253 | " [7.7, 2.8, 6.7, 2. ],\n",
254 | " [6.3, 2.7, 4.9, 1.8],\n",
255 | " [6.7, 3.3, 5.7, 2.1],\n",
256 | " [7.2, 3.2, 6. , 1.8],\n",
257 | " [6.2, 2.8, 4.8, 1.8],\n",
258 | " [6.1, 3. , 4.9, 1.8],\n",
259 | " [6.4, 2.8, 5.6, 2.1],\n",
260 | " [7.2, 3. , 5.8, 1.6],\n",
261 | " [7.4, 2.8, 6.1, 1.9],\n",
262 | " [7.9, 3.8, 6.4, 2. ],\n",
263 | " [6.4, 2.8, 5.6, 2.2],\n",
264 | " [6.3, 2.8, 5.1, 1.5],\n",
265 | " [6.1, 2.6, 5.6, 1.4],\n",
266 | " [7.7, 3. , 6.1, 2.3],\n",
267 | " [6.3, 3.4, 5.6, 2.4],\n",
268 | " [6.4, 3.1, 5.5, 1.8],\n",
269 | " [6. , 3. , 4.8, 1.8],\n",
270 | " [6.9, 3.1, 5.4, 2.1],\n",
271 | " [6.7, 3.1, 5.6, 2.4],\n",
272 | " [6.9, 3.1, 5.1, 2.3],\n",
273 | " [5.8, 2.7, 5.1, 1.9],\n",
274 | " [6.8, 3.2, 5.9, 2.3],\n",
275 | " [6.7, 3.3, 5.7, 2.5],\n",
276 | " [6.7, 3. , 5.2, 2.3],\n",
277 | " [6.3, 2.5, 5. , 1.9],\n",
278 | " [6.5, 3. , 5.2, 2. ],\n",
279 | " [6.2, 3.4, 5.4, 2.3],\n",
280 | " [5.9, 3. , 5.1, 1.8]])"
281 | ]
282 | },
283 | "execution_count": 5,
284 | "metadata": {},
285 | "output_type": "execute_result"
286 | }
287 | ],
288 | "source": [
289 | "iris.data"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "### 4.2. Output variable (the Class label)"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 30,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
308 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
309 | " 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
310 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
311 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
312 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
313 | " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
314 | ]
315 | },
316 | "execution_count": 30,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | }
320 | ],
321 | "source": [
322 | "iris.target"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "### 4.3. Assigning *input* and *output* variables\n",
330 | "Let's assign the 4 input variables to X and the output variable (class label) to Y"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 9,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "X = iris.data\n",
340 | "Y = iris.target"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "### 4.3. Let's examine the data dimension"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 10,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/plain": [
358 | "(150, 4)"
359 | ]
360 | },
361 | "execution_count": 10,
362 | "metadata": {},
363 | "output_type": "execute_result"
364 | }
365 | ],
366 | "source": [
367 | "X.shape"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 11,
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "data": {
377 | "text/plain": [
378 | "(150,)"
379 | ]
380 | },
381 | "execution_count": 11,
382 | "metadata": {},
383 | "output_type": "execute_result"
384 | }
385 | ],
386 | "source": [
387 | "Y.shape"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "## 5. Build Classification Model using Random Forest"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 9,
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "clf = RandomForestClassifier()"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 10,
409 | "metadata": {},
410 | "outputs": [
411 | {
412 | "data": {
413 | "text/plain": [
414 | "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
415 | " criterion='gini', max_depth=None, max_features='auto',\n",
416 | " max_leaf_nodes=None, max_samples=None,\n",
417 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
418 | " min_samples_leaf=1, min_samples_split=2,\n",
419 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n",
420 | " n_jobs=None, oob_score=False, random_state=None,\n",
421 | " verbose=0, warm_start=False)"
422 | ]
423 | },
424 | "execution_count": 10,
425 | "metadata": {},
426 | "output_type": "execute_result"
427 | }
428 | ],
429 | "source": [
430 | "clf.fit(X, Y)"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "## 6. Feature Importance"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 11,
443 | "metadata": {},
444 | "outputs": [
445 | {
446 | "name": "stdout",
447 | "output_type": "stream",
448 | "text": [
449 | "[0.07344346 0.01623453 0.42869861 0.4816234 ]\n"
450 | ]
451 | }
452 | ],
453 | "source": [
454 | "print(clf.feature_importances_)"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "## 7. Make Prediction"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": 12,
467 | "metadata": {},
468 | "outputs": [
469 | {
470 | "data": {
471 | "text/plain": [
472 | "array([5.1, 3.5, 1.4, 0.2])"
473 | ]
474 | },
475 | "execution_count": 12,
476 | "metadata": {},
477 | "output_type": "execute_result"
478 | }
479 | ],
480 | "source": [
481 | "X[0]"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 13,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "name": "stdout",
491 | "output_type": "stream",
492 | "text": [
493 | "[0]\n"
494 | ]
495 | }
496 | ],
497 | "source": [
498 | "print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": 14,
504 | "metadata": {},
505 | "outputs": [
506 | {
507 | "name": "stdout",
508 | "output_type": "stream",
509 | "text": [
510 | "[0]\n"
511 | ]
512 | }
513 | ],
514 | "source": [
515 | "print(clf.predict(X[[0]]))"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 15,
521 | "metadata": {},
522 | "outputs": [
523 | {
524 | "name": "stdout",
525 | "output_type": "stream",
526 | "text": [
527 | "[[1. 0. 0.]]\n"
528 | ]
529 | }
530 | ],
531 | "source": [
532 | "print(clf.predict_proba(X[[0]]))"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 16,
538 | "metadata": {},
539 | "outputs": [
540 | {
541 | "data": {
542 | "text/plain": [
543 | "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
544 | " criterion='gini', max_depth=None, max_features='auto',\n",
545 | " max_leaf_nodes=None, max_samples=None,\n",
546 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
547 | " min_samples_leaf=1, min_samples_split=2,\n",
548 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n",
549 | " n_jobs=None, oob_score=False, random_state=None,\n",
550 | " verbose=0, warm_start=False)"
551 | ]
552 | },
553 | "execution_count": 16,
554 | "metadata": {},
555 | "output_type": "execute_result"
556 | }
557 | ],
558 | "source": [
559 | "clf.fit(iris.data, iris.target_names[iris.target])"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {},
565 | "source": [
566 | "## 8. Data split (80/20 ratio)"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": 17,
572 | "metadata": {},
573 | "outputs": [],
574 | "source": [
575 | "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": 18,
581 | "metadata": {},
582 | "outputs": [
583 | {
584 | "data": {
585 | "text/plain": [
586 | "((120, 4), (120,))"
587 | ]
588 | },
589 | "execution_count": 18,
590 | "metadata": {},
591 | "output_type": "execute_result"
592 | }
593 | ],
594 | "source": [
595 | "X_train.shape, Y_train.shape"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 19,
601 | "metadata": {},
602 | "outputs": [
603 | {
604 | "data": {
605 | "text/plain": [
606 | "((30, 4), (30,))"
607 | ]
608 | },
609 | "execution_count": 19,
610 | "metadata": {},
611 | "output_type": "execute_result"
612 | }
613 | ],
614 | "source": [
615 | "X_test.shape, Y_test.shape"
616 | ]
617 | },
618 | {
619 | "cell_type": "markdown",
620 | "metadata": {},
621 | "source": [
622 | "## 9. Rebuild the Random Forest Model"
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": 20,
628 | "metadata": {},
629 | "outputs": [
630 | {
631 | "data": {
632 | "text/plain": [
633 | "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
634 | " criterion='gini', max_depth=None, max_features='auto',\n",
635 | " max_leaf_nodes=None, max_samples=None,\n",
636 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
637 | " min_samples_leaf=1, min_samples_split=2,\n",
638 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n",
639 | " n_jobs=None, oob_score=False, random_state=None,\n",
640 | " verbose=0, warm_start=False)"
641 | ]
642 | },
643 | "execution_count": 20,
644 | "metadata": {},
645 | "output_type": "execute_result"
646 | }
647 | ],
648 | "source": [
649 | "clf.fit(X_train, Y_train)"
650 | ]
651 | },
652 | {
653 | "cell_type": "markdown",
654 | "metadata": {},
655 | "source": [
656 | "### 9.1. Performs prediction on single sample from the data set"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 21,
662 | "metadata": {},
663 | "outputs": [
664 | {
665 | "name": "stdout",
666 | "output_type": "stream",
667 | "text": [
668 | "[0]\n"
669 | ]
670 | }
671 | ],
672 | "source": [
673 | "print(clf.predict([[5.1, 3.5, 1.4, 0.2]]))"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": 22,
679 | "metadata": {},
680 | "outputs": [
681 | {
682 | "name": "stdout",
683 | "output_type": "stream",
684 | "text": [
685 | "[[1. 0. 0.]]\n"
686 | ]
687 | }
688 | ],
689 | "source": [
690 | "print(clf.predict_proba([[5.1, 3.5, 1.4, 0.2]]))"
691 | ]
692 | },
693 | {
694 | "cell_type": "markdown",
695 | "metadata": {},
696 | "source": [
697 | "### 9.2. Performs prediction on the test set"
698 | ]
699 | },
700 | {
701 | "cell_type": "markdown",
702 | "metadata": {},
703 | "source": [
704 | "#### *Predicted class labels*"
705 | ]
706 | },
707 | {
708 | "cell_type": "code",
709 | "execution_count": 23,
710 | "metadata": {},
711 | "outputs": [
712 | {
713 | "name": "stdout",
714 | "output_type": "stream",
715 | "text": [
716 | "[2 1 0 1 1 2 1 0 1 0 2 1 1 1 1 1 1 2 2 0 0 2 0 0 0 1 1 1 1 0]\n"
717 | ]
718 | }
719 | ],
720 | "source": [
721 | "print(clf.predict(X_test))"
722 | ]
723 | },
724 | {
725 | "cell_type": "markdown",
726 | "metadata": {},
727 | "source": [
728 | "#### *Actual class labels*"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": 24,
734 | "metadata": {},
735 | "outputs": [
736 | {
737 | "name": "stdout",
738 | "output_type": "stream",
739 | "text": [
740 | "[2 1 0 1 1 2 1 0 1 0 2 1 2 1 1 2 2 2 2 0 0 2 0 0 0 1 1 1 1 0]\n"
741 | ]
742 | }
743 | ],
744 | "source": [
745 | "print(Y_test)"
746 | ]
747 | },
748 | {
749 | "cell_type": "markdown",
750 | "metadata": {},
751 | "source": [
752 | "## 10. Model Performance"
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 25,
758 | "metadata": {},
759 | "outputs": [
760 | {
761 | "name": "stdout",
762 | "output_type": "stream",
763 | "text": [
764 | "0.9\n"
765 | ]
766 | }
767 | ],
768 | "source": [
769 | "print(clf.score(X_test, Y_test))"
770 | ]
771 | }
772 | ],
773 | "metadata": {
774 | "kernelspec": {
775 | "display_name": "Python 3",
776 | "language": "python",
777 | "name": "python3"
778 | },
779 | "language_info": {
780 | "codemirror_mode": {
781 | "name": "ipython",
782 | "version": 3
783 | },
784 | "file_extension": ".py",
785 | "mimetype": "text/x-python",
786 | "name": "python",
787 | "nbconvert_exporter": "python",
788 | "pygments_lexer": "ipython3",
789 | "version": "3.7.6"
790 | }
791 | },
792 | "nbformat": 4,
793 | "nbformat_minor": 4
794 | }
795 |
--------------------------------------------------------------------------------
/python/model_is_training_progress_bar.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "zzD4-HxqXBmt"
7 | },
8 | "source": [
9 | "# **Progress Bar in Jupyter Notebook**\n",
10 | "\n",
11 | "Chanin Nantasenamat\n",
12 | "\n",
13 | "**Data Professor YouTube channel**, http://youtube.com/dataprofessor"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "An7XU557Y5ci"
20 | },
21 | "source": [
22 | "# **Progress Bar with the tqdm library**"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {
29 | "id": "3yc04janmetd"
30 | },
31 | "outputs": [],
32 | "source": [
33 | "# ! pip install tqdm"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {
40 | "id": "gxa8jup1DNjt"
41 | },
42 | "outputs": [],
43 | "source": [
44 | "from tqdm.notebook import tqdm\n",
45 | "from time import sleep"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "metadata": {
52 | "id": "009bdoXCE74q"
53 | },
54 | "outputs": [
55 | {
56 | "data": {
57 | "application/vnd.jupyter.widget-view+json": {
58 | "model_id": "93cc2d7933af4faf96fda14e55f24e23",
59 | "version_major": 2,
60 | "version_minor": 0
61 | },
62 | "text/plain": [
63 | " 0%| | 0/100 [00:00, ?it/s]"
64 | ]
65 | },
66 | "metadata": {},
67 | "output_type": "display_data"
68 | }
69 | ],
70 | "source": [
71 | "number_list = list(range(100))\n",
72 | "for x in tqdm(number_list):\n",
73 | " sleep(0.05)\n",
74 | "#print('Completed!')"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {
80 | "id": "4tFGw2QFMz6N"
81 | },
82 | "source": [
83 | "# **Model Building**"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {
89 | "id": "zKKr9EoSVbOV"
90 | },
91 | "source": [
92 | "### Reading in the Delaney Solubility Dataset"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 3,
98 | "metadata": {
99 | "id": "FHR0FBHEMyyL"
100 | },
101 | "outputs": [],
102 | "source": [
103 | "import pandas as pd\n",
104 | "\n",
105 | "dataset = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')\n",
106 | "\n",
107 | "X = dataset.drop(['logS'], axis=1)\n",
108 | "Y = dataset.iloc[:,-1]\n"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {
114 | "id": "BqqRRTtUVi7v"
115 | },
116 | "source": [
117 | "### Model Building with Progress Bar"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 4,
123 | "metadata": {
124 | "id": "cpa2tS3kInAx",
125 | "scrolled": true
126 | },
127 | "outputs": [
128 | {
129 | "data": {
130 | "application/vnd.jupyter.widget-view+json": {
131 | "model_id": "a1b762495ff545468e8b801795c6b708",
132 | "version_major": 2,
133 | "version_minor": 0
134 | },
135 | "text/plain": [
136 | " 0%| | 0/10 [00:00, ?it/s]"
137 | ]
138 | },
139 | "metadata": {},
140 | "output_type": "display_data"
141 | },
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "Tree: 100, R2: 0.9796508266364179, MSE: 0.08936295274735467\n",
147 | "Tree: 200, R2: 0.9805478792326812, MSE: 0.08542356575902461\n",
148 | "Tree: 300, R2: 0.9801470956638436, MSE: 0.08718359809468906\n",
149 | "Tree: 400, R2: 0.9803760482277171, MSE: 0.08617815788435489\n",
150 | "Tree: 500, R2: 0.9804686074892891, MSE: 0.08577168589797951\n",
151 | "Tree: 600, R2: 0.9804079256830844, MSE: 0.08603816873163578\n",
152 | "Tree: 700, R2: 0.9802975717717071, MSE: 0.0865227855360484\n",
153 | "Tree: 800, R2: 0.9803651322114956, MSE: 0.08622609533244484\n",
154 | "Tree: 900, R2: 0.98037907466393, MSE: 0.08616486735547396\n",
155 | "Tree: 1000, R2: 0.9804349669126423, MSE: 0.08591941775949379\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "from sklearn.ensemble import RandomForestRegressor\n",
161 | "from sklearn.metrics import mean_squared_error, r2_score\n",
162 | "\n",
163 | "parameter_n_estimators = [100,200,300,400,500,600,700,800,900,1000]\n",
164 | "\n",
165 | "for i in tqdm(parameter_n_estimators):\n",
166 | " model = RandomForestRegressor(n_estimators=i)\n",
167 | " model.fit(X,Y)\n",
168 | " Y_pred = model.predict(X)\n",
169 | " r2 = r2_score(Y, Y_pred)\n",
170 | " mse = mean_squared_error(Y, Y_pred)\n",
171 | " print('Tree: %s, R2: %s, MSE: %s' % (i, r2, mse))"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": []
180 | }
181 | ],
182 | "metadata": {
183 | "colab": {
184 | "collapsed_sections": [],
185 | "name": "Model-building-with-progress-bar.ipynb",
186 | "provenance": []
187 | },
188 | "kernelspec": {
189 | "display_name": "Python 3",
190 | "language": "python",
191 | "name": "python3"
192 | },
193 | "language_info": {
194 | "codemirror_mode": {
195 | "name": "ipython",
196 | "version": 3
197 | },
198 | "file_extension": ".py",
199 | "mimetype": "text/x-python",
200 | "name": "python",
201 | "nbconvert_exporter": "python",
202 | "pygments_lexer": "ipython3",
203 | "version": "3.7.9"
204 | }
205 | },
206 | "nbformat": 4,
207 | "nbformat_minor": 1
208 | }
209 |
--------------------------------------------------------------------------------
/python/pandas_select_columns.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "pandas-select-columns.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "sBn4_vrWHjCi",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "# **Introduction to Pandas dataframe: Select specific column(s) in a DataFrame**\n",
24 | "\n",
25 | "Chanin Nantasenamat\n",
26 | "\n",
27 | "[*'Data Professor' YouTube channel*](http://youtube.com/dataprofessor)\n",
28 | "\n",
29 | "In this Jupyter notebook, I will be showing you how to select specifc column(s) in a DataFrame.\n",
30 | "\n",
31 | "---"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {
37 | "id": "PBWDV57tHmLf",
38 | "colab_type": "text"
39 | },
40 | "source": [
41 | "## **Download CSV data**"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "metadata": {
47 | "id": "pq6EVr5sHXAq",
48 | "colab_type": "code",
49 | "colab": {
50 | "base_uri": "https://localhost:8080/",
51 | "height": 208
52 | },
53 | "outputId": "1eea0330-050d-4101-c21b-dce27e09cf01"
54 | },
55 | "source": [
56 | "! wget https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv"
57 | ],
58 | "execution_count": 10,
59 | "outputs": [
60 | {
61 | "output_type": "stream",
62 | "text": [
63 | "--2020-05-03 17:41:00-- https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv\n",
64 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
65 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
66 | "HTTP request sent, awaiting response... 200 OK\n",
67 | "Length: 57370 (56K) [text/plain]\n",
68 | "Saving to: ‘delaney_solubility_with_descriptors.csv’\n",
69 | "\n",
70 | "\r delaney_s 0%[ ] 0 --.-KB/s \rdelaney_solubility_ 100%[===================>] 56.03K --.-KB/s in 0.01s \n",
71 | "\n",
72 | "2020-05-03 17:41:00 (3.82 MB/s) - ‘delaney_solubility_with_descriptors.csv’ saved [57370/57370]\n",
73 | "\n"
74 | ],
75 | "name": "stdout"
76 | }
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "metadata": {
82 | "id": "qJ9j-V_zHwm8",
83 | "colab_type": "code",
84 | "colab": {}
85 | },
86 | "source": [
87 | "import pandas as pd"
88 | ],
89 | "execution_count": 0,
90 | "outputs": []
91 | },
92 | {
93 | "cell_type": "code",
94 | "metadata": {
95 | "id": "D6twMV5THz2r",
96 | "colab_type": "code",
97 | "colab": {
98 | "base_uri": "https://localhost:8080/",
99 | "height": 415
100 | },
101 | "outputId": "3230bda2-d61e-48f6-f232-0dabecc08908"
102 | },
103 | "source": [
104 | "df = pd.read_csv('delaney_solubility_with_descriptors.csv')\n",
105 | "df"
106 | ],
107 | "execution_count": 12,
108 | "outputs": [
109 | {
110 | "output_type": "execute_result",
111 | "data": {
112 | "text/html": [
113 | "
\n",
114 | "\n",
127 | "
\n",
128 | " \n",
129 | " \n",
130 | " | \n",
131 | " MolLogP | \n",
132 | " MolWt | \n",
133 | " NumRotatableBonds | \n",
134 | " AromaticProportion | \n",
135 | " logS | \n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " \n",
140 | " 0 | \n",
141 | " 2.59540 | \n",
142 | " 167.850 | \n",
143 | " 0.0 | \n",
144 | " 0.000000 | \n",
145 | " -2.180 | \n",
146 | "
\n",
147 | " \n",
148 | " 1 | \n",
149 | " 2.37650 | \n",
150 | " 133.405 | \n",
151 | " 0.0 | \n",
152 | " 0.000000 | \n",
153 | " -2.000 | \n",
154 | "
\n",
155 | " \n",
156 | " 2 | \n",
157 | " 2.59380 | \n",
158 | " 167.850 | \n",
159 | " 1.0 | \n",
160 | " 0.000000 | \n",
161 | " -1.740 | \n",
162 | "
\n",
163 | " \n",
164 | " 3 | \n",
165 | " 2.02890 | \n",
166 | " 133.405 | \n",
167 | " 1.0 | \n",
168 | " 0.000000 | \n",
169 | " -1.480 | \n",
170 | "
\n",
171 | " \n",
172 | " 4 | \n",
173 | " 2.91890 | \n",
174 | " 187.375 | \n",
175 | " 1.0 | \n",
176 | " 0.000000 | \n",
177 | " -3.040 | \n",
178 | "
\n",
179 | " \n",
180 | " ... | \n",
181 | " ... | \n",
182 | " ... | \n",
183 | " ... | \n",
184 | " ... | \n",
185 | " ... | \n",
186 | "
\n",
187 | " \n",
188 | " 1139 | \n",
189 | " 1.98820 | \n",
190 | " 287.343 | \n",
191 | " 8.0 | \n",
192 | " 0.000000 | \n",
193 | " 1.144 | \n",
194 | "
\n",
195 | " \n",
196 | " 1140 | \n",
197 | " 3.42130 | \n",
198 | " 286.114 | \n",
199 | " 2.0 | \n",
200 | " 0.333333 | \n",
201 | " -4.925 | \n",
202 | "
\n",
203 | " \n",
204 | " 1141 | \n",
205 | " 3.60960 | \n",
206 | " 308.333 | \n",
207 | " 4.0 | \n",
208 | " 0.695652 | \n",
209 | " -3.893 | \n",
210 | "
\n",
211 | " \n",
212 | " 1142 | \n",
213 | " 2.56214 | \n",
214 | " 354.815 | \n",
215 | " 3.0 | \n",
216 | " 0.521739 | \n",
217 | " -3.790 | \n",
218 | "
\n",
219 | " \n",
220 | " 1143 | \n",
221 | " 2.02164 | \n",
222 | " 179.219 | \n",
223 | " 1.0 | \n",
224 | " 0.461538 | \n",
225 | " -2.581 | \n",
226 | "
\n",
227 | " \n",
228 | "
\n",
229 | "
1144 rows × 5 columns
\n",
230 | "
"
231 | ],
232 | "text/plain": [
233 | " MolLogP MolWt NumRotatableBonds AromaticProportion logS\n",
234 | "0 2.59540 167.850 0.0 0.000000 -2.180\n",
235 | "1 2.37650 133.405 0.0 0.000000 -2.000\n",
236 | "2 2.59380 167.850 1.0 0.000000 -1.740\n",
237 | "3 2.02890 133.405 1.0 0.000000 -1.480\n",
238 | "4 2.91890 187.375 1.0 0.000000 -3.040\n",
239 | "... ... ... ... ... ...\n",
240 | "1139 1.98820 287.343 8.0 0.000000 1.144\n",
241 | "1140 3.42130 286.114 2.0 0.333333 -4.925\n",
242 | "1141 3.60960 308.333 4.0 0.695652 -3.893\n",
243 | "1142 2.56214 354.815 3.0 0.521739 -3.790\n",
244 | "1143 2.02164 179.219 1.0 0.461538 -2.581\n",
245 | "\n",
246 | "[1144 rows x 5 columns]"
247 | ]
248 | },
249 | "metadata": {
250 | "tags": []
251 | },
252 | "execution_count": 12
253 | }
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {
259 | "id": "uc36WETtHp77",
260 | "colab_type": "text"
261 | },
262 | "source": [
263 | "## **Selecting specific column(s)**"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "id": "Ha6n6gH3IsPs",
270 | "colab_type": "text"
271 | },
272 | "source": [
273 | "### Selecting a single column"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "metadata": {
279 | "id": "94xKLaCoHpeY",
280 | "colab_type": "code",
281 | "colab": {
282 | "base_uri": "https://localhost:8080/",
283 | "height": 225
284 | },
285 | "outputId": "7bc417a4-ff9e-43c4-f05a-7dc4eb5aa387"
286 | },
287 | "source": [
288 | "df.MolLogP"
289 | ],
290 | "execution_count": 13,
291 | "outputs": [
292 | {
293 | "output_type": "execute_result",
294 | "data": {
295 | "text/plain": [
296 | "0 2.59540\n",
297 | "1 2.37650\n",
298 | "2 2.59380\n",
299 | "3 2.02890\n",
300 | "4 2.91890\n",
301 | " ... \n",
302 | "1139 1.98820\n",
303 | "1140 3.42130\n",
304 | "1141 3.60960\n",
305 | "1142 2.56214\n",
306 | "1143 2.02164\n",
307 | "Name: MolLogP, Length: 1144, dtype: float64"
308 | ]
309 | },
310 | "metadata": {
311 | "tags": []
312 | },
313 | "execution_count": 13
314 | }
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "metadata": {
320 | "id": "ua7cZKWzIP-J",
321 | "colab_type": "code",
322 | "colab": {
323 | "base_uri": "https://localhost:8080/",
324 | "height": 225
325 | },
326 | "outputId": "3cff5ff9-f26a-43ea-9c53-7772d68c48e2"
327 | },
328 | "source": [
329 | "df['MolLogP']"
330 | ],
331 | "execution_count": 14,
332 | "outputs": [
333 | {
334 | "output_type": "execute_result",
335 | "data": {
336 | "text/plain": [
337 | "0 2.59540\n",
338 | "1 2.37650\n",
339 | "2 2.59380\n",
340 | "3 2.02890\n",
341 | "4 2.91890\n",
342 | " ... \n",
343 | "1139 1.98820\n",
344 | "1140 3.42130\n",
345 | "1141 3.60960\n",
346 | "1142 2.56214\n",
347 | "1143 2.02164\n",
348 | "Name: MolLogP, Length: 1144, dtype: float64"
349 | ]
350 | },
351 | "metadata": {
352 | "tags": []
353 | },
354 | "execution_count": 14
355 | }
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {
361 | "id": "Mb_loyXfIyhw",
362 | "colab_type": "text"
363 | },
364 | "source": [
365 | "### Selecting two or more columns"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "metadata": {
371 | "id": "ZbLCqhFbIRfS",
372 | "colab_type": "code",
373 | "colab": {
374 | "base_uri": "https://localhost:8080/",
375 | "height": 415
376 | },
377 | "outputId": "d1d61288-290a-4fce-db4b-e3fbf635ba4d"
378 | },
379 | "source": [
380 | "df[['MolLogP','NumRotatableBonds']]"
381 | ],
382 | "execution_count": 23,
383 | "outputs": [
384 | {
385 | "output_type": "execute_result",
386 | "data": {
387 | "text/html": [
388 | "\n",
389 | "\n",
402 | "
\n",
403 | " \n",
404 | " \n",
405 | " | \n",
406 | " MolLogP | \n",
407 | " NumRotatableBonds | \n",
408 | "
\n",
409 | " \n",
410 | " \n",
411 | " \n",
412 | " 0 | \n",
413 | " 2.59540 | \n",
414 | " 0.0 | \n",
415 | "
\n",
416 | " \n",
417 | " 1 | \n",
418 | " 2.37650 | \n",
419 | " 0.0 | \n",
420 | "
\n",
421 | " \n",
422 | " 2 | \n",
423 | " 2.59380 | \n",
424 | " 1.0 | \n",
425 | "
\n",
426 | " \n",
427 | " 3 | \n",
428 | " 2.02890 | \n",
429 | " 1.0 | \n",
430 | "
\n",
431 | " \n",
432 | " 4 | \n",
433 | " 2.91890 | \n",
434 | " 1.0 | \n",
435 | "
\n",
436 | " \n",
437 | " ... | \n",
438 | " ... | \n",
439 | " ... | \n",
440 | "
\n",
441 | " \n",
442 | " 1139 | \n",
443 | " 1.98820 | \n",
444 | " 8.0 | \n",
445 | "
\n",
446 | " \n",
447 | " 1140 | \n",
448 | " 3.42130 | \n",
449 | " 2.0 | \n",
450 | "
\n",
451 | " \n",
452 | " 1141 | \n",
453 | " 3.60960 | \n",
454 | " 4.0 | \n",
455 | "
\n",
456 | " \n",
457 | " 1142 | \n",
458 | " 2.56214 | \n",
459 | " 3.0 | \n",
460 | "
\n",
461 | " \n",
462 | " 1143 | \n",
463 | " 2.02164 | \n",
464 | " 1.0 | \n",
465 | "
\n",
466 | " \n",
467 | "
\n",
468 | "
1144 rows × 2 columns
\n",
469 | "
"
470 | ],
471 | "text/plain": [
472 | " MolLogP NumRotatableBonds\n",
473 | "0 2.59540 0.0\n",
474 | "1 2.37650 0.0\n",
475 | "2 2.59380 1.0\n",
476 | "3 2.02890 1.0\n",
477 | "4 2.91890 1.0\n",
478 | "... ... ...\n",
479 | "1139 1.98820 8.0\n",
480 | "1140 3.42130 2.0\n",
481 | "1141 3.60960 4.0\n",
482 | "1142 2.56214 3.0\n",
483 | "1143 2.02164 1.0\n",
484 | "\n",
485 | "[1144 rows x 2 columns]"
486 | ]
487 | },
488 | "metadata": {
489 | "tags": []
490 | },
491 | "execution_count": 23
492 | }
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "metadata": {
498 | "id": "qxMA09nEIV7e",
499 | "colab_type": "code",
500 | "colab": {
501 | "base_uri": "https://localhost:8080/",
502 | "height": 415
503 | },
504 | "outputId": "6f1131df-d3ff-4735-96d5-166a11c03f56"
505 | },
506 | "source": [
507 | "df.iloc[:,[0,2]]"
508 | ],
509 | "execution_count": 22,
510 | "outputs": [
511 | {
512 | "output_type": "execute_result",
513 | "data": {
514 | "text/html": [
515 | "\n",
516 | "\n",
529 | "
\n",
530 | " \n",
531 | " \n",
532 | " | \n",
533 | " MolLogP | \n",
534 | " NumRotatableBonds | \n",
535 | "
\n",
536 | " \n",
537 | " \n",
538 | " \n",
539 | " 0 | \n",
540 | " 2.59540 | \n",
541 | " 0.0 | \n",
542 | "
\n",
543 | " \n",
544 | " 1 | \n",
545 | " 2.37650 | \n",
546 | " 0.0 | \n",
547 | "
\n",
548 | " \n",
549 | " 2 | \n",
550 | " 2.59380 | \n",
551 | " 1.0 | \n",
552 | "
\n",
553 | " \n",
554 | " 3 | \n",
555 | " 2.02890 | \n",
556 | " 1.0 | \n",
557 | "
\n",
558 | " \n",
559 | " 4 | \n",
560 | " 2.91890 | \n",
561 | " 1.0 | \n",
562 | "
\n",
563 | " \n",
564 | " ... | \n",
565 | " ... | \n",
566 | " ... | \n",
567 | "
\n",
568 | " \n",
569 | " 1139 | \n",
570 | " 1.98820 | \n",
571 | " 8.0 | \n",
572 | "
\n",
573 | " \n",
574 | " 1140 | \n",
575 | " 3.42130 | \n",
576 | " 2.0 | \n",
577 | "
\n",
578 | " \n",
579 | " 1141 | \n",
580 | " 3.60960 | \n",
581 | " 4.0 | \n",
582 | "
\n",
583 | " \n",
584 | " 1142 | \n",
585 | " 2.56214 | \n",
586 | " 3.0 | \n",
587 | "
\n",
588 | " \n",
589 | " 1143 | \n",
590 | " 2.02164 | \n",
591 | " 1.0 | \n",
592 | "
\n",
593 | " \n",
594 | "
\n",
595 | "
1144 rows × 2 columns
\n",
596 | "
"
597 | ],
598 | "text/plain": [
599 | " MolLogP NumRotatableBonds\n",
600 | "0 2.59540 0.0\n",
601 | "1 2.37650 0.0\n",
602 | "2 2.59380 1.0\n",
603 | "3 2.02890 1.0\n",
604 | "4 2.91890 1.0\n",
605 | "... ... ...\n",
606 | "1139 1.98820 8.0\n",
607 | "1140 3.42130 2.0\n",
608 | "1141 3.60960 4.0\n",
609 | "1142 2.56214 3.0\n",
610 | "1143 2.02164 1.0\n",
611 | "\n",
612 | "[1144 rows x 2 columns]"
613 | ]
614 | },
615 | "metadata": {
616 | "tags": []
617 | },
618 | "execution_count": 22
619 | }
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "metadata": {
625 | "id": "V53KSsZjId1X",
626 | "colab_type": "code",
627 | "colab": {
628 | "base_uri": "https://localhost:8080/",
629 | "height": 415
630 | },
631 | "outputId": "81be4e24-2bac-4dd5-97d7-32ece84768c6"
632 | },
633 | "source": [
634 | "selection = ['MolLogP','NumRotatableBonds', 'logS']\n",
635 | "df[selection]"
636 | ],
637 | "execution_count": 25,
638 | "outputs": [
639 | {
640 | "output_type": "execute_result",
641 | "data": {
642 | "text/html": [
643 | "\n",
644 | "\n",
657 | "
\n",
658 | " \n",
659 | " \n",
660 | " | \n",
661 | " MolLogP | \n",
662 | " NumRotatableBonds | \n",
663 | " logS | \n",
664 | "
\n",
665 | " \n",
666 | " \n",
667 | " \n",
668 | " 0 | \n",
669 | " 2.59540 | \n",
670 | " 0.0 | \n",
671 | " -2.180 | \n",
672 | "
\n",
673 | " \n",
674 | " 1 | \n",
675 | " 2.37650 | \n",
676 | " 0.0 | \n",
677 | " -2.000 | \n",
678 | "
\n",
679 | " \n",
680 | " 2 | \n",
681 | " 2.59380 | \n",
682 | " 1.0 | \n",
683 | " -1.740 | \n",
684 | "
\n",
685 | " \n",
686 | " 3 | \n",
687 | " 2.02890 | \n",
688 | " 1.0 | \n",
689 | " -1.480 | \n",
690 | "
\n",
691 | " \n",
692 | " 4 | \n",
693 | " 2.91890 | \n",
694 | " 1.0 | \n",
695 | " -3.040 | \n",
696 | "
\n",
697 | " \n",
698 | " ... | \n",
699 | " ... | \n",
700 | " ... | \n",
701 | " ... | \n",
702 | "
\n",
703 | " \n",
704 | " 1139 | \n",
705 | " 1.98820 | \n",
706 | " 8.0 | \n",
707 | " 1.144 | \n",
708 | "
\n",
709 | " \n",
710 | " 1140 | \n",
711 | " 3.42130 | \n",
712 | " 2.0 | \n",
713 | " -4.925 | \n",
714 | "
\n",
715 | " \n",
716 | " 1141 | \n",
717 | " 3.60960 | \n",
718 | " 4.0 | \n",
719 | " -3.893 | \n",
720 | "
\n",
721 | " \n",
722 | " 1142 | \n",
723 | " 2.56214 | \n",
724 | " 3.0 | \n",
725 | " -3.790 | \n",
726 | "
\n",
727 | " \n",
728 | " 1143 | \n",
729 | " 2.02164 | \n",
730 | " 1.0 | \n",
731 | " -2.581 | \n",
732 | "
\n",
733 | " \n",
734 | "
\n",
735 | "
1144 rows × 3 columns
\n",
736 | "
"
737 | ],
738 | "text/plain": [
739 | " MolLogP NumRotatableBonds logS\n",
740 | "0 2.59540 0.0 -2.180\n",
741 | "1 2.37650 0.0 -2.000\n",
742 | "2 2.59380 1.0 -1.740\n",
743 | "3 2.02890 1.0 -1.480\n",
744 | "4 2.91890 1.0 -3.040\n",
745 | "... ... ... ...\n",
746 | "1139 1.98820 8.0 1.144\n",
747 | "1140 3.42130 2.0 -4.925\n",
748 | "1141 3.60960 4.0 -3.893\n",
749 | "1142 2.56214 3.0 -3.790\n",
750 | "1143 2.02164 1.0 -2.581\n",
751 | "\n",
752 | "[1144 rows x 3 columns]"
753 | ]
754 | },
755 | "metadata": {
756 | "tags": []
757 | },
758 | "execution_count": 25
759 | }
760 | ]
761 | },
762 | {
763 | "cell_type": "code",
764 | "metadata": {
765 | "id": "rF85rPt1I9sY",
766 | "colab_type": "code",
767 | "colab": {}
768 | },
769 | "source": [
770 | ""
771 | ],
772 | "execution_count": 0,
773 | "outputs": []
774 | }
775 | ]
776 | }
--------------------------------------------------------------------------------
/python/r_magic_command.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "r-magic-command.ipynb",
7 | "provenance": []
8 | },
9 | "kernelspec": {
10 | "name": "python3",
11 | "display_name": "Python 3"
12 | }
13 | },
14 | "cells": [
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {
18 | "id": "EnyONbNhCqSK",
19 | "colab_type": "text"
20 | },
21 | "source": [
22 | "# **Using R and Python in the Same Notebook**\n",
23 | "\n",
24 | "Chanin Nantasenamat\n",
25 | "\n",
26 | "[*'Data Professor' YouTube channel*](http://youtube.com/dataprofessor)\n",
27 | "\n",
28 | "In this Jupyter notebook, I will show you how to use R and Python in the same notebook.\n",
29 | "\n",
30 | "---"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "metadata": {
36 | "id": "2h-2I4CviFCR",
37 | "colab_type": "code",
38 | "colab": {}
39 | },
40 | "source": [
41 | "# activate R magic\n",
42 | "%load_ext rpy2.ipython"
43 | ],
44 | "execution_count": 0,
45 | "outputs": []
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {
50 | "id": "FftFvPLNiZME",
51 | "colab_type": "text"
52 | },
53 | "source": [
54 | "## Python"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "metadata": {
60 | "id": "3hPnRI2piJM3",
61 | "colab_type": "code",
62 | "colab": {}
63 | },
64 | "source": [
65 | "import pandas as pd"
66 | ],
67 | "execution_count": 0,
68 | "outputs": []
69 | },
70 | {
71 | "cell_type": "code",
72 | "metadata": {
73 | "id": "yNKM70-ZiPcg",
74 | "colab_type": "code",
75 | "colab": {}
76 | },
77 | "source": [
78 | "x <- 42\n",
79 | "print(x)"
80 | ],
81 | "execution_count": 0,
82 | "outputs": []
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {
87 | "id": "dtkChhxpiWEd",
88 | "colab_type": "text"
89 | },
90 | "source": [
91 | "## R"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "metadata": {
97 | "id": "ozqbZ3lviTPj",
98 | "colab_type": "code",
99 | "colab": {}
100 | },
101 | "source": [
102 | "%%R\n",
103 | "x <- 42\n",
104 | "print(x)"
105 | ],
106 | "execution_count": 0,
107 | "outputs": []
108 | },
109 | {
110 | "cell_type": "code",
111 | "metadata": {
112 | "id": "napTAYyXiU8r",
113 | "colab_type": "code",
114 | "colab": {}
115 | },
116 | "source": [
117 | "%%R\n",
118 | "install.packages('caret')\n",
119 | "install.packages('mlbench')"
120 | ],
121 | "execution_count": 0,
122 | "outputs": []
123 | },
124 | {
125 | "cell_type": "code",
126 | "metadata": {
127 | "id": "4eB_IbK4kztb",
128 | "colab_type": "code",
129 | "colab": {}
130 | },
131 | "source": [
132 | "%%R\n",
133 | "install.packages('mlbench')"
134 | ],
135 | "execution_count": 0,
136 | "outputs": []
137 | },
138 | {
139 | "cell_type": "code",
140 | "metadata": {
141 | "id": "Bl0feNEUi-Jk",
142 | "colab_type": "code",
143 | "colab": {}
144 | },
145 | "source": [
146 | "%%R\n",
147 | "library(caret)"
148 | ],
149 | "execution_count": 0,
150 | "outputs": []
151 | },
152 | {
153 | "cell_type": "code",
154 | "metadata": {
155 | "id": "zY7WFnrSj4Mr",
156 | "colab_type": "code",
157 | "colab": {}
158 | },
159 | "source": [
160 | "%%R\n",
161 | "############################################\n",
162 | "# Data Professor #\n",
163 | "# http://youtube.com/dataprofessor #\n",
164 | "# http://github.com/dataprofessor #\n",
165 | "# http://facebook.com/dataprofessor #\n",
166 | "# https://www.instagram.com/data.professor #\n",
167 | "############################################\n",
168 | "\n",
169 | "# Importing libraries\n",
170 | "library(mlbench) # Contains several benchmark data sets (especially the Boston Housing dataset)\n",
171 | "library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training\n",
172 | "\n",
173 | "# Importing the Boston Housing data set\n",
174 | "data(BostonHousing)\n",
175 | "\n",
176 | "head(BostonHousing)\n",
177 | "\n",
178 | "# Check to see if there are missing data?\n",
179 | "sum(is.na(BostonHousing))\n",
180 | "\n",
181 | "# To achieve reproducible model; set the random seed number\n",
182 | "set.seed(100)\n",
183 | "\n",
184 | "# Performs stratified random split of the data set\n",
185 | "TrainingIndex <- createDataPartition(BostonHousing$medv, p=0.8, list = FALSE)\n",
186 | "TrainingSet <- BostonHousing[TrainingIndex,] # Training Set\n",
187 | "TestingSet <- BostonHousing[-TrainingIndex,] # Test Set\n",
188 | "\n",
189 | "\n",
190 | "###############################\n",
191 | "\n",
192 | "# Build Training model\n",
193 | "Model <- train(medv ~ ., data = TrainingSet,\n",
194 | " method = \"lm\",\n",
195 | " na.action = na.omit,\n",
196 | " preProcess=c(\"scale\",\"center\"),\n",
197 | " trControl= trainControl(method=\"none\")\n",
198 | ")\n",
199 | "\n",
200 | "# Apply model for prediction\n",
201 | "Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set\n",
202 | "Model.testing <-predict(Model, TestingSet) # Apply model to make prediction on Testing set\n",
203 | "\n",
204 | "# Model performance (Displays scatter plot and performance metrics)\n",
205 | " # Scatter plot of Training set\n",
206 | "plot(TrainingSet$medv,Model.training, col = \"blue\" )\n",
207 | "plot(TestingSet$medv,Model.testing, col = \"blue\" )"
208 | ],
209 | "execution_count": 0,
210 | "outputs": []
211 | },
212 | {
213 | "cell_type": "code",
214 | "metadata": {
215 | "id": "Q6A7bOvbll8D",
216 | "colab_type": "code",
217 | "colab": {}
218 | },
219 | "source": [
220 | ""
221 | ],
222 | "execution_count": 0,
223 | "outputs": []
224 | }
225 | ]
226 | }
--------------------------------------------------------------------------------
/shiny/001-first-app/app.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Modified from Winston Chang,
8 | # https://shiny.rstudio.com/gallery/shiny-theme-selector.html
9 |
10 | # Concepts about Reactive programming used by Shiny,
11 | # https://shiny.rstudio.com/articles/reactivity-overview.html
12 |
13 | # Load R packages
14 | library(shiny)
15 | library(shinythemes)
16 |
17 |
18 | # Define UI
19 | ui <- fluidPage(theme = shinytheme("cerulean"),
20 | navbarPage(
21 | # theme = "cerulean", # <--- To use a theme, uncomment this
22 | "My first app",
23 | tabPanel("Navbar 1",
24 | sidebarPanel(
25 | tags$h3("Input:"),
26 | textInput("txt1", "Given Name:", ""),
27 | textInput("txt2", "Surname:", ""),
28 |
29 | ), # sidebarPanel
30 | mainPanel(
31 | h1("Header 1"),
32 |
33 | h4("Output 1"),
34 | verbatimTextOutput("txtout"),
35 |
36 | ) # mainPanel
37 |
38 | ), # Navbar 1, tabPanel
39 | tabPanel("Navbar 2", "This panel is intentionally left blank"),
40 | tabPanel("Navbar 3", "This panel is intentionally left blank")
41 |
42 | ) # navbarPage
43 | ) # fluidPage
44 |
45 |
46 | # Define server function
47 | server <- function(input, output) {
48 |
49 | output$txtout <- renderText({
50 | paste( input$txt1, input$txt2, sep = " " )
51 | })
52 | } # server
53 |
54 |
55 | # Create Shiny object
56 | shinyApp(ui = ui, server = server)
57 |
--------------------------------------------------------------------------------
/shiny/002-histogram/app.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Modified from https://shiny.rstudio.com/tutorial/written-tutorial/lesson1/
8 |
9 | library(shiny)
10 | data(airquality)
11 |
12 | # Define UI for app that draws a histogram ----
13 | ui <- fluidPage(
14 |
15 | # App title ----
16 | titlePanel("Ozone level!"),
17 |
18 | # Sidebar layout with input and output definitions ----
19 | sidebarLayout(
20 |
21 | # Sidebar panel for inputs ----
22 | sidebarPanel(
23 |
24 | # Input: Slider for the number of bins ----
25 | sliderInput(inputId = "bins",
26 | label = "Number of bins:",
27 | min = 1,
28 | max = 50,
29 | value = 30)
30 |
31 | ),
32 |
33 | # Main panel for displaying outputs ----
34 | mainPanel(
35 |
36 | # Output: Histogram ----
37 | plotOutput(outputId = "distPlot")
38 |
39 | )
40 | )
41 | )
42 |
43 | # Define server logic required to draw a histogram ----
44 | server <- function(input, output) {
45 |
46 |
47 | output$distPlot <- renderPlot({
48 |
49 | x <- airquality$Ozone
50 | x <- na.omit(x)
51 | bins <- seq(min(x), max(x), length.out = input$bins + 1)
52 |
53 | hist(x, breaks = bins, col = "#75AADB", border = "black",
54 | xlab = "Ozone level",
55 | main = "Histogram of Ozone level")
56 |
57 | })
58 |
59 | }
60 |
61 | # Create Shiny app ----
62 | shinyApp(ui = ui, server = server)
63 |
--------------------------------------------------------------------------------
/shiny/003-play-golf/app.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 |
8 | # Import libraries
9 | library(shiny)
10 | library(shinythemes)
11 | library(data.table)
12 | library(RCurl)
13 | library(randomForest)
14 |
15 | # Read data
16 | weather <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/weather-weka.csv") )
17 |
18 | # Build model
19 | model <- randomForest(play ~ ., data = weather, ntree = 500, mtry = 4, importance = TRUE)
20 |
21 | # Save model to RDS file
22 | # saveRDS(model, "model.rds")
23 |
24 | # Read in the RF model
25 | #model <- readRDS("model.rds")
26 |
27 | ####################################
28 | # User interface #
29 | ####################################
30 |
31 | ui <- fluidPage(theme = shinytheme("united"),
32 |
33 | # Page header
34 | headerPanel('Play Golf?'),
35 |
36 | # Input values
37 | sidebarPanel(
38 | HTML("Input parameters
"),
39 |
40 | selectInput("outlook", label = "Outlook:",
41 | choices = list("Sunny" = "sunny", "Overcast" = "overcast", "Rainy" = "rainy"),
42 | selected = "Rainy"),
43 | sliderInput("temperature", "Temperature:",
44 | min = 64, max = 86,
45 | value = 70),
46 | sliderInput("humidity", "Humidity:",
47 | min = 65, max = 96,
48 | value = 90),
49 | selectInput("windy", label = "Windy:",
50 | choices = list("Yes" = "TRUE", "No" = "FALSE"),
51 | selected = "TRUE"),
52 |
53 | actionButton("submitbutton", "Submit", class = "btn btn-primary")
54 | ),
55 |
56 | mainPanel(
57 | tags$label(h3('Status/Output')), # Status/Output Text Box
58 | verbatimTextOutput('contents'),
59 | tableOutput('tabledata') # Prediction results table
60 |
61 | )
62 | )
63 |
64 | ####################################
65 | # Server #
66 | ####################################
67 |
68 | server <- function(input, output, session) {
69 |
70 | # Input Data
71 | datasetInput <- reactive({
72 |
73 | # outlook,temperature,humidity,windy,play
74 | df <- data.frame(
75 | Name = c("outlook",
76 | "temperature",
77 | "humidity",
78 | "windy"),
79 | Value = as.character(c(input$outlook,
80 | input$temperature,
81 | input$humidity,
82 | input$windy)),
83 | stringsAsFactors = FALSE)
84 |
85 | play <- "play"
86 | df <- rbind(df, play)
87 | input <- transpose(df)
88 | write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE)
89 |
90 | test <- read.csv(paste("input", ".csv", sep=""), header = TRUE)
91 |
92 | test$outlook <- factor(test$outlook, levels = c("overcast", "rainy", "sunny"))
93 |
94 |
95 | Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3))
96 | print(Output)
97 |
98 | })
99 |
100 | # Status/Output Text Box
101 | output$contents <- renderPrint({
102 | if (input$submitbutton>0) {
103 | isolate("Calculation complete.")
104 | } else {
105 | return("Server is ready for calculation.")
106 | }
107 | })
108 |
109 | # Prediction results table
110 | output$tabledata <- renderTable({
111 | if (input$submitbutton>0) {
112 | isolate(datasetInput())
113 | }
114 | })
115 |
116 | }
117 |
118 | ####################################
119 | # Create the shiny app #
120 | ####################################
121 | shinyApp(ui = ui, server = server)
122 |
--------------------------------------------------------------------------------
/shiny/004-iris-predictor/app-numeric.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | # http://facebook.com/dataprofessor #
6 | # https://www.instagram.com/data.professor #
7 | ############################################
8 |
9 | # Import libraries
10 | library(shiny)
11 | library(data.table)
12 | library(randomForest)
13 |
14 | # Read in the RF model
15 | model <- readRDS("model.rds")
16 |
17 |
18 | ####################################
19 | # User interface #
20 | ####################################
21 |
22 | ui <- pageWithSidebar(
23 |
24 | # Page header
25 | headerPanel('Iris Predictor'),
26 |
27 | # Input values
28 | sidebarPanel(
29 | #HTML("Input parameters
"),
30 | tags$label(h3('Input parameters')),
31 | numericInput("Sepal.Length",
32 | label = "Sepal Length",
33 | value = 5.1),
34 | numericInput("Sepal.Width",
35 | label = "Sepal Width",
36 | value = 3.6),
37 | numericInput("Petal.Length",
38 | label = "Petal Length",
39 | value = 1.4),
40 | numericInput("Petal.Width",
41 | label = "Petal Width",
42 | value = 0.2),
43 |
44 | actionButton("submitbutton", "Submit",
45 | class = "btn btn-primary")
46 | ),
47 |
48 | mainPanel(
49 | tags$label(h3('Status/Output')), # Status/Output Text Box
50 | verbatimTextOutput('contents'),
51 | tableOutput('tabledata') # Prediction results table
52 |
53 | )
54 | )
55 |
56 | ####################################
57 | # Server #
58 | ####################################
59 |
60 | server<- function(input, output, session) {
61 |
62 | # Input Data
63 | datasetInput <- reactive({
64 |
65 | df <- data.frame(
66 | Name = c("Sepal Length",
67 | "Sepal Width",
68 | "Petal Length",
69 | "Petal Width"),
70 | Value = as.character(c(input$Sepal.Length,
71 | input$Sepal.Width,
72 | input$Petal.Length,
73 | input$Petal.Width)),
74 | stringsAsFactors = FALSE)
75 |
76 | Species <- 0
77 | df <- rbind(df, Species)
78 | input <- transpose(df)
79 | write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE)
80 |
81 | test <- read.csv(paste("input", ".csv", sep=""), header = TRUE)
82 |
83 | Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3))
84 | print(Output)
85 |
86 | })
87 |
88 | # Status/Output Text Box
89 | output$contents <- renderPrint({
90 | if (input$submitbutton>0) {
91 | isolate("Calculation complete.")
92 | } else {
93 | return("Server is ready for calculation.")
94 | }
95 | })
96 |
97 | # Prediction results table
98 | output$tabledata <- renderTable({
99 | if (input$submitbutton>0) {
100 | isolate(datasetInput())
101 | }
102 | })
103 |
104 | }
105 |
106 | ####################################
107 | # Create the shiny app #
108 | ####################################
109 | shinyApp(ui = ui, server = server)
110 |
--------------------------------------------------------------------------------
/shiny/004-iris-predictor/app-slider.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | # http://facebook.com/dataprofessor #
6 | # https://www.instagram.com/data.professor #
7 | ############################################
8 |
9 | # Import libraries
10 | library(shiny)
11 | library(data.table)
12 | library(randomForest)
13 |
14 | # Read in the RF model
15 | model <- readRDS("model.rds")
16 |
17 | # Training set
18 | TrainSet <- read.csv("training.csv", header = TRUE)
19 | TrainSet <- TrainSet[,-1]
20 |
21 |
22 | ####################################
23 | # User interface #
24 | ####################################
25 |
26 | ui <- pageWithSidebar(
27 |
28 | # Page header
29 | headerPanel('Iris Predictor'),
30 |
31 | # Input values
32 | sidebarPanel(
33 | HTML("Input parameters"),
34 | sliderInput("Sepal.Length", label = "Sepal Length", value = 5.0,
35 | min = min(TrainSet$Sepal.Length),
36 | max = max(TrainSet$Sepal.Length)
37 | ),
38 | sliderInput("Sepal.Width", label = "Sepal Width", value = 3.6,
39 | min = min(TrainSet$Sepal.Width),
40 | max = max(TrainSet$Sepal.Width)),
41 | sliderInput("Petal.Length", label = "Petal Length", value = 1.4,
42 | min = min(TrainSet$Petal.Length),
43 | max = max(TrainSet$Petal.Length)),
44 | sliderInput("Petal.Width", label = "Petal Width", value = 0.2,
45 | min = min(TrainSet$Petal.Width),
46 | max = max(TrainSet$Petal.Width)),
47 |
48 | actionButton("submitbutton", "Submit", class = "btn btn-primary")
49 | ),
50 |
51 | mainPanel(
52 | tags$label(h3('Status/Output')), # Status/Output Text Box
53 | verbatimTextOutput('contents'),
54 | tableOutput('tabledata') # Prediction results table
55 |
56 | )
57 | )
58 |
59 | ####################################
60 | # Server #
61 | ####################################
62 |
63 | server<- function(input, output, session) {
64 |
65 | # Input Data
66 | datasetInput <- reactive({
67 |
68 | df <- data.frame(
69 | Name = c("Sepal Length",
70 | "Sepal Width",
71 | "Petal Length",
72 | "Petal Width"),
73 | Value = as.character(c(input$Sepal.Length,
74 | input$Sepal.Width,
75 | input$Petal.Length,
76 | input$Petal.Width)),
77 | stringsAsFactors = FALSE)
78 |
79 | Species <- 0
80 | df <- rbind(df, Species)
81 | input <- transpose(df)
82 | write.table(input,"input.csv", sep=",", quote = FALSE, row.names = FALSE, col.names = FALSE)
83 |
84 | test <- read.csv(paste("input", ".csv", sep=""), header = TRUE)
85 |
86 | Output <- data.frame(Prediction=predict(model,test), round(predict(model,test,type="prob"), 3))
87 | print(Output)
88 |
89 | })
90 |
91 | # Status/Output Text Box
92 | output$contents <- renderPrint({
93 | if (input$submitbutton>0) {
94 | isolate("Calculation complete.")
95 | } else {
96 | return("Server is ready for calculation.")
97 | }
98 | })
99 |
100 | # Prediction results table
101 | output$tabledata <- renderTable({
102 | if (input$submitbutton>0) {
103 | isolate(datasetInput())
104 | }
105 | })
106 |
107 | }
108 |
109 | ####################################
110 | # Create the shiny app #
111 | ####################################
112 | shinyApp(ui = ui, server = server)
--------------------------------------------------------------------------------
/shiny/004-iris-predictor/model.R:
--------------------------------------------------------------------------------
1 | ####################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | ####################################
6 |
7 | # Importing libraries
8 | library(RCurl) # for downloading the iris CSV file
9 | library(randomForest)
10 | library(caret)
11 |
12 | # Importing the Iris data set
13 | iris <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/iris.csv") )
14 |
15 | # Performs stratified random split of the data set
16 | TrainingIndex <- createDataPartition(iris$Species, p=0.8, list = FALSE)
17 | TrainingSet <- iris[TrainingIndex,] # Training Set
18 | TestingSet <- iris[-TrainingIndex,] # Test Set
19 |
20 | write.csv(TrainingSet, "training.csv")
21 | write.csv(TestingSet, "testing.csv")
22 |
23 | TrainSet <- read.csv("training.csv", header = TRUE)
24 | TrainSet <- TrainSet[,-1]
25 |
26 | # Building Random forest model
27 |
28 | model <- randomForest(Species ~ ., data = TrainSet, ntree = 500, mtry = 4, importance = TRUE)
29 |
30 | # Save model to RDS file
31 | saveRDS(model, "model.rds")
32 |
--------------------------------------------------------------------------------
/shiny/004-iris-predictor/model.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/shiny/004-iris-predictor/model.rds
--------------------------------------------------------------------------------
/shiny/004-iris-predictor/testing.csv:
--------------------------------------------------------------------------------
1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
2 | "5",5,3.6,1.4,0.2,"setosa"
3 | "9",4.4,2.9,1.4,0.2,"setosa"
4 | "14",4.3,3,1.1,0.1,"setosa"
5 | "19",5.7,3.8,1.7,0.3,"setosa"
6 | "22",5.1,3.7,1.5,0.4,"setosa"
7 | "26",5,3,1.6,0.2,"setosa"
8 | "29",5.2,3.4,1.4,0.2,"setosa"
9 | "37",5.5,3.5,1.3,0.2,"setosa"
10 | "41",5,3.5,1.3,0.3,"setosa"
11 | "42",4.5,2.3,1.3,0.3,"setosa"
12 | "55",6.5,2.8,4.6,1.5,"versicolor"
13 | "56",5.7,2.8,4.5,1.3,"versicolor"
14 | "61",5,2,3.5,1,"versicolor"
15 | "65",5.6,2.9,3.6,1.3,"versicolor"
16 | "66",6.7,3.1,4.4,1.4,"versicolor"
17 | "68",5.8,2.7,4.1,1,"versicolor"
18 | "73",6.3,2.5,4.9,1.5,"versicolor"
19 | "90",5.5,2.5,4,1.3,"versicolor"
20 | "92",6.1,3,4.6,1.4,"versicolor"
21 | "99",5.1,2.5,3,1.1,"versicolor"
22 | "103",7.1,3,5.9,2.1,"virginica"
23 | "111",6.5,3.2,5.1,2,"virginica"
24 | "112",6.4,2.7,5.3,1.9,"virginica"
25 | "113",6.8,3,5.5,2.1,"virginica"
26 | "120",6,2.2,5,1.5,"virginica"
27 | "133",6.4,2.8,5.6,2.2,"virginica"
28 | "134",6.3,2.8,5.1,1.5,"virginica"
29 | "136",7.7,3,6.1,2.3,"virginica"
30 | "146",6.7,3,5.2,2.3,"virginica"
31 | "147",6.3,2.5,5,1.9,"virginica"
32 |
--------------------------------------------------------------------------------
/shiny/004-iris-predictor/training.csv:
--------------------------------------------------------------------------------
1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"
2 | "1",5.1,3.5,1.4,0.2,"setosa"
3 | "2",4.9,3,1.4,0.2,"setosa"
4 | "3",4.7,3.2,1.3,0.2,"setosa"
5 | "4",4.6,3.1,1.5,0.2,"setosa"
6 | "6",5.4,3.9,1.7,0.4,"setosa"
7 | "7",4.6,3.4,1.4,0.3,"setosa"
8 | "8",5,3.4,1.5,0.2,"setosa"
9 | "10",4.9,3.1,1.5,0.1,"setosa"
10 | "11",5.4,3.7,1.5,0.2,"setosa"
11 | "12",4.8,3.4,1.6,0.2,"setosa"
12 | "13",4.8,3,1.4,0.1,"setosa"
13 | "15",5.8,4,1.2,0.2,"setosa"
14 | "16",5.7,4.4,1.5,0.4,"setosa"
15 | "17",5.4,3.9,1.3,0.4,"setosa"
16 | "18",5.1,3.5,1.4,0.3,"setosa"
17 | "20",5.1,3.8,1.5,0.3,"setosa"
18 | "21",5.4,3.4,1.7,0.2,"setosa"
19 | "23",4.6,3.6,1,0.2,"setosa"
20 | "24",5.1,3.3,1.7,0.5,"setosa"
21 | "25",4.8,3.4,1.9,0.2,"setosa"
22 | "27",5,3.4,1.6,0.4,"setosa"
23 | "28",5.2,3.5,1.5,0.2,"setosa"
24 | "30",4.7,3.2,1.6,0.2,"setosa"
25 | "31",4.8,3.1,1.6,0.2,"setosa"
26 | "32",5.4,3.4,1.5,0.4,"setosa"
27 | "33",5.2,4.1,1.5,0.1,"setosa"
28 | "34",5.5,4.2,1.4,0.2,"setosa"
29 | "35",4.9,3.1,1.5,0.1,"setosa"
30 | "36",5,3.2,1.2,0.2,"setosa"
31 | "38",4.9,3.1,1.5,0.1,"setosa"
32 | "39",4.4,3,1.3,0.2,"setosa"
33 | "40",5.1,3.4,1.5,0.2,"setosa"
34 | "43",4.4,3.2,1.3,0.2,"setosa"
35 | "44",5,3.5,1.6,0.6,"setosa"
36 | "45",5.1,3.8,1.9,0.4,"setosa"
37 | "46",4.8,3,1.4,0.3,"setosa"
38 | "47",5.1,3.8,1.6,0.2,"setosa"
39 | "48",4.6,3.2,1.4,0.2,"setosa"
40 | "49",5.3,3.7,1.5,0.2,"setosa"
41 | "50",5,3.3,1.4,0.2,"setosa"
42 | "51",7,3.2,4.7,1.4,"versicolor"
43 | "52",6.4,3.2,4.5,1.5,"versicolor"
44 | "53",6.9,3.1,4.9,1.5,"versicolor"
45 | "54",5.5,2.3,4,1.3,"versicolor"
46 | "57",6.3,3.3,4.7,1.6,"versicolor"
47 | "58",4.9,2.4,3.3,1,"versicolor"
48 | "59",6.6,2.9,4.6,1.3,"versicolor"
49 | "60",5.2,2.7,3.9,1.4,"versicolor"
50 | "62",5.9,3,4.2,1.5,"versicolor"
51 | "63",6,2.2,4,1,"versicolor"
52 | "64",6.1,2.9,4.7,1.4,"versicolor"
53 | "67",5.6,3,4.5,1.5,"versicolor"
54 | "69",6.2,2.2,4.5,1.5,"versicolor"
55 | "70",5.6,2.5,3.9,1.1,"versicolor"
56 | "71",5.9,3.2,4.8,1.8,"versicolor"
57 | "72",6.1,2.8,4,1.3,"versicolor"
58 | "74",6.1,2.8,4.7,1.2,"versicolor"
59 | "75",6.4,2.9,4.3,1.3,"versicolor"
60 | "76",6.6,3,4.4,1.4,"versicolor"
61 | "77",6.8,2.8,4.8,1.4,"versicolor"
62 | "78",6.7,3,5,1.7,"versicolor"
63 | "79",6,2.9,4.5,1.5,"versicolor"
64 | "80",5.7,2.6,3.5,1,"versicolor"
65 | "81",5.5,2.4,3.8,1.1,"versicolor"
66 | "82",5.5,2.4,3.7,1,"versicolor"
67 | "83",5.8,2.7,3.9,1.2,"versicolor"
68 | "84",6,2.7,5.1,1.6,"versicolor"
69 | "85",5.4,3,4.5,1.5,"versicolor"
70 | "86",6,3.4,4.5,1.6,"versicolor"
71 | "87",6.7,3.1,4.7,1.5,"versicolor"
72 | "88",6.3,2.3,4.4,1.3,"versicolor"
73 | "89",5.6,3,4.1,1.3,"versicolor"
74 | "91",5.5,2.6,4.4,1.2,"versicolor"
75 | "93",5.8,2.6,4,1.2,"versicolor"
76 | "94",5,2.3,3.3,1,"versicolor"
77 | "95",5.6,2.7,4.2,1.3,"versicolor"
78 | "96",5.7,3,4.2,1.2,"versicolor"
79 | "97",5.7,2.9,4.2,1.3,"versicolor"
80 | "98",6.2,2.9,4.3,1.3,"versicolor"
81 | "100",5.7,2.8,4.1,1.3,"versicolor"
82 | "101",6.3,3.3,6,2.5,"virginica"
83 | "102",5.8,2.7,5.1,1.9,"virginica"
84 | "104",6.3,2.9,5.6,1.8,"virginica"
85 | "105",6.5,3,5.8,2.2,"virginica"
86 | "106",7.6,3,6.6,2.1,"virginica"
87 | "107",4.9,2.5,4.5,1.7,"virginica"
88 | "108",7.3,2.9,6.3,1.8,"virginica"
89 | "109",6.7,2.5,5.8,1.8,"virginica"
90 | "110",7.2,3.6,6.1,2.5,"virginica"
91 | "114",5.7,2.5,5,2,"virginica"
92 | "115",5.8,2.8,5.1,2.4,"virginica"
93 | "116",6.4,3.2,5.3,2.3,"virginica"
94 | "117",6.5,3,5.5,1.8,"virginica"
95 | "118",7.7,3.8,6.7,2.2,"virginica"
96 | "119",7.7,2.6,6.9,2.3,"virginica"
97 | "121",6.9,3.2,5.7,2.3,"virginica"
98 | "122",5.6,2.8,4.9,2,"virginica"
99 | "123",7.7,2.8,6.7,2,"virginica"
100 | "124",6.3,2.7,4.9,1.8,"virginica"
101 | "125",6.7,3.3,5.7,2.1,"virginica"
102 | "126",7.2,3.2,6,1.8,"virginica"
103 | "127",6.2,2.8,4.8,1.8,"virginica"
104 | "128",6.1,3,4.9,1.8,"virginica"
105 | "129",6.4,2.8,5.6,2.1,"virginica"
106 | "130",7.2,3,5.8,1.6,"virginica"
107 | "131",7.4,2.8,6.1,1.9,"virginica"
108 | "132",7.9,3.8,6.4,2,"virginica"
109 | "135",6.1,2.6,5.6,1.4,"virginica"
110 | "137",6.3,3.4,5.6,2.4,"virginica"
111 | "138",6.4,3.1,5.5,1.8,"virginica"
112 | "139",6,3,4.8,1.8,"virginica"
113 | "140",6.9,3.1,5.4,2.1,"virginica"
114 | "141",6.7,3.1,5.6,2.4,"virginica"
115 | "142",6.9,3.1,5.1,2.3,"virginica"
116 | "143",5.8,2.7,5.1,1.9,"virginica"
117 | "144",6.8,3.2,5.9,2.3,"virginica"
118 | "145",6.7,3.3,5.7,2.5,"virginica"
119 | "148",6.5,3,5.2,2,"virginica"
120 | "149",6.2,3.4,5.4,2.3,"virginica"
121 | "150",5.9,3,5.1,1.8,"virginica"
122 |
--------------------------------------------------------------------------------
/shiny/005-bmi/about.md:
--------------------------------------------------------------------------------
1 | #### What is BMI?
2 |
3 | **Body Mass Index (BMI)** is essentially a value obtained from the weight and height of a person [1].
4 |
5 | #### Calculating the BMI
6 | BMI can be computed by dividing the person's weight (kg) by their squared height (m) as follows:
7 |
8 | > BMI = kg/m^2
9 |
10 | where *kg* represents the person's weight and *m^2* the person's squared height.
11 |
12 | #### About this BMI Calculator
13 |
14 | This *BMI Calculator* is for adults 20 years and older. Further information on calculating BMI for children and teenagers is available from the CDC [2].
15 |
16 | #### References
17 | 1. Centers for Disease Control. [Body Mass Index (BMI)](https://www.cdc.gov/healthyweight/assessing/bmi/index.html), Accessed January 26, 2020.
18 | 2. Centers for Disease Control. [BMI Percentile Calculator for Child and Teen](https://www.cdc.gov/healthyweight/bmi/calculator.html), Accessed January 26, 2020.
19 |
--------------------------------------------------------------------------------
/shiny/005-bmi/app.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | # Data Professor #
3 | # http://youtube.com/dataprofessor #
4 | # http://github.com/dataprofessor #
5 | # http://facebook.com/dataprofessor #
6 | # https://www.instagram.com/data.professor #
7 | ############################################
8 |
9 | library(shiny)
10 | library(shinythemes)
11 |
12 |
13 | ####################################
14 | # User Interface #
15 | ####################################
16 | ui <- fluidPage(theme = shinytheme("united"),
17 | navbarPage("BMI Calculator:",
18 |
19 | tabPanel("Home",
20 | # Input values
21 | sidebarPanel(
22 | HTML("Input parameters
"),
23 | sliderInput("height",
24 | label = "Height",
25 | value = 175,
26 | min = 40,
27 | max = 250),
28 | sliderInput("weight",
29 | label = "Weight",
30 | value = 70,
31 | min = 20,
32 | max = 100),
33 |
34 | actionButton("submitbutton",
35 | "Submit",
36 | class = "btn btn-primary")
37 | ),
38 |
39 | mainPanel(
40 | tags$label(h3('Status/Output')), # Status/Output Text Box
41 | verbatimTextOutput('contents'),
42 | tableOutput('tabledata') # Results table
43 | ) # mainPanel()
44 |
45 | ), #tabPanel(), Home
46 |
47 | tabPanel("About",
48 | titlePanel("About"),
49 | div(includeMarkdown("about.md"),
50 | align="justify")
51 | ) #tabPanel(), About
52 |
53 | ) # navbarPage()
54 | ) # fluidPage()
55 |
56 |
57 | ####################################
58 | # Server #
59 | ####################################
60 | server <- function(input, output, session) {
61 |
62 | # Input Data
63 | datasetInput <- reactive({
64 |
65 | bmi <- input$weight/( (input$height/100) * (input$height/100) )
66 | bmi <- data.frame(bmi)
67 | names(bmi) <- "BMI"
68 | print(bmi)
69 |
70 | })
71 |
72 | # Status/Output Text Box
73 | output$contents <- renderPrint({
74 | if (input$submitbutton>0) {
75 | isolate("Calculation complete.")
76 | } else {
77 | return("Server is ready for calculation.")
78 | }
79 | })
80 |
81 | # Prediction results table
82 | output$tabledata <- renderTable({
83 | if (input$submitbutton>0) {
84 | isolate(datasetInput())
85 | }
86 | })
87 |
88 | }
89 |
90 |
91 | ####################################
92 | # Create Shiny App #
93 | ####################################
94 | shinyApp(ui = ui, server = server)
95 |
--------------------------------------------------------------------------------
/streamlit/part1/myapp.py:
--------------------------------------------------------------------------------
1 | import yfinance as yf
2 | import streamlit as st
3 |
4 | st.write("""
5 | # Simple Stock Price App
6 |
7 | Shown are the stock closing price and volume of Google!
8 |
9 | """)
10 |
11 | # https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75
12 | #define the ticker symbol
13 | tickerSymbol = 'GOOGL'
14 | #get data on this ticker
15 | tickerData = yf.Ticker(tickerSymbol)
16 | #get the historical prices for this ticker
17 | tickerDf = tickerData.history(period='1d', start='2010-5-31', end='2020-5-31')
18 | # Open High Low Close Volume Dividends Stock Splits
19 |
20 | st.line_chart(tickerDf.Close)
21 | st.line_chart(tickerDf.Volume)
22 |
--------------------------------------------------------------------------------
/streamlit/part1/myapp2.py:
--------------------------------------------------------------------------------
1 | import yfinance as yf
2 | import streamlit as st
3 |
4 | st.write("""
5 | # Simple Stock Price App
6 |
7 | Shown are the stock **closing price** and ***volume*** of Google!
8 |
9 | """)
10 |
11 | # https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75
12 | #define the ticker symbol
13 | tickerSymbol = 'GOOGL'
14 | #get data on this ticker
15 | tickerData = yf.Ticker(tickerSymbol)
16 | #get the historical prices for this ticker
17 | tickerDf = tickerData.history(period='1d', start='2010-5-31', end='2020-5-31')
18 | # Open High Low Close Volume Dividends Stock Splits
19 |
20 | st.write("""
21 | ## Closing Price
22 | """)
23 | st.line_chart(tickerDf.Close)
24 | st.write("""
25 | ## Volume Price
26 | """)
27 | st.line_chart(tickerDf.Volume)
28 |
--------------------------------------------------------------------------------
/streamlit/part10/sp500-app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import base64
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | import numpy as np
7 | import yfinance as yf
8 |
9 | st.title('S&P 500 App')
10 |
11 | st.markdown("""
12 | This app retrieves the list of the **S&P 500** (from Wikipedia) and its corresponding **stock closing price** (year-to-date)!
13 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn
14 | * **Data source:** [Wikipedia](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies).
15 | """)
16 |
17 | st.sidebar.header('User Input Features')
18 |
19 | # Web scraping of S&P 500 data
20 | #
21 | @st.cache
22 | def load_data():
23 | url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
24 | html = pd.read_html(url, header = 0)
25 | df = html[0]
26 | return df
27 |
28 | df = load_data()
29 | sector = df.groupby('GICS Sector')
30 |
31 | # Sidebar - Sector selection
32 | sorted_sector_unique = sorted( df['GICS Sector'].unique() )
33 | selected_sector = st.sidebar.multiselect('Sector', sorted_sector_unique, sorted_sector_unique)
34 |
35 | # Filtering data
36 | df_selected_sector = df[ (df['GICS Sector'].isin(selected_sector)) ]
37 |
38 | st.header('Display Companies in Selected Sector')
39 | st.write('Data Dimension: ' + str(df_selected_sector.shape[0]) + ' rows and ' + str(df_selected_sector.shape[1]) + ' columns.')
40 | st.dataframe(df_selected_sector)
41 |
42 | # Download S&P500 data
43 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
44 | def filedownload(df):
45 | csv = df.to_csv(index=False)
46 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions
47 | href = f'Download CSV File'
48 | return href
49 |
50 | st.markdown(filedownload(df_selected_sector), unsafe_allow_html=True)
51 |
52 | # https://pypi.org/project/yfinance/
53 |
54 | data = yf.download(
55 | tickers = list(df_selected_sector[:10].Symbol),
56 | period = "ytd",
57 | interval = "1d",
58 | group_by = 'ticker',
59 | auto_adjust = True,
60 | prepost = True,
61 | threads = True,
62 | proxy = None
63 | )
64 |
65 | # Plot Closing Price of Query Symbol
66 | def price_plot(symbol):
67 | df = pd.DataFrame(data[symbol].Close)
68 | df['Date'] = df.index
69 | plt.fill_between(df.Date, df.Close, color='skyblue', alpha=0.3)
70 | plt.plot(df.Date, df.Close, color='skyblue', alpha=0.8)
71 | plt.xticks(rotation=90)
72 | plt.title(symbol, fontweight='bold')
73 | plt.xlabel('Date', fontweight='bold')
74 | plt.ylabel('Closing Price', fontweight='bold')
75 | return st.pyplot()
76 |
77 | num_company = st.sidebar.slider('Number of Companies', 1, 5)
78 |
79 | if st.button('Show Plots'):
80 | st.header('Stock Closing Price')
81 | for i in list(df_selected_sector.Symbol)[:num_company]:
82 | price_plot(i)
83 |
--------------------------------------------------------------------------------
/streamlit/part12/crypto-price-app.py:
--------------------------------------------------------------------------------
1 | # This app is for educational purpose only. Insights gained is not financial advice. Use at your own risk!
2 | import streamlit as st
3 | from PIL import Image
4 | import pandas as pd
5 | import base64
6 | import matplotlib.pyplot as plt
7 | from bs4 import BeautifulSoup
8 | import requests
9 | import json
10 | import time
11 | #---------------------------------#
12 | # New feature (make sure to upgrade your streamlit library)
13 | # pip install --upgrade streamlit
14 |
15 | #---------------------------------#
16 | # Page layout
17 | ## Page expands to full width
18 | st.set_page_config(layout="wide")
19 | #---------------------------------#
20 | # Title
21 |
22 | image = Image.open('logo.jpg')
23 |
24 | st.image(image, width = 500)
25 |
26 | st.title('Crypto Price App')
27 | st.markdown("""
28 | This app retrieves cryptocurrency prices for the top 100 cryptocurrency from the **CoinMarketCap**!
29 |
30 | """)
31 | #---------------------------------#
32 | # About
33 | expander_bar = st.beta_expander("About")
34 | expander_bar.markdown("""
35 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn, BeautifulSoup, requests, json, time
36 | * **Data source:** [CoinMarketCap](http://coinmarketcap.com).
37 | * **Credit:** Web scraper adapted from the Medium article *[Web Scraping Crypto Prices With Python](https://towardsdatascience.com/web-scraping-crypto-prices-with-python-41072ea5b5bf)* written by [Bryan Feng](https://medium.com/@bryanf).
38 | """)
39 |
40 |
41 | #---------------------------------#
42 | # Page layout (continued)
43 | ## Divide page to 3 columns (col1 = sidebar, col2 and col3 = page contents)
44 | col1 = st.sidebar
45 | col2, col3 = st.beta_columns((2,1))
46 |
47 | #---------------------------------#
48 | # Sidebar + Main panel
49 | col1.header('Input Options')
50 |
51 | ## Sidebar - Currency price unit
52 | currency_price_unit = col1.selectbox('Select currency for price', ('USD', 'BTC', 'ETH'))
53 |
54 | # Web scraping of CoinMarketCap data
55 | @st.cache
56 | def load_data():
57 | cmc = requests.get('https://coinmarketcap.com')
58 | soup = BeautifulSoup(cmc.content, 'html.parser')
59 |
60 | data = soup.find('script', id='__NEXT_DATA__', type='application/json')
61 | coins = {}
62 | coin_data = json.loads(data.contents[0])
63 | listings = coin_data['props']['initialState']['cryptocurrency']['listingLatest']['data']
64 | for i in listings:
65 | coins[str(i['id'])] = i['slug']
66 |
67 | coin_name = []
68 | coin_symbol = []
69 | market_cap = []
70 | percent_change_1h = []
71 | percent_change_24h = []
72 | percent_change_7d = []
73 | price = []
74 | volume_24h = []
75 |
76 | for i in listings:
77 | coin_name.append(i['slug'])
78 | coin_symbol.append(i['symbol'])
79 | price.append(i['quote'][currency_price_unit]['price'])
80 | percent_change_1h.append(i['quote'][currency_price_unit]['percent_change_1h'])
81 | percent_change_24h.append(i['quote'][currency_price_unit]['percent_change_24h'])
82 | percent_change_7d.append(i['quote'][currency_price_unit]['percent_change_7d'])
83 | market_cap.append(i['quote'][currency_price_unit]['market_cap'])
84 | volume_24h.append(i['quote'][currency_price_unit]['volume_24h'])
85 |
86 | df = pd.DataFrame(columns=['coin_name', 'coin_symbol', 'market_cap', 'percent_change_1h', 'percent_change_24h', 'percent_change_7d', 'price', 'volume_24h'])
87 | df['coin_name'] = coin_name
88 | df['coin_symbol'] = coin_symbol
89 | df['price'] = price
90 | df['percent_change_1h'] = percent_change_1h
91 | df['percent_change_24h'] = percent_change_24h
92 | df['percent_change_7d'] = percent_change_7d
93 | df['market_cap'] = market_cap
94 | df['volume_24h'] = volume_24h
95 | return df
96 |
97 | df = load_data()
98 |
99 | ## Sidebar - Cryptocurrency selections
100 | sorted_coin = sorted( df['coin_symbol'] )
101 | selected_coin = col1.multiselect('Cryptocurrency', sorted_coin, sorted_coin)
102 |
103 | df_selected_coin = df[ (df['coin_symbol'].isin(selected_coin)) ] # Filtering data
104 |
105 | ## Sidebar - Number of coins to display
106 | num_coin = col1.slider('Display Top N Coins', 1, 100, 100)
107 | df_coins = df_selected_coin[:num_coin]
108 |
109 | ## Sidebar - Percent change timeframe
110 | percent_timeframe = col1.selectbox('Percent change time frame',
111 | ['7d','24h', '1h'])
112 | percent_dict = {"7d":'percent_change_7d',"24h":'percent_change_24h',"1h":'percent_change_1h'}
113 | selected_percent_timeframe = percent_dict[percent_timeframe]
114 |
115 | ## Sidebar - Sorting values
116 | sort_values = col1.selectbox('Sort values?', ['Yes', 'No'])
117 |
118 | col2.subheader('Price Data of Selected Cryptocurrency')
119 | col2.write('Data Dimension: ' + str(df_selected_coin.shape[0]) + ' rows and ' + str(df_selected_coin.shape[1]) + ' columns.')
120 |
121 | col2.dataframe(df_coins)
122 |
123 | # Download CSV data
124 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
125 | def filedownload(df):
126 | csv = df.to_csv(index=False)
127 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions
128 | href = f'Download CSV File'
129 | return href
130 |
131 | col2.markdown(filedownload(df_selected_coin), unsafe_allow_html=True)
132 |
133 | #---------------------------------#
134 | # Preparing data for Bar plot of % Price change
135 | col2.subheader('Table of % Price Change')
136 | df_change = pd.concat([df_coins.coin_symbol, df_coins.percent_change_1h, df_coins.percent_change_24h, df_coins.percent_change_7d], axis=1)
137 | df_change = df_change.set_index('coin_symbol')
138 | df_change['positive_percent_change_1h'] = df_change['percent_change_1h'] > 0
139 | df_change['positive_percent_change_24h'] = df_change['percent_change_24h'] > 0
140 | df_change['positive_percent_change_7d'] = df_change['percent_change_7d'] > 0
141 | col2.dataframe(df_change)
142 |
143 | # Conditional creation of Bar plot (time frame)
144 | col3.subheader('Bar plot of % Price Change')
145 |
146 | if percent_timeframe == '7d':
147 | if sort_values == 'Yes':
148 | df_change = df_change.sort_values(by=['percent_change_7d'])
149 | col3.write('*7 days period*')
150 | plt.figure(figsize=(5,25))
151 | plt.subplots_adjust(top = 1, bottom = 0)
152 | df_change['percent_change_7d'].plot(kind='barh', color=df_change.positive_percent_change_7d.map({True: 'g', False: 'r'}))
153 | col3.pyplot(plt)
154 | elif percent_timeframe == '24h':
155 | if sort_values == 'Yes':
156 | df_change = df_change.sort_values(by=['percent_change_24h'])
157 | col3.write('*24 hour period*')
158 | plt.figure(figsize=(5,25))
159 | plt.subplots_adjust(top = 1, bottom = 0)
160 | df_change['percent_change_24h'].plot(kind='barh', color=df_change.positive_percent_change_24h.map({True: 'g', False: 'r'}))
161 | col3.pyplot(plt)
162 | else:
163 | if sort_values == 'Yes':
164 | df_change = df_change.sort_values(by=['percent_change_1h'])
165 | col3.write('*1 hour period*')
166 | plt.figure(figsize=(5,25))
167 | plt.subplots_adjust(top = 1, bottom = 0)
168 | df_change['percent_change_1h'].plot(kind='barh', color=df_change.positive_percent_change_1h.map({True: 'g', False: 'r'}))
169 | col3.pyplot(plt)
170 |
--------------------------------------------------------------------------------
/streamlit/part12/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part12/logo.jpg
--------------------------------------------------------------------------------
/streamlit/part2/iris-ml-app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | from sklearn import datasets
4 | from sklearn.ensemble import RandomForestClassifier
5 |
6 | st.write("""
7 | # Simple Iris Flower Prediction App
8 |
9 | This app predicts the **Iris flower** type!
10 | """)
11 |
12 | st.sidebar.header('User Input Parameters')
13 |
14 | def user_input_features():
15 | sepal_length = st.sidebar.slider('Sepal length', 4.3, 7.9, 5.4)
16 | sepal_width = st.sidebar.slider('Sepal width', 2.0, 4.4, 3.4)
17 | petal_length = st.sidebar.slider('Petal length', 1.0, 6.9, 1.3)
18 | petal_width = st.sidebar.slider('Petal width', 0.1, 2.5, 0.2)
19 | data = {'sepal_length': sepal_length,
20 | 'sepal_width': sepal_width,
21 | 'petal_length': petal_length,
22 | 'petal_width': petal_width}
23 | features = pd.DataFrame(data, index=[0])
24 | return features
25 |
26 | df = user_input_features()
27 |
28 | st.subheader('User Input parameters')
29 | st.write(df)
30 |
31 | iris = datasets.load_iris()
32 | X = iris.data
33 | Y = iris.target
34 |
35 | clf = RandomForestClassifier()
36 | clf.fit(X, Y)
37 |
38 | prediction = clf.predict(df)
39 | prediction_proba = clf.predict_proba(df)
40 |
41 | st.subheader('Class labels and their corresponding index number')
42 | st.write(iris.target_names)
43 |
44 | st.subheader('Prediction')
45 | st.write(iris.target_names[prediction])
46 | #st.write(prediction)
47 |
48 | st.subheader('Prediction Probability')
49 | st.write(prediction_proba)
50 |
--------------------------------------------------------------------------------
/streamlit/part3/penguins-app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import numpy as np
4 | import pickle
5 | from sklearn.ensemble import RandomForestClassifier
6 |
7 | st.write("""
8 | # Penguin Prediction App
9 |
10 | This app predicts the **Palmer Penguin** species!
11 |
12 | Data obtained from the [palmerpenguins library](https://github.com/allisonhorst/palmerpenguins) in R by Allison Horst.
13 | """)
14 |
15 | st.sidebar.header('User Input Features')
16 |
17 | st.sidebar.markdown("""
18 | [Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/penguins_example.csv)
19 | """)
20 |
21 | # Collects user input features into dataframe
22 | uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"])
23 | if uploaded_file is not None:
24 | input_df = pd.read_csv(uploaded_file)
25 | else:
26 | def user_input_features():
27 | island = st.sidebar.selectbox('Island',('Biscoe','Dream','Torgersen'))
28 | sex = st.sidebar.selectbox('Sex',('male','female'))
29 | bill_length_mm = st.sidebar.slider('Bill length (mm)', 32.1,59.6,43.9)
30 | bill_depth_mm = st.sidebar.slider('Bill depth (mm)', 13.1,21.5,17.2)
31 | flipper_length_mm = st.sidebar.slider('Flipper length (mm)', 172.0,231.0,201.0)
32 | body_mass_g = st.sidebar.slider('Body mass (g)', 2700.0,6300.0,4207.0)
33 | data = {'island': island,
34 | 'bill_length_mm': bill_length_mm,
35 | 'bill_depth_mm': bill_depth_mm,
36 | 'flipper_length_mm': flipper_length_mm,
37 | 'body_mass_g': body_mass_g,
38 | 'sex': sex}
39 | features = pd.DataFrame(data, index=[0])
40 | return features
41 | input_df = user_input_features()
42 |
43 | # Combines user input features with entire penguins dataset
44 | # This will be useful for the encoding phase
45 | penguins_raw = pd.read_csv('penguins_cleaned.csv')
46 | penguins = penguins_raw.drop(columns=['species'])
47 | df = pd.concat([input_df,penguins],axis=0)
48 |
49 | # Encoding of ordinal features
50 | # https://www.kaggle.com/pratik1120/penguin-dataset-eda-classification-and-clustering
51 | encode = ['sex','island']
52 | for col in encode:
53 | dummy = pd.get_dummies(df[col], prefix=col)
54 | df = pd.concat([df,dummy], axis=1)
55 | del df[col]
56 | df = df[:1] # Selects only the first row (the user input data)
57 |
58 | # Displays the user input features
59 | st.subheader('User Input features')
60 |
61 | if uploaded_file is not None:
62 | st.write(df)
63 | else:
64 | st.write('Awaiting CSV file to be uploaded. Currently using example input parameters (shown below).')
65 | st.write(df)
66 |
67 | # Reads in saved classification model
68 | load_clf = pickle.load(open('penguins_clf.pkl', 'rb'))
69 |
70 | # Apply model to make predictions
71 | prediction = load_clf.predict(df)
72 | prediction_proba = load_clf.predict_proba(df)
73 |
74 |
75 | st.subheader('Prediction')
76 | penguins_species = np.array(['Adelie','Chinstrap','Gentoo'])
77 | st.write(penguins_species[prediction])
78 |
79 | st.subheader('Prediction Probability')
80 | st.write(prediction_proba)
81 |
--------------------------------------------------------------------------------
/streamlit/part3/penguins-model-building.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | penguins = pd.read_csv('penguins_cleaned.csv')
3 |
4 | # Ordinal feature encoding
5 | # https://www.kaggle.com/pratik1120/penguin-dataset-eda-classification-and-clustering
6 | df = penguins.copy()
7 | target = 'species'
8 | encode = ['sex','island']
9 |
10 | for col in encode:
11 | dummy = pd.get_dummies(df[col], prefix=col)
12 | df = pd.concat([df,dummy], axis=1)
13 | del df[col]
14 |
15 | target_mapper = {'Adelie':0, 'Chinstrap':1, 'Gentoo':2}
16 | def target_encode(val):
17 | return target_mapper[val]
18 |
19 | df['species'] = df['species'].apply(target_encode)
20 |
21 | # Separating X and y
22 | X = df.drop('species', axis=1)
23 | Y = df['species']
24 |
25 | # Build random forest model
26 | from sklearn.ensemble import RandomForestClassifier
27 | clf = RandomForestClassifier()
28 | clf.fit(X, Y)
29 |
30 | # Saving the model
31 | import pickle
32 | pickle.dump(clf, open('penguins_clf.pkl', 'wb'))
33 |
--------------------------------------------------------------------------------
/streamlit/part3/penguins_cleaned.csv:
--------------------------------------------------------------------------------
1 | "species","island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex"
2 | "Adelie","Torgersen",39.1,18.7,181,3750,"male"
3 | "Adelie","Torgersen",39.5,17.4,186,3800,"female"
4 | "Adelie","Torgersen",40.3,18,195,3250,"female"
5 | "Adelie","Torgersen",36.7,19.3,193,3450,"female"
6 | "Adelie","Torgersen",39.3,20.6,190,3650,"male"
7 | "Adelie","Torgersen",38.9,17.8,181,3625,"female"
8 | "Adelie","Torgersen",39.2,19.6,195,4675,"male"
9 | "Adelie","Torgersen",41.1,17.6,182,3200,"female"
10 | "Adelie","Torgersen",38.6,21.2,191,3800,"male"
11 | "Adelie","Torgersen",34.6,21.1,198,4400,"male"
12 | "Adelie","Torgersen",36.6,17.8,185,3700,"female"
13 | "Adelie","Torgersen",38.7,19,195,3450,"female"
14 | "Adelie","Torgersen",42.5,20.7,197,4500,"male"
15 | "Adelie","Torgersen",34.4,18.4,184,3325,"female"
16 | "Adelie","Torgersen",46,21.5,194,4200,"male"
17 | "Adelie","Biscoe",37.8,18.3,174,3400,"female"
18 | "Adelie","Biscoe",37.7,18.7,180,3600,"male"
19 | "Adelie","Biscoe",35.9,19.2,189,3800,"female"
20 | "Adelie","Biscoe",38.2,18.1,185,3950,"male"
21 | "Adelie","Biscoe",38.8,17.2,180,3800,"male"
22 | "Adelie","Biscoe",35.3,18.9,187,3800,"female"
23 | "Adelie","Biscoe",40.6,18.6,183,3550,"male"
24 | "Adelie","Biscoe",40.5,17.9,187,3200,"female"
25 | "Adelie","Biscoe",37.9,18.6,172,3150,"female"
26 | "Adelie","Biscoe",40.5,18.9,180,3950,"male"
27 | "Adelie","Dream",39.5,16.7,178,3250,"female"
28 | "Adelie","Dream",37.2,18.1,178,3900,"male"
29 | "Adelie","Dream",39.5,17.8,188,3300,"female"
30 | "Adelie","Dream",40.9,18.9,184,3900,"male"
31 | "Adelie","Dream",36.4,17,195,3325,"female"
32 | "Adelie","Dream",39.2,21.1,196,4150,"male"
33 | "Adelie","Dream",38.8,20,190,3950,"male"
34 | "Adelie","Dream",42.2,18.5,180,3550,"female"
35 | "Adelie","Dream",37.6,19.3,181,3300,"female"
36 | "Adelie","Dream",39.8,19.1,184,4650,"male"
37 | "Adelie","Dream",36.5,18,182,3150,"female"
38 | "Adelie","Dream",40.8,18.4,195,3900,"male"
39 | "Adelie","Dream",36,18.5,186,3100,"female"
40 | "Adelie","Dream",44.1,19.7,196,4400,"male"
41 | "Adelie","Dream",37,16.9,185,3000,"female"
42 | "Adelie","Dream",39.6,18.8,190,4600,"male"
43 | "Adelie","Dream",41.1,19,182,3425,"male"
44 | "Adelie","Dream",36,17.9,190,3450,"female"
45 | "Adelie","Dream",42.3,21.2,191,4150,"male"
46 | "Adelie","Biscoe",39.6,17.7,186,3500,"female"
47 | "Adelie","Biscoe",40.1,18.9,188,4300,"male"
48 | "Adelie","Biscoe",35,17.9,190,3450,"female"
49 | "Adelie","Biscoe",42,19.5,200,4050,"male"
50 | "Adelie","Biscoe",34.5,18.1,187,2900,"female"
51 | "Adelie","Biscoe",41.4,18.6,191,3700,"male"
52 | "Adelie","Biscoe",39,17.5,186,3550,"female"
53 | "Adelie","Biscoe",40.6,18.8,193,3800,"male"
54 | "Adelie","Biscoe",36.5,16.6,181,2850,"female"
55 | "Adelie","Biscoe",37.6,19.1,194,3750,"male"
56 | "Adelie","Biscoe",35.7,16.9,185,3150,"female"
57 | "Adelie","Biscoe",41.3,21.1,195,4400,"male"
58 | "Adelie","Biscoe",37.6,17,185,3600,"female"
59 | "Adelie","Biscoe",41.1,18.2,192,4050,"male"
60 | "Adelie","Biscoe",36.4,17.1,184,2850,"female"
61 | "Adelie","Biscoe",41.6,18,192,3950,"male"
62 | "Adelie","Biscoe",35.5,16.2,195,3350,"female"
63 | "Adelie","Biscoe",41.1,19.1,188,4100,"male"
64 | "Adelie","Torgersen",35.9,16.6,190,3050,"female"
65 | "Adelie","Torgersen",41.8,19.4,198,4450,"male"
66 | "Adelie","Torgersen",33.5,19,190,3600,"female"
67 | "Adelie","Torgersen",39.7,18.4,190,3900,"male"
68 | "Adelie","Torgersen",39.6,17.2,196,3550,"female"
69 | "Adelie","Torgersen",45.8,18.9,197,4150,"male"
70 | "Adelie","Torgersen",35.5,17.5,190,3700,"female"
71 | "Adelie","Torgersen",42.8,18.5,195,4250,"male"
72 | "Adelie","Torgersen",40.9,16.8,191,3700,"female"
73 | "Adelie","Torgersen",37.2,19.4,184,3900,"male"
74 | "Adelie","Torgersen",36.2,16.1,187,3550,"female"
75 | "Adelie","Torgersen",42.1,19.1,195,4000,"male"
76 | "Adelie","Torgersen",34.6,17.2,189,3200,"female"
77 | "Adelie","Torgersen",42.9,17.6,196,4700,"male"
78 | "Adelie","Torgersen",36.7,18.8,187,3800,"female"
79 | "Adelie","Torgersen",35.1,19.4,193,4200,"male"
80 | "Adelie","Dream",37.3,17.8,191,3350,"female"
81 | "Adelie","Dream",41.3,20.3,194,3550,"male"
82 | "Adelie","Dream",36.3,19.5,190,3800,"male"
83 | "Adelie","Dream",36.9,18.6,189,3500,"female"
84 | "Adelie","Dream",38.3,19.2,189,3950,"male"
85 | "Adelie","Dream",38.9,18.8,190,3600,"female"
86 | "Adelie","Dream",35.7,18,202,3550,"female"
87 | "Adelie","Dream",41.1,18.1,205,4300,"male"
88 | "Adelie","Dream",34,17.1,185,3400,"female"
89 | "Adelie","Dream",39.6,18.1,186,4450,"male"
90 | "Adelie","Dream",36.2,17.3,187,3300,"female"
91 | "Adelie","Dream",40.8,18.9,208,4300,"male"
92 | "Adelie","Dream",38.1,18.6,190,3700,"female"
93 | "Adelie","Dream",40.3,18.5,196,4350,"male"
94 | "Adelie","Dream",33.1,16.1,178,2900,"female"
95 | "Adelie","Dream",43.2,18.5,192,4100,"male"
96 | "Adelie","Biscoe",35,17.9,192,3725,"female"
97 | "Adelie","Biscoe",41,20,203,4725,"male"
98 | "Adelie","Biscoe",37.7,16,183,3075,"female"
99 | "Adelie","Biscoe",37.8,20,190,4250,"male"
100 | "Adelie","Biscoe",37.9,18.6,193,2925,"female"
101 | "Adelie","Biscoe",39.7,18.9,184,3550,"male"
102 | "Adelie","Biscoe",38.6,17.2,199,3750,"female"
103 | "Adelie","Biscoe",38.2,20,190,3900,"male"
104 | "Adelie","Biscoe",38.1,17,181,3175,"female"
105 | "Adelie","Biscoe",43.2,19,197,4775,"male"
106 | "Adelie","Biscoe",38.1,16.5,198,3825,"female"
107 | "Adelie","Biscoe",45.6,20.3,191,4600,"male"
108 | "Adelie","Biscoe",39.7,17.7,193,3200,"female"
109 | "Adelie","Biscoe",42.2,19.5,197,4275,"male"
110 | "Adelie","Biscoe",39.6,20.7,191,3900,"female"
111 | "Adelie","Biscoe",42.7,18.3,196,4075,"male"
112 | "Adelie","Torgersen",38.6,17,188,2900,"female"
113 | "Adelie","Torgersen",37.3,20.5,199,3775,"male"
114 | "Adelie","Torgersen",35.7,17,189,3350,"female"
115 | "Adelie","Torgersen",41.1,18.6,189,3325,"male"
116 | "Adelie","Torgersen",36.2,17.2,187,3150,"female"
117 | "Adelie","Torgersen",37.7,19.8,198,3500,"male"
118 | "Adelie","Torgersen",40.2,17,176,3450,"female"
119 | "Adelie","Torgersen",41.4,18.5,202,3875,"male"
120 | "Adelie","Torgersen",35.2,15.9,186,3050,"female"
121 | "Adelie","Torgersen",40.6,19,199,4000,"male"
122 | "Adelie","Torgersen",38.8,17.6,191,3275,"female"
123 | "Adelie","Torgersen",41.5,18.3,195,4300,"male"
124 | "Adelie","Torgersen",39,17.1,191,3050,"female"
125 | "Adelie","Torgersen",44.1,18,210,4000,"male"
126 | "Adelie","Torgersen",38.5,17.9,190,3325,"female"
127 | "Adelie","Torgersen",43.1,19.2,197,3500,"male"
128 | "Adelie","Dream",36.8,18.5,193,3500,"female"
129 | "Adelie","Dream",37.5,18.5,199,4475,"male"
130 | "Adelie","Dream",38.1,17.6,187,3425,"female"
131 | "Adelie","Dream",41.1,17.5,190,3900,"male"
132 | "Adelie","Dream",35.6,17.5,191,3175,"female"
133 | "Adelie","Dream",40.2,20.1,200,3975,"male"
134 | "Adelie","Dream",37,16.5,185,3400,"female"
135 | "Adelie","Dream",39.7,17.9,193,4250,"male"
136 | "Adelie","Dream",40.2,17.1,193,3400,"female"
137 | "Adelie","Dream",40.6,17.2,187,3475,"male"
138 | "Adelie","Dream",32.1,15.5,188,3050,"female"
139 | "Adelie","Dream",40.7,17,190,3725,"male"
140 | "Adelie","Dream",37.3,16.8,192,3000,"female"
141 | "Adelie","Dream",39,18.7,185,3650,"male"
142 | "Adelie","Dream",39.2,18.6,190,4250,"male"
143 | "Adelie","Dream",36.6,18.4,184,3475,"female"
144 | "Adelie","Dream",36,17.8,195,3450,"female"
145 | "Adelie","Dream",37.8,18.1,193,3750,"male"
146 | "Adelie","Dream",36,17.1,187,3700,"female"
147 | "Adelie","Dream",41.5,18.5,201,4000,"male"
148 | "Gentoo","Biscoe",46.1,13.2,211,4500,"female"
149 | "Gentoo","Biscoe",50,16.3,230,5700,"male"
150 | "Gentoo","Biscoe",48.7,14.1,210,4450,"female"
151 | "Gentoo","Biscoe",50,15.2,218,5700,"male"
152 | "Gentoo","Biscoe",47.6,14.5,215,5400,"male"
153 | "Gentoo","Biscoe",46.5,13.5,210,4550,"female"
154 | "Gentoo","Biscoe",45.4,14.6,211,4800,"female"
155 | "Gentoo","Biscoe",46.7,15.3,219,5200,"male"
156 | "Gentoo","Biscoe",43.3,13.4,209,4400,"female"
157 | "Gentoo","Biscoe",46.8,15.4,215,5150,"male"
158 | "Gentoo","Biscoe",40.9,13.7,214,4650,"female"
159 | "Gentoo","Biscoe",49,16.1,216,5550,"male"
160 | "Gentoo","Biscoe",45.5,13.7,214,4650,"female"
161 | "Gentoo","Biscoe",48.4,14.6,213,5850,"male"
162 | "Gentoo","Biscoe",45.8,14.6,210,4200,"female"
163 | "Gentoo","Biscoe",49.3,15.7,217,5850,"male"
164 | "Gentoo","Biscoe",42,13.5,210,4150,"female"
165 | "Gentoo","Biscoe",49.2,15.2,221,6300,"male"
166 | "Gentoo","Biscoe",46.2,14.5,209,4800,"female"
167 | "Gentoo","Biscoe",48.7,15.1,222,5350,"male"
168 | "Gentoo","Biscoe",50.2,14.3,218,5700,"male"
169 | "Gentoo","Biscoe",45.1,14.5,215,5000,"female"
170 | "Gentoo","Biscoe",46.5,14.5,213,4400,"female"
171 | "Gentoo","Biscoe",46.3,15.8,215,5050,"male"
172 | "Gentoo","Biscoe",42.9,13.1,215,5000,"female"
173 | "Gentoo","Biscoe",46.1,15.1,215,5100,"male"
174 | "Gentoo","Biscoe",47.8,15,215,5650,"male"
175 | "Gentoo","Biscoe",48.2,14.3,210,4600,"female"
176 | "Gentoo","Biscoe",50,15.3,220,5550,"male"
177 | "Gentoo","Biscoe",47.3,15.3,222,5250,"male"
178 | "Gentoo","Biscoe",42.8,14.2,209,4700,"female"
179 | "Gentoo","Biscoe",45.1,14.5,207,5050,"female"
180 | "Gentoo","Biscoe",59.6,17,230,6050,"male"
181 | "Gentoo","Biscoe",49.1,14.8,220,5150,"female"
182 | "Gentoo","Biscoe",48.4,16.3,220,5400,"male"
183 | "Gentoo","Biscoe",42.6,13.7,213,4950,"female"
184 | "Gentoo","Biscoe",44.4,17.3,219,5250,"male"
185 | "Gentoo","Biscoe",44,13.6,208,4350,"female"
186 | "Gentoo","Biscoe",48.7,15.7,208,5350,"male"
187 | "Gentoo","Biscoe",42.7,13.7,208,3950,"female"
188 | "Gentoo","Biscoe",49.6,16,225,5700,"male"
189 | "Gentoo","Biscoe",45.3,13.7,210,4300,"female"
190 | "Gentoo","Biscoe",49.6,15,216,4750,"male"
191 | "Gentoo","Biscoe",50.5,15.9,222,5550,"male"
192 | "Gentoo","Biscoe",43.6,13.9,217,4900,"female"
193 | "Gentoo","Biscoe",45.5,13.9,210,4200,"female"
194 | "Gentoo","Biscoe",50.5,15.9,225,5400,"male"
195 | "Gentoo","Biscoe",44.9,13.3,213,5100,"female"
196 | "Gentoo","Biscoe",45.2,15.8,215,5300,"male"
197 | "Gentoo","Biscoe",46.6,14.2,210,4850,"female"
198 | "Gentoo","Biscoe",48.5,14.1,220,5300,"male"
199 | "Gentoo","Biscoe",45.1,14.4,210,4400,"female"
200 | "Gentoo","Biscoe",50.1,15,225,5000,"male"
201 | "Gentoo","Biscoe",46.5,14.4,217,4900,"female"
202 | "Gentoo","Biscoe",45,15.4,220,5050,"male"
203 | "Gentoo","Biscoe",43.8,13.9,208,4300,"female"
204 | "Gentoo","Biscoe",45.5,15,220,5000,"male"
205 | "Gentoo","Biscoe",43.2,14.5,208,4450,"female"
206 | "Gentoo","Biscoe",50.4,15.3,224,5550,"male"
207 | "Gentoo","Biscoe",45.3,13.8,208,4200,"female"
208 | "Gentoo","Biscoe",46.2,14.9,221,5300,"male"
209 | "Gentoo","Biscoe",45.7,13.9,214,4400,"female"
210 | "Gentoo","Biscoe",54.3,15.7,231,5650,"male"
211 | "Gentoo","Biscoe",45.8,14.2,219,4700,"female"
212 | "Gentoo","Biscoe",49.8,16.8,230,5700,"male"
213 | "Gentoo","Biscoe",49.5,16.2,229,5800,"male"
214 | "Gentoo","Biscoe",43.5,14.2,220,4700,"female"
215 | "Gentoo","Biscoe",50.7,15,223,5550,"male"
216 | "Gentoo","Biscoe",47.7,15,216,4750,"female"
217 | "Gentoo","Biscoe",46.4,15.6,221,5000,"male"
218 | "Gentoo","Biscoe",48.2,15.6,221,5100,"male"
219 | "Gentoo","Biscoe",46.5,14.8,217,5200,"female"
220 | "Gentoo","Biscoe",46.4,15,216,4700,"female"
221 | "Gentoo","Biscoe",48.6,16,230,5800,"male"
222 | "Gentoo","Biscoe",47.5,14.2,209,4600,"female"
223 | "Gentoo","Biscoe",51.1,16.3,220,6000,"male"
224 | "Gentoo","Biscoe",45.2,13.8,215,4750,"female"
225 | "Gentoo","Biscoe",45.2,16.4,223,5950,"male"
226 | "Gentoo","Biscoe",49.1,14.5,212,4625,"female"
227 | "Gentoo","Biscoe",52.5,15.6,221,5450,"male"
228 | "Gentoo","Biscoe",47.4,14.6,212,4725,"female"
229 | "Gentoo","Biscoe",50,15.9,224,5350,"male"
230 | "Gentoo","Biscoe",44.9,13.8,212,4750,"female"
231 | "Gentoo","Biscoe",50.8,17.3,228,5600,"male"
232 | "Gentoo","Biscoe",43.4,14.4,218,4600,"female"
233 | "Gentoo","Biscoe",51.3,14.2,218,5300,"male"
234 | "Gentoo","Biscoe",47.5,14,212,4875,"female"
235 | "Gentoo","Biscoe",52.1,17,230,5550,"male"
236 | "Gentoo","Biscoe",47.5,15,218,4950,"female"
237 | "Gentoo","Biscoe",52.2,17.1,228,5400,"male"
238 | "Gentoo","Biscoe",45.5,14.5,212,4750,"female"
239 | "Gentoo","Biscoe",49.5,16.1,224,5650,"male"
240 | "Gentoo","Biscoe",44.5,14.7,214,4850,"female"
241 | "Gentoo","Biscoe",50.8,15.7,226,5200,"male"
242 | "Gentoo","Biscoe",49.4,15.8,216,4925,"male"
243 | "Gentoo","Biscoe",46.9,14.6,222,4875,"female"
244 | "Gentoo","Biscoe",48.4,14.4,203,4625,"female"
245 | "Gentoo","Biscoe",51.1,16.5,225,5250,"male"
246 | "Gentoo","Biscoe",48.5,15,219,4850,"female"
247 | "Gentoo","Biscoe",55.9,17,228,5600,"male"
248 | "Gentoo","Biscoe",47.2,15.5,215,4975,"female"
249 | "Gentoo","Biscoe",49.1,15,228,5500,"male"
250 | "Gentoo","Biscoe",46.8,16.1,215,5500,"male"
251 | "Gentoo","Biscoe",41.7,14.7,210,4700,"female"
252 | "Gentoo","Biscoe",53.4,15.8,219,5500,"male"
253 | "Gentoo","Biscoe",43.3,14,208,4575,"female"
254 | "Gentoo","Biscoe",48.1,15.1,209,5500,"male"
255 | "Gentoo","Biscoe",50.5,15.2,216,5000,"female"
256 | "Gentoo","Biscoe",49.8,15.9,229,5950,"male"
257 | "Gentoo","Biscoe",43.5,15.2,213,4650,"female"
258 | "Gentoo","Biscoe",51.5,16.3,230,5500,"male"
259 | "Gentoo","Biscoe",46.2,14.1,217,4375,"female"
260 | "Gentoo","Biscoe",55.1,16,230,5850,"male"
261 | "Gentoo","Biscoe",48.8,16.2,222,6000,"male"
262 | "Gentoo","Biscoe",47.2,13.7,214,4925,"female"
263 | "Gentoo","Biscoe",46.8,14.3,215,4850,"female"
264 | "Gentoo","Biscoe",50.4,15.7,222,5750,"male"
265 | "Gentoo","Biscoe",45.2,14.8,212,5200,"female"
266 | "Gentoo","Biscoe",49.9,16.1,213,5400,"male"
267 | "Chinstrap","Dream",46.5,17.9,192,3500,"female"
268 | "Chinstrap","Dream",50,19.5,196,3900,"male"
269 | "Chinstrap","Dream",51.3,19.2,193,3650,"male"
270 | "Chinstrap","Dream",45.4,18.7,188,3525,"female"
271 | "Chinstrap","Dream",52.7,19.8,197,3725,"male"
272 | "Chinstrap","Dream",45.2,17.8,198,3950,"female"
273 | "Chinstrap","Dream",46.1,18.2,178,3250,"female"
274 | "Chinstrap","Dream",51.3,18.2,197,3750,"male"
275 | "Chinstrap","Dream",46,18.9,195,4150,"female"
276 | "Chinstrap","Dream",51.3,19.9,198,3700,"male"
277 | "Chinstrap","Dream",46.6,17.8,193,3800,"female"
278 | "Chinstrap","Dream",51.7,20.3,194,3775,"male"
279 | "Chinstrap","Dream",47,17.3,185,3700,"female"
280 | "Chinstrap","Dream",52,18.1,201,4050,"male"
281 | "Chinstrap","Dream",45.9,17.1,190,3575,"female"
282 | "Chinstrap","Dream",50.5,19.6,201,4050,"male"
283 | "Chinstrap","Dream",50.3,20,197,3300,"male"
284 | "Chinstrap","Dream",58,17.8,181,3700,"female"
285 | "Chinstrap","Dream",46.4,18.6,190,3450,"female"
286 | "Chinstrap","Dream",49.2,18.2,195,4400,"male"
287 | "Chinstrap","Dream",42.4,17.3,181,3600,"female"
288 | "Chinstrap","Dream",48.5,17.5,191,3400,"male"
289 | "Chinstrap","Dream",43.2,16.6,187,2900,"female"
290 | "Chinstrap","Dream",50.6,19.4,193,3800,"male"
291 | "Chinstrap","Dream",46.7,17.9,195,3300,"female"
292 | "Chinstrap","Dream",52,19,197,4150,"male"
293 | "Chinstrap","Dream",50.5,18.4,200,3400,"female"
294 | "Chinstrap","Dream",49.5,19,200,3800,"male"
295 | "Chinstrap","Dream",46.4,17.8,191,3700,"female"
296 | "Chinstrap","Dream",52.8,20,205,4550,"male"
297 | "Chinstrap","Dream",40.9,16.6,187,3200,"female"
298 | "Chinstrap","Dream",54.2,20.8,201,4300,"male"
299 | "Chinstrap","Dream",42.5,16.7,187,3350,"female"
300 | "Chinstrap","Dream",51,18.8,203,4100,"male"
301 | "Chinstrap","Dream",49.7,18.6,195,3600,"male"
302 | "Chinstrap","Dream",47.5,16.8,199,3900,"female"
303 | "Chinstrap","Dream",47.6,18.3,195,3850,"female"
304 | "Chinstrap","Dream",52,20.7,210,4800,"male"
305 | "Chinstrap","Dream",46.9,16.6,192,2700,"female"
306 | "Chinstrap","Dream",53.5,19.9,205,4500,"male"
307 | "Chinstrap","Dream",49,19.5,210,3950,"male"
308 | "Chinstrap","Dream",46.2,17.5,187,3650,"female"
309 | "Chinstrap","Dream",50.9,19.1,196,3550,"male"
310 | "Chinstrap","Dream",45.5,17,196,3500,"female"
311 | "Chinstrap","Dream",50.9,17.9,196,3675,"female"
312 | "Chinstrap","Dream",50.8,18.5,201,4450,"male"
313 | "Chinstrap","Dream",50.1,17.9,190,3400,"female"
314 | "Chinstrap","Dream",49,19.6,212,4300,"male"
315 | "Chinstrap","Dream",51.5,18.7,187,3250,"male"
316 | "Chinstrap","Dream",49.8,17.3,198,3675,"female"
317 | "Chinstrap","Dream",48.1,16.4,199,3325,"female"
318 | "Chinstrap","Dream",51.4,19,201,3950,"male"
319 | "Chinstrap","Dream",45.7,17.3,193,3600,"female"
320 | "Chinstrap","Dream",50.7,19.7,203,4050,"male"
321 | "Chinstrap","Dream",42.5,17.3,187,3350,"female"
322 | "Chinstrap","Dream",52.2,18.8,197,3450,"male"
323 | "Chinstrap","Dream",45.2,16.6,191,3250,"female"
324 | "Chinstrap","Dream",49.3,19.9,203,4050,"male"
325 | "Chinstrap","Dream",50.2,18.8,202,3800,"male"
326 | "Chinstrap","Dream",45.6,19.4,194,3525,"female"
327 | "Chinstrap","Dream",51.9,19.5,206,3950,"male"
328 | "Chinstrap","Dream",46.8,16.5,189,3650,"female"
329 | "Chinstrap","Dream",45.7,17,195,3650,"female"
330 | "Chinstrap","Dream",55.8,19.8,207,4000,"male"
331 | "Chinstrap","Dream",43.5,18.1,202,3400,"female"
332 | "Chinstrap","Dream",49.6,18.2,193,3775,"male"
333 | "Chinstrap","Dream",50.8,19,210,4100,"male"
334 | "Chinstrap","Dream",50.2,18.7,198,3775,"female"
335 |
--------------------------------------------------------------------------------
/streamlit/part3/penguins_clf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part3/penguins_clf.pkl
--------------------------------------------------------------------------------
/streamlit/part3/penguins_example.csv:
--------------------------------------------------------------------------------
1 | island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
2 | Biscoe,43.9,17.2,201.0,4207.0,male
3 |
--------------------------------------------------------------------------------
/streamlit/part5/basketball_app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import base64
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | import numpy as np
7 |
8 | st.title('NBA Player Stats Explorer')
9 |
10 | st.markdown("""
11 | This app performs simple webscraping of NBA player stats data!
12 | * **Python libraries:** base64, pandas, streamlit
13 | * **Data source:** [Basketball-reference.com](https://www.basketball-reference.com/).
14 | """)
15 |
16 | st.sidebar.header('User Input Features')
17 | selected_year = st.sidebar.selectbox('Year', list(reversed(range(1950,2020))))
18 |
19 | # Web scraping of NBA player stats
20 | @st.cache
21 | def load_data(year):
22 | url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_per_game.html"
23 | html = pd.read_html(url, header = 0)
24 | df = html[0]
25 | raw = df.drop(df[df.Age == 'Age'].index) # Deletes repeating headers in content
26 | raw = raw.fillna(0)
27 | playerstats = raw.drop(['Rk'], axis=1)
28 | return playerstats
29 | playerstats = load_data(selected_year)
30 |
31 | # Sidebar - Team selection
32 | sorted_unique_team = sorted(playerstats.Tm.unique())
33 | selected_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team)
34 |
35 | # Sidebar - Position selection
36 | unique_pos = ['C','PF','SF','PG','SG']
37 | selected_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos)
38 |
39 | # Filtering data
40 | df_selected_team = playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))]
41 |
42 | st.header('Display Player Stats of Selected Team(s)')
43 | st.write('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.')
44 | st.dataframe(df_selected_team)
45 |
46 | # Download NBA player stats data
47 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
48 | def filedownload(df):
49 | csv = df.to_csv(index=False)
50 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions
51 | href = f'Download CSV File'
52 | return href
53 |
54 | st.markdown(filedownload(df_selected_team), unsafe_allow_html=True)
55 |
56 | # Heatmap
57 | if st.button('Intercorrelation Heatmap'):
58 | st.header('Intercorrelation Matrix Heatmap')
59 | df_selected_team.to_csv('output.csv',index=False)
60 | df = pd.read_csv('output.csv')
61 |
62 | corr = df.corr()
63 | mask = np.zeros_like(corr)
64 | mask[np.triu_indices_from(mask)] = True
65 | with sns.axes_style("white"):
66 | f, ax = plt.subplots(figsize=(7, 5))
67 | ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)
68 | st.pyplot()
69 |
--------------------------------------------------------------------------------
/streamlit/part6/boston-house-ml-app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import shap
4 | import matplotlib.pyplot as plt
5 | from sklearn import datasets
6 | from sklearn.ensemble import RandomForestRegressor
7 |
8 | st.write("""
9 | # Boston House Price Prediction App
10 |
11 | This app predicts the **Boston House Price**!
12 | """)
13 | st.write('---')
14 |
15 | # Loads the Boston House Price Dataset
16 | boston = datasets.load_boston()
17 | X = pd.DataFrame(boston.data, columns=boston.feature_names)
18 | Y = pd.DataFrame(boston.target, columns=["MEDV"])
19 |
20 | # Sidebar
21 | # Header of Specify Input Parameters
22 | st.sidebar.header('Specify Input Parameters')
23 |
24 | def user_input_features():
25 | CRIM = st.sidebar.slider('CRIM', X.CRIM.min(), X.CRIM.max(), X.CRIM.mean())
26 | ZN = st.sidebar.slider('ZN', X.ZN.min(), X.ZN.max(), X.ZN.mean())
27 | INDUS = st.sidebar.slider('INDUS', X.INDUS.min(), X.INDUS.max(), X.INDUS.mean())
28 | CHAS = st.sidebar.slider('CHAS', X.CHAS.min(), X.CHAS.max(), X.CHAS.mean())
29 | NOX = st.sidebar.slider('NOX', X.NOX.min(), X.NOX.max(), X.NOX.mean())
30 | RM = st.sidebar.slider('RM', X.RM.min(), X.RM.max(), X.RM.mean())
31 | AGE = st.sidebar.slider('AGE', X.AGE.min(), X.AGE.max(), X.AGE.mean())
32 | DIS = st.sidebar.slider('DIS', X.DIS.min(), X.DIS.max(), X.DIS.mean())
33 | RAD = st.sidebar.slider('RAD', X.RAD.min(), X.RAD.max(), X.RAD.mean())
34 | TAX = st.sidebar.slider('TAX', X.TAX.min(), X.TAX.max(), X.TAX.mean())
35 | PTRATIO = st.sidebar.slider('PTRATIO', X.PTRATIO.min(), X.PTRATIO.max(), X.PTRATIO.mean())
36 | B = st.sidebar.slider('B', X.B.min(), X.B.max(), X.B.mean())
37 | LSTAT = st.sidebar.slider('LSTAT', X.LSTAT.min(), X.LSTAT.max(), X.LSTAT.mean())
38 | data = {'CRIM': CRIM,
39 | 'ZN': ZN,
40 | 'INDUS': INDUS,
41 | 'CHAS': CHAS,
42 | 'NOX': NOX,
43 | 'RM': RM,
44 | 'AGE': AGE,
45 | 'DIS': DIS,
46 | 'RAD': RAD,
47 | 'TAX': TAX,
48 | 'PTRATIO': PTRATIO,
49 | 'B': B,
50 | 'LSTAT': LSTAT}
51 | features = pd.DataFrame(data, index=[0])
52 | return features
53 |
54 | df = user_input_features()
55 |
56 | # Main Panel
57 |
58 | # Print specified input parameters
59 | st.header('Specified Input parameters')
60 | st.write(df)
61 | st.write('---')
62 |
63 | # Build Regression Model
64 | model = RandomForestRegressor()
65 | model.fit(X, Y)
66 | # Apply Model to Make Prediction
67 | prediction = model.predict(df)
68 |
69 | st.header('Prediction of MEDV')
70 | st.write(prediction)
71 | st.write('---')
72 |
73 | # Explaining the model's predictions using SHAP values
74 | # https://github.com/slundberg/shap
75 | explainer = shap.TreeExplainer(model)
76 | shap_values = explainer.shap_values(X)
77 |
78 | st.header('Feature Importance')
79 | plt.title('Feature importance based on SHAP values')
80 | shap.summary_plot(shap_values, X)
81 | st.pyplot(bbox_inches='tight')
82 | st.write('---')
83 |
84 | plt.title('Feature importance based on SHAP values (Bar)')
85 | shap.summary_plot(shap_values, X, plot_type="bar")
86 | st.pyplot(bbox_inches='tight')
87 |
--------------------------------------------------------------------------------
/streamlit/part7/solubility-app.py:
--------------------------------------------------------------------------------
1 | ######################
2 | # Import libraries
3 | ######################
4 | import numpy as np
5 | import pandas as pd
6 | import streamlit as st
7 | import pickle
8 | from PIL import Image
9 | from rdkit import Chem
10 | from rdkit.Chem import Descriptors
11 |
12 | ######################
13 | # Custom function
14 | ######################
15 | ## Calculate molecular descriptors
16 | def AromaticProportion(m):
17 | aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())]
18 | aa_count = []
19 | for i in aromatic_atoms:
20 | if i==True:
21 | aa_count.append(1)
22 | AromaticAtom = sum(aa_count)
23 | HeavyAtom = Descriptors.HeavyAtomCount(m)
24 | AR = AromaticAtom/HeavyAtom
25 | return AR
26 |
27 | def generate(smiles, verbose=False):
28 |
29 | moldata= []
30 | for elem in smiles:
31 | mol=Chem.MolFromSmiles(elem)
32 | moldata.append(mol)
33 |
34 | baseData= np.arange(1,1)
35 | i=0
36 | for mol in moldata:
37 |
38 | desc_MolLogP = Descriptors.MolLogP(mol)
39 | desc_MolWt = Descriptors.MolWt(mol)
40 | desc_NumRotatableBonds = Descriptors.NumRotatableBonds(mol)
41 | desc_AromaticProportion = AromaticProportion(mol)
42 |
43 | row = np.array([desc_MolLogP,
44 | desc_MolWt,
45 | desc_NumRotatableBonds,
46 | desc_AromaticProportion])
47 |
48 | if(i==0):
49 | baseData=row
50 | else:
51 | baseData=np.vstack([baseData, row])
52 | i=i+1
53 |
54 | columnNames=["MolLogP","MolWt","NumRotatableBonds","AromaticProportion"]
55 | descriptors = pd.DataFrame(data=baseData,columns=columnNames)
56 |
57 | return descriptors
58 |
59 | ######################
60 | # Page Title
61 | ######################
62 |
63 | image = Image.open('solubility-logo.jpg')
64 |
65 | st.image(image, use_column_width=True)
66 |
67 | st.write("""
68 | # Molecular Solubility Prediction Web App
69 |
70 | This app predicts the **Solubility (LogS)** values of molecules!
71 |
72 | Data obtained from the John S. Delaney. [ESOL: Estimating Aqueous Solubility Directly from Molecular Structure](https://pubs.acs.org/doi/10.1021/ci034243x). ***J. Chem. Inf. Comput. Sci.*** 2004, 44, 3, 1000-1005.
73 | ***
74 | """)
75 |
76 |
77 | ######################
78 | # Input molecules (Side Panel)
79 | ######################
80 |
81 | st.sidebar.header('User Input Features')
82 |
83 | ## Read SMILES input
84 | SMILES_input = "NCCCC\nCCC\nCN"
85 |
86 | SMILES = st.sidebar.text_area("SMILES input", SMILES_input)
87 | SMILES = "C\n" + SMILES #Adds C as a dummy, first item
88 | SMILES = SMILES.split('\n')
89 |
90 | st.header('Input SMILES')
91 | SMILES[1:] # Skips the dummy first item
92 |
93 | ## Calculate molecular descriptors
94 | st.header('Computed molecular descriptors')
95 | X = generate(SMILES)
96 | X[1:] # Skips the dummy first item
97 |
98 | ######################
99 | # Pre-built model
100 | ######################
101 |
102 | # Reads in saved model
103 | load_model = pickle.load(open('solubility_model.pkl', 'rb'))
104 |
105 | # Apply model to make predictions
106 | prediction = load_model.predict(X)
107 | #prediction_proba = load_model.predict_proba(X)
108 |
109 | st.header('Predicted LogS values')
110 | prediction[1:] # Skips the dummy first item
111 |
--------------------------------------------------------------------------------
/streamlit/part7/solubility-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part7/solubility-logo.jpg
--------------------------------------------------------------------------------
/streamlit/part7/solubility_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part7/solubility_model.pkl
--------------------------------------------------------------------------------
/streamlit/part8/dna-app.py:
--------------------------------------------------------------------------------
1 | ######################
2 | # Import libraries
3 | ######################
4 |
5 | import pandas as pd
6 | import streamlit as st
7 | import altair as alt
8 | from PIL import Image
9 |
10 | ######################
11 | # Page Title
12 | ######################
13 |
14 | image = Image.open('dna-logo.jpg')
15 |
16 | st.image(image, use_column_width=True)
17 |
18 | st.write("""
19 | # DNA Nucleotide Count Web App
20 |
21 | This app counts the nucleotide composition of query DNA!
22 |
23 | ***
24 | """)
25 |
26 |
27 | ######################
28 | # Input Text Box
29 | ######################
30 |
31 | #st.sidebar.header('Enter DNA sequence')
32 | st.header('Enter DNA sequence')
33 |
34 | sequence_input = ">DNA Query 2\nGAACACGTGGAGGCAAACAGGAAGGTGAAGAAGAACTTATCCTATCAGGACGGAAGGTCCTGTGCTCGGG\nATCTTCCAGACGTCGCGACTCTAAATTGCCCCCTCTGAGGTCAAGGAACACAAGATGGTTTTGGAAATGC\nTGAACCCGATACATTATAACATCACCAGCATCGTGCCTGAAGCCATGCCTGCTGCCACCATGCCAGTCCT"
35 |
36 | #sequence = st.sidebar.text_area("Sequence input", sequence_input, height=250)
37 | sequence = st.text_area("Sequence input", sequence_input, height=250)
38 | sequence = sequence.splitlines()
39 | sequence = sequence[1:] # Skips the sequence name (first line)
40 | sequence = ''.join(sequence) # Concatenates list to string
41 |
42 | st.write("""
43 | ***
44 | """)
45 |
46 | ## Prints the input DNA sequence
47 | st.header('INPUT (DNA Query)')
48 | sequence
49 |
50 | ## DNA nucleotide count
51 | st.header('OUTPUT (DNA Nucleotide Count)')
52 |
53 | ### 1. Print dictionary
54 | st.subheader('1. Print dictionary')
55 | def DNA_nucleotide_count(seq):
56 | d = dict([
57 | ('A',seq.count('A')),
58 | ('T',seq.count('T')),
59 | ('G',seq.count('G')),
60 | ('C',seq.count('C'))
61 | ])
62 | return d
63 |
64 | X = DNA_nucleotide_count(sequence)
65 |
66 | #X_label = list(X)
67 | #X_values = list(X.values())
68 |
69 | X
70 |
71 | ### 2. Print text
72 | st.subheader('2. Print text')
73 | st.write('There are ' + str(X['A']) + ' adenine (A)')
74 | st.write('There are ' + str(X['T']) + ' thymine (T)')
75 | st.write('There are ' + str(X['G']) + ' guanine (G)')
76 | st.write('There are ' + str(X['C']) + ' cytosine (C)')
77 |
78 | ### 3. Display DataFrame
79 | st.subheader('3. Display DataFrame')
80 | df = pd.DataFrame.from_dict(X, orient='index')
81 | df = df.rename({0: 'count'}, axis='columns')
82 | df.reset_index(inplace=True)
83 | df = df.rename(columns = {'index':'nucleotide'})
84 | st.write(df)
85 |
86 | ### 4. Display Bar Chart using Altair
87 | st.subheader('4. Display Bar chart')
88 | p = alt.Chart(df).mark_bar().encode(
89 | x='nucleotide',
90 | y='count'
91 | )
92 | p = p.properties(
93 | width=alt.Step(80) # controls width of bar.
94 | )
95 | st.write(p)
96 |
--------------------------------------------------------------------------------
/streamlit/part8/dna-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataprofessor/code/d494f8093073990fcf77061bd740a0e4c2d40020/streamlit/part8/dna-logo.jpg
--------------------------------------------------------------------------------
/streamlit/part9/football_app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import pandas as pd
3 | import base64
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | import numpy as np
7 |
8 | st.title('NFL Football Stats (Rushing) Explorer')
9 |
10 | st.markdown("""
11 | This app performs simple webscraping of NFL Football player stats data (focusing on Rushing)!
12 | * **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn
13 | * **Data source:** [pro-football-reference.com](https://www.pro-football-reference.com/).
14 | """)
15 |
16 | st.sidebar.header('User Input Features')
17 | selected_year = st.sidebar.selectbox('Year', list(reversed(range(1990,2020))))
18 |
19 | # Web scraping of NFL player stats
20 | # https://www.pro-football-reference.com/years/2019/rushing.htm
21 | @st.cache
22 | def load_data(year):
23 | url = "https://www.pro-football-reference.com/years/" + str(year) + "/rushing.htm"
24 | html = pd.read_html(url, header = 1)
25 | df = html[0]
26 | raw = df.drop(df[df.Age == 'Age'].index) # Deletes repeating headers in content
27 | raw = raw.fillna(0)
28 | playerstats = raw.drop(['Rk'], axis=1)
29 | return playerstats
30 | playerstats = load_data(selected_year)
31 |
32 | # Sidebar - Team selection
33 | sorted_unique_team = sorted(playerstats.Tm.unique())
34 | selected_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team)
35 |
36 | # Sidebar - Position selection
37 | unique_pos = ['RB','QB','WR','FB','TE']
38 | selected_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos)
39 |
40 | # Filtering data
41 | df_selected_team = playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))]
42 |
43 | st.header('Display Player Stats of Selected Team(s)')
44 | st.write('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.')
45 | st.dataframe(df_selected_team)
46 |
47 | # Download NBA player stats data
48 | # https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
49 | def filedownload(df):
50 | csv = df.to_csv(index=False)
51 | b64 = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions
52 | href = f'Download CSV File'
53 | return href
54 |
55 | st.markdown(filedownload(df_selected_team), unsafe_allow_html=True)
56 |
57 | # Heatmap
58 | if st.button('Intercorrelation Heatmap'):
59 | st.header('Intercorrelation Matrix Heatmap')
60 | df_selected_team.to_csv('output.csv',index=False)
61 | df = pd.read_csv('output.csv')
62 |
63 | corr = df.corr()
64 | mask = np.zeros_like(corr)
65 | mask[np.triu_indices_from(mask)] = True
66 | with sns.axes_style("white"):
67 | f, ax = plt.subplots(figsize=(7, 5))
68 | ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)
69 | st.pyplot()
70 |
--------------------------------------------------------------------------------