├── README.md ├── ml-app.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # mlapp 2 | 3 | # Watch the tutorial video 4 | 5 | [How to Build a Machine Learning App | Streamlit #13](https://youtu.be/eT3JMZagMnE) 6 | 7 | How to Build a Machine Learning App | Streamlit #13 8 | 9 | # Demo 10 | 11 | Launch the web app: 12 | 13 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/dataprofessor/ml-app/main/ml-app.py) 14 | 15 | # Reproducing this web app 16 | To recreate this web app on your own computer, do the following. 17 | 18 | ### Create conda environment 19 | Firstly, we will create a conda environment called *ml* 20 | ``` 21 | conda create -n ml python=3.7.9 22 | ``` 23 | Secondly, we will login to the *ml* environement 24 | ``` 25 | conda activate ml 26 | ``` 27 | ### Install prerequisite libraries 28 | 29 | Download requirements.txt file 30 | 31 | ``` 32 | wget https://raw.githubusercontent.com/dataprofessor/ml-auto-app/main/requirements.txt 33 | 34 | ``` 35 | 36 | Pip install libraries 37 | ``` 38 | pip install -r requirements.txt 39 | ``` 40 | ### Download and unzip contents from GitHub repo 41 | 42 | Download and unzip contents from https://github.com/dataprofessor/ml-app/archive/main.zip 43 | 44 | ### Launch the app 45 | 46 | ``` 47 | streamlit run app.py 48 | ``` 49 | -------------------------------------------------------------------------------- /ml-app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.ensemble import RandomForestRegressor 5 | from sklearn.metrics import mean_squared_error, r2_score 6 | from sklearn.datasets import load_diabetes, load_boston 7 | 8 | #---------------------------------# 9 | # Page layout 10 | ## Page expands to full width 11 | st.set_page_config(page_title='The Machine Learning App', 12 | layout='wide') 13 | 14 | #---------------------------------# 15 | # Model building 16 | def build_model(df): 17 | X = df.iloc[:,:-1] # Using all column except for the last column as X 18 | Y = df.iloc[:,-1] # Selecting the last column as Y 19 | 20 | # Data splitting 21 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=(100-split_size)/100) 22 | 23 | st.markdown('**1.2. Data splits**') 24 | st.write('Training set') 25 | st.info(X_train.shape) 26 | st.write('Test set') 27 | st.info(X_test.shape) 28 | 29 | st.markdown('**1.3. Variable details**:') 30 | st.write('X variable') 31 | st.info(list(X.columns)) 32 | st.write('Y variable') 33 | st.info(Y.name) 34 | 35 | rf = RandomForestRegressor(n_estimators=parameter_n_estimators, 36 | random_state=parameter_random_state, 37 | max_features=parameter_max_features, 38 | criterion=parameter_criterion, 39 | min_samples_split=parameter_min_samples_split, 40 | min_samples_leaf=parameter_min_samples_leaf, 41 | bootstrap=parameter_bootstrap, 42 | oob_score=parameter_oob_score, 43 | n_jobs=parameter_n_jobs) 44 | rf.fit(X_train, Y_train) 45 | 46 | st.subheader('2. Model Performance') 47 | 48 | st.markdown('**2.1. Training set**') 49 | Y_pred_train = rf.predict(X_train) 50 | st.write('Coefficient of determination ($R^2$):') 51 | st.info( r2_score(Y_train, Y_pred_train) ) 52 | 53 | st.write('Error (MSE or MAE):') 54 | st.info( mean_squared_error(Y_train, Y_pred_train) ) 55 | 56 | st.markdown('**2.2. Test set**') 57 | Y_pred_test = rf.predict(X_test) 58 | st.write('Coefficient of determination ($R^2$):') 59 | st.info( r2_score(Y_test, Y_pred_test) ) 60 | 61 | st.write('Error (MSE or MAE):') 62 | st.info( mean_squared_error(Y_test, Y_pred_test) ) 63 | 64 | st.subheader('3. Model Parameters') 65 | st.write(rf.get_params()) 66 | 67 | #---------------------------------# 68 | st.write(""" 69 | # The Machine Learning App 70 | 71 | In this implementation, the *RandomForestRegressor()* function is used in this app for build a regression model using the **Random Forest** algorithm. 72 | 73 | Try adjusting the hyperparameters! 74 | 75 | """) 76 | 77 | #---------------------------------# 78 | # Sidebar - Collects user input features into dataframe 79 | with st.sidebar.header('1. Upload your CSV data'): 80 | uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"]) 81 | st.sidebar.markdown(""" 82 | [Example CSV input file](https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv) 83 | """) 84 | 85 | # Sidebar - Specify parameter settings 86 | with st.sidebar.header('2. Set Parameters'): 87 | split_size = st.sidebar.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5) 88 | 89 | with st.sidebar.subheader('2.1. Learning Parameters'): 90 | parameter_n_estimators = st.sidebar.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100) 91 | parameter_max_features = st.sidebar.select_slider('Max features (max_features)', options=['auto', 'sqrt', 'log2']) 92 | parameter_min_samples_split = st.sidebar.slider('Minimum number of samples required to split an internal node (min_samples_split)', 1, 10, 2, 1) 93 | parameter_min_samples_leaf = st.sidebar.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1) 94 | 95 | with st.sidebar.subheader('2.2. General Parameters'): 96 | parameter_random_state = st.sidebar.slider('Seed number (random_state)', 0, 1000, 42, 1) 97 | parameter_criterion = st.sidebar.select_slider('Performance measure (criterion)', options=['mse', 'mae']) 98 | parameter_bootstrap = st.sidebar.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False]) 99 | parameter_oob_score = st.sidebar.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True]) 100 | parameter_n_jobs = st.sidebar.select_slider('Number of jobs to run in parallel (n_jobs)', options=[1, -1]) 101 | 102 | #---------------------------------# 103 | # Main panel 104 | 105 | # Displays the dataset 106 | st.subheader('1. Dataset') 107 | 108 | if uploaded_file is not None: 109 | df = pd.read_csv(uploaded_file) 110 | st.markdown('**1.1. Glimpse of dataset**') 111 | st.write(df) 112 | build_model(df) 113 | else: 114 | st.info('Awaiting for CSV file to be uploaded.') 115 | if st.button('Press to use Example Dataset'): 116 | # Diabetes dataset 117 | #diabetes = load_diabetes() 118 | #X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) 119 | #Y = pd.Series(diabetes.target, name='response') 120 | #df = pd.concat( [X,Y], axis=1 ) 121 | 122 | #st.markdown('The Diabetes dataset is used as the example.') 123 | #st.write(df.head(5)) 124 | 125 | # Boston housing dataset 126 | boston = load_boston() 127 | X = pd.DataFrame(boston.data, columns=boston.feature_names) 128 | Y = pd.Series(boston.target, name='response') 129 | df = pd.concat( [X,Y], axis=1 ) 130 | 131 | st.markdown('The Boston housing dataset is used as the example.') 132 | st.write(df.head(5)) 133 | 134 | build_model(df) 135 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==0.71.0 2 | pandas==1.1.3 3 | base58==2.0.1 4 | numpy==1.19.2 5 | pillow==8.0.1 6 | scikit-learn==0.23.2 7 | --------------------------------------------------------------------------------