├── Home.py ├── README.md ├── pages ├── 1_Pre_Work.py ├── 2_Step_1.py ├── 3_Step_2.py ├── 4_Step_3_A.py ├── 4_Step_3_B.py ├── 5_Step_4.py ├── 6_Step_5.py └── 7_Step_6_&_7.py ├── requirements.txt └── resources └── images └── ml_pipeline.jpg /Home.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from PIL import Image 3 | import os 4 | 5 | st.title("Model Building Pipeline") 6 | 7 | # Get the absolute path to the directory of the current script 8 | current_dir = os.path.dirname(os.path.abspath(__file__)) 9 | # Specify the relative path from the current script to the file 10 | file_path = os.path.join(current_dir, "resources/images", "ml_pipeline.jpg") 11 | 12 | 13 | image = Image.open(file_path) 14 | st.image(image, caption='Credits: M Ravi Teja') 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Model-Building-Pipeline-Tutorial -------------------------------------------------------------------------------- /pages/1_Pre_Work.py: -------------------------------------------------------------------------------- 1 | # Data Collection 2 | # Data Cleaning 3 | # Data Exploration 4 | 5 | import streamlit as st 6 | 7 | st.title("Pre Work") 8 | 9 | st.subheader("1. Data Collection") 10 | 11 | st.subheader("2. Data Cleaning") 12 | 13 | st.subheader("3. Data Exploration") -------------------------------------------------------------------------------- /pages/2_Step_1.py: -------------------------------------------------------------------------------- 1 | # Identify Input and Output 2 | import streamlit as st 3 | 4 | st.title("Identify Target Variable and Predictors") 5 | 6 | with st.expander("See explanation"): 7 | st.write(""" 8 | **Identifying Target Variable (i.e. y):** Helps in choosing Algorithm and Evaluation Metrics. 9 | **Identifying Predictor Variables (i.e. X):** Helps in choosing correct Data Preparation Strategy. 10 | """) 11 | 12 | CODE_SNIPPET_1 = ''' 13 | # Code Snippet 1 14 | y = df['target'] 15 | X = df[['predictor_1', 'predictor_2', ...]] 16 | ''' 17 | 18 | CODE_SNIPPET_2 = ''' 19 | # Code Snippet 2 20 | y = df.pop("target") 21 | X = df 22 | ''' 23 | 24 | st.code(CODE_SNIPPET_1, language='python') 25 | 26 | st.code(CODE_SNIPPET_2, language='python') 27 | 28 | with st.expander("Important Note"): 29 | st.write(""" 30 | There can be many predictors given in the dataset. You must have already done an end-to-end EDA. 31 | Now it is your responsibility to choose the best predictors. 32 | Identifying best predictor can make sure that you are building the model on **Good Quality Data**. 33 | """) 34 | -------------------------------------------------------------------------------- /pages/3_Step_2.py: -------------------------------------------------------------------------------- 1 | # Split the data into train and test 2 | 3 | import streamlit as st 4 | 5 | st.title("Split the data into Train and Test set") 6 | 7 | with st.expander("See explanation"): 8 | st.write(""" 9 | **Train Set (i.e. X_train, y_train):** Used to train a model/pattern. 10 | **Test Set (i.e. X_test, y_test):** Used to test the models performance. 11 | """) 12 | 13 | CODE_SNIPPET = ''' 14 | # Code Snippet 15 | from sklearn.model_selection import train_test_split 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, 17 | train_size=0.7, 18 | random_state=100) 19 | ''' 20 | 21 | st.code(CODE_SNIPPET, language='python') 22 | 23 | with st.expander("Important Note"): 24 | st.write(""" 25 | Never apply data preparation before train_test_split. 26 | It can lead to **Data Leakage** problem. 27 | **Never make this mistake.** 28 | """) 29 | -------------------------------------------------------------------------------- /pages/4_Step_3_A.py: -------------------------------------------------------------------------------- 1 | # Data Preparation: 2 | import streamlit as st 3 | 4 | st.title("Data Preparation: Data Cleaning and Feature Engineering") 5 | 6 | with st.expander("See explanation"): 7 | st.write(""" 8 | Choosing appropriate Data Preparation Technique is important. 9 | If we fail to apply this step properly, Machine Learning Models won't be good. 10 | **Remeber Garbage In Garbage Out**. 11 | """) 12 | 13 | st.subheader("Data Cleaning Coming Soon. Go to Step 3 B for Feature Engineering.") -------------------------------------------------------------------------------- /pages/4_Step_3_B.py: -------------------------------------------------------------------------------- 1 | # Data Preparation on Training Data 2 | import streamlit as st 3 | 4 | st.title("Feature Engineering: Preparing the Training Data (i.e. X_train)") 5 | 6 | 7 | with st.expander("See explanation"): 8 | st.write(""" 9 | Here we have covered Feature Engineering for Continuous and Discrete Features. 10 | """) 11 | 12 | CODE_SNIPPET_1 = ''' 13 | # Rescaling the Numerical Features using Standardization 14 | from sklearn.preprocessing import StandardScaler 15 | 16 | # Creating object of StandardScaler class 17 | scaler = StandardScaler() 18 | 19 | # column names are (annoyingly) lost after Scaling 20 | # (i.e. the dataframe is converted to a numpy ndarray) 21 | X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train), 22 | columns=X_train.columns, 23 | index=X_train.index) 24 | 25 | X_train_transformed.head() 26 | ''' 27 | 28 | CODE_SNIPPET_1_2_TEST = ''' 29 | X_test_transformed = pd.DataFrame(scaler.transform(X_test), 30 | columns=X_test.columns, 31 | index=X_test.index) 32 | ''' 33 | 34 | 35 | CODE_SNIPPET_2 = ''' 36 | # Rescaling the Numerical Features using Normalization 37 | from sklearn.preprocessing import MinMaxScaler 38 | 39 | # Creating object of MinMaxScaler class 40 | scaler = MinMaxScaler() 41 | 42 | # column names are (annoyingly) lost after Scaling 43 | # (i.e. the dataframe is converted to a numpy ndarray) 44 | X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train), 45 | columns=X_train.columns, 46 | index=X_train.index) 47 | 48 | X_train_transformed.head() 49 | ''' 50 | 51 | CODE_SNIPPET_3 = ''' 52 | # OneHotEncoding the categorical features 53 | from sklearn.preprocessing import OneHotEncoder 54 | 55 | # Creating object of OneHotEncoder 56 | encoder = OneHotEncoder(drop='first', sparse=False) 57 | 58 | # column names are (annoyingly) lost after OneHotEncoding 59 | # (i.e. the dataframe is converted to a numpy ndarray) 60 | X_train_transformed = pd.DataFrame(encoder.fit_transform(X_train), 61 | columns=encoder.get_feature_names_out(), 62 | index=X_train.index) 63 | 64 | X_train_transformed.head() 65 | ''' 66 | 67 | CODE_SNIPPET_3_TEST = ''' 68 | X_test_transformed = pd.DataFrame(encoder.transform(X_test), 69 | columns=encoder.get_feature_names_out(), 70 | index=X_test.index) 71 | ''' 72 | 73 | 74 | CODE_SNIPPET_4 = ''' 75 | # Create an empty dataframe which will contain the transformed data 76 | X_train_transformed = pd.DataFrame(index=X_train.index) 77 | 78 | # Define an encoding based on the ranking of each category 79 | cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5} 80 | color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7} 81 | 82 | # Apply the encoding 83 | X_train_transformed['cut'] = X_train['cut'].apply(lambda x : cut_encoder[x]) 84 | X_train_transformed['color'] = X_train['color'].apply(lambda x : color_encoder[x]) 85 | ''' 86 | 87 | CODE_SNIPPET_4_TEST = ''' 88 | X_test_transformed = pd.DataFrame(index = X_test.index) 89 | X_test_transformed['cut'] = X_test['cut'].apply(lambda x : cut_encoder[x]) 90 | X_test_transformed['color'] = X_test['color'].apply(lambda x : color_encoder[x]) 91 | ''' 92 | 93 | 94 | CODE_SNIPPET_5 = ''' 95 | # Separate Discrete and Continuous Columns 96 | X_train_cat = X_train.select_dtypes(include=['object']) 97 | X_train_num = X_train.select_dtypes(include=['int64', 'float64']) 98 | 99 | # Apply Standardization/Normalisation for X_train_num 100 | # Apply OHE/Label Encoding for X_train_cat 101 | 102 | # Concatenate the two 103 | X_train_transformed = pd.concat([X_train_num_transformed, X_train_cat_transformed], axis=1) 104 | ''' 105 | 106 | 107 | option = st.selectbox('Select the datatypes of columns in your X_train?', 108 | ('Continuous', 'Discrete', 'Both')) 109 | 110 | if(option=='Continuous'): 111 | st.write(""" 112 | For Continuous Predictors, it is very important that we rescale them. 113 | This is because every predictor might be measured on a different scale. 114 | Many machine learning and deep learning algorithms won't be able to learn patterns. 115 | There are two strategies to rescale the Continuous Data: 116 | 1. Standardization 117 | 2. Normalization 118 | """) 119 | tab1, tab2 = st.tabs(["Standardization", "Normalization"]) 120 | with tab1: 121 | st.code(CODE_SNIPPET_1, language='python') 122 | with tab2: 123 | st.code(CODE_SNIPPET_2, language='python') 124 | toggle = st.toggle("Click here to learn how to rescale the X_test Data") 125 | if toggle: 126 | st.code(CODE_SNIPPET_1_2_TEST, language='python') 127 | elif(option=='Discrete'): 128 | st.write(""" 129 | For Discrete Predictors (if non numerical), it is very important that we transform them. 130 | This is because algorithms can't take non numerical data as input. 131 | There are two strategies for this transformation: 132 | 1. One Hot Encoding 133 | 2. Label Encoding 134 | """) 135 | tab1, tab2 = st.tabs(["One Hot Encoding", "Label Encoding"]) 136 | with tab1: 137 | st.code(CODE_SNIPPET_3, language='python') 138 | toggle_1 = st.toggle("Click here to learn how to encode the X_test Data", key='c_1') 139 | if toggle_1: 140 | st.code(CODE_SNIPPET_3_TEST, language='python') 141 | 142 | with tab2: 143 | st.code(CODE_SNIPPET_4, language='python') 144 | toggle_2 = st.toggle("Click here to learn how to encode the X_test Data", key='c_2') 145 | if toggle_2: 146 | st.code(CODE_SNIPPET_4_TEST, language='python') 147 | 148 | elif(option=="Both"): 149 | st.code(CODE_SNIPPET_5, language='python') 150 | -------------------------------------------------------------------------------- /pages/5_Step_4.py: -------------------------------------------------------------------------------- 1 | # Building a Model 2 | import streamlit as st 3 | 4 | st.title("Training Phase: Building a Model") 5 | 6 | CODE_SNIPPET_1 = ''' 7 | from sklearn.neighbors import KNeighborsClassifier 8 | classifier = KNeighborsClassifier() 9 | classifier.fit(X_train_transformed, y_train) 10 | ''' 11 | 12 | CODE_SNIPPET_2 = ''' 13 | from sklearn.tree import DecisionTreeClassifier 14 | classifier = DecisionTreeClassifier() 15 | classifier.fit(X_train_transformed, y_train) 16 | ''' 17 | 18 | CODE_SNIPPET_3 = ''' 19 | from sklearn.naive_bayes import GaussianNB 20 | classifier = GaussianNB() 21 | classifier.fit(X_train_transformed, y_train) 22 | ''' 23 | 24 | CODE_SNIPPET_4 = ''' 25 | from sklearn.linear_model import LogisticRegression 26 | classifier = LogisticRegression() 27 | classifier.fit(X_train_transformed, y_train) 28 | ''' 29 | 30 | CODE_SNIPPET_5 = ''' 31 | from sklearn.svm import SCV 32 | classifier = SVC() 33 | classifier.fit(X_train_transformed, y_train) 34 | ''' 35 | 36 | CODE_SNIPPET_6 = ''' 37 | from sklearn.ensemble import RandomForestClassifier 38 | classifier = RandomForestClassifier() 39 | classifier.fit(X_train_transformed, y 40 | ''' 41 | 42 | CODE_SNIPPET_7 = ''' 43 | from sklearn.neighbors import KNeighborsRegressor 44 | regressor = KNeighborsRegressor() 45 | regressor.fit(X_train_transformed, y_train) 46 | ''' 47 | 48 | CODE_SNIPPET_8 = ''' 49 | from sklearn.tree import DecisionTreeRegressor 50 | regressor = DecisionTreeRegressor() 51 | regressor.fit(X_train_transformed, y_train) 52 | ''' 53 | 54 | CODE_SNIPPET_9 = ''' 55 | from sklearn.linear_model import LinearRegression 56 | regressor = LinearRegression() 57 | regressor.fit(X_train_transformed, y_train) 58 | ''' 59 | 60 | CODE_SNIPPET_10 = ''' 61 | from sklearn.ensemble import RandomForestRegressor 62 | regressor = RandomForestRegressor() 63 | regressor.fit(X_train_transformed, y_train) 64 | ''' 65 | 66 | option = st.selectbox('Select the Algorithm? (HINT: Step 1 - Based on target variable)', 67 | ('Classification', 'Regression')) 68 | 69 | if(option=='Classification'): 70 | tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["KNN", "DT", "Naive Bayes", 71 | "Logistic Regression", "SVC", 72 | "Random Forest"]) 73 | with tab1: 74 | st.code(CODE_SNIPPET_1, language='python') 75 | with tab2: 76 | st.code(CODE_SNIPPET_2, language='python') 77 | with tab3: 78 | st.code(CODE_SNIPPET_3, language='python') 79 | with tab4: 80 | st.code(CODE_SNIPPET_4, language='python') 81 | with tab5: 82 | st.code(CODE_SNIPPET_5, language='python') 83 | with tab6: 84 | st.code(CODE_SNIPPET_6, language='python') 85 | elif(option=='Regression'): 86 | tab1, tab2, tab3, tab4 = st.tabs(["KNN", "DT", 87 | "Linear Regression", 88 | "Random Forest"]) 89 | with tab1: 90 | st.code(CODE_SNIPPET_7, language='python') 91 | with tab2: 92 | st.code(CODE_SNIPPET_8, language='python') 93 | with tab3: 94 | st.code(CODE_SNIPPET_9, language='python') 95 | with tab4: 96 | st.code(CODE_SNIPPET_10, language='python') 97 | -------------------------------------------------------------------------------- /pages/6_Step_5.py: -------------------------------------------------------------------------------- 1 | # Data Preparation on Test Data 2 | import streamlit as st 3 | 4 | st.title("Data Preparation: Preparing the Test Data (i.e. X_test)") 5 | 6 | st.subheader("Explore Step 3 B. This step has already been covered there.") -------------------------------------------------------------------------------- /pages/7_Step_6_&_7.py: -------------------------------------------------------------------------------- 1 | # Predicting on unseen data 2 | import streamlit as st 3 | 4 | st.title("Testing Phase: Prediction and Evaluation") 5 | 6 | st.header("Predicting on Unseen Data") 7 | 8 | CODE_SNIPPET = ''' 9 | y_test_pred = model.predict(X_test_transformed) 10 | ''' 11 | 12 | st.code(CODE_SNIPPET, language='python') 13 | 14 | st.header("Evaluating the Model") 15 | 16 | CODE_SNIPPET_1 = ''' 17 | from sklearn import metrics 18 | metrics.accuracy_score(y_test, y_test_pred) 19 | ''' 20 | 21 | CODE_SNIPPET_2 = ''' 22 | from sklearn import metrics 23 | metrics.confusion_matrix(y_test, y_test_pred) 24 | ''' 25 | 26 | CODE_SNIPPET_3 = ''' 27 | from sklearn import metrics 28 | metrics.classification_report(y_test, y_test_pred) 29 | ''' 30 | 31 | CODE_SNIPPET_4 = ''' 32 | from sklearn import metrics 33 | metrics.mean_absolute_error(y_test, y_test_pred) 34 | ''' 35 | 36 | CODE_SNIPPET_5 = ''' 37 | from sklearn import metrics 38 | metrics.mean_squared_error(y_test, y_test_pred) 39 | ''' 40 | 41 | CODE_SNIPPET_6 = ''' 42 | from sklearn import metrics 43 | np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)) 44 | ''' 45 | 46 | CODE_SNIPPET_7 = ''' 47 | from sklearn import metrics 48 | metrics.r2_score(y_test, y_test_pred) 49 | ''' 50 | 51 | option = st.selectbox('Select the evaluation metric? (HINT: Step 1 - Based on target variable)', 52 | ('Classification', 'Regression')) 53 | 54 | if(option=='Classification'): 55 | tab1, tab2, tab3 = st.tabs(["Accuracy", "Confusion Matrix", 56 | "Classification Report"]) 57 | with tab1: 58 | st.code(CODE_SNIPPET_1, language='python') 59 | with tab2: 60 | st.code(CODE_SNIPPET_2, language='python') 61 | with tab3: 62 | st.code(CODE_SNIPPET_3, language='python') 63 | elif(option=='Regression'): 64 | tab1, tab2, tab3, tab4 = st.tabs(["MAE", "MSE", 65 | "RMSE", 66 | "R2 Score"]) 67 | with tab1: 68 | st.code(CODE_SNIPPET_4, language='python') 69 | with tab2: 70 | st.code(CODE_SNIPPET_5, language='python') 71 | with tab3: 72 | st.code(CODE_SNIPPET_6, language='python') 73 | with tab4: 74 | st.code(CODE_SNIPPET_7, language='python') 75 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | altair==5.1.2 2 | attrs==23.1.0 3 | blinker==1.7.0 4 | cachetools==5.3.2 5 | certifi==2023.11.17 6 | charset-normalizer==3.3.2 7 | click==8.1.7 8 | colorama==0.4.6 9 | gitdb==4.0.11 10 | GitPython==3.1.40 11 | idna==3.4 12 | importlib-metadata==6.8.0 13 | Jinja2==3.1.2 14 | jsonschema==4.20.0 15 | jsonschema-specifications==2023.11.1 16 | markdown-it-py==3.0.0 17 | MarkupSafe==2.1.3 18 | mdurl==0.1.2 19 | numpy==1.26.2 20 | packaging==23.2 21 | pandas==2.1.3 22 | Pillow==10.1.0 23 | protobuf==4.25.1 24 | pyarrow==14.0.1 25 | pydeck==0.8.1b0 26 | Pygments==2.17.2 27 | python-dateutil==2.8.2 28 | pytz==2023.3.post1 29 | referencing==0.31.0 30 | requests==2.31.0 31 | rich==13.7.0 32 | rpds-py==0.13.1 33 | six==1.16.0 34 | smmap==5.0.1 35 | streamlit==1.28.2 36 | tenacity==8.2.3 37 | toml==0.10.2 38 | toolz==0.12.0 39 | tornado==6.3.3 40 | typing_extensions==4.8.0 41 | tzdata==2023.3 42 | tzlocal==5.2 43 | urllib3==2.1.0 44 | validators==0.22.0 45 | watchdog==3.0.0 46 | zipp==3.17.0 47 | -------------------------------------------------------------------------------- /resources/images/ml_pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bansalkanav/Model-Building-Pipeline-Tutorial/ec1f517cb5cb30aec5655d168084e5fb9a04a9b0/resources/images/ml_pipeline.jpg --------------------------------------------------------------------------------