├── Home.py
├── README.md
├── pages
    ├── 1_Pre_Work.py
    ├── 2_Step_1.py
    ├── 3_Step_2.py
    ├── 4_Step_3_A.py
    ├── 4_Step_3_B.py
    ├── 5_Step_4.py
    ├── 6_Step_5.py
    └── 7_Step_6_&_7.py
├── requirements.txt
└── resources
    └── images
        └── ml_pipeline.jpg


/Home.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from PIL import Image
 3 | import os
 4 | 
 5 | st.title("Model Building Pipeline")
 6 | 
 7 | # Get the absolute path to the directory of the current script
 8 | current_dir = os.path.dirname(os.path.abspath(__file__))
 9 | # Specify the relative path from the current script to the file
10 | file_path = os.path.join(current_dir, "resources/images", "ml_pipeline.jpg")
11 | 
12 | 
13 | image = Image.open(file_path)
14 | st.image(image, caption='Credits: M Ravi Teja')
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Model-Building-Pipeline-Tutorial


--------------------------------------------------------------------------------
/pages/1_Pre_Work.py:
--------------------------------------------------------------------------------
 1 | # Data Collection
 2 | # Data Cleaning
 3 | # Data Exploration
 4 | 
 5 | import streamlit as st
 6 | 
 7 | st.title("Pre Work")
 8 | 
 9 | st.subheader("1. Data Collection")
10 | 
11 | st.subheader("2. Data Cleaning")
12 | 
13 | st.subheader("3. Data Exploration")


--------------------------------------------------------------------------------
/pages/2_Step_1.py:
--------------------------------------------------------------------------------
 1 | # Identify Input and Output
 2 | import streamlit as st
 3 | 
 4 | st.title("Identify Target Variable and Predictors")
 5 | 
 6 | with st.expander("See explanation"):
 7 |     st.write("""
 8 |             **Identifying Target Variable (i.e. y):** Helps in choosing Algorithm and Evaluation Metrics.  
 9 |             **Identifying Predictor Variables (i.e. X):** Helps in choosing correct Data Preparation Strategy.
10 |             """)
11 | 
12 | CODE_SNIPPET_1 = '''
13 | # Code Snippet 1
14 | y = df['target']
15 | X = df[['predictor_1', 'predictor_2', ...]]
16 | '''
17 | 
18 | CODE_SNIPPET_2 = '''
19 | # Code Snippet 2
20 | y = df.pop("target")
21 | X = df
22 | '''
23 | 
24 | st.code(CODE_SNIPPET_1, language='python')
25 | 
26 | st.code(CODE_SNIPPET_2, language='python') 
27 | 
28 | with st.expander("Important Note"):
29 |     st.write("""
30 |         There can be many predictors given in the dataset. You must have already done an end-to-end EDA.  
31 |         Now it is your responsibility to choose the best predictors.  
32 |         Identifying best predictor can make sure that you are building the model on **Good Quality Data**.
33 |         """)
34 | 


--------------------------------------------------------------------------------
/pages/3_Step_2.py:
--------------------------------------------------------------------------------
 1 | # Split the data into train and test
 2 | 
 3 | import streamlit as st
 4 | 
 5 | st.title("Split the data into Train and Test set")
 6 | 
 7 | with st.expander("See explanation"):
 8 |     st.write("""
 9 |             **Train Set (i.e. X_train, y_train):** Used to train a model/pattern.  
10 |             **Test Set (i.e. X_test, y_test):** Used to test the models performance.
11 |             """)
12 | 
13 | CODE_SNIPPET = '''
14 | # Code Snippet
15 | from sklearn.model_selection import train_test_split
16 | X_train, X_test, y_train, y_test = train_test_split(X, y, 
17 |                                             train_size=0.7, 
18 |                                             random_state=100)
19 | '''
20 | 
21 | st.code(CODE_SNIPPET, language='python')
22 | 
23 | with st.expander("Important Note"):
24 |     st.write("""
25 |             Never apply data preparation before train_test_split.  
26 |             It can lead to **Data Leakage** problem.
27 |             **Never make this mistake.**
28 |             """)
29 | 


--------------------------------------------------------------------------------
/pages/4_Step_3_A.py:
--------------------------------------------------------------------------------
 1 | # Data Preparation:
 2 | import streamlit as st
 3 | 
 4 | st.title("Data Preparation: Data Cleaning and Feature Engineering")
 5 | 
 6 | with st.expander("See explanation"):
 7 |     st.write("""
 8 |             Choosing appropriate Data Preparation Technique is important.  
 9 |             If we fail to apply this step properly, Machine Learning Models won't be good.  
10 |             **Remeber Garbage In Garbage Out**.
11 |             """)
12 |     
13 | st.subheader("Data Cleaning Coming Soon. Go to Step 3 B for Feature Engineering.")


--------------------------------------------------------------------------------
/pages/4_Step_3_B.py:
--------------------------------------------------------------------------------
  1 | # Data Preparation on Training Data
  2 | import streamlit as st
  3 | 
  4 | st.title("Feature Engineering: Preparing the Training Data (i.e. X_train)")
  5 | 
  6 | 
  7 | with st.expander("See explanation"):
  8 |     st.write("""
  9 |             Here we have covered Feature Engineering for Continuous and Discrete Features.
 10 |             """)
 11 | 
 12 | CODE_SNIPPET_1 = '''
 13 | # Rescaling the Numerical Features using Standardization
 14 | from sklearn.preprocessing import StandardScaler
 15 | 
 16 | # Creating object of StandardScaler class
 17 | scaler = StandardScaler()
 18 | 
 19 | # column names are (annoyingly) lost after Scaling
 20 | # (i.e. the dataframe is converted to a numpy ndarray)
 21 | X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train), 
 22 | 				columns=X_train.columns, 
 23 | 				index=X_train.index)
 24 | 
 25 | X_train_transformed.head()
 26 | '''
 27 | 
 28 | CODE_SNIPPET_1_2_TEST = '''
 29 | X_test_transformed = pd.DataFrame(scaler.transform(X_test), 
 30 |                                    columns=X_test.columns, 
 31 |                                    index=X_test.index)
 32 | '''
 33 | 
 34 | 
 35 | CODE_SNIPPET_2 = '''
 36 | # Rescaling the Numerical Features using Normalization
 37 | from sklearn.preprocessing import MinMaxScaler
 38 | 
 39 | # Creating object of MinMaxScaler class
 40 | scaler = MinMaxScaler()
 41 | 
 42 | # column names are (annoyingly) lost after Scaling
 43 | # (i.e. the dataframe is converted to a numpy ndarray)
 44 | X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train), 
 45 | 				columns=X_train.columns, 
 46 | 				index=X_train.index)
 47 | 
 48 | X_train_transformed.head()
 49 | '''
 50 | 
 51 | CODE_SNIPPET_3 = '''
 52 | # OneHotEncoding the categorical features
 53 | from sklearn.preprocessing import OneHotEncoder
 54 | 
 55 | # Creating object of OneHotEncoder
 56 | encoder = OneHotEncoder(drop='first', sparse=False)
 57 | 
 58 | # column names are (annoyingly) lost after OneHotEncoding
 59 | # (i.e. the dataframe is converted to a numpy ndarray)
 60 | X_train_transformed = pd.DataFrame(encoder.fit_transform(X_train), 
 61 | 				columns=encoder.get_feature_names_out(), 
 62 | 				index=X_train.index)
 63 | 
 64 | X_train_transformed.head()
 65 | '''
 66 | 
 67 | CODE_SNIPPET_3_TEST = '''
 68 | X_test_transformed = pd.DataFrame(encoder.transform(X_test), 
 69 |                         columns=encoder.get_feature_names_out(), 
 70 |                         index=X_test.index)
 71 | '''
 72 | 
 73 | 
 74 | CODE_SNIPPET_4 = '''
 75 | # Create an empty dataframe which will contain the transformed data
 76 | X_train_transformed = pd.DataFrame(index=X_train.index)
 77 | 
 78 | # Define an encoding based on the ranking of each category
 79 | cut_encoder = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Ideal' : 4, 'Premium' : 5}
 80 | color_encoder = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}
 81 | 
 82 | # Apply the encoding
 83 | X_train_transformed['cut'] = X_train['cut'].apply(lambda x : cut_encoder[x])
 84 | X_train_transformed['color'] = X_train['color'].apply(lambda x : color_encoder[x])
 85 | '''
 86 | 
 87 | CODE_SNIPPET_4_TEST = '''
 88 | X_test_transformed = pd.DataFrame(index = X_test.index)
 89 | X_test_transformed['cut'] = X_test['cut'].apply(lambda x : cut_encoder[x])
 90 | X_test_transformed['color'] = X_test['color'].apply(lambda x : color_encoder[x])
 91 | '''
 92 | 
 93 | 
 94 | CODE_SNIPPET_5 = '''
 95 | # Separate Discrete and Continuous Columns
 96 | X_train_cat = X_train.select_dtypes(include=['object'])
 97 | X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
 98 | 
 99 | # Apply Standardization/Normalisation for X_train_num
100 | # Apply OHE/Label Encoding for X_train_cat
101 | 
102 | # Concatenate the two
103 | X_train_transformed = pd.concat([X_train_num_transformed, X_train_cat_transformed], axis=1)
104 | '''
105 | 
106 | 
107 | option = st.selectbox('Select the datatypes of columns in your X_train?',
108 |                       ('Continuous', 'Discrete', 'Both'))
109 | 
110 | if(option=='Continuous'):
111 |     st.write("""
112 |             For Continuous Predictors, it is very important that we rescale them.  
113 |             This is because every predictor might be measured on a different scale.  
114 |             Many machine learning and deep learning algorithms won't be able to learn patterns.  
115 |             There are two strategies to rescale the Continuous Data:
116 |             1. Standardization
117 |             2. Normalization
118 |             """)
119 |     tab1, tab2 = st.tabs(["Standardization", "Normalization"])
120 |     with tab1:
121 |         st.code(CODE_SNIPPET_1, language='python')
122 |     with tab2:
123 |         st.code(CODE_SNIPPET_2, language='python')
124 |     toggle = st.toggle("Click here to learn how to rescale the X_test Data")
125 |     if toggle:
126 |         st.code(CODE_SNIPPET_1_2_TEST, language='python')
127 | elif(option=='Discrete'):
128 |     st.write("""
129 |             For Discrete Predictors (if non numerical), it is very important that we transform them.  
130 |             This is because algorithms can't take non numerical data as input.  
131 |             There are two strategies for this transformation:
132 |             1. One Hot Encoding
133 |             2. Label Encoding
134 |             """)
135 |     tab1, tab2 = st.tabs(["One Hot Encoding", "Label Encoding"])
136 |     with tab1:
137 |         st.code(CODE_SNIPPET_3, language='python')
138 |         toggle_1 = st.toggle("Click here to learn how to encode the X_test Data", key='c_1')
139 |         if toggle_1:
140 |             st.code(CODE_SNIPPET_3_TEST, language='python')
141 | 
142 |     with tab2:
143 |         st.code(CODE_SNIPPET_4, language='python')
144 |         toggle_2 = st.toggle("Click here to learn how to encode the X_test Data", key='c_2')
145 |         if toggle_2:
146 |             st.code(CODE_SNIPPET_4_TEST, language='python')
147 | 
148 | elif(option=="Both"):
149 |     st.code(CODE_SNIPPET_5, language='python')
150 | 


--------------------------------------------------------------------------------
/pages/5_Step_4.py:
--------------------------------------------------------------------------------
 1 | # Building a Model
 2 | import streamlit as st
 3 | 
 4 | st.title("Training Phase: Building a Model")
 5 | 
 6 | CODE_SNIPPET_1 = '''
 7 | from sklearn.neighbors import KNeighborsClassifier
 8 | classifier = KNeighborsClassifier()
 9 | classifier.fit(X_train_transformed, y_train)
10 | '''
11 | 
12 | CODE_SNIPPET_2 = '''
13 | from sklearn.tree import DecisionTreeClassifier
14 | classifier = DecisionTreeClassifier()
15 | classifier.fit(X_train_transformed, y_train)
16 | '''
17 | 
18 | CODE_SNIPPET_3 = '''
19 | from sklearn.naive_bayes import GaussianNB
20 | classifier = GaussianNB()
21 | classifier.fit(X_train_transformed, y_train)
22 | '''
23 | 
24 | CODE_SNIPPET_4 = '''
25 | from sklearn.linear_model import LogisticRegression
26 | classifier = LogisticRegression()
27 | classifier.fit(X_train_transformed, y_train)
28 | '''
29 | 
30 | CODE_SNIPPET_5 = '''
31 | from sklearn.svm import SCV
32 | classifier = SVC()
33 | classifier.fit(X_train_transformed, y_train)
34 | '''
35 | 
36 | CODE_SNIPPET_6 = '''
37 | from sklearn.ensemble import RandomForestClassifier
38 | classifier = RandomForestClassifier()
39 | classifier.fit(X_train_transformed, y
40 | '''
41 | 
42 | CODE_SNIPPET_7 = '''
43 | from sklearn.neighbors import KNeighborsRegressor
44 | regressor = KNeighborsRegressor()
45 | regressor.fit(X_train_transformed, y_train)
46 | '''
47 | 
48 | CODE_SNIPPET_8 = '''
49 | from sklearn.tree import DecisionTreeRegressor
50 | regressor = DecisionTreeRegressor()
51 | regressor.fit(X_train_transformed, y_train)
52 | '''
53 | 
54 | CODE_SNIPPET_9 = '''
55 | from sklearn.linear_model import LinearRegression
56 | regressor = LinearRegression()
57 | regressor.fit(X_train_transformed, y_train)
58 | '''
59 | 
60 | CODE_SNIPPET_10 = '''
61 | from sklearn.ensemble import RandomForestRegressor
62 | regressor = RandomForestRegressor()
63 | regressor.fit(X_train_transformed, y_train)
64 | '''
65 | 
66 | option = st.selectbox('Select the Algorithm? (HINT: Step 1 - Based on target variable)',
67 |                       ('Classification', 'Regression'))
68 | 
69 | if(option=='Classification'):
70 |     tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["KNN", "DT", "Naive Bayes", 
71 |                                                   "Logistic Regression", "SVC", 
72 |                                                   "Random Forest"])
73 |     with tab1:
74 |         st.code(CODE_SNIPPET_1, language='python')
75 |     with tab2:
76 |         st.code(CODE_SNIPPET_2, language='python')
77 |     with tab3:
78 |         st.code(CODE_SNIPPET_3, language='python')
79 |     with tab4:
80 |         st.code(CODE_SNIPPET_4, language='python')
81 |     with tab5:
82 |         st.code(CODE_SNIPPET_5, language='python')
83 |     with tab6:
84 |         st.code(CODE_SNIPPET_6, language='python')
85 | elif(option=='Regression'):
86 |     tab1, tab2, tab3, tab4 = st.tabs(["KNN", "DT", 
87 |                                     "Linear Regression", 
88 |                                     "Random Forest"])
89 |     with tab1:
90 |         st.code(CODE_SNIPPET_7, language='python')
91 |     with tab2:
92 |         st.code(CODE_SNIPPET_8, language='python')
93 |     with tab3:
94 |         st.code(CODE_SNIPPET_9, language='python')
95 |     with tab4:
96 |         st.code(CODE_SNIPPET_10, language='python')
97 | 


--------------------------------------------------------------------------------
/pages/6_Step_5.py:
--------------------------------------------------------------------------------
1 | # Data Preparation on Test Data
2 | import streamlit as st
3 | 
4 | st.title("Data Preparation: Preparing the Test Data (i.e. X_test)")
5 | 
6 | st.subheader("Explore Step 3 B. This step has already been covered there.")


--------------------------------------------------------------------------------
/pages/7_Step_6_&_7.py:
--------------------------------------------------------------------------------
 1 | # Predicting on unseen data
 2 | import streamlit as st
 3 | 
 4 | st.title("Testing Phase: Prediction and Evaluation")
 5 | 
 6 | st.header("Predicting on Unseen Data")
 7 | 
 8 | CODE_SNIPPET = '''
 9 | y_test_pred = model.predict(X_test_transformed)
10 | '''
11 | 
12 | st.code(CODE_SNIPPET, language='python')
13 | 
14 | st.header("Evaluating the Model")
15 | 
16 | CODE_SNIPPET_1 = '''
17 | from sklearn import metrics
18 | metrics.accuracy_score(y_test, y_test_pred)
19 | '''
20 | 
21 | CODE_SNIPPET_2 = '''
22 | from sklearn import metrics
23 | metrics.confusion_matrix(y_test, y_test_pred)
24 | '''
25 | 
26 | CODE_SNIPPET_3 = '''
27 | from sklearn import metrics
28 | metrics.classification_report(y_test, y_test_pred)
29 | '''
30 | 
31 | CODE_SNIPPET_4 = '''
32 | from sklearn import metrics
33 | metrics.mean_absolute_error(y_test, y_test_pred)
34 | '''
35 | 
36 | CODE_SNIPPET_5 = '''
37 | from sklearn import metrics
38 | metrics.mean_squared_error(y_test, y_test_pred)
39 | '''
40 | 
41 | CODE_SNIPPET_6 = '''
42 | from sklearn import metrics
43 | np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
44 | '''
45 | 
46 | CODE_SNIPPET_7 = '''
47 | from sklearn import metrics
48 | metrics.r2_score(y_test, y_test_pred)
49 | '''
50 | 
51 | option = st.selectbox('Select the evaluation metric? (HINT: Step 1 - Based on target variable)',
52 |                       ('Classification', 'Regression'))
53 | 
54 | if(option=='Classification'):
55 |     tab1, tab2, tab3 = st.tabs(["Accuracy", "Confusion Matrix", 
56 |                                 "Classification Report"])
57 |     with tab1:
58 |         st.code(CODE_SNIPPET_1, language='python')
59 |     with tab2:
60 |         st.code(CODE_SNIPPET_2, language='python')
61 |     with tab3:
62 |         st.code(CODE_SNIPPET_3, language='python')
63 | elif(option=='Regression'):
64 |     tab1, tab2, tab3, tab4 = st.tabs(["MAE", "MSE", 
65 |                                     "RMSE", 
66 |                                     "R2 Score"])
67 |     with tab1:
68 |         st.code(CODE_SNIPPET_4, language='python')
69 |     with tab2:
70 |         st.code(CODE_SNIPPET_5, language='python')
71 |     with tab3:
72 |         st.code(CODE_SNIPPET_6, language='python')
73 |     with tab4:
74 |         st.code(CODE_SNIPPET_7, language='python')
75 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | altair==5.1.2
 2 | attrs==23.1.0
 3 | blinker==1.7.0
 4 | cachetools==5.3.2
 5 | certifi==2023.11.17
 6 | charset-normalizer==3.3.2
 7 | click==8.1.7
 8 | colorama==0.4.6
 9 | gitdb==4.0.11
10 | GitPython==3.1.40
11 | idna==3.4
12 | importlib-metadata==6.8.0
13 | Jinja2==3.1.2
14 | jsonschema==4.20.0
15 | jsonschema-specifications==2023.11.1
16 | markdown-it-py==3.0.0
17 | MarkupSafe==2.1.3
18 | mdurl==0.1.2
19 | numpy==1.26.2
20 | packaging==23.2
21 | pandas==2.1.3
22 | Pillow==10.1.0
23 | protobuf==4.25.1
24 | pyarrow==14.0.1
25 | pydeck==0.8.1b0
26 | Pygments==2.17.2
27 | python-dateutil==2.8.2
28 | pytz==2023.3.post1
29 | referencing==0.31.0
30 | requests==2.31.0
31 | rich==13.7.0
32 | rpds-py==0.13.1
33 | six==1.16.0
34 | smmap==5.0.1
35 | streamlit==1.28.2
36 | tenacity==8.2.3
37 | toml==0.10.2
38 | toolz==0.12.0
39 | tornado==6.3.3
40 | typing_extensions==4.8.0
41 | tzdata==2023.3
42 | tzlocal==5.2
43 | urllib3==2.1.0
44 | validators==0.22.0
45 | watchdog==3.0.0
46 | zipp==3.17.0
47 | 


--------------------------------------------------------------------------------
/resources/images/ml_pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bansalkanav/Model-Building-Pipeline-Tutorial/ec1f517cb5cb30aec5655d168084e5fb9a04a9b0/resources/images/ml_pipeline.jpg


--------------------------------------------------------------------------------