├── 365_Data_Science_ML_Algorithms_A_to_Z.apkg ├── Classification_Example ├── ML_Algos_Example_Classification.ipynb ├── requirements.txt └── weatherAUS.csv ├── From_Scratch_Implementation ├── DecisionTree.py ├── GradientBoostedTrees.py ├── HierarchicalClustering.py ├── KMeans.py ├── KNearestNeighbors.py ├── LinearRegression.py ├── LogisticRegression.py ├── NaiveBayes.py ├── NeuralNetwork.py ├── NonNegativeMatrixFactorization.py ├── RandomForest.py ├── Regularization.py └── SupportVectorMachine.py ├── README.md └── Regression_Example ├── regression_example.ipynb ├── requirements.txt └── train.csv /365_Data_Science_ML_Algorithms_A_to_Z.apkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PlayingNumbers/ML_Algorithms_Course/451d8d1a5fa3e1f3108291f3531db4546768bb10/365_Data_Science_ML_Algorithms_A_to_Z.apkg -------------------------------------------------------------------------------- /Classification_Example/ML_Algos_Example_Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# ML Algorithms - Classification Example\n", 9 | "\n", 10 | "## Business Problem\n", 11 | "\n", 12 | "For this example, we are trying to predict if it will rain tomorrow based on weather data from Australia. This could be something that would be useful for a weather station or a website to project. \n", 13 | "\n", 14 | "### Import Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd \n", 24 | "import numpy as np \n", 25 | "import matplotlib.pyplot as plt \n", 26 | "import seaborn as sns " 27 | ] 28 | }, 29 | { 30 | "attachments": {}, 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Load in the data" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "df = pd.read_csv('weatherAUS.csv')" 44 | ] 45 | }, 46 | { 47 | "attachments": {}, 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "# Brief exploratory data analysis" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "(145460, 23)" 63 | ] 64 | }, 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "df.shape" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | " Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine \\\n", 84 | "0 2008-12-01 Albury 13.4 22.9 0.6 NaN NaN \n", 85 | "1 2008-12-02 Albury 7.4 25.1 0.0 NaN NaN \n", 86 | "2 2008-12-03 Albury 12.9 25.7 0.0 NaN NaN \n", 87 | "3 2008-12-04 Albury 9.2 28.0 0.0 NaN NaN \n", 88 | "4 2008-12-05 Albury 17.5 32.3 1.0 NaN NaN \n", 89 | "\n", 90 | " WindGustDir WindGustSpeed WindDir9am ... Humidity9am Humidity3pm \\\n", 91 | "0 W 44.0 W ... 71.0 22.0 \n", 92 | "1 WNW 44.0 NNW ... 44.0 25.0 \n", 93 | "2 WSW 46.0 W ... 38.0 30.0 \n", 94 | "3 NE 24.0 SE ... 45.0 16.0 \n", 95 | "4 W 41.0 ENE ... 82.0 33.0 \n", 96 | "\n", 97 | " Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday \\\n", 98 | "0 1007.7 1007.1 8.0 NaN 16.9 21.8 No \n", 99 | "1 1010.6 1007.8 NaN NaN 17.2 24.3 No \n", 100 | "2 1007.6 1008.7 NaN 2.0 21.0 23.2 No \n", 101 | "3 1017.6 1012.8 NaN NaN 18.1 26.5 No \n", 102 | "4 1010.8 1006.0 7.0 8.0 17.8 29.7 No \n", 103 | "\n", 104 | " RainTomorrow \n", 105 | "0 No \n", 106 | "1 No \n", 107 | "2 No \n", 108 | "3 No \n", 109 | "4 No \n", 110 | "\n", 111 | "[5 rows x 23 columns]\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "# Display the first few rows of the dataset\n", 117 | "print(df.head())\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "\n", 130 | "RangeIndex: 145460 entries, 0 to 145459\n", 131 | "Data columns (total 23 columns):\n", 132 | " # Column Non-Null Count Dtype \n", 133 | "--- ------ -------------- ----- \n", 134 | " 0 Date 145460 non-null object \n", 135 | " 1 Location 145460 non-null object \n", 136 | " 2 MinTemp 143975 non-null float64\n", 137 | " 3 MaxTemp 144199 non-null float64\n", 138 | " 4 Rainfall 142199 non-null float64\n", 139 | " 5 Evaporation 82670 non-null float64\n", 140 | " 6 Sunshine 75625 non-null float64\n", 141 | " 7 WindGustDir 135134 non-null object \n", 142 | " 8 WindGustSpeed 135197 non-null float64\n", 143 | " 9 WindDir9am 134894 non-null object \n", 144 | " 10 WindDir3pm 141232 non-null object \n", 145 | " 11 WindSpeed9am 143693 non-null float64\n", 146 | " 12 WindSpeed3pm 142398 non-null float64\n", 147 | " 13 Humidity9am 142806 non-null float64\n", 148 | " 14 Humidity3pm 140953 non-null float64\n", 149 | " 15 Pressure9am 130395 non-null float64\n", 150 | " 16 Pressure3pm 130432 non-null float64\n", 151 | " 17 Cloud9am 89572 non-null float64\n", 152 | " 18 Cloud3pm 86102 non-null float64\n", 153 | " 19 Temp9am 143693 non-null float64\n", 154 | " 20 Temp3pm 141851 non-null float64\n", 155 | " 21 RainToday 142199 non-null object \n", 156 | " 22 RainTomorrow 142193 non-null object \n", 157 | "dtypes: float64(16), object(7)\n", 158 | "memory usage: 25.5+ MB\n", 159 | "None\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "# Display basic information about the dataset\n", 165 | "print(df.info())\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | " MinTemp MaxTemp Rainfall Evaporation \\\n", 178 | "count 143975.000000 144199.000000 142199.000000 82670.000000 \n", 179 | "mean 12.194034 23.221348 2.360918 5.468232 \n", 180 | "std 6.398495 7.119049 8.478060 4.193704 \n", 181 | "min -8.500000 -4.800000 0.000000 0.000000 \n", 182 | "25% 7.600000 17.900000 0.000000 2.600000 \n", 183 | "50% 12.000000 22.600000 0.000000 4.800000 \n", 184 | "75% 16.900000 28.200000 0.800000 7.400000 \n", 185 | "max 33.900000 48.100000 371.000000 145.000000 \n", 186 | "\n", 187 | " Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm \\\n", 188 | "count 75625.000000 135197.000000 143693.000000 142398.000000 \n", 189 | "mean 7.611178 40.035230 14.043426 18.662657 \n", 190 | "std 3.785483 13.607062 8.915375 8.809800 \n", 191 | "min 0.000000 6.000000 0.000000 0.000000 \n", 192 | "25% 4.800000 31.000000 7.000000 13.000000 \n", 193 | "50% 8.400000 39.000000 13.000000 19.000000 \n", 194 | "75% 10.600000 48.000000 19.000000 24.000000 \n", 195 | "max 14.500000 135.000000 130.000000 87.000000 \n", 196 | "\n", 197 | " Humidity9am Humidity3pm Pressure9am Pressure3pm \\\n", 198 | "count 142806.000000 140953.000000 130395.00000 130432.000000 \n", 199 | "mean 68.880831 51.539116 1017.64994 1015.255889 \n", 200 | "std 19.029164 20.795902 7.10653 7.037414 \n", 201 | "min 0.000000 0.000000 980.50000 977.100000 \n", 202 | "25% 57.000000 37.000000 1012.90000 1010.400000 \n", 203 | "50% 70.000000 52.000000 1017.60000 1015.200000 \n", 204 | "75% 83.000000 66.000000 1022.40000 1020.000000 \n", 205 | "max 100.000000 100.000000 1041.00000 1039.600000 \n", 206 | "\n", 207 | " Cloud9am Cloud3pm Temp9am Temp3pm \n", 208 | "count 89572.000000 86102.000000 143693.000000 141851.00000 \n", 209 | "mean 4.447461 4.509930 16.990631 21.68339 \n", 210 | "std 2.887159 2.720357 6.488753 6.93665 \n", 211 | "min 0.000000 0.000000 -7.200000 -5.40000 \n", 212 | "25% 1.000000 2.000000 12.300000 16.60000 \n", 213 | "50% 5.000000 5.000000 16.700000 21.10000 \n", 214 | "75% 7.000000 7.000000 21.600000 26.40000 \n", 215 | "max 9.000000 9.000000 40.200000 46.70000 \n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "# Display summary statistics for numerical columns\n", 221 | "print(df.describe())\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "Date 0\n", 234 | "Location 0\n", 235 | "MinTemp 1485\n", 236 | "MaxTemp 1261\n", 237 | "Rainfall 3261\n", 238 | "Evaporation 62790\n", 239 | "Sunshine 69835\n", 240 | "WindGustDir 10326\n", 241 | "WindGustSpeed 10263\n", 242 | "WindDir9am 10566\n", 243 | "WindDir3pm 4228\n", 244 | "WindSpeed9am 1767\n", 245 | "WindSpeed3pm 3062\n", 246 | "Humidity9am 2654\n", 247 | "Humidity3pm 4507\n", 248 | "Pressure9am 15065\n", 249 | "Pressure3pm 15028\n", 250 | "Cloud9am 55888\n", 251 | "Cloud3pm 59358\n", 252 | "Temp9am 1767\n", 253 | "Temp3pm 3609\n", 254 | "RainToday 3261\n", 255 | "RainTomorrow 3267\n", 256 | "dtype: int64\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "# Check for missing values\n", 262 | "print(df.isna().sum())" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 9, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# create months column\n", 272 | "# figure out location data \n", 273 | "# what to do with imputed data \n", 274 | "# scale data \n", 275 | "#remove outliers \n", 276 | "\n", 277 | "## Naive Bayes \n", 278 | " # Outlier removal \n", 279 | " # Encoding \n", 280 | "## SVM & Logistic regression Logistic Regression\n", 281 | " # Remove Outliers \n", 282 | " # Remove missing values \n", 283 | " # Scaling \n", 284 | " # dummy variables \n", 285 | "## Trees (Decision, RF, XGBoost, GB Tree)\n", 286 | "## KNN\n", 287 | " # Feature scaling\n", 288 | " # Imputation \n", 289 | " # dummy variables \n", 290 | "## ANN \n", 291 | " # scaling \n", 292 | " # get dummies " 293 | ] 294 | }, 295 | { 296 | "attachments": {}, 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "# Data Cleaning \n", 301 | "1) Drop Null Values in y variable\n", 302 | "2) Create a category for Month" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 10, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "#Data Cleaning \n", 312 | "df = df.dropna(subset=['RainTomorrow'])\n", 313 | "df['month'] = df.Date.apply(lambda x: pd.to_datetime(x).month).astype('category')" 314 | ] 315 | }, 316 | { 317 | "attachments": {}, 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "# Create model baseline\n", 322 | "\n", 323 | "The dataset is slightly imbalanced. We see that if we project it not to rain every time, our model will be right around 78% of the time. We want our model to perform at least to perform better than this 78% threshold. " 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 11, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "No 0.775819\n", 335 | "Yes 0.224181\n", 336 | "Name: RainTomorrow, dtype: float64" 337 | ] 338 | }, 339 | "execution_count": 11, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "df['RainTomorrow'].value_counts()/ df.shape[0]" 346 | ] 347 | }, 348 | { 349 | "attachments": {}, 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "# Additional Data Preprocessing\n", 354 | "1) Remove outliers\n", 355 | "2) Create train test split\n", 356 | "3) Create columns for continuous and categorical varaiables" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from sklearn.compose import ColumnTransformer\n", 366 | "from sklearn.pipeline import Pipeline\n", 367 | "from sklearn.impute import SimpleImputer\n", 368 | "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", 369 | "from scipy import stats\n", 370 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 371 | "import scipy \n", 372 | "#adjust for X & Y \n", 373 | "\n", 374 | "def z_score_removal(X,y, columns, z_score):\n", 375 | " df = pd.concat([X, y], axis=1)\n", 376 | " col_df = df[columns] \n", 377 | " z_scores = scipy.stats.zscore(col_df).abs()\n", 378 | " outliers = (z_scores.max(axis=1) > z_score)\n", 379 | " df_out = df[~outliers]\n", 380 | " X_cleaned = df_out[X.columns]\n", 381 | " y_cleaned = df_out.drop(X.columns, axis =1)\n", 382 | " return X_cleaned, y_cleaned\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 13, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "#train test split \n", 392 | "from sklearn.model_selection import train_test_split \n", 393 | "X = df.drop(['RainTomorrow', 'Date'], axis=1)\n", 394 | "y = df.loc[:,'RainTomorrow'].map({'Yes': 1, 'No': 0}).astype('category')\n", 395 | "\n", 396 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 14, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "# Identify continuous and categorical columns\n", 406 | "continuous_columns = [col for col in X_train.columns if X_train[col].dtype == 'float64' or X_train[col].dtype == 'int64']\n", 407 | "categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 15, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "# pipeline for Naive Bayes - We need to impute continuous columns and encode categorical variables.\n", 417 | "nb_preprocessor = ColumnTransformer(transformers=[\n", 418 | " ('num', Pipeline([\n", 419 | " ('imputer', SimpleImputer(strategy='median')),\n", 420 | " ]), continuous_columns),\n", 421 | " ('cat', Pipeline([\n", 422 | " ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", 423 | " ]), categorical_columns)\n", 424 | "],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')\n", 425 | "\n", 426 | "\n", 427 | "# Fit and transform the data\n", 428 | "data_transformed = nb_preprocessor.fit_transform(X_train)\n", 429 | "nb_X_train, nb_y_train = z_score_removal(data_transformed, y_train, continuous_columns, 3)" 430 | ] 431 | }, 432 | { 433 | "attachments": {}, 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "## Hyperparameter Tuning - Naive Bayes\n", 438 | "\n", 439 | "Naive Bayes is a family of simple probabilistic classifiers based on applying Bayes' theorem with the \"naive\" assumption of conditional independence between every pair of features given the class. These classifiers are particularly useful for text classification and other high-dimensional problems. There are different Naive Bayes classifiers available in scikit-learn, such as GaussianNB, MultinomialNB, and BernoulliNB.\n", 440 | "\n", 441 | "Relevant Parameters:\n", 442 | "\n", 443 | "### GaussianNB\n", 444 | "- **var_smoothing**: Portion of the largest variance of all features that is added to variances for calculation stability. It's used to smooth the likelihood estimates and avoid zero probabilities, which can lead to better generalization performance.\n", 445 | "\n", 446 | "### MultinomialNB\n", 447 | "- **alpha**: Additive (Laplace/Lidstone) smoothing parameter. It's used to control the trade-off between fitting the data and smoothing the probabilities, which helps prevent overfitting.\n", 448 | "- **fit_prior**: Whether to learn class prior probabilities or not. If false, a uniform prior will be used. Learning the prior can help improve the classification performance in cases where the class distribution is imbalanced.\n", 449 | "\n", 450 | "### BernoulliNB\n", 451 | "- **alpha**: Additive (Laplace/Lidstone) smoothing parameter. It's used to control the trade-off between fitting the data and smoothing the probabilities, which helps prevent overfitting.\n", 452 | "- **binarize**: Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.\n", 453 | "- **fit_prior**: Whether to learn class prior probabilities or not. If false, a uniform prior will be used. Learning the prior can help improve the classification performance in cases where the class distribution is imbalanced.\n", 454 | "\n", 455 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 16, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stderr", 465 | "output_type": "stream", 466 | "text": [ 467 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 468 | " y = column_or_1d(y, warn=True)\n", 469 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 470 | " y = column_or_1d(y, warn=True)\n", 471 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 472 | " y = column_or_1d(y, warn=True)\n", 473 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 474 | " y = column_or_1d(y, warn=True)\n", 475 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 476 | " y = column_or_1d(y, warn=True)\n", 477 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 478 | " y = column_or_1d(y, warn=True)\n" 479 | ] 480 | }, 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "[0.65895062 0.67078189 0.66708754 0.66194351 0.65764123]\n", 486 | "Test score (accuracy): 0.6639824167601945\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "#Naive Bayes Code \n", 492 | "from sklearn.naive_bayes import GaussianNB\n", 493 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 494 | "\n", 495 | "# Naive Bayes Classifier\n", 496 | "nb_model = GaussianNB()\n", 497 | "nb_scores = cross_val_score(nb_model, nb_X_train, nb_y_train, cv=5)\n", 498 | "print(nb_scores)\n", 499 | "\n", 500 | "# No hyperparameters to tune for GaussianNB\n", 501 | "nb_model.fit(nb_X_train, nb_y_train)\n", 502 | "test_score_nb = nb_model.score(nb_X_train, nb_y_train)\n", 503 | "\n", 504 | "print(f\"Test score (accuracy): {test_score_nb}\")" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 17, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "# pipeline for SVM & Logistic regression classifiers \n", 514 | "lr_preprocessor = ColumnTransformer(transformers=[\n", 515 | " ('num', Pipeline([\n", 516 | " ('imputer', SimpleImputer(strategy='median')),\n", 517 | " ('scaler', StandardScaler())\n", 518 | " ]), continuous_columns),\n", 519 | " ('cat', Pipeline([\n", 520 | " ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n", 521 | " ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", 522 | " ]), categorical_columns)\n", 523 | "],remainder = 'passthrough', verbose_feature_names_out= False).set_output(transform='pandas')\n", 524 | "\n", 525 | "lr_transformed = lr_preprocessor.fit_transform(X_train)\n", 526 | "lr_X_train, lr_y_train = z_score_removal(lr_transformed, y_train, continuous_columns, 3)" 527 | ] 528 | }, 529 | { 530 | "attachments": {}, 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "## Hyperparameter Tuning - Logistic Regression\n", 535 | "\n", 536 | "Logistic Regression is a linear model for classification that uses the logistic function to model the probability of a binary outcome. It's a simple yet powerful technique for solving binary and multi-class classification problems. In scikit-learn, the `LogisticRegression` class provides an implementation of logistic regression.\n", 537 | "\n", 538 | "Relevant Parameters:\n", 539 | "- **penalty**: The type of regularization applied to the model. Options include 'l1', 'l2', 'elasticnet', and 'none'. Regularization is used to control the trade-off between fitting the data and keeping the weights small, which helps prevent overfitting.\n", 540 | "- **C**: Inverse of regularization strength (i.e., 1/lambda). Smaller values specify stronger regularization. It's used to control the amount of regularization applied to the model, which can impact the model's ability to generalize to unseen data.\n", 541 | "- **fit_intercept**: Whether to include an intercept term in the model. If false, the data is assumed to be already centered. Including an intercept can improve the fit of the model, especially if the data is not centered.\n", 542 | "- **solver**: The algorithm used for optimization. Choices are 'newton-cg', 'lbfgs', 'liblinear', 'sag', and 'saga'. Each solver has its own benefits and drawbacks, so it's essential to choose the one that best suits your problem and dataset.\n", 543 | "- **max_iter**: Maximum number of iterations for the solver to converge. Increasing this value allows the model more time to converge but may increase the computation time.\n", 544 | "- **multi_class**: Strategy for multi-class problems. Options are 'auto', 'ovr' (one-vs-rest), and 'multinomial'. 'auto' will choose the best strategy based on the data and solver. For multi-class problems, the choice of strategy can impact the classification performance.\n", 545 | "\n", 546 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 18, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "name": "stderr", 556 | "output_type": "stream", 557 | "text": [ 558 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 559 | " y = column_or_1d(y, warn=True)\n", 560 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 561 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 562 | "\n", 563 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 564 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 565 | "Please also refer to the documentation for alternative solver options:\n", 566 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 567 | " n_iter_i = _check_optimize_result(\n", 568 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 569 | " y = column_or_1d(y, warn=True)\n", 570 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 571 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 572 | "\n", 573 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 574 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 575 | "Please also refer to the documentation for alternative solver options:\n", 576 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 577 | " n_iter_i = _check_optimize_result(\n", 578 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 579 | " y = column_or_1d(y, warn=True)\n", 580 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 581 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 582 | "\n", 583 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 584 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 585 | "Please also refer to the documentation for alternative solver options:\n", 586 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 587 | " n_iter_i = _check_optimize_result(\n", 588 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 589 | " y = column_or_1d(y, warn=True)\n", 590 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 591 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 592 | "\n", 593 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 594 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 595 | "Please also refer to the documentation for alternative solver options:\n", 596 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 597 | " n_iter_i = _check_optimize_result(\n", 598 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 599 | " y = column_or_1d(y, warn=True)\n" 600 | ] 601 | }, 602 | { 603 | "name": "stdout", 604 | "output_type": "stream", 605 | "text": [ 606 | "[0.85077628 0.8516648 0.85231949 0.8526936 0.8481575 ]\n" 607 | ] 608 | }, 609 | { 610 | "name": "stderr", 611 | "output_type": "stream", 612 | "text": [ 613 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 614 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 615 | "\n", 616 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 617 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 618 | "Please also refer to the documentation for alternative solver options:\n", 619 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 620 | " n_iter_i = _check_optimize_result(\n" 621 | ] 622 | }, 623 | { 624 | "data": { 625 | "text/plain": [ 626 | "'\\n# Parameter grid for GridSearchCV\\nparam_grid = {\\n \\'penalty\\': [\\'l1\\', \\'l2\\', \\'elasticnet\\', \\'none\\'],\\n \\'C\\': [0.1, 1, 10],\\n \\'solver\\': [\\'newton-cg\\', \\'lbfgs\\', \\'liblinear\\', \\'sag\\', \\'saga\\'],\\n \\'max_iter\\': [100, 500, 1000]\\n}\\n\\ngrid_search_lr = GridSearchCV(lr_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_lr.fit(lr_X_train, lr_y_train)\\ntest_score_lr = grid_search_lr.best_estimator_.score(lr_X_train, lr_y_train)\\n\\nprint(f\"Best penalty value: {grid_search_lr.best_params_[\\'penalty\\']}\")\\nprint(f\"Best C value: {grid_search_lr.best_params_[\\'C\\']}\")\\nprint(f\"Best solver value: {grid_search_lr.best_params_[\\'solver\\']}\")\\nprint(f\"Best max_iter value: {grid_search_lr.best_params_[\\'max_iter\\']}\")\\nprint(f\"Best accuracy: {grid_search_lr.best_score_}\")\\n'" 627 | ] 628 | }, 629 | "execution_count": 18, 630 | "metadata": {}, 631 | "output_type": "execute_result" 632 | } 633 | ], 634 | "source": [ 635 | "#Logistic Regression & SVM Code\n", 636 | "\n", 637 | "from sklearn.linear_model import LogisticRegression\n", 638 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 639 | "\n", 640 | "# Logistic Regression with default parameters\n", 641 | "lr_model = LogisticRegression()\n", 642 | "lr_scores = cross_val_score(lr_model, lr_X_train, lr_y_train, cv=5, scoring='accuracy')\n", 643 | "print(lr_scores)\n", 644 | "\"\"\"\n", 645 | "# Parameter grid for GridSearchCV\n", 646 | "param_grid = {\n", 647 | " 'penalty': ['l1', 'l2', 'elasticnet', 'none'],\n", 648 | " 'C': [0.1, 1, 10],\n", 649 | " 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],\n", 650 | " 'max_iter': [100, 500, 1000]\n", 651 | "}\n", 652 | "\n", 653 | "grid_search_lr = GridSearchCV(lr_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 654 | "grid_search_lr.fit(lr_X_train, lr_y_train)\n", 655 | "test_score_lr = grid_search_lr.best_estimator_.score(lr_X_train, lr_y_train)\n", 656 | "\n", 657 | "print(f\"Best penalty value: {grid_search_lr.best_params_['penalty']}\")\n", 658 | "print(f\"Best C value: {grid_search_lr.best_params_['C']}\")\n", 659 | "print(f\"Best solver value: {grid_search_lr.best_params_['solver']}\")\n", 660 | "print(f\"Best max_iter value: {grid_search_lr.best_params_['max_iter']}\")\n", 661 | "print(f\"Best accuracy: {grid_search_lr.best_score_}\")\n", 662 | "\"\"\"" 663 | ] 664 | }, 665 | { 666 | "attachments": {}, 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "## Hyperparameter Tuning - SVM Classification\n", 671 | "\n", 672 | "Support Vector Machine (SVM) Classification is a versatile machine learning algorithm that can be used for both linear and non-linear classification tasks. It aims to find the best-fitting hyperplane that has the largest distance (margin) between the support vectors and the hyperplane.\n", 673 | "\n", 674 | "Relevant Parameters:\n", 675 | "- **kernel**: Specifies the kernel function to be used in the algorithm. Possible options are 'linear', 'poly', 'rbf', 'sigmoid', and 'precomputed'. The choice of the kernel function depends on the nature of the data and the problem to be solved.\n", 676 | "- **C**: Regularization parameter (also called the cost parameter); must be a positive float. It determines the trade-off between achieving a low training error and a low testing error. In other words, it controls the balance between overfitting and underfitting. A smaller value of C creates a wider margin, which may result in more training errors but better generalization to the test data. A larger value of C creates a narrower margin, which may result in fewer training errors but poorer generalization to the test data.\n", 677 | "- **degree**: The degree of the polynomial kernel function ('poly'). Ignored by all other kernels. It is the degree of the polynomial used for the 'poly' kernel and determines the flexibility of the model.\n", 678 | "- **gamma**: Kernel coefficient for 'rbf', 'poly', and 'sigmoid'. If gamma is 'scale' (default), then it is calculated as 1 / (n_features * X.var()) for the input data X. If gamma is 'auto', then it is calculated as 1/n_features. A smaller gamma value will produce a more flexible model, while a larger gamma value will produce a more rigid model.\n", 679 | "- **coef0**: Independent term in the kernel function. It is only significant in 'poly' and 'sigmoid'. It controls the influence of higher degree terms in the polynomial and sigmoid kernels.\n", 680 | "- **shrinking**: Whether to use the shrinking heuristic. The shrinking heuristic is a technique used to speed up training by removing some of the support vectors that are not necessary for the final solution. True by default.\n", 681 | "\n", 682 | "By tuning these parameters, you can find the best combination for your specific classification problem and achieve a better balance between model complexity and generalization performance." 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 19, 688 | "metadata": {}, 689 | "outputs": [ 690 | { 691 | "name": "stderr", 692 | "output_type": "stream", 693 | "text": [ 694 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 695 | " y = column_or_1d(y, warn=True)\n", 696 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 697 | " y = column_or_1d(y, warn=True)\n", 698 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 699 | " y = column_or_1d(y, warn=True)\n", 700 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 701 | " y = column_or_1d(y, warn=True)\n", 702 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 703 | " y = column_or_1d(y, warn=True)\n" 704 | ] 705 | }, 706 | { 707 | "name": "stdout", 708 | "output_type": "stream", 709 | "text": [ 710 | "[0.85760382 0.85891321 0.85793116 0.85895997 0.85582679]\n" 711 | ] 712 | }, 713 | { 714 | "data": { 715 | "text/plain": [ 716 | "'\\n# Parameter grid for GridSearchCV\\nparam_grid = {\\n \\'kernel\\': [\\'linear\\', \\'rbf\\'],\\n \\'C\\': [0.1, 1, 10],\\n \\'gamma\\': [\\'scale\\', \\'auto\\', 0.1, 1, 10]\\n}\\n\\ngrid_search_svm = GridSearchCV(svm_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_svm.fit(lr_X_train, lr_y_train)\\ntest_score_svm = grid_search_svm.best_estimator_.score(lr_X_train, lr_y_train)\\n\\nprint(f\"Best kernel value: {grid_search_svm.best_params_[\\'kernel\\']}\")\\nprint(f\"Best C value: {grid_search_svm.best_params_[\\'C\\']}\")\\nprint(f\"Best gamma value: {grid_search_svm.best_params_[\\'gamma\\']}\")\\nprint(f\"Best accuracy: {grid_search_svm.best_score_}\")\\n'" 717 | ] 718 | }, 719 | "execution_count": 19, 720 | "metadata": {}, 721 | "output_type": "execute_result" 722 | } 723 | ], 724 | "source": [ 725 | "from sklearn.svm import SVC\n", 726 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 727 | "\n", 728 | "# SVM with default parameters\n", 729 | "svm_model = SVC()\n", 730 | "svm_scores = cross_val_score(svm_model, lr_X_train, lr_y_train, cv=5, scoring='accuracy')\n", 731 | "print(svm_scores)\n", 732 | "\n", 733 | "\"\"\"\n", 734 | "# Parameter grid for GridSearchCV\n", 735 | "param_grid = {\n", 736 | " 'kernel': ['linear', 'rbf'],\n", 737 | " 'C': [0.1, 1, 10],\n", 738 | " 'gamma': ['scale', 'auto', 0.1, 1, 10]\n", 739 | "}\n", 740 | "\n", 741 | "grid_search_svm = GridSearchCV(svm_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 742 | "grid_search_svm.fit(lr_X_train, lr_y_train)\n", 743 | "test_score_svm = grid_search_svm.best_estimator_.score(lr_X_train, lr_y_train)\n", 744 | "\n", 745 | "print(f\"Best kernel value: {grid_search_svm.best_params_['kernel']}\")\n", 746 | "print(f\"Best C value: {grid_search_svm.best_params_['C']}\")\n", 747 | "print(f\"Best gamma value: {grid_search_svm.best_params_['gamma']}\")\n", 748 | "print(f\"Best accuracy: {grid_search_svm.best_score_}\")\n", 749 | "\"\"\"" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 20, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "# pipeline for Trees (Decision, RF, XGBoost, GB Tree) - Reuse nb without outlier removval\n", 759 | "tree_X_train = nb_preprocessor.fit_transform(X_train)\n" 760 | ] 761 | }, 762 | { 763 | "attachments": {}, 764 | "cell_type": "markdown", 765 | "metadata": {}, 766 | "source": [ 767 | "## Hyperparameter Tuning - Decision Tree\n", 768 | "\n", 769 | "Decision Trees are a popular machine learning algorithm used for both regression and classification tasks. They are easy to interpret and can naturally handle a mixture of continuous and categorical variables.\n", 770 | "\n", 771 | "Relevant Parameters:\n", 772 | "- **criterion**: The function to measure the quality of a split. Supported criteria for regression are 'mse' (mean squared error) and 'friedman_mse' (improvement in mean squared error). For classification, supported criteria are 'gini' and 'entropy'.\n", 773 | "- **splitter**: The strategy used to choose the split at each node. Supported strategies are 'best' to choose the best split and 'random' to choose the best random split.\n", 774 | "- **max_depth**: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Controlling the depth can help prevent overfitting.\n", 775 | "- **min_samples_split**: The minimum number of samples required to split an internal node. A larger value prevents the tree from growing too deep, thus preventing overfitting.\n", 776 | "- **min_samples_leaf**: The minimum number of samples required to be at a leaf node. A larger value prevents the tree from growing too deep, thus preventing overfitting.\n", 777 | "- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights required to be at a leaf node. Samples have equal weight when sample_weight is not provided.\n", 778 | "- **max_features**: The number of features to consider when looking for the best split. If None, then max_features=n_features.\n", 779 | "- **max_leaf_nodes**: Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None, then unlimited number of leaf nodes.\n", 780 | "- **min_impurity_decrease**: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.\n", 781 | "- **min_impurity_split**: Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.\n", 782 | "\n", 783 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance.\n" 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": 21, 789 | "metadata": {}, 790 | "outputs": [ 791 | { 792 | "name": "stdout", 793 | "output_type": "stream", 794 | "text": [ 795 | "[0.792141 0.7920531 0.78594348 0.793108 0.79415385]\n" 796 | ] 797 | }, 798 | { 799 | "data": { 800 | "text/plain": [ 801 | "'\\n# Expanded parameter grid for GridSearchCV\\nparam_grid = {\\n \\'criterion\\': [\\'gini\\', \\'entropy\\'],\\n \\'splitter\\': [\\'best\\', \\'random\\'],\\n \\'max_depth\\': [None, 5, 10, 15, 20, 25, 30, 35, 40],\\n \\'min_samples_split\\': [2, 5, 10, 15, 20],\\n \\'min_samples_leaf\\': [1, 2, 4, 6, 8, 10],\\n \\'min_weight_fraction_leaf\\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'max_features\\': [None, \\'sqrt\\', \\'log2\\'],\\n \\'max_leaf_nodes\\': [None, 10, 20, 30, 40, 50],\\n \\'min_impurity_decrease\\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'class_weight\\': [None, \\'balanced\\']\\n}\\n\\ngrid_search_dt = GridSearchCV(dt_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_dt.fit(tree_X_train, y_train)\\ntest_score_dt = grid_search_dt.best_estimator_.score(tree_X_train, y_train)\\n\\nprint(\"Best hyperparameters found:\")\\nfor key, value in grid_search_dt.best_params_.items():\\n print(f\"{key}: {value}\")\\n\\nprint(f\"Best accuracy: {grid_search_dt.best_score_}\")\\n\\n'" 802 | ] 803 | }, 804 | "execution_count": 21, 805 | "metadata": {}, 806 | "output_type": "execute_result" 807 | } 808 | ], 809 | "source": [ 810 | "# Trees (Decision, RF, XGBoost, GB Tree) Code\n", 811 | "from sklearn.tree import DecisionTreeClassifier\n", 812 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 813 | "\n", 814 | "# Decision Tree with default parameters\n", 815 | "dt_model = DecisionTreeClassifier()\n", 816 | "dt_scores = cross_val_score(dt_model, tree_X_train, y_train, cv=5, scoring='accuracy')\n", 817 | "print(dt_scores)\n", 818 | "\n", 819 | "\"\"\"\n", 820 | "# Expanded parameter grid for GridSearchCV\n", 821 | "param_grid = {\n", 822 | " 'criterion': ['gini', 'entropy'],\n", 823 | " 'splitter': ['best', 'random'],\n", 824 | " 'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],\n", 825 | " 'min_samples_split': [2, 5, 10, 15, 20],\n", 826 | " 'min_samples_leaf': [1, 2, 4, 6, 8, 10],\n", 827 | " 'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n", 828 | " 'max_features': [None, 'sqrt', 'log2'],\n", 829 | " 'max_leaf_nodes': [None, 10, 20, 30, 40, 50],\n", 830 | " 'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n", 831 | " 'class_weight': [None, 'balanced']\n", 832 | "}\n", 833 | "\n", 834 | "grid_search_dt = GridSearchCV(dt_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 835 | "grid_search_dt.fit(tree_X_train, y_train)\n", 836 | "test_score_dt = grid_search_dt.best_estimator_.score(tree_X_train, y_train)\n", 837 | "\n", 838 | "print(\"Best hyperparameters found:\")\n", 839 | "for key, value in grid_search_dt.best_params_.items():\n", 840 | " print(f\"{key}: {value}\")\n", 841 | "\n", 842 | "print(f\"Best accuracy: {grid_search_dt.best_score_}\")\n", 843 | "\n", 844 | "\"\"\"" 845 | ] 846 | }, 847 | { 848 | "attachments": {}, 849 | "cell_type": "markdown", 850 | "metadata": {}, 851 | "source": [ 852 | "## Hyperparameter Tuning - Random Forest\n", 853 | "\n", 854 | "Random Forest is an ensemble learning method that constructs a multitude of decision trees at training time and outputs the mode of the classes (classification) or mean prediction (regression) of the individual trees. It is highly flexible and can handle a wide variety of tasks.\n", 855 | "\n", 856 | "Relevant Parameters:\n", 857 | "- **n_estimators**: The number of trees in the forest. Increasing the number of trees can improve the model's performance, but may also increase the computation time.\n", 858 | "- **criterion**: The function to measure the quality of a split. Supported criteria for regression are 'mse' (mean squared error) and 'mae' (mean absolute error). For classification, supported criteria are 'gini' and 'entropy'.\n", 859 | "- **max_depth**: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Controlling the depth can help prevent overfitting.\n", 860 | "- **min_samples_split**: The minimum number of samples required to split an internal node. A larger value prevents the tree from growing too deep, thus preventing overfitting.\n", 861 | "- **min_samples_leaf**: The minimum number of samples required to be at a leaf node. A larger value prevents the tree from growing too deep, thus preventing overfitting.\n", 862 | "- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights required to be at a leaf node. Samples have equal weight when sample_weight is not provided.\n", 863 | "- **max_features**: The number of features to consider when looking for the best split. If None, then max_features=n_features. It can also be a float, int, or string ('auto', 'sqrt', or 'log2').\n", 864 | "- **max_leaf_nodes**: Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None, then unlimited number of leaf nodes.\n", 865 | "- **min_impurity_decrease**: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.\n", 866 | "- **bootstrap**: Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.\n", 867 | "- **oob_score**: Whether to use out-of-bag samples to estimate the generalization accuracy.\n", 868 | "\n", 869 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 22, 875 | "metadata": {}, 876 | "outputs": [ 877 | { 878 | "name": "stdout", 879 | "output_type": "stream", 880 | "text": [ 881 | "[0.85736891 0.85776449 0.85530306 0.85859962 0.85327473]\n" 882 | ] 883 | }, 884 | { 885 | "data": { 886 | "text/plain": [ 887 | "'\\n# Expanded parameter grid for GridSearchCV\\nparam_grid = {\\n \\'n_estimators\\': [10, 50, 100, 200, 300],\\n \\'criterion\\': [\\'gini\\', \\'entropy\\'],\\n \\'max_depth\\': [None, 5, 10, 15, 20, 25, 30, 35, 40],\\n \\'min_samples_split\\': [2, 5, 10, 15, 20],\\n \\'min_samples_leaf\\': [1, 2, 4, 6, 8, 10],\\n \\'min_weight_fraction_leaf\\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'max_features\\': [None, \\'sqrt\\', \\'log2\\'],\\n \\'max_leaf_nodes\\': [None, 10, 20, 30, 40, 50],\\n \\'min_impurity_decrease\\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'bootstrap\\': [True, False],\\n \\'class_weight\\': [None, \\'balanced\\'],\\n \\'warm_start\\': [False, True],\\n \\'oob_score\\': [False, True]\\n}\\n\\ngrid_search_rf = GridSearchCV(rf_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_rf.fit(tree_X_train, y_train)\\ntest_score_rf = grid_search_rf.best_estimator_.score(tree_X_train, y_train)\\n\\nprint(\"Best hyperparameters found:\")\\nfor key, value in grid_search_rf.best_params_.items():\\n print(f\"{key}: {value}\")\\n\\nprint(f\"Best accuracy: {grid_search_rf.best_score_}\")\\n'" 888 | ] 889 | }, 890 | "execution_count": 22, 891 | "metadata": {}, 892 | "output_type": "execute_result" 893 | } 894 | ], 895 | "source": [ 896 | "from sklearn.ensemble import RandomForestClassifier\n", 897 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 898 | "\n", 899 | "# Random Forest with default parameters\n", 900 | "rf_model = RandomForestClassifier()\n", 901 | "rf_scores = cross_val_score(rf_model, tree_X_train, y_train, cv=5, scoring='accuracy')\n", 902 | "print(rf_scores)\n", 903 | "\n", 904 | "\"\"\"\n", 905 | "# Expanded parameter grid for GridSearchCV\n", 906 | "param_grid = {\n", 907 | " 'n_estimators': [10, 50, 100, 200, 300],\n", 908 | " 'criterion': ['gini', 'entropy'],\n", 909 | " 'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],\n", 910 | " 'min_samples_split': [2, 5, 10, 15, 20],\n", 911 | " 'min_samples_leaf': [1, 2, 4, 6, 8, 10],\n", 912 | " 'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n", 913 | " 'max_features': [None, 'sqrt', 'log2'],\n", 914 | " 'max_leaf_nodes': [None, 10, 20, 30, 40, 50],\n", 915 | " 'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n", 916 | " 'bootstrap': [True, False],\n", 917 | " 'class_weight': [None, 'balanced'],\n", 918 | " 'warm_start': [False, True],\n", 919 | " 'oob_score': [False, True]\n", 920 | "}\n", 921 | "\n", 922 | "grid_search_rf = GridSearchCV(rf_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 923 | "grid_search_rf.fit(tree_X_train, y_train)\n", 924 | "test_score_rf = grid_search_rf.best_estimator_.score(tree_X_train, y_train)\n", 925 | "\n", 926 | "print(\"Best hyperparameters found:\")\n", 927 | "for key, value in grid_search_rf.best_params_.items():\n", 928 | " print(f\"{key}: {value}\")\n", 929 | "\n", 930 | "print(f\"Best accuracy: {grid_search_rf.best_score_}\")\n", 931 | "\"\"\"" 932 | ] 933 | }, 934 | { 935 | "attachments": {}, 936 | "cell_type": "markdown", 937 | "metadata": {}, 938 | "source": [ 939 | "## Hyperparameter Tuning - Gradient Boosted Classifier\n", 940 | "\n", 941 | "Gradient Boosting is an ensemble learning method that builds an additive model in a forward stage-wise fashion. It allows for the optimization of arbitrary differentiable loss functions. In each stage, a regression tree is fit on the negative gradient of the given loss function.\n", 942 | "\n", 943 | "Relevant Parameters:\n", 944 | "- **loss**: The loss function to be optimized. For classification, supported options are 'deviance' (default) for the exponential loss and 'exponential' for AdaBoost-like exponential loss.\n", 945 | "- **learning_rate**: The learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.\n", 946 | "- **n_estimators**: The number of boosting stages to perform. Gradient boosting is fairly robust to overfitting, so a large number of estimators usually results in better performance.\n", 947 | "- **subsample**: The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0, this results in Stochastic Gradient Boosting. subsample interacts with the parameter n_estimators. Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.\n", 948 | "- **criterion**: The function to measure the quality of a split. Supported criteria are 'friedman_mse' (default) for the mean squared error with improvement score by Friedman, 'mse' for mean squared error, and 'mae' for the mean absolute error.\n", 949 | "- **min_samples_split**: The minimum number of samples required to split an internal node. A larger value prevents the tree from growing too deep, thus preventing overfitting.\n", 950 | "- **min_samples_leaf**: The minimum number of samples required to be at a leaf node. A larger value prevents the tree from growing too deep, thus preventing overfitting.\n", 951 | "- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights required to be at a leaf node. Samples have equal weight when sample_weight is not provided.\n", 952 | "- **max_depth**: The maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.\n", 953 | "- **min_impurity_decrease**: A node will be split if this split induces a decrease of the impurity greater than or equal to this value.\n", 954 | "- **max_features**: The number of features to consider when looking for the best split. If None, then max_features=n_features. It can also be a float, int, or string ('auto', 'sqrt', or 'log2').\n", 955 | "\n", 956 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": 23, 962 | "metadata": {}, 963 | "outputs": [ 964 | { 965 | "name": "stdout", 966 | "output_type": "stream", 967 | "text": [ 968 | "[0.85117138 0.85033625 0.85020439 0.85266582 0.84606593]\n" 969 | ] 970 | }, 971 | { 972 | "data": { 973 | "text/plain": [ 974 | "'\\n# Expanded parameter grid for GridSearchCV\\nparam_grid = {\\n \\'loss\\': [\\'deviance\\', \\'exponential\\'],\\n \\'learning_rate\\': [0.01, 0.1, 0.2, 0.3],\\n \\'n_estimators\\': [10, 50, 100, 200, 300],\\n \\'subsample\\': [0.5, 0.8, 1.0],\\n \\'criterion\\': [\\'friedman_mse\\', \\'mse\\', \\'mae\\'],\\n \\'min_samples_split\\': [2, 5, 10, 15, 20],\\n \\'min_samples_leaf\\': [1, 2, 4, 6, 8, 10],\\n \\'min_weight_fraction_leaf\\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'max_depth\\': [None, 5, 10, 15, 20, 25, 30, 35, 40],\\n \\'min_impurity_decrease\\': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'max_features\\': [None, \\'sqrt\\', \\'log2\\'],\\n \\'max_leaf_nodes\\': [None, 10, 20, 30, 40, 50],\\n \\'n_iter_no_change\\': [None, 5, 10, 15],\\n \\'validation_fraction\\': [0.1, 0.2, 0.3, 0.4, 0.5],\\n \\'tol\\': [1e-4, 1e-3, 1e-2, 1e-1]\\n}\\n\\ngrid_search_gb = GridSearchCV(gb_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_gb.fit(tree_X_train, y_train)\\ntest_score_gb = grid_search_gb.best_estimator_.score(tree_X_train, y_train)\\n\\nprint(\"Best hyperparameters found:\")\\nfor key, value in grid_search_gb.best_params_.items():\\n print(f\"{key}: {value}\")\\n\\nprint(f\"Best accuracy: {grid_search_gb.best_score_}\")\\n'" 975 | ] 976 | }, 977 | "execution_count": 23, 978 | "metadata": {}, 979 | "output_type": "execute_result" 980 | } 981 | ], 982 | "source": [ 983 | "from sklearn.ensemble import GradientBoostingClassifier\n", 984 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 985 | "\n", 986 | "# Gradient Boosting with default parameters\n", 987 | "gb_model = GradientBoostingClassifier()\n", 988 | "gb_scores = cross_val_score(gb_model, tree_X_train, y_train, cv=5, scoring='accuracy')\n", 989 | "print(gb_scores)\n", 990 | "\n", 991 | "\"\"\"\n", 992 | "# Expanded parameter grid for GridSearchCV\n", 993 | "param_grid = {\n", 994 | " 'loss': ['deviance', 'exponential'],\n", 995 | " 'learning_rate': [0.01, 0.1, 0.2, 0.3],\n", 996 | " 'n_estimators': [10, 50, 100, 200, 300],\n", 997 | " 'subsample': [0.5, 0.8, 1.0],\n", 998 | " 'criterion': ['friedman_mse', 'mse', 'mae'],\n", 999 | " 'min_samples_split': [2, 5, 10, 15, 20],\n", 1000 | " 'min_samples_leaf': [1, 2, 4, 6, 8, 10],\n", 1001 | " 'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n", 1002 | " 'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40],\n", 1003 | " 'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],\n", 1004 | " 'max_features': [None, 'sqrt', 'log2'],\n", 1005 | " 'max_leaf_nodes': [None, 10, 20, 30, 40, 50],\n", 1006 | " 'n_iter_no_change': [None, 5, 10, 15],\n", 1007 | " 'validation_fraction': [0.1, 0.2, 0.3, 0.4, 0.5],\n", 1008 | " 'tol': [1e-4, 1e-3, 1e-2, 1e-1]\n", 1009 | "}\n", 1010 | "\n", 1011 | "grid_search_gb = GridSearchCV(gb_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 1012 | "grid_search_gb.fit(tree_X_train, y_train)\n", 1013 | "test_score_gb = grid_search_gb.best_estimator_.score(tree_X_train, y_train)\n", 1014 | "\n", 1015 | "print(\"Best hyperparameters found:\")\n", 1016 | "for key, value in grid_search_gb.best_params_.items():\n", 1017 | " print(f\"{key}: {value}\")\n", 1018 | "\n", 1019 | "print(f\"Best accuracy: {grid_search_gb.best_score_}\")\n", 1020 | "\"\"\"" 1021 | ] 1022 | }, 1023 | { 1024 | "attachments": {}, 1025 | "cell_type": "markdown", 1026 | "metadata": {}, 1027 | "source": [ 1028 | "## Hyperparameter Tuning - XGBoost\n", 1029 | "\n", 1030 | "XGBoost (eXtreme Gradient Boosting) is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable. It implements machine learning algorithms under the Gradient Boosting framework, offering several regularization techniques to prevent overfitting.\n", 1031 | "\n", 1032 | "Relevant Parameters:\n", 1033 | "- **learning_rate**: Boosting learning rate. Controls the contribution of each tree in the ensemble. Lower learning rates lead to more robust models but require more trees (n_estimators).\n", 1034 | "- **n_estimators**: Number of boosting rounds to be run. Larger values result in more complex models but can increase the risk of overfitting.\n", 1035 | "- **max_depth**: Maximum tree depth for base learners. Controls the depth of each individual tree in the ensemble. Deeper trees can capture more complex patterns, but may also overfit the data.\n", 1036 | "- **min_child_weight**: Minimum sum of instance weight (hessian) needed in a child. Defines the minimum number of instances required for a node to be split.\n", 1037 | "- **gamma**: Minimum loss reduction required to make a further partition on a leaf node of the tree. Controls the complexity of the tree by reducing the number of splits made.\n", 1038 | "- **subsample**: Subsample ratio of the training instances. Setting it to a value less than 1.0 can help prevent overfitting.\n", 1039 | "- **colsample_bytree**: Subsample ratio of columns when constructing each tree. A smaller value can reduce overfitting and speed up the training process.\n", 1040 | "- **colsample_bylevel**: Subsample ratio of columns for each level. Specifies the fraction of features to choose for each level in the tree building process.\n", 1041 | "- **colsample_bynode**: Subsample ratio of columns for each split. Specifies the fraction of features to choose for each split in the tree building process.\n", 1042 | "- **reg_alpha**: L1 regularization term on weights. Controls the sparsity of feature weights, effectively performing feature selection.\n", 1043 | "- **reg_lambda**: L2 regularization term on weights. Smoothens the weights, preventing extreme values and reducing the risk of overfitting.\n", 1044 | "\n", 1045 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 24, 1051 | "metadata": {}, 1052 | "outputs": [ 1053 | { 1054 | "name": "stderr", 1055 | "output_type": "stream", 1056 | "text": [ 1057 | "/home/kjee/.local/lib/python3.9/site-packages/xgboost/sklearn.py:1395: UserWarning: `use_label_encoder` is deprecated in 1.7.0.\n", 1058 | " warnings.warn(\"`use_label_encoder` is deprecated in 1.7.0.\")\n", 1059 | "/home/kjee/.local/lib/python3.9/site-packages/xgboost/sklearn.py:1395: UserWarning: `use_label_encoder` is deprecated in 1.7.0.\n", 1060 | " warnings.warn(\"`use_label_encoder` is deprecated in 1.7.0.\")\n", 1061 | "/home/kjee/.local/lib/python3.9/site-packages/xgboost/sklearn.py:1395: UserWarning: `use_label_encoder` is deprecated in 1.7.0.\n", 1062 | " warnings.warn(\"`use_label_encoder` is deprecated in 1.7.0.\")\n", 1063 | "/home/kjee/.local/lib/python3.9/site-packages/xgboost/sklearn.py:1395: UserWarning: `use_label_encoder` is deprecated in 1.7.0.\n", 1064 | " warnings.warn(\"`use_label_encoder` is deprecated in 1.7.0.\")\n", 1065 | "/home/kjee/.local/lib/python3.9/site-packages/xgboost/sklearn.py:1395: UserWarning: `use_label_encoder` is deprecated in 1.7.0.\n", 1066 | " warnings.warn(\"`use_label_encoder` is deprecated in 1.7.0.\")\n", 1067 | "/home/kjee/.local/lib/python3.9/site-packages/xgboost/sklearn.py:1395: UserWarning: `use_label_encoder` is deprecated in 1.7.0.\n", 1068 | " warnings.warn(\"`use_label_encoder` is deprecated in 1.7.0.\")\n" 1069 | ] 1070 | }, 1071 | { 1072 | "name": "stdout", 1073 | "output_type": "stream", 1074 | "text": [ 1075 | "[0.86031383 0.85816008 0.86079733 0.86154455 0.85745055]\n" 1076 | ] 1077 | }, 1078 | { 1079 | "data": { 1080 | "text/plain": [ 1081 | "'\\n# Expanded parameter grid for GridSearchCV\\nparam_grid = {\\n \\'max_depth\\': [3, 6, 9, 12],\\n \\'learning_rate\\': [0.01, 0.1, 0.2, 0.3],\\n \\'n_estimators\\': [10, 50, 100, 200, 300],\\n \\'booster\\': [\\'gbtree\\', \\'gblinear\\', \\'dart\\'],\\n \\'min_child_weight\\': [1, 5, 10],\\n \\'gamma\\': [0, 0.1, 0.2, 0.3, 0.4],\\n \\'subsample\\': [0.5, 0.8, 1.0],\\n \\'colsample_bytree\\': [0.5, 0.8, 1.0],\\n \\'colsample_bylevel\\': [0.5, 0.8, 1.0],\\n \\'reg_alpha\\': [0, 0.1, 0.2, 0.3, 0.4],\\n \\'reg_lambda\\': [1, 2, 3, 4],\\n \\'scale_pos_weight\\': [1, 2, 3],\\n \\'max_delta_step\\': [0, 1, 2, 3, 4],\\n \\'base_score\\': [0.5, 0.6, 0.7, 0.8, 0.9],\\n \\'random_state\\': [0, 1, 2, 3]\\n}\\n\\ngrid_search_xgb = GridSearchCV(xgb_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_xgb.fit(tree_X_train, y_train)\\ntest_score_xgb = grid_search_xgb.best_estimator_.score(tree_X_train, y_train)\\n\\nprint(\"Best hyperparameters found:\")\\nfor key, value in grid_search_xgb.best_params_.items():\\n print(f\"{key}: {value}\")\\n\\nprint(f\"Best accuracy: {grid_search_xgb.best_score_}\")\\n'" 1082 | ] 1083 | }, 1084 | "execution_count": 24, 1085 | "metadata": {}, 1086 | "output_type": "execute_result" 1087 | } 1088 | ], 1089 | "source": [ 1090 | "import xgboost as xgb\n", 1091 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 1092 | "\n", 1093 | "# XGBoost with default parameters\n", 1094 | "xgb_model = xgb.XGBClassifier(use_label_encoder=False)\n", 1095 | "xgb_scores = cross_val_score(xgb_model, tree_X_train.values, y_train.values, cv=5, scoring='accuracy')\n", 1096 | "print(xgb_scores)\n", 1097 | "\"\"\"\n", 1098 | "# Expanded parameter grid for GridSearchCV\n", 1099 | "param_grid = {\n", 1100 | " 'max_depth': [3, 6, 9, 12],\n", 1101 | " 'learning_rate': [0.01, 0.1, 0.2, 0.3],\n", 1102 | " 'n_estimators': [10, 50, 100, 200, 300],\n", 1103 | " 'booster': ['gbtree', 'gblinear', 'dart'],\n", 1104 | " 'min_child_weight': [1, 5, 10],\n", 1105 | " 'gamma': [0, 0.1, 0.2, 0.3, 0.4],\n", 1106 | " 'subsample': [0.5, 0.8, 1.0],\n", 1107 | " 'colsample_bytree': [0.5, 0.8, 1.0],\n", 1108 | " 'colsample_bylevel': [0.5, 0.8, 1.0],\n", 1109 | " 'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4],\n", 1110 | " 'reg_lambda': [1, 2, 3, 4],\n", 1111 | " 'scale_pos_weight': [1, 2, 3],\n", 1112 | " 'max_delta_step': [0, 1, 2, 3, 4],\n", 1113 | " 'base_score': [0.5, 0.6, 0.7, 0.8, 0.9],\n", 1114 | " 'random_state': [0, 1, 2, 3]\n", 1115 | "}\n", 1116 | "\n", 1117 | "grid_search_xgb = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 1118 | "grid_search_xgb.fit(tree_X_train, y_train)\n", 1119 | "test_score_xgb = grid_search_xgb.best_estimator_.score(tree_X_train, y_train)\n", 1120 | "\n", 1121 | "print(\"Best hyperparameters found:\")\n", 1122 | "for key, value in grid_search_xgb.best_params_.items():\n", 1123 | " print(f\"{key}: {value}\")\n", 1124 | "\n", 1125 | "print(f\"Best accuracy: {grid_search_xgb.best_score_}\")\n", 1126 | "\"\"\"" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": 25, 1132 | "metadata": {}, 1133 | "outputs": [], 1134 | "source": [ 1135 | "#pipeline for KNN & ANN - Same as lr but wihout outlier removal\n", 1136 | "knn_X_train = lr_transformed.copy()" 1137 | ] 1138 | }, 1139 | { 1140 | "attachments": {}, 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "k-Nearest Neighbors (KNN) is a simple, yet powerful, non-parametric supervised learning algorithm used for classification and regression. It assigns a new instance to the majority class or computes the mean (for regression tasks) of its k nearest neighbors in the feature space.\n", 1145 | "\n", 1146 | "Relevant Parameters:\n", 1147 | "- **n_neighbors**: Number of neighbors to use for the query. This is the main hyperparameter controlling the complexity of the KNN model. Larger values of k lead to smoother decision boundaries, while smaller values can capture more complex patterns but may overfit the data.\n", 1148 | "- **weights**: Weight function used in prediction. There are two options: 'uniform' (all points in each neighborhood are weighted equally) and 'distance' (assign weights proportional to the inverse of the distance from the query point). Using 'distance' can help reduce the impact of noise in the data.\n", 1149 | "- **algorithm**: Algorithm used to compute the nearest neighbors. Options include 'auto', 'ball_tree', 'kd_tree', and 'brute'. 'auto' will attempt to decide the most appropriate algorithm based on the values passed to fit() method. Choose the algorithm that best suits your data and computational requirements.\n", 1150 | "- **leaf_size**: Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.\n", 1151 | "- **p**: Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and for p = 2, it's equivalent to using euclidean_distance (l2). A larger value of p can help capture the specific geometry of your feature space.\n", 1152 | "\n", 1153 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "code", 1158 | "execution_count": 26, 1159 | "metadata": {}, 1160 | "outputs": [ 1161 | { 1162 | "name": "stdout", 1163 | "output_type": "stream", 1164 | "text": [ 1165 | "[0.84079821 0.83477649 0.83895213 0.83890818 0.83573626]\n" 1166 | ] 1167 | }, 1168 | { 1169 | "data": { 1170 | "text/plain": [ 1171 | "'\\n# Expanded parameter grid for GridSearchCV\\nparam_grid = {\\n \\'n_neighbors\\': list(range(1, 31)),\\n \\'weights\\': [\\'uniform\\', \\'distance\\'],\\n \\'algorithm\\': [\\'auto\\', \\'ball_tree\\', \\'kd_tree\\', \\'brute\\'],\\n \\'leaf_size\\': list(range(1, 50)),\\n \\'p\\': [1, 2],\\n \\'metric\\': [\\'euclidean\\', \\'manhattan\\', \\'chebyshev\\', \\'minkowski\\', \\'wminkowski\\', \\'seuclidean\\', \\'mahalanobis\\']\\n}\\n\\ngrid_search_knn = GridSearchCV(knn_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_knn.fit(knn_X_train, y_train)\\ntest_score_knn = grid_search_knn.best_estimator_.score(knn_X_train, y_train)\\n\\nprint(\"Best hyperparameters found:\")\\nfor key, value in grid_search_knn.best_params_.items():\\n print(f\"{key}: {value}\")\\n\\nprint(f\"Best accuracy: {grid_search_knn.best_score_}\")\\n'" 1172 | ] 1173 | }, 1174 | "execution_count": 26, 1175 | "metadata": {}, 1176 | "output_type": "execute_result" 1177 | } 1178 | ], 1179 | "source": [ 1180 | "# KNN Code\n", 1181 | "from sklearn.neighbors import KNeighborsClassifier\n", 1182 | "knn_model = KNeighborsClassifier()\n", 1183 | "knn_scores = cross_val_score(knn_model, knn_X_train, y_train, cv=5, scoring='accuracy')\n", 1184 | "print(knn_scores)\n", 1185 | "\n", 1186 | "\"\"\"\n", 1187 | "# Expanded parameter grid for GridSearchCV\n", 1188 | "param_grid = {\n", 1189 | " 'n_neighbors': list(range(1, 31)),\n", 1190 | " 'weights': ['uniform', 'distance'],\n", 1191 | " 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],\n", 1192 | " 'leaf_size': list(range(1, 50)),\n", 1193 | " 'p': [1, 2],\n", 1194 | " 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis']\n", 1195 | "}\n", 1196 | "\n", 1197 | "grid_search_knn = GridSearchCV(knn_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 1198 | "grid_search_knn.fit(knn_X_train, y_train)\n", 1199 | "test_score_knn = grid_search_knn.best_estimator_.score(knn_X_train, y_train)\n", 1200 | "\n", 1201 | "print(\"Best hyperparameters found:\")\n", 1202 | "for key, value in grid_search_knn.best_params_.items():\n", 1203 | " print(f\"{key}: {value}\")\n", 1204 | "\n", 1205 | "print(f\"Best accuracy: {grid_search_knn.best_score_}\")\n", 1206 | "\"\"\"" 1207 | ] 1208 | }, 1209 | { 1210 | "attachments": {}, 1211 | "cell_type": "markdown", 1212 | "metadata": {}, 1213 | "source": [ 1214 | "## Hyperparameter Tuning - MLPClassifier\n", 1215 | "\n", 1216 | "Multi-layer Perceptron (MLP) is a class of feedforward artificial neural network that can be used for classification and regression tasks. It consists of multiple layers of nodes, where each layer is fully connected to the next one. MLPClassifier is a popular implementation in scikit-learn for solving classification problems.\n", 1217 | "\n", 1218 | "Relevant Parameters:\n", 1219 | "- **hidden_layer_sizes**: A tuple representing the number of neurons in each hidden layer. By adjusting this parameter, you can control the complexity of the model. Adding more hidden layers and neurons can increase the capacity of the model to learn complex patterns but may also lead to overfitting.\n", 1220 | "- **activation**: Activation function for the hidden layers. Options include 'identity', 'logistic' (sigmoid), 'tanh', and 'relu'. Different activation functions can lead to different model behaviors and convergence properties.\n", 1221 | "- **solver**: The solver for weight optimization. Choices are 'lbfgs', 'sgd', and 'adam'. Each solver has its own benefits and drawbacks, so it's essential to choose the one that best suits your problem and dataset.\n", 1222 | "- **alpha**: L2 penalty (regularization term) parameter. It's used to control the trade-off between fitting the data and keeping the weights small, which helps prevent overfitting.\n", 1223 | "- **batch_size**: The size of mini-batches for stochastic optimizers. If the solver is 'lbfgs', the classifier will not use mini-batch. For 'sgd' and 'adam', using smaller batch sizes can provide a regularizing effect but may increase the time required for convergence.\n", 1224 | "- **learning_rate**: Learning rate schedule for weight updates. Options are 'constant', 'invscaling', and 'adaptive'. The learning rate determines how quickly the model adapts to the data, with larger values leading to faster convergence but potentially oscillating around the optimum.\n", 1225 | "- **max_iter**: Maximum number of iterations. The solver iterates until convergence or this number of iterations is reached. Increasing this value allows the model more time to converge but may increase the computation time.\n", 1226 | "\n", 1227 | "By tuning these parameters, you can find the best combination for your specific problem and achieve a better balance between model complexity and generalization performance." 1228 | ] 1229 | }, 1230 | { 1231 | "cell_type": "code", 1232 | "execution_count": 27, 1233 | "metadata": {}, 1234 | "outputs": [ 1235 | { 1236 | "name": "stderr", 1237 | "output_type": "stream", 1238 | "text": [ 1239 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n", 1240 | " warnings.warn(\n", 1241 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n", 1242 | " warnings.warn(\n", 1243 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n", 1244 | " warnings.warn(\n", 1245 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n", 1246 | " warnings.warn(\n" 1247 | ] 1248 | }, 1249 | { 1250 | "name": "stdout", 1251 | "output_type": "stream", 1252 | "text": [ 1253 | "[0.8428201 0.84013889 0.84611665 0.83816096 0.84391209]\n" 1254 | ] 1255 | }, 1256 | { 1257 | "name": "stderr", 1258 | "output_type": "stream", 1259 | "text": [ 1260 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n", 1261 | " warnings.warn(\n" 1262 | ] 1263 | }, 1264 | { 1265 | "data": { 1266 | "text/plain": [ 1267 | "'\\n# Parameter grid for GridSearchCV\\nparam_grid = {\\n \\'hidden_layer_sizes\\': [(10,), (20,), (50,), (10, 10), (20, 20), (50, 50)],\\n \\'activation\\': [\\'identity\\', \\'logistic\\', \\'tanh\\', \\'relu\\'],\\n \\'solver\\': [\\'lbfgs\\', \\'sgd\\', \\'adam\\'],\\n \\'alpha\\': [0.0001, 0.001, 0.01, 0.1],\\n \\'learning_rate\\': [\\'constant\\', \\'invscaling\\', \\'adaptive\\'],\\n \\'max_iter\\': [200, 500, 1000],\\n}\\n\\ngrid_search_mlp = GridSearchCV(mlp_model, param_grid, scoring=\\'accuracy\\', cv=5, n_jobs=-1, verbose=1)\\ngrid_search_mlp.fit(knn_X_train, y_train)\\ntest_score_mlp = grid_search_mlp.best_estimator_.score(knn_X_train, y_train)\\n\\nprint(\"Best hyperparameters found:\")\\nfor key, value in grid_search_mlp.best_params_.items():\\n print(f\"{key}: {value}\")\\n\\nprint(f\"Best accuracy: {grid_search_mlp.best_score_}\")\\n'" 1268 | ] 1269 | }, 1270 | "execution_count": 27, 1271 | "metadata": {}, 1272 | "output_type": "execute_result" 1273 | } 1274 | ], 1275 | "source": [ 1276 | "# ANN Code\n", 1277 | "from sklearn.neural_network import MLPClassifier\n", 1278 | "from sklearn.model_selection import cross_val_score, GridSearchCV\n", 1279 | "\n", 1280 | "# MLP with default parameters\n", 1281 | "mlp_model = MLPClassifier(random_state=42)\n", 1282 | "mlp_scores = cross_val_score(mlp_model, knn_X_train, y_train, cv=5, scoring='accuracy')\n", 1283 | "print(mlp_scores)\n", 1284 | "\n", 1285 | "\"\"\"\n", 1286 | "# Parameter grid for GridSearchCV\n", 1287 | "param_grid = {\n", 1288 | " 'hidden_layer_sizes': [(10,), (20,), (50,), (10, 10), (20, 20), (50, 50)],\n", 1289 | " 'activation': ['identity', 'logistic', 'tanh', 'relu'],\n", 1290 | " 'solver': ['lbfgs', 'sgd', 'adam'],\n", 1291 | " 'alpha': [0.0001, 0.001, 0.01, 0.1],\n", 1292 | " 'learning_rate': ['constant', 'invscaling', 'adaptive'],\n", 1293 | " 'max_iter': [200, 500, 1000],\n", 1294 | "}\n", 1295 | "\n", 1296 | "grid_search_mlp = GridSearchCV(mlp_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)\n", 1297 | "grid_search_mlp.fit(knn_X_train, y_train)\n", 1298 | "test_score_mlp = grid_search_mlp.best_estimator_.score(knn_X_train, y_train)\n", 1299 | "\n", 1300 | "print(\"Best hyperparameters found:\")\n", 1301 | "for key, value in grid_search_mlp.best_params_.items():\n", 1302 | " print(f\"{key}: {value}\")\n", 1303 | "\n", 1304 | "print(f\"Best accuracy: {grid_search_mlp.best_score_}\")\n", 1305 | "\"\"\"" 1306 | ] 1307 | }, 1308 | { 1309 | "cell_type": "code", 1310 | "execution_count": 28, 1311 | "metadata": {}, 1312 | "outputs": [], 1313 | "source": [ 1314 | "X_test_Nb = nb_preprocessor.transform(X_test).values\n", 1315 | "X_test_LR = lr_preprocessor.transform(X_test).values" 1316 | ] 1317 | }, 1318 | { 1319 | "cell_type": "code", 1320 | "execution_count": 30, 1321 | "metadata": {}, 1322 | "outputs": [ 1323 | { 1324 | "name": "stderr", 1325 | "output_type": "stream", 1326 | "text": [ 1327 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 1328 | " y = column_or_1d(y, warn=True)\n", 1329 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 1330 | " y = column_or_1d(y, warn=True)\n", 1331 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 1332 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 1333 | "\n", 1334 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 1335 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 1336 | "Please also refer to the documentation for alternative solver options:\n", 1337 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 1338 | " n_iter_i = _check_optimize_result(\n", 1339 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/utils/validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 1340 | " y = column_or_1d(y, warn=True)\n", 1341 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n", 1342 | " warnings.warn(\n" 1343 | ] 1344 | }, 1345 | { 1346 | "data": { 1347 | "text/html": [ 1348 | "
MLPClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 1349 | ], 1350 | "text/plain": [ 1351 | "MLPClassifier(random_state=42)" 1352 | ] 1353 | }, 1354 | "execution_count": 30, 1355 | "metadata": {}, 1356 | "output_type": "execute_result" 1357 | } 1358 | ], 1359 | "source": [ 1360 | "nb_model.fit(nb_X_train, nb_y_train)\n", 1361 | "lr_model.fit(lr_X_train, lr_y_train)\n", 1362 | "svm_model.fit(lr_X_train, lr_y_train)\n", 1363 | "dt_model.fit(tree_X_train, y_train)\n", 1364 | "rf_model.fit(tree_X_train, y_train)\n", 1365 | "gb_model.fit(tree_X_train, y_train)\n", 1366 | "xgb_model.fit(tree_X_train.values, y_train.values)\n", 1367 | "knn_model.fit(knn_X_train, y_train)\n", 1368 | "mlp_model.fit(knn_X_train, y_train)" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "code", 1373 | "execution_count": 32, 1374 | "metadata": {}, 1375 | "outputs": [ 1376 | { 1377 | "name": "stderr", 1378 | "output_type": "stream", 1379 | "text": [ 1380 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names\n", 1381 | " warnings.warn(\n", 1382 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names\n", 1383 | " warnings.warn(\n", 1384 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but SVC was fitted with feature names\n", 1385 | " warnings.warn(\n" 1386 | ] 1387 | }, 1388 | { 1389 | "name": "stdout", 1390 | "output_type": "stream", 1391 | "text": [ 1392 | "Naive Bayes Accuracy: 0.6615211505327192\n", 1393 | "Logistic Regression Accuracy: 0.8455641900207461\n", 1394 | "SVM Classification Accuracy: 0.8542494461830584\n", 1395 | "Decision Tree Accuracy: 0.7893034213579943\n" 1396 | ] 1397 | }, 1398 | { 1399 | "name": "stderr", 1400 | "output_type": "stream", 1401 | "text": [ 1402 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names\n", 1403 | " warnings.warn(\n", 1404 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names\n", 1405 | " warnings.warn(\n" 1406 | ] 1407 | }, 1408 | { 1409 | "name": "stdout", 1410 | "output_type": "stream", 1411 | "text": [ 1412 | "Random Forest Accuracy: 0.8554801504975562\n", 1413 | "Gradient Boosted Classifier Accuracy: 0.8475684799043567\n", 1414 | "XGBoost Accuracy: 0.8601919898730617\n" 1415 | ] 1416 | }, 1417 | { 1418 | "name": "stderr", 1419 | "output_type": "stream", 1420 | "text": [ 1421 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but GradientBoostingClassifier was fitted with feature names\n", 1422 | " warnings.warn(\n", 1423 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names\n", 1424 | " warnings.warn(\n" 1425 | ] 1426 | }, 1427 | { 1428 | "name": "stdout", 1429 | "output_type": "stream", 1430 | "text": [ 1431 | "KNN Accuracy: 0.8368789338584338\n", 1432 | "ANN Accuracy: 0.8481662505713984\n" 1433 | ] 1434 | }, 1435 | { 1436 | "name": "stderr", 1437 | "output_type": "stream", 1438 | "text": [ 1439 | "/home/kjee/.local/lib/python3.9/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but MLPClassifier was fitted with feature names\n", 1440 | " warnings.warn(\n" 1441 | ] 1442 | } 1443 | ], 1444 | "source": [ 1445 | "from sklearn.metrics import accuracy_score\n", 1446 | "\n", 1447 | "# Naive Bayes\n", 1448 | "y_pred_nb = nb_model.predict(X_test_Nb)\n", 1449 | "acc_nb = accuracy_score(y_test, y_pred_nb)\n", 1450 | "print(f\"Naive Bayes Accuracy: {acc_nb}\")\n", 1451 | "\n", 1452 | "# Logistic Regression\n", 1453 | "y_pred_lr = lr_model.predict(X_test_LR)\n", 1454 | "acc_lr = accuracy_score(y_test, y_pred_lr)\n", 1455 | "print(f\"Logistic Regression Accuracy: {acc_lr}\")\n", 1456 | "\n", 1457 | "# SVM Classification\n", 1458 | "y_pred_svm = svm_model.predict(X_test_LR)\n", 1459 | "acc_svm = accuracy_score(y_test, y_pred_svm)\n", 1460 | "print(f\"SVM Classification Accuracy: {acc_svm}\")\n", 1461 | "\n", 1462 | "# Decision Tree\n", 1463 | "y_pred_dt = dt_model.predict(X_test_Nb)\n", 1464 | "acc_dt = accuracy_score(y_test, y_pred_dt)\n", 1465 | "print(f\"Decision Tree Accuracy: {acc_dt}\")\n", 1466 | "\n", 1467 | "# Random Forest\n", 1468 | "y_pred_rf = rf_model.predict(X_test_Nb)\n", 1469 | "acc_rf = accuracy_score(y_test, y_pred_rf)\n", 1470 | "print(f\"Random Forest Accuracy: {acc_rf}\")\n", 1471 | "\n", 1472 | "# Gradient Boosted Classifier\n", 1473 | "y_pred_gb = gb_model.predict(X_test_Nb)\n", 1474 | "acc_gb = accuracy_score(y_test, y_pred_gb)\n", 1475 | "print(f\"Gradient Boosted Classifier Accuracy: {acc_gb}\")\n", 1476 | "\n", 1477 | "# XGBoost\n", 1478 | "y_pred_xgb = xgb_model.predict(X_test_Nb)\n", 1479 | "acc_xgb = accuracy_score(y_test, y_pred_xgb)\n", 1480 | "print(f\"XGBoost Accuracy: {acc_xgb}\")\n", 1481 | "\n", 1482 | "# KNN\n", 1483 | "y_pred_knn = knn_model.predict(X_test_LR)\n", 1484 | "acc_knn = accuracy_score(y_test, y_pred_knn)\n", 1485 | "print(f\"KNN Accuracy: {acc_knn}\")\n", 1486 | "\n", 1487 | "# ANN\n", 1488 | "y_pred_ann = mlp_model.predict(X_test_LR)\n", 1489 | "acc_ann = accuracy_score(y_test, y_pred_ann)\n", 1490 | "print(f\"ANN Accuracy: {acc_ann}\")" 1491 | ] 1492 | }, 1493 | { 1494 | "cell_type": "code", 1495 | "execution_count": null, 1496 | "metadata": {}, 1497 | "outputs": [], 1498 | "source": [] 1499 | } 1500 | ], 1501 | "metadata": { 1502 | "kernelspec": { 1503 | "display_name": "base", 1504 | "language": "python", 1505 | "name": "python3" 1506 | }, 1507 | "language_info": { 1508 | "codemirror_mode": { 1509 | "name": "ipython", 1510 | "version": 3 1511 | }, 1512 | "file_extension": ".py", 1513 | "mimetype": "text/x-python", 1514 | "name": "python", 1515 | "nbconvert_exporter": "python", 1516 | "pygments_lexer": "ipython3", 1517 | "version": "3.9.5" 1518 | }, 1519 | "orig_nbformat": 4, 1520 | "vscode": { 1521 | "interpreter": { 1522 | "hash": "308e8e5829e870726ac9a9b610bce15efe1b382ea09a414e81be3a16d394eb2b" 1523 | } 1524 | } 1525 | }, 1526 | "nbformat": 4, 1527 | "nbformat_minor": 2 1528 | } 1529 | -------------------------------------------------------------------------------- /Classification_Example/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.7.1 2 | numpy==1.24.2 3 | pandas==2.0.0 4 | scikit_learn==1.2.2 5 | scipy==1.10.1 6 | seaborn==0.12.2 7 | xgboost==1.7.5 8 | -------------------------------------------------------------------------------- /From_Scratch_Implementation/DecisionTree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | 6 | class Node: 7 | def __init__(self, 8 | feature = None, 9 | feature_value = None, 10 | threshold = None, 11 | data_left = None, 12 | data_right = None, 13 | gain = None, 14 | value = None): 15 | 16 | self.feature = feature 17 | self.feature_value = feature_value 18 | self.threshold = threshold 19 | self.data_left = data_left 20 | self.data_right = data_right 21 | self.gain = gain 22 | self.value = value 23 | 24 | class DecisionTree: 25 | def __init__(self, min_samples_split=2, max_depth=5): 26 | self.min_samples_split = min_samples_split 27 | self.max_depth = max_depth 28 | self.root = None 29 | 30 | def fit(self, X, y): 31 | self.root = self._build_tree(X,y) 32 | 33 | def entropy(self, data): 34 | 35 | class_counts = np.bincount(data) 36 | class_probs = class_counts/len(data) 37 | 38 | class_entropies = [] 39 | entropy = 0 40 | for prob in class_probs: 41 | if prob > 0: 42 | entropy += prob * np.log(prob) 43 | class_entropies.append(entropy) 44 | entropy = np.sum(class_entropies) * -1 45 | return entropy 46 | 47 | def information_gain(self, 48 | parent, 49 | left_child, 50 | right_child): 51 | 52 | num_left = len(left_child)/len(parent) 53 | num_right = len(right_child)/len(parent) 54 | 55 | ## Compute entropies 56 | parent_entropy = self.entropy(parent) 57 | left_entropy = self.entropy(left_child) 58 | right_entropy = self.entropy(right_child) 59 | 60 | ## Compute information gain 61 | info_gain = parent_entropy - (num_left * left_entropy + 62 | num_right * right_entropy) 63 | return info_gain 64 | 65 | def _best_split(self, X, y): 66 | best_split = {} 67 | best_info_gain = -1 68 | n_rows, n_cols = X.shape 69 | 70 | feature_set = list(X.columns) 71 | 72 | # Aggregate X and y to create dataframe 73 | # We need to do this in order to compute information gain 74 | df = np.concatenate((X, np.array(y).reshape(1, -1).T), axis=1) 75 | df = pd.DataFrame(df) 76 | 77 | df.columns = feature_set + ['y'] 78 | 79 | # Loop through each dataset feature 80 | for i, feature in enumerate(feature_set): 81 | 82 | # Store the feature data 83 | feature_data = sorted(np.unique(X[feature])) 84 | 85 | # Loop through each value and store the left and right. 86 | for feature_val in feature_data: 87 | 88 | # Store the left and right values 89 | df_left = df[df[feature] <= feature_val].copy() 90 | df_right = df[df[feature] > feature_val].copy() 91 | 92 | # Extract target variables 93 | if len(df_left) > 0 and len(df_right) > 0: 94 | y_parent = df['y'] 95 | y_left = df_left['y'] 96 | y_right = df_right['y'] 97 | 98 | # Compute information gain 99 | info_gain = self.information_gain(y_parent, 100 | y_left, 101 | y_right) 102 | if info_gain > best_info_gain: 103 | best_info_gain = info_gain 104 | best_split = { 105 | 'feature_col':feature, 106 | 'split_value':feature_val, 107 | 'df_left':df_left, 108 | 'df_right':df_right, 109 | 'gain':info_gain 110 | } 111 | return best_split 112 | 113 | def _build_tree(self, X, y, depth = 0): 114 | n_rows, n_cols = X.shape 115 | 116 | # Ensuring this isn't a leaf node. If so, we don't split. 117 | # This is the base case for the recursion 118 | if n_rows >= self.min_samples_split and depth <= self.max_depth: 119 | 120 | # Get best split 121 | best = self._best_split(X,y) 122 | 123 | # If information gain is not 0, possibly, room to split. 124 | if best['gain'] > 0: 125 | left = self._build_tree( 126 | X=best['df_left'].drop(['y'], axis = 1), 127 | y=best['df_left']['y'], 128 | depth = depth + 1 129 | ) 130 | 131 | right = self._build_tree( 132 | X=best['df_right'].drop(['y'], axis = 1), 133 | y=best['df_right']['y'], 134 | depth = depth + 1 135 | ) 136 | return Node( 137 | feature=best['feature_col'], 138 | threshold=best['split_value'], 139 | data_left=left, 140 | data_right=right, 141 | gain=best['gain'] 142 | ) 143 | return Node(value=Counter(y).most_common(1)[0][0]) 144 | 145 | def _traverse_tree(self, x, node): 146 | 147 | # If we hit leaf node, return that value 148 | if node.value != None: 149 | return node.value 150 | 151 | # Pull feature column 152 | feature_value = x[node.feature] 153 | 154 | # Go left if less than threshold 155 | if feature_value <= node.threshold: 156 | return self._traverse_tree(x=x, node = node.data_left) 157 | 158 | # Go right if more than threshold 159 | if feature_value > node.threshold: 160 | return self._traverse_tree(x=x, node = node.data_right) 161 | 162 | def predict(self, X): 163 | predictions = [] 164 | for index, x in X.iterrows(): 165 | pred = self._traverse_tree(x,self.root) 166 | predictions.append(pred) 167 | return predictions 168 | 169 | 170 | if __name__=='__main__': 171 | 172 | ## Create dummy dataframe 173 | df = pd.DataFrame() 174 | X1 = np.array([1,2,3,4,5,6,7,8,9,10]) 175 | X2 = np.array([4,5,6,4,5,1,7,8,9,10]) 176 | y = np.array([0,1,1,0,0,1,0,1,0,1]) 177 | df['col_1'] = X1 178 | df['col_2'] = X2 179 | df['y'] = y 180 | 181 | X = df[['col_1','col_2']] 182 | y = df['y'] 183 | 184 | dt = DecisionTree() 185 | dt.fit(X, y) 186 | preds = dt.predict(X) 187 | 188 | print(preds) -------------------------------------------------------------------------------- /From_Scratch_Implementation/GradientBoostedTrees.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.tree import DecisionTreeRegressor 4 | 5 | class GradientBoostingRegressor: 6 | def __init__(self, 7 | num_estimators, 8 | learning_rate, 9 | max_depth = None): 10 | self.num_estimators = num_estimators 11 | self.learning_rate = learning_rate 12 | self.max_depth = max_depth 13 | 14 | def fit(self, X, y): 15 | 16 | # Initialize model with the mean 17 | self.avg_y = np.mean(y) 18 | self.trees = [] 19 | 20 | for i in range(self.num_estimators): 21 | 22 | # Compute residual 23 | residual = y - self.predict(X) 24 | 25 | # Fit Decision Tree on the residuals 26 | tree = DecisionTreeRegressor(max_depth = self.max_depth) 27 | tree.fit(X, residual) 28 | self.trees.append(tree) 29 | 30 | # Make prediction & recompute residual 31 | residual_preds = tree.predict(X) 32 | 33 | def predict(self, X): 34 | 35 | # Initial array with training means 36 | final_prediction = np.full((1,len(X)),self.avg_y)[0] 37 | 38 | # Get residual predictions 39 | for tree in self.trees: 40 | # Make Prediction 41 | resid_pred = tree.predict(X) 42 | final_prediction += resid_pred * self.learning_rate 43 | 44 | return final_prediction 45 | 46 | if __name__=='__main__': 47 | 48 | ## Create dummy dataframe 49 | df = pd.DataFrame() 50 | X1 = np.array([1,2,3,4,5,6,7,8,9,10]) 51 | X2 = np.array([4,5,6,4,5,1,7,8,9,10]) 52 | y = np.array([8,7,5,8,2,3,5,7,7,8]) 53 | df['col_1'] = X1 54 | df['col_2'] = X2 55 | df['y'] = y 56 | 57 | X = df[['col_1','col_2']] 58 | y = df['y'] 59 | 60 | dt = GradientBoostingRegressor(num_estimators = 100, 61 | learning_rate = 0.02) 62 | dt.fit(X, y) 63 | preds = dt.predict(X) 64 | 65 | print(preds) -------------------------------------------------------------------------------- /From_Scratch_Implementation/HierarchicalClustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from itertools import product 4 | 5 | 6 | class HierarchicalClustering: 7 | def __init__(self, n_clusters, linkage='single'): 8 | # Constructor method for the class, which sets the number of clusters and the linkage method. 9 | self.n_clusters = n_clusters 10 | self.linkage = linkage 11 | 12 | def fit(self, X): 13 | # Fit method, which performs the clustering. 14 | self.n_samples = X.shape[0] 15 | self.clusters = [[i] for i in range(self.n_samples)] 16 | self.distances = self._calculate_distances(X) 17 | self.history = [] 18 | 19 | while len(self.clusters) > self.n_clusters: 20 | i, j = self._find_closest_pair() 21 | self._merge_clusters(i, j) 22 | self.history.append((i, j)) 23 | 24 | def _calculate_distances(self, X): 25 | # Helper function to calculate the pairwise distances between points in the dataset. 26 | distances = np.zeros((self.n_samples, self.n_samples)) 27 | for i in range(self.n_samples): 28 | for j in range(i+1, self.n_samples): 29 | distances[i, j] = self._calculate_distance(X.iloc[i], X.iloc[j]) 30 | return distances 31 | 32 | def _calculate_distance(self, x, y): 33 | # Helper function to calculate the distance between two points based on the selected linkage method. 34 | if self.linkage == 'single': 35 | return np.min(np.abs(x - y)) 36 | elif self.linkage == 'complete': 37 | return np.max(np.abs(x - y)) 38 | elif self.linkage == 'average': 39 | return np.mean(np.abs(x - y)) 40 | 41 | def _find_closest_pair(self): 42 | # Helper function to find the two closest clusters. 43 | min_distance = np.inf 44 | closest_pair = None 45 | for i in range(len(self.clusters)): 46 | for j in range(i+1, len(self.clusters)): 47 | distance = self._calculate_cluster_distance(self.clusters[i], self.clusters[j]) 48 | if distance < min_distance: 49 | min_distance = distance 50 | closest_pair = (i, j) 51 | return closest_pair 52 | 53 | def _calculate_cluster_distance(self, c1, c2): 54 | # Helper function to calculate the distance between two clusters based on the selected linkage method. 55 | distance = np.inf 56 | for i in c1: 57 | for j in c2: 58 | d = self.distances[min(i, j), max(i, j)] 59 | if d < distance: 60 | distance = d 61 | return distance 62 | 63 | def _merge_clusters(self, i, j): 64 | # Helper function to merge two clusters. 65 | self.clusters[i] = self.clusters[i] + self.clusters[j] 66 | self.clusters.pop(j) 67 | 68 | def predict(self): 69 | # Predict method, which returns the cluster labels for the original dataset. 70 | labels = np.zeros(self.n_samples, dtype=np.int32) 71 | for i, cluster in enumerate(self.clusters): 72 | for j in cluster: 73 | labels[j] = i 74 | return labels 75 | 76 | if __name__=='__main__': 77 | 78 | ## Create dummy dataframe 79 | df = pd.DataFrame() 80 | X1 = np.array([1,2,3,4,5,6,7,8,9,10]) 81 | X2 = np.array([4,5,6,4,5,1,7,8,9,10]) 82 | y = np.array([0,1,1,0,0,1,0,1,0,1]) 83 | df['col_1'] = X1 84 | df['col_2'] = X2 85 | 86 | X = df[['col_1','col_2']] 87 | 88 | hc = HierarchicalClustering(n_clusters=4) 89 | hc.fit(X) 90 | print(hc.clusters) 91 | -------------------------------------------------------------------------------- /From_Scratch_Implementation/KMeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class KMeans: 6 | def __init__(self, k, max_iter): 7 | self.k = k 8 | self.max_iter = max_iter 9 | self.centroids = None 10 | 11 | def _euclidean_distance(self, X1, X2): 12 | return np.linalg.norm(X1 - X2) 13 | 14 | def _compute_centroid(self, 15 | X, 16 | assigned_centroid_dict = None): 17 | 18 | # Loop through each key, value in dictionary 19 | # For each one of these dataframes, take the mean to get the new centroid 20 | # return the new means 21 | self.centroids = [] 22 | for centroid, centroid_df in assigned_centroid_dict.items(): 23 | 24 | centroid_mean = pd.DataFrame(centroid_df.mean(axis = 0)) 25 | 26 | centroid_mean = centroid_mean.T 27 | 28 | self.centroids.append(centroid_mean) 29 | 30 | return pd.concat(self.centroids) 31 | 32 | 33 | def predict(self, X): 34 | 35 | # Loop through each centroid, get distances 36 | assigned_centroid_dict = {} 37 | 38 | for row_num, row_X in X.iterrows(): 39 | 40 | row_X_df = pd.DataFrame(row_X).T 41 | 42 | assigned_centroid = None 43 | closest_distance = None 44 | 45 | for centroid_num, row_c in self.centroids.iterrows(): 46 | 47 | distance = self._euclidean_distance(row_c, row_X) 48 | 49 | if assigned_centroid is None: 50 | assigned_centroid = centroid_num 51 | closest_distance = distance 52 | continue 53 | 54 | # Replace assigned centroid if closer 55 | elif distance < closest_distance: 56 | assigned_centroid = centroid_num 57 | closest_distance = distance 58 | 59 | if assigned_centroid not in assigned_centroid_dict.keys(): 60 | assigned_centroid_dict[assigned_centroid] = row_X_df 61 | 62 | else: 63 | assigned_centroid_dict[assigned_centroid].append(row_X_df) 64 | 65 | return assigned_centroid_dict 66 | 67 | def fit(self, X): 68 | 69 | # Initialize centroids randomly if first time. 70 | self.centroids = X.sample(self.k) 71 | 72 | for i in range(self.max_iter): 73 | self.assigned_centroid_dict = self.predict(X) 74 | self.centroids = self._compute_centroid(X, self.assigned_centroid_dict) 75 | 76 | 77 | if __name__=='__main__': 78 | 79 | ## Create dummy dataframe 80 | df = pd.DataFrame() 81 | X1 = np.array([1,2,3,4,5,6,7,8,9,10]) 82 | X2 = np.array([4,5,6,4,5,1,7,8,9,10]) 83 | y = np.array([0,1,1,0,0,1,0,1,0,1]) 84 | df['col_1'] = X1 85 | df['col_2'] = X2 86 | 87 | X = df[['col_1','col_2']] 88 | 89 | km = KMeans(k=2,max_iter = 1) 90 | km.fit(X) 91 | 92 | print(km.centroids) 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /From_Scratch_Implementation/KNearestNeighbors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class KNN: 5 | def __init__(self, n_neighbors): 6 | self.n_neighbors = n_neighbors 7 | 8 | def fit(self, X, y): 9 | self.X = X 10 | self.y = y 11 | 12 | def euclidean_distance(self, X1, X2): 13 | return np.linalg.norm(X1 - X2) 14 | 15 | def _select_neighbors(self, all_distances): 16 | nn_dict = dict() 17 | for key, distances in all_distances.items(): 18 | sorted_d = sorted(distances.items(), 19 | key=lambda item: item[1], 20 | reverse=True) 21 | 22 | nearest_neighbors = sorted_d[:self.n_neighbors] 23 | nn_dict[key] = nearest_neighbors 24 | return nn_dict 25 | 26 | def _compute_distances(self, X): 27 | all_distances = {} 28 | ## Compute distances between X and fitted data 29 | for i_pred, X_pred in X.iterrows(): 30 | individ_distances = {} 31 | for i_fit, X_fit in self.X.iterrows(): 32 | distance = self.euclidean_distance(X_pred, X_fit) 33 | individ_distances[i_fit] = distance 34 | all_distances[i_pred] = individ_distances 35 | return all_distances 36 | 37 | def predict(self, X): 38 | all_distances = self._compute_distances(X) 39 | nn_dict = self._select_neighbors(all_distances) 40 | 41 | # Compute predictions 42 | predictions = [] 43 | for key, neighbors in nn_dict.items(): 44 | labels = [self.y[neighbor[0]] for neighbor in neighbors] 45 | predictions.append(np.mean(labels)) 46 | return predictions 47 | 48 | if __name__=='__main__': 49 | # Create a dummy dataset using pandas 50 | df = pd.DataFrame() 51 | X1 = np.array([1,2,3,4,5,6,7,8,9,10]) 52 | X2 = np.array([4,5,6,4,5,1,7,8,9,10]) 53 | y = np.array([8,7,5,8,2,3,5,7,7,8]) 54 | df['col_1'] = X1 55 | df['col_2'] = X2 56 | df['y'] = y 57 | 58 | X = df[['col_1','col_2']] 59 | y = df['y'] 60 | 61 | knn = KNN(n_neighbors = 3) 62 | knn.fit(X, y) 63 | predictions = knn.predict(X) 64 | 65 | print(predictions) 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /From_Scratch_Implementation/LinearRegression.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | # Using Gradient Descent 6 | class LinearRegression: 7 | def __init__(self, step_size=0.2, max_steps=100): 8 | self.step_size = step_size 9 | self.max_steps = max_steps 10 | 11 | def sum_of_squared_error(self, X, y, preds): 12 | return np.sum((preds - y)**2) 13 | 14 | def fit(self, X, y): 15 | num_samples, num_features = X.shape 16 | 17 | # Initialize our weights to either zero, mean or random. 18 | self.weights = np.zeros(X.shape[1]) 19 | 20 | # Make prediction using current weights 21 | preds = self.predict(X) 22 | 23 | # Compute the loss with the initialized weights. 24 | # You should expect the loss to be high. 25 | current_loss = self.sum_of_squared_error(X,y,preds) 26 | 27 | # Running Gradient Descent 28 | for _ in range(self.max_steps): 29 | 30 | # The partial derivative of loss with respect to weights 31 | # is the following equation 32 | dw = (1/num_samples) * np.dot(X.T, (preds - y)) 33 | 34 | # Update the weights with the step size * gradient 35 | self.weights -= self.step_size * dw 36 | 37 | preds = self.predict(X) 38 | 39 | # Compute new loss with new weights 40 | new_loss = self.sum_of_squared_error(X,y,preds) 41 | 42 | # We want the loss to **increase** with each iteration. 43 | # This is a Maximum Likelihood Estimation, we want to 44 | # maximize our likelihood function. 45 | if current_loss < new_loss: 46 | break 47 | 48 | # Replace the loss 49 | current_loss = new_loss 50 | 51 | 52 | def predict(self, X): 53 | preds = np.dot(X, self.weights) 54 | return preds 55 | 56 | 57 | # Using Linear Algebra 58 | class LinearRegression: 59 | def __init__(self): 60 | self.weights = None 61 | self.bias = None 62 | 63 | def fit(self, X, y): 64 | num_samples, num_features = X.shape 65 | X = np.concatenate((np.ones((num_samples, 1)), X), axis=1) 66 | A = np.dot(X.T, X) 67 | b = np.dot(X.T, y) 68 | self.weights = np.linalg.solve(A, b) 69 | self.bias = self.weights[0] 70 | self.weights = self.weights[1:] 71 | 72 | def predict(self, X): 73 | num_samples, num_features = X.shape 74 | X = np.concatenate((np.ones((num_samples, 1)), X), axis=1) 75 | return np.dot(X, self.weights) + self.bias 76 | 77 | 78 | 79 | 80 | 81 | 82 | X = np.array([[1, 2], [3, 4], [5, 6]]) 83 | y = np.array([1, 3, 5]) 84 | 85 | print(X) 86 | 87 | # Create an instance of the LinearRegression class 88 | model = LinearRegressionLinearAlgebra() 89 | 90 | # Fit the model to the training data 91 | model.fit(X, y) 92 | 93 | # Make predictions on new data 94 | X_new = np.array([[7, 8]]) 95 | y_pred = model.predict(X_new) 96 | print(y_pred) 97 | -------------------------------------------------------------------------------- /From_Scratch_Implementation/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class LogisticRegression: 4 | def __init__(self, step_size=0.2, max_steps=100): 5 | self.step_size = step_size 6 | self.max_steps = max_steps 7 | 8 | def sigmoid(self, z): 9 | return 1 / (1 + np.exp(-z)) 10 | 11 | def log_likelihood(self, X, y, preds): 12 | # Compute log likelihood 13 | return 1 * (np.sum(y * np.log(preds) + (1 - y) * np.log(1 - preds))) 14 | 15 | def fit(self, X, y): 16 | 17 | # Initialize our weights to either zero, mean or random. 18 | self.weights = np.zeros(X.shape[1]) 19 | 20 | # Compute the loss with the initialized weights. 21 | # You should expect the loss to be high. 22 | preds = self.predict(X) 23 | current_loss = self.log_likelihood(X, y, preds) 24 | 25 | # Running Gradient Descent 26 | for _ in range(self.max_steps): 27 | 28 | # The partial derivative of loss with respect to weights 29 | # is the following equation 30 | gradient = np.dot(X.T, (preds - y)) / y.size 31 | 32 | # Update the weights with the step size * gradient 33 | self.weights -= self.step_size * gradient 34 | 35 | # Make prediction using current weights 36 | preds = self.predict(X) 37 | 38 | # Compute new loss with new weights 39 | new_loss = self.log_likelihood(X,y,preds) 40 | 41 | # We want the loss to **increase** with each iteration. 42 | # This is a Maximum Likelihood Estimation, we want to 43 | # maximize our likelihood function. 44 | if current_loss > new_loss: 45 | break 46 | 47 | # Replace the loss 48 | current_loss = new_loss 49 | 50 | 51 | def predict(self, X): 52 | # Z = weights * inputs (X) 53 | z = np.dot(X, self.weights) 54 | 55 | # Apply sigmoid transformation 56 | preds = self.sigmoid(z) 57 | return preds -------------------------------------------------------------------------------- /From_Scratch_Implementation/NaiveBayes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_classification 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score 5 | 6 | 7 | class NaiveBayes: 8 | def fit(self, X, y): 9 | # number of samples and features 10 | n_samples, n_features = X.shape 11 | 12 | # number of unique classes 13 | self.classes = np.unique(y) 14 | n_classes = len(self.classes) 15 | 16 | # calculate prior probabilities for each class 17 | self.priors = np.zeros(n_classes) 18 | for i in range(n_classes): 19 | self.priors[i] = np.sum(y == self.classes[i]) / float(n_samples) 20 | 21 | # calculate mean and variance for each feature and class 22 | self.means = np.zeros((n_classes, n_features)) 23 | self.variances = np.zeros((n_classes, n_features)) 24 | for i in range(n_classes): 25 | X_class = X[y == self.classes[i]] 26 | self.means[i, :] = X_class.mean(axis=0) 27 | self.variances[i, :] = X_class.var(axis=0) 28 | 29 | def predict(self, X): 30 | # initialize an empty list to store predicted class labels 31 | y_pred = [] 32 | 33 | # iterate over each sample in X 34 | for sample in X: 35 | # calculate the posterior probability for each class 36 | posteriors = [] 37 | for i in range(len(self.classes)): 38 | prior = np.log(self.priors[i]) 39 | posterior = np.sum(np.log(self.calculate_likelihood(sample, self.means[i, :], self.variances[i, :]))) 40 | posterior = prior + posterior 41 | posteriors.append(posterior) 42 | # select the class with the highest posterior probability as the predicted label 43 | y_pred.append(self.classes[np.argmax(posteriors)]) 44 | 45 | return np.array(y_pred) 46 | 47 | def calculate_likelihood(self, x, mean, var): 48 | # calculate the probability of each feature given a class using Gaussian distribution 49 | exponent = np.exp(-((x - mean) ** 2 / (2 * var))) 50 | return (1 / np.sqrt(2 * np.pi * var)) * exponent 51 | 52 | # generate a synthetic dataset with 4 features and 3 classes 53 | X, y = make_classification(n_samples=1000, n_features=4, n_classes=2, random_state=42) 54 | 55 | # split the dataset into training and testing sets 56 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 57 | 58 | # instantiate the Naive Bayes classifier 59 | nb = NaiveBayes() 60 | 61 | # fit the model on the training set 62 | nb.fit(X_train, y_train) 63 | 64 | # make predictions on the testing set 65 | y_pred = nb.predict(X_test) 66 | 67 | # calculate the accuracy of the predictions 68 | accuracy = accuracy_score(y_test, y_pred) 69 | 70 | # print the accuracy score 71 | print("Accuracy:", accuracy) -------------------------------------------------------------------------------- /From_Scratch_Implementation/NeuralNetwork.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class NeuralNetwork: 4 | def __init__(self, num_inputs, num_hidden, num_outputs): 5 | 6 | # initialize weights and biases for the hidden layer and output layer 7 | self.hidden_weights = np.random.randn(num_inputs, num_hidden) 8 | self.hidden_bias = np.zeros((1, num_hidden)) 9 | self.output_weights = np.random.randn(num_hidden, num_outputs) 10 | self.output_bias = np.zeros((1, num_outputs)) 11 | 12 | def forward(self, inputs): 13 | # pass inputs through the hidden layer 14 | hidden_layer = np.dot(inputs, self.hidden_weights) + self.hidden_bias 15 | self.hidden_layer_activation = self.sigmoid(hidden_layer) 16 | 17 | # pass hidden layer output through the output layer 18 | output_layer = np.dot(self.hidden_layer_activation, self.output_weights) + self.output_bias 19 | output_layer_activation = self.sigmoid(output_layer) 20 | 21 | return output_layer_activation 22 | 23 | def sigmoid(self, x): 24 | return 1 / (1 + np.exp(-x)) 25 | 26 | def sigmoid_derivative(self, x): 27 | return x * (1 - x) 28 | 29 | def backward(self, inputs, targets, output): 30 | # calculate the error in the output layer 31 | output_error = targets - output 32 | output_delta = output_error * self.sigmoid_derivative(output) 33 | 34 | # calculate the error in the hidden layer 35 | hidden_error = np.dot(output_delta, self.output_weights.T) 36 | hidden_delta = hidden_error * self.sigmoid_derivative(self.hidden_layer_activation) 37 | 38 | # update the weights and biases for the output layer and hidden layer 39 | self.output_weights += np.dot(self.hidden_layer_activation.T, output_delta) 40 | self.output_bias += np.sum(output_delta, axis=0, keepdims=True) 41 | self.hidden_weights += np.dot(inputs.T, hidden_delta) 42 | self.hidden_bias += np.sum(hidden_delta, axis=0, keepdims=True) 43 | 44 | def train(self, inputs, targets, num_epochs, learning_rate): 45 | for epoch in range(num_epochs): 46 | # forward pass 47 | output = self.forward(inputs) 48 | 49 | # backward pass 50 | self.backward(inputs, targets, output) 51 | 52 | # print the mean squared error at each epoch 53 | mse = np.mean(np.square(targets - output)) 54 | print("Epoch:", epoch, "MSE:", mse) 55 | 56 | # update the learning rate at each epoch 57 | learning_rate *= 0.99 58 | 59 | def predict(self, inputs): 60 | return self.forward(inputs) 61 | 62 | # Create a neural network with 2 inputs, 3 hidden neurons, and 1 output 63 | nn = NeuralNetwork(2, 3, 1) 64 | 65 | # Create a set of training data 66 | inputs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 67 | targets = np.array([[0], [1], [1], [0]]) 68 | 69 | # Train the neural network for 100 epochs 70 | nn.train(inputs, targets, 100, 0.1) 71 | 72 | # Test the neural network on a new set of inputs 73 | new_inputs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 74 | output = nn.forward(new_inputs) 75 | 76 | # Print the output 77 | print(output) -------------------------------------------------------------------------------- /From_Scratch_Implementation/NonNegativeMatrixFactorization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class NMF: 5 | def __init__(self, n_components, max_iter=200, tol=1e-4): 6 | self.n_components = n_components 7 | self.max_iter = max_iter 8 | self.tol = tol 9 | 10 | def fit(self, X): 11 | n_samples, n_features = X.shape 12 | 13 | # Initialize factors with random values 14 | self.W = np.random.rand(n_samples, self.n_components) 15 | self.H = np.random.rand(self.n_components, n_features) 16 | 17 | for n_iter in range(self.max_iter): 18 | # Update H 19 | self.H *= np.dot(self.W.T, X) / (np.dot(np.dot(self.W.T, self.W), self.H) + 1e-10) 20 | 21 | # Update W 22 | XH = np.dot(self.W, self.H) 23 | WHH = np.dot(XH, self.H.T) + 1e-10 24 | self.W *= np.dot(X, self.H.T) / WHH 25 | 26 | # Compute reconstruction error 27 | err = np.mean((X - np.dot(self.W, self.H)) ** 2) 28 | 29 | # Check for convergence 30 | if n_iter % 10 == 0: 31 | print("Iteration {}: error = {:.4f}".format(n_iter, err)) 32 | if err < self.tol: 33 | print("Converged after {} iterations".format(n_iter)) 34 | break 35 | 36 | def transform(self, X): 37 | return np.dot(self.W.T, X) 38 | 39 | def fit_transform(self, X): 40 | self.fit(X) 41 | return self.transform(X) 42 | 43 | X = np.random.rand(100, 50) 44 | 45 | nmf = NMF(n_components=10, max_iter=200, tol=1e-4) 46 | W = nmf.fit_transform(X) -------------------------------------------------------------------------------- /From_Scratch_Implementation/RandomForest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.tree import DecisionTreeClassifier 4 | 5 | class RandomForestClassifier: 6 | def __init__(self, 7 | n_estimators=100, 8 | max_depth = None, 9 | min_samples_split=2, 10 | max_features = 'auto', 11 | max_samples = None): 12 | 13 | self.n_estimators = n_estimators 14 | self.max_depth = max_depth 15 | self.min_samples_split = min_samples_split 16 | self.trees = [] 17 | self.feature_importances = None 18 | self.max_features = max_features 19 | self.max_samples = max_samples 20 | 21 | def fit(self, X, y): 22 | 23 | ## Set an automatic max features 24 | if self.max_features == 'auto': 25 | self.max_features = int(np.sqrt(len(X.columns))) 26 | 27 | # Use the loop to create the number of estimators 28 | for i in range(self.n_estimators): 29 | tree = DecisionTreeClassifier(max_depth = self.max_depth, 30 | min_samples_split = self.min_samples_split, 31 | max_features = self.max_features) 32 | 33 | # Default to 70% 34 | if self.max_samples is None: 35 | self.max_samples = int(X.shape[0] * 0.7) 36 | 37 | # Define sub-sample 38 | indices = np.random.choice(X.shape[0], self.max_samples, replace = True) 39 | tree.fit(X.iloc[indices], y.iloc[indices]) 40 | 41 | self.trees.append(tree) 42 | 43 | def feature_importances_(self): 44 | self.feature_importances = np.zeros(X.shape[1]) 45 | for tree in self.trees: 46 | self.feature_importance += tree.feature_importances_ 47 | 48 | self.feature_importances /= self.n_estimators 49 | 50 | def predict(self, X): 51 | all_preds = [] 52 | for tree in self.trees: 53 | preds = tree.predict(X) 54 | all_preds.append(preds) 55 | return pd.DataFrame(all_preds).mean().values 56 | 57 | 58 | if __name__=='__main__': 59 | 60 | ## Create dummy dataframe 61 | df = pd.DataFrame() 62 | X1 = np.array([1,2,3,4,5,6,7,8,9,10]) 63 | X2 = np.array([4,5,6,4,5,1,7,8,9,10]) 64 | y = np.array([0,1,1,0,0,1,0,1,0,1]) 65 | df['col_1'] = X1 66 | df['col_2'] = X2 67 | df['y'] = y 68 | 69 | X = df[['col_1','col_2']] 70 | y = df['y'] 71 | 72 | dt = RandomForestClassifier() 73 | dt.fit(X, y) 74 | preds = dt.predict(X) 75 | 76 | print(preds) 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /From_Scratch_Implementation/Regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class LogisticRegression: 4 | def __init__(self, step_size=0.2, max_steps=100, reg_lambda = 0): 5 | self.step_size = step_size 6 | self.max_steps = max_steps 7 | self.reg_lambda = reg_lambda 8 | 9 | def sigmoid(self, z): 10 | return 1 / (1 + np.exp(-z)) 11 | 12 | def log_likelihood(self, X, y, preds, ): 13 | # Compute the regularization term 14 | reg_term = self.reg_lambda/(2*len(y)) * np.sum(self.weights ** 2) 15 | 16 | # Compute the loss with the regularization term 17 | return (np.sum(y * np.log(preds) + (1 - y) * np.log(1 - preds)))/(len(y)) + reg_term 18 | 19 | def fit(self, X, y): 20 | 21 | # Initialize our weights to either zero, mean or random. 22 | self.weights = np.zeros(X.shape[1]) 23 | 24 | # Compute the loss with the initialized weights. 25 | # You should expect the loss to be high. 26 | preds = self.predict(X) 27 | current_loss = self.log_likelihood(X, y, preds) 28 | 29 | # Running Gradient Descent 30 | for _ in range(self.max_steps): 31 | 32 | # Calculate the gradient for regularization 33 | reg_gradient = self.regularization_param * self.weights 34 | 35 | # The partial derivative of loss with respect to weights 36 | # The derivative of the regularization 37 | gradient = np.dot(X.T, (preds - y)) / y.size + self.reg_term/y.size * self.weights 38 | 39 | # Update the weights with the step size * gradient 40 | self.weights -= self.step_size * gradient 41 | 42 | # Make prediction using current weights 43 | preds = self.predict(X) 44 | 45 | # Compute new loss with new weights 46 | new_loss = self.log_likelihood(X,y,preds) 47 | 48 | # We want the loss to **increase** with each iteration. 49 | # This is a Maximum Likelihood Estimation, we want to 50 | # maximize our likelihood function. 51 | if current_loss > new_loss: 52 | break 53 | 54 | # Replace the loss 55 | current_loss = new_loss 56 | 57 | 58 | def predict(self, X): 59 | # Z = weights * inputs (X) 60 | z = np.dot(X, self.weights) 61 | 62 | # Apply sigmoid transformation 63 | preds = self.sigmoid(z) 64 | return preds -------------------------------------------------------------------------------- /From_Scratch_Implementation/SupportVectorMachine.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Define the SVM class 4 | class SVM: 5 | # Initialize the SVM with the given hyperparameters 6 | def __init__(self, C=1.0, kernel='linear', degree=3, gamma='auto'): 7 | self.C = C 8 | self.kernel = kernel 9 | self.degree = degree 10 | self.gamma = gamma 11 | 12 | # Define the function to fit the training data 13 | def fit(self, X, y): 14 | # Initialize the number of support vectors 15 | self.num_support_vectors = 0 16 | 17 | # Calculate the number of training examples 18 | self.num_examples = X.shape[0] 19 | 20 | # Initialize the weights and bias 21 | self.w = np.zeros(X.shape[1]) 22 | self.b = 0 23 | 24 | # Initialize the Lagrange multipliers 25 | self.lagrange_multipliers = np.zeros(self.num_examples) 26 | 27 | # Set the convergence criteria 28 | criteria = (self.C * np.eye(self.num_examples)).tolist() 29 | 30 | # Optimize the Lagrange multipliers 31 | while True: 32 | num_changed_lagrange_multipliers = 0 33 | for i in range(self.num_examples): 34 | # Calculate the error 35 | error = 0 36 | for j in range(self.num_examples): 37 | # Calculate the kernel 38 | if self.kernel == 'linear': 39 | kernel = np.dot(X[i], X[j]) 40 | elif self.kernel == 'poly': 41 | kernel = (np.dot(X[i], X[j]) + 1) ** self.degree 42 | else: 43 | kernel = np.exp(-self.gamma * 44 | np.sum(np.square(X[i] - X[j]))) 45 | 46 | # Calculate the error 47 | error += self.lagrange_multipliers[j] * y[j] * kernel 48 | 49 | error -= y[i] 50 | 51 | # Check if the Lagrange multiplier is valid 52 | if ((y[i] * error < -criteria[i][i]) and 53 | (self.lagrange_multipliers[i] < self.C)) or \ 54 | ((y[i] * error > criteria[i][i]) and 55 | (self.lagrange_multipliers[i] > 0)): 56 | 57 | # Select the second Lagrange multiplier randomly 58 | j = np.random.randint(0, self.num_examples) 59 | while j == i: 60 | j = np.random.randint(0, self.num_examples) 61 | 62 | # Calculate the error 63 | error_j = 0 64 | for k in range(self.num_examples): 65 | # Calculate the kernel 66 | if self.kernel == 'linear': 67 | kernel = np.dot(X[j], X[k]) 68 | elif self.kernel == 'poly': 69 | kernel = (np.dot(X[j], X[k]) + 1) ** self.degree 70 | else: 71 | kernel = np.exp(-self.gamma * 72 | np.sum(np.square(X[j] - X[k]))) 73 | 74 | # Calculate the error 75 | error_j += self.lagrange_multipliers[k] * \ 76 | y[k] * kernel 77 | 78 | error_j -= y[j] 79 | 80 | # Save the Lagrange multipliers 81 | lagrange_multipliers_i_old = self.lagrange_multipliers[i] 82 | lagrange_multipliers_j_old = self.lagrange_multipliers[j] 83 | 84 | # Compute the bounds for the Lagrange multipliers 85 | if y[i] != y[j]: 86 | lower_bound = max(0, self.lagrange_multipliers[j] - 87 | self.lagrange_multipliers[i]) 88 | upper_bound = min(self.C, 89 | self.C + self.lagrange_multipliers[j] - 90 | self.lagrange_multipliers[i]) 91 | else: 92 | lower_bound = max(0, self.lagrange_multipliers[j] + 93 | self.lagrange_multipliers[i] - self.C) 94 | upper_bound = min(self.C, 95 | self.lagrange_multipliers[j] + 96 | self.lagrange_multipliers[i]) 97 | 98 | # Compute the Lagrange multiplier 99 | if lower_bound == upper_bound: 100 | continue 101 | 102 | # Calculate the kernel 103 | if self.kernel == 'linear': 104 | kernel = np.dot(X[i], X[j]) 105 | elif self.kernel == 'poly': 106 | kernel = (np.dot(X[i], X[j]) + 1) ** self.degree 107 | else: 108 | kernel = np.exp(-self.gamma * 109 | np.sum(np.square(X[i] - X[j]))) 110 | 111 | # Compute the Lagrange multiplier 112 | lagrange_multipliers_j_new = self.lagrange_multipliers[j] + \ 113 | y[j] * (error - error_j) / (kernel + 114 | self.lagrange_multipliers[i] - 115 | self.lagrange_multipliers[j]) 116 | 117 | # Clip the Lagrange multiplier 118 | lagrange_multipliers_j_new = min(max( 119 | lagrange_multipliers_j_new, lower_bound), upper_bound) 120 | 121 | # Check if the Lagrange multiplier is valid 122 | if abs(lagrange_multipliers_j_new - 123 | lagrange_multipliers_j_old) < 1e-5: 124 | continue 125 | 126 | # Compute the Lagrange multiplier 127 | lagrange_multipliers_i_new = self.lagrange_multipliers[i] + \ 128 | y[i] * y[j] * (lagrange_multipliers_j_old - 129 | lagrange_multipliers_j_new) 130 | 131 | # Update the Lagrange multipliers 132 | self.lagrange_multipliers[i] = lagrange_multipliers_i_new 133 | self.lagrange_multipliers[j] = lagrange_multipliers_j_new 134 | 135 | # Update the error 136 | error += y[i] * y[j] * (lagrange_multipliers_j_old - 137 | lagrange_multipliers_j_new) * kernel 138 | 139 | # Update the weights and bias 140 | self.w += (lagrange_multipliers_i_new - 141 | lagrange_multipliers_i_old) * y[i] * X[i] + \ 142 | (lagrange_multipliers_j_new - 143 | lagrange_multipliers_j_old) * y[j] * X[j] 144 | self.b += (lagrange_multipliers_j_new - 145 | lagrange_multipliers_j_old) * y[j] 146 | 147 | # Increment the number of changed Lagrange multipliers 148 | num_changed_lagrange_multipliers += 1 149 | 150 | # Break if no Lagrange multiplier has changed 151 | if num_changed_lagrange_multipliers == 0: 152 | break 153 | 154 | # Compute the support vectors 155 | self.support_vectors = [] 156 | for i in range(self.num_examples): 157 | if self.lagrange_multipliers[i] > 0: 158 | self.support_vectors.append(X[i]) 159 | self.support_vectors = np.array(self.support_vectors) 160 | 161 | # Compute the number of support vectors 162 | self.num_support_vectors = len(self.support_vectors) 163 | 164 | # Define the function to predict the labels 165 | def predict(self, X): 166 | # Initialize the predictions 167 | predictions = np.zeros(X.shape[0]) 168 | 169 | # Compute the predictions 170 | for i in range(X.shape[0]): 171 | prediction = 0 172 | for j in range(self.num_support_vectors): 173 | # Calculate the kernel 174 | if self.kernel == 'linear': 175 | kernel = np.dot(X[i], self.support_vectors[j]) 176 | elif self.kernel == 'poly': 177 | kernel = (np.dot(X[i], self.support_vectors[j]) + 1) ** \ 178 | self.degree 179 | else: 180 | kernel = np.exp(-self.gamma * 181 | np.sum(np.square(X[i] - self.support_vectors[j]))) 182 | 183 | # Calculate the prediction 184 | prediction += self.lagrange_multipliers[j] * \ 185 | y[j] * kernel 186 | 187 | prediction += self.b 188 | predictions[i] = np.sign(prediction) 189 | 190 | return predictions -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLAlgorithmsCourse 2 | This is the public repository for the 365 Data Science ML Algorithms Course by Ken Jee and Jeff Li. In this course, we walk you through the ins and outs of each ML Algorithm. We did not build this course ourselves. We stood on the shoulders of giants. We think its only fair to credit all the resources we used to build this course, as we could not have created this course without the help of the ML community. This course includes the following: 3 | - Detailed explanations of each ML algorithm (listed below) with specifics on how they work, pros and cons, when to use them, and data preprocessing needed for each one. 4 | - Two projects using all of the classification and regression algorithms with detailed instructions on parameter tuning 5 | - Resources that we used to build the course so you have additional details on each topic 6 | 7 | **Use the discount link for our 3 course bundle (limited time 68% off!)** --> [The Machine Learning A-Z Bundle](https://bit.ly/3NAZ5oP) 8 | ## Coding Project Examples 9 | - [Regression Project - Kaggle](https://www.kaggle.com/code/kenjee/exhaustive-regression-parameter-tuning) 10 | - [Classification Project - Kaggle](https://www.kaggle.com/code/kenjee/exhaustive-classification-parameter-tuning) 11 | 12 | ## Flashcards 13 | Please go to Ankiweb.net to download Anki and to sign up for account. Please go [here](https://github.com/PlayingNumbers/ML_Algorithms_Course/blob/main/365_Data_Science_ML_Algorithms_A_to_Z.apkg) to download the flashcards for this course. 14 | 15 | ## 1. Linear Regression 16 | - [Linear Regression, Clearly Exlplained!!! by StatQuest](https://www.youtube.com/watch?v=nk2CQITm_eo&ab_channel=StatQuestwithJoshStarmer) 17 | - [Linear Regression by Jim Frost](https://statisticsbyjim.com/regression/linear-regression/) 18 | - [7 Classical Assumptions of Ordinary Least Squares (OLS) Linear Regression](https://statisticsbyjim.com/regression/ols-linear-regression-assumptions/) 19 | - [Gauss-Markov Theorem](https://statisticsbyjim.com/regression/gauss-markov-theorem-ols-blue/) 20 | - [Linear Regression — Detailed View](https://towardsdatascience.com/linear-regression-detailed-view-ea73175f6e86) 21 | - [Building Linear Regression (Least Squares) with Linear Algebra](https://towardsdatascience.com/building-linear-regression-least-squares-with-linear-algebra-2adf071dd5dd) 22 | - [Linear Regression using Gradient Descent](https://towardsdatascience.com/linear-regression-using-gradient-descent-97a6c8700931) 23 | 24 | ## 2. Regularization 25 | - [what-are-l1-l2-and-elastic-net-regularization-in-neural-networks](https://github.com/christianversloot/machine-learning-articles/blob/main/what-are-l1-l2-and-elastic-net-regularization-in-neural-networks.md) 26 | - [When will L1 regularization work better than L2 and vice versa?](https://stats.stackexchange.com/questions/184019/when-will-l1-regularization-work-better-than-l2-and-vice-versa) 27 | - [What is the difference between L1 and L2 regularization? How does it solve the problem of overfitting? Which regularizer to use and when?](https://www.quora.com/What-is-the-difference-between-L1-and-L2-regularization-How-does-it-solve-the-problem-of-overfitting-Which-regularizer-to-use-and-when) 28 | - [What is elastic net regularization, and how does it solve the drawbacks of Ridge (𝐿2 29 | ) and Lasso (𝐿1 30 | )?](https://stats.stackexchange.com/questions/184029/what-is-elastic-net-regularization-and-how-does-it-solve-the-drawbacks-of-ridge) 31 | - [Ridge, LASSO, and ElasticNet Regression](https://towardsdatascience.com/ridge-lasso-and-elasticnet-regression-b1f9c00ea3a3) 32 | 33 | ## 3. Logistic Regression 34 | - [The Intuitive Explanation of Logistic Regression](https://towardsdatascience.com/the-intuitive-explanation-of-logistic-regression-a0375b1bee54) 35 | - [StatQuest: Logistic Regression](https://www.youtube.com/watch?v=yIYKR4sgzI8&ab_channel=StatQuestwithJoshStarmer) 36 | - [Logistic Regression by Andrew Ng](https://www.youtube.com/watch?v=-la3q9d7AKQ&ab_channel=ArtificialIntelligence-AllinOne) 37 | - [Logistic Regression by Amherst College](https://nhorton.people.amherst.edu/ips9/IPS_09_Ch14.pdf) 38 | - [Intuition behind Log-loss score](https://towardsdatascience.com/intuition-behind-log-loss-score-4e0c9979680a) 39 | - [Log Loss Function by Alex Dyakonov](https://dasha.ai/en-us/blog/log-loss-function) 40 | 41 | ## 4. Gradient Descent 42 | - [Gradient Descent From Scratch by Analytics Vidhya](https://www.analyticsvidhya.com/blog/2021/05/gradient-descent-from-scratch-complete-intuition/#:~:text=The%20intuition%20behind%20Gradient%20Descent&text=We%20have%20to%20find%20the,between%20actual%20and%20predicted%20values.) 43 | - [Gradient descent, how neural networks learn](https://www.youtube.com/watch?v=IHZwWFHWa-w&ab_channel=3Blue1Brown) 44 | - [Stochastic Gradient Descent, Clearly Explained!!! by Josh Starmer](https://www.youtube.com/watch?v=vMh0zPT0tLI&ab_channel=StatQuestwithJoshStarmer) 45 | - [Gradient Descent Intuition — How Machines Learn](https://medium.com/x8-the-ai-community/gradient-descent-intuition-how-machines-learn-d29ad7464453) 46 | - [The Math and Intuition Behind Gradient Descent by Suraj Bansal](https://medium.datadriveninvestor.com/the-math-and-intuition-behind-gradient-descent-13c45f367a11) 47 | - [Batch gradient descent versus stochastic gradient descent](https://stats.stackexchange.com/questions/49528/batch-gradient-descent-versus-stochastic-gradient-descent) 48 | 49 | ## 5. Decision Tree 50 | - [Decision Trees Explained by James Thorn](https://towardsdatascience.com/decision-trees-explained-3ec41632ceb6) 51 | - [A Guide to Decision Trees for Beginners](https://www.kaggle.com/code/vipulgandhi/a-guide-to-decision-trees-for-beginners) 52 | - [Decision and Classification Trees, Clearly Explained!!! by Josh Starmer](https://www.youtube.com/watch?v=_L39rN6gz7Y&ab_channel=StatQuestwithJoshStarmer) 53 | - [Information Gain and Mutual Information for Machine Learning by Jason Brownlee](https://machinelearningmastery.com/information-gain-and-mutual-information/#:~:text=Mutual%20Information%20Related%3F-,What%20Is%20Information%20Gain%3F,samples%2C%20and%20hence%20less%20surprise.) 54 | - [A Simple Explanation of Information Gain and Entropy by Victor Zhou](https://victorzhou.com/blog/information-gain/) 55 | - [How to program a decision tree in Python from 0](https://anderfernandez.com/en/blog/code-decision-tree-python-from-scratch/) 56 | 57 | 58 | ## 6. Random Forest 59 | - [Building Intuition for Random Forests by Rishi Sidhu](https://medium.com/x8-the-ai-community/building-intuition-for-random-forests-76d36fa28c5e) 60 | - [An Introduction to Random Forest Algorithm for beginners](https://www.analyticsvidhya.com/blog/2021/10/an-introduction-to-random-forest-algorithm-for-beginners/) 61 | - [Feature Importance in Random Forest](https://mljar.com/blog/feature-importance-in-random-forest/) 62 | - [Detailed Explanation of Random Forests Features importance Bias](https://medium.com/@eng.mohammed.saad.18/detailed-explanation-of-random-forests-features-importance-bias-8755d26ac3bc) 63 | - [Random Forest: A Complete Guide for Machine Learning by Niklas Donges](https://builtin.com/data-science/random-forest-algorithm) 64 | - [Random Forest Simple Explanation by Will Koehrsen](https://williamkoehrsen.medium.com/random-forest-simple-explanation-377895a60d2d) 65 | - [Why Choose Random Forest and Not Decision Trees](https://towardsai.net/p/machine-learning/why-choose-random-forest-and-not-decision-trees) 66 | - [When to use Random Forest](https://datascience.stackexchange.com/questions/54751/when-to-use-random-forest) 67 | 68 | 69 | ## 7. Gradient Boosted Trees 70 | - [The Intuition Behind Gradient Boosting & XGBoost by Bobby Tan](https://towardsdatascience.com/the-intuition-behind-gradient-boosting-xgboost-6d5eac844920) 71 | - [Gradient Boosting Algorithm: A Complete Guide for Beginners](https://www.analyticsvidhya.com/blog/2021/09/gradient-boosting-algorithm-a-complete-guide-for-beginners/) 72 | - [Gradient Boosting Trees vs. Random Forests](https://www.baeldung.com/cs/gradient-boosting-trees-vs-random-forests#:~:text=4.3.-,Advantages%20and%20Disadvantages,and%20start%20modeling%20the%20noise.) 73 | - [Gradient Boosting In Classification: Not a Black Box Anymore!](https://blog.paperspace.com/gradient-boosting-for-classification/) 74 | - [A Gentle Introduction to the Gradient Boosting Algorithm for Machine Learning](https://machinelearningmastery.com/gentle-introduction-gradient-boosting-algorithm-machine-learning/) 75 | 76 | ## 8. XGBoost 77 | - [XGBoost Paper](https://arxiv.org/abs/1603.02754) 78 | - [A Gentle Introduction to XGBoost for Applied Machine Learning](https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/) 79 | - [XGBoost A Scalable Tree Boosting System by Tianqi Chen](https://www.youtube.com/watch?v=Vly8xGnNiWs&ab_channel=RealDataScienceUSA%28formerlyDataScience.LA%29) 80 | - [CatBoost vs. LightGBM vs. XGBoost](https://towardsdatascience.com/catboost-vs-lightgbm-vs-xgboost-c80f40662924) 81 | - [XGBoost, LightGBM or CatBoost — which boosting algorithm should I use?](https://medium.com/riskified-technology/xgboost-lightgbm-or-catboost-which-boosting-algorithm-should-i-use-e7fda7bb36bc) 82 | 83 | ## 9. K-Nearest Neighbors(KNN) 84 | - [KNN algorithm: Introduction to K-Nearest Neighbors Algorithm for Regression](https://www.analyticsvidhya.com/blog/2018/08/k-nearest-neighbor-introduction-regression-python/) 85 | - [K-Nearest Neighbors 👨‍👩‍👧‍👦](https://www.romaglushko.com/blog/k-nearest-neighbors/) 86 | - [Pros And Cons Of The K-Nearest Neighbors (KNN) Algorithm](https://roboticsbiz.com/pros-and-cons-of-the-k-nearest-neighbors-knn-algorithm/) 87 | - [StatQuest: K-nearest neighbors, Clearly Explained](https://www.youtube.com/watch?v=HVXime0nQeI&ab_channel=StatQuestwithJoshStarmer) 88 | - [The KNN Algorithm – Explanation, Opportunities, Limitations](https://neptune.ai/blog/knn-algorithm-explanation-opportunities-limitations#:~:text=KNN%20is%20most%20useful%20when,of%20desired%20precision%20and%20accuracy.) 89 | - [K-Nearest Neighbors (KNN) Classification with scikit-learn](https://www.datacamp.com/tutorial/k-nearest-neighbor-classification-scikit-learn) 90 | - [Develop k-Nearest Neighbors in Python From Scratch](https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/) 91 | 92 | ## 10. K-Means Clustering 93 | - [Elbow Method for Finding the Optimal Number of Clusters in K-Means](https://www.analyticsvidhya.com/blog/2021/01/in-depth-intuition-of-k-means-clustering-algorithm-in-machine-learning/) 94 | - [Intuition Behind K-Means](https://pianalytix.com/intuition-behind-k-means/) 95 | - [k-Means Advantages and Disadvantages](https://developers.google.com/machine-learning/clustering/algorithm/advantages-disadvantages) 96 | - [Difference between K means and Hierarchical Clustering](https://www.geeksforgeeks.org/difference-between-k-means-and-hierarchical-clustering/#) 97 | - [Learn K-Means and Hierarchical Clustering Algorithms in 15 minutes](https://medium.com/sfu-cspmp/learn-k-means-and-hierarchical-clustering-algorithms-in-15-minute-221661bbec9e) 98 | 99 | ## 11. Hierarchical Clustering 100 | - [Hierarchical clustering explained](https://towardsdatascience.com/hierarchical-clustering-explained-e59b13846da8) 101 | - [HOW THE HIERARCHICAL CLUSTERING ALGORITHM WORKS](https://dataaspirant.com/hierarchical-clustering-algorithm/) 102 | - [How to understand the drawbacks of Hierarchical Clustering?](https://stats.stackexchange.com/questions/183873/how-to-understand-the-drawbacks-of-hierarchical-clustering) 103 | - [Choosing the right linkage method for hierarchical clustering](https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering) 104 | - [Agglomerative Hierarchical Clustering](https://online.stat.psu.edu/stat505/lesson/14/14.4) 105 | - [Lecture 3: Hierarchical Methods](https://cse.buffalo.edu/~jing/cse601/fa12/materials/clustering_hierarchical.pdf) 106 | - [Hierarchical Clustering in Python](https://blog.quantinsti.com/hierarchical-clustering-python/) 107 | 108 | ## 12. Support Vector Machine 109 | - [Support Vector Machines: An Intuitive Approach](https://www.kdnuggets.com/2022/08/support-vector-machines-intuitive-approach.html) 110 | - [Support Vector Machine(SVM): A Complete guide for beginners](https://www.analyticsvidhya.com/blog/2021/10/support-vector-machinessvm-a-complete-guide-for-beginners/) 111 | - [Deep Dive into Support Vector Machine](https://towardsdatascience.com/deep-dive-into-support-vector-machine-654c8d517103) 112 | - [Support Vector Machines Part 1 (of 3): Main Ideas!!! by Josh Starmer](https://www.youtube.com/watch?v=efR1C6CvhmE&ab_channel=StatQuestwithJoshStarmer) 113 | - [SVM and Kernel SVM](https://towardsdatascience.com/svm-and-kernel-svm-fed02bef1200) 114 | - [Kernel Functions-Introduction to SVM Kernel & Examples](https://data-flair.training/blogs/svm-kernel-functions/) 115 | 116 | ## 13. Artificial Neural Nets 117 | - [Deep Learning vs. Classical ML](https://towardsdatascience.com/deep-learning-vs-classical-machine-learning-9a42c6d48aa) 118 | - [Backpropagation](https://brilliant.org/wiki/backpropagation/) 119 | - [Neural Networks by Analogy with Linear Regression](https://joshuagoings.com/2020/05/05/neural-network/) 120 | - [Neural Networks and Deep Learning](http://neuralnetworksanddeeplearning.com/) 121 | - [Colah's Blog](http://colah.github.io/) 122 | - [CNN's for Deep Learning](https://python.plainenglish.io/convolution-neural-network-cnn-in-deep-learning-77f5ab457166) 123 | 124 | ## 14. Collaborative Filtering 125 | - [Non-negative matrix factorization for recommendation systems](https://medium.com/logicai/non-negative-matrix-factorization-for-recommendation-systems-985ca8d5c16c) 126 | - [Collaborative Filtering Example - Google](https://developers.google.com/machine-learning/recommendation/collaborative/basics) 127 | - [Scikit Learn Decomposition](https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a/sklearn/decomposition/nmf.py#L1235) 128 | - [Quick Intro Nonnegative Matrix Factorization](https://heather.cs.ucdavis.edu/NMFTutorial.pdf) 129 | - [Algorithms for Non-Negative Matrix Factorization](https://proceedings.neurips.cc/paper/2000/file/f9d1152547c0bde01830b7e8bd60024c-Paper.pdf) 130 | - [Optimal number of latent factors in non-negative matrix factorization?](https://stats.stackexchange.com/questions/111205/how-to-choose-an-optimal-number-of-latent-factors-in-non-negative-matrix-factori) 131 | - [How to Use Cross-Validation for Matrix Completion](https://towardsdatascience.com/how-to-use-cross-validation-for-matrix-completion-2b14103d2c4c) 132 | - [Matrix Factorization for Movie Recommendations in Python](https://beckernick.github.io/matrix-factorization-recommender/) 133 | - [NMF — A visual explainer and Python Implementation](https://towardsdatascience.com/nmf-a-visual-explainer-and-python-implementation-7ecdd73491f8) 134 | - [Recommendation System Series Part 4: The 7 Variants of Matrix Factorization For Collaborative Filtering](https://towardsdatascience.com/recsys-series-part-4-the-7-variants-of-matrix-factorization-for-collaborative-filtering-368754e4fab5) 135 | - [Collaborative Filtering: Matrix Factorization Recommender System](https://www.jiristodulka.com/post/recsys_cf/) 136 | -------------------------------------------------------------------------------- /Regression_Example/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.2 2 | pandas==2.0.0 3 | scikit_learn==1.2.2 4 | scipy==1.10.1 5 | statsmodels==0.13.5 6 | xgboost==1.7.5 7 | --------------------------------------------------------------------------------