├── multiple_linear_regression.py ├── multivariate_adaptive_regression_splines.py ├── simple_linear_regression.py ├── lasso_regression.py ├── ridge_regression.py ├── quadratic_discriminant_analysis.py ├── logistic_regression.py ├── mtcars.csv ├── linear_discriminant_analysis ├── partial_least_squares.py └── principal_components_regression.py /multiple_linear_regression.py: -------------------------------------------------------------------------------- 1 | #ENTER THE DATA 2 | import pandas as pd 3 | 4 | #create data 5 | df = pd.DataFrame({'hours': [1, 2, 2, 4, 2, 1, 5, 4, 2, 4, 4, 3, 6, 5, 3, 4, 6, 2, 1, 2], 6 | 'exams': [1, 3, 3, 5, 2, 2, 1, 1, 0, 3, 4, 3, 2, 4, 4, 4, 5, 1, 0, 1], 7 | 'score': [76, 78, 85, 88, 72, 69, 94, 94, 88, 92, 90, 75, 96, 90, 82, 85, 99, 83, 62, 76]}) 8 | 9 | #view data 10 | df 11 | 12 | #FIT THE MODEL 13 | import statsmodels.api as sm 14 | 15 | #define response variable 16 | y = df['score'] 17 | 18 | #define predictor variables 19 | x = df[['hours', 'exams']] 20 | 21 | #add constant to predictor variables 22 | x = sm.add_constant(x) 23 | 24 | #fit linear regression model 25 | model = sm.OLS(y, x).fit() 26 | 27 | #view model summary 28 | print(model.summary()) -------------------------------------------------------------------------------- /multivariate_adaptive_regression_splines.py: -------------------------------------------------------------------------------- 1 | pip install sklearn-contrib-py-earth 2 | 3 | import pandas as pd 4 | from numpy import mean 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn.model_selection import RepeatedKFold 7 | from sklearn.datasets import make_regression 8 | from pyearth import Earth 9 | 10 | #create fake regression data 11 | X, y = make_regression(n_samples=5000, n_features=15, n_informative=10, noise=0.5, random_state=5) 12 | 13 | # define the model 14 | model = Earth() 15 | 16 | # define the evaluation procedure 17 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 18 | 19 | # evaluate the model and collect results 20 | n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) 21 | 22 | # report performance 23 | mean(n_scores) -------------------------------------------------------------------------------- /simple_linear_regression.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import statsmodels.api as sm 3 | import matplotlib.pyplot as plt 4 | 5 | #create data 6 | df = pd.DataFrame({'hours': [1, 2, 4, 5, 5, 6, 6, 7, 8, 10, 11, 11, 12, 12, 14], 7 | 'score': [64, 66, 76, 73, 74, 81, 83, 82, 80, 88, 84, 82, 91, 93, 89]}) 8 | 9 | #create scatterplot 10 | plt.scatter(df.hours, df.score) 11 | plt.title('Hours studied vs. Exam Score') 12 | plt.xlabel('Hours') 13 | plt.ylabel('Score') 14 | plt.show() 15 | 16 | #create boxplot 17 | df.boxplot(column=['score']) 18 | 19 | #fit simple linear regression model 20 | y = df['score'] 21 | x = df[['hours']] 22 | x = sm.add_constant(x) 23 | model = sm.OLS(y, x).fit() 24 | 25 | #view model summary 26 | print(model.summary()) 27 | 28 | #produce residual plots 29 | fig = plt.figure(figsize=(12,8)) 30 | fig = sm.graphics.plot_regress_exog(model, 'hours', fig=fig) 31 | 32 | #produce Q-Q plot 33 | res = model.resid 34 | fig = sm.qqplot(res, fit=True, line="45") 35 | plt.show() -------------------------------------------------------------------------------- /lasso_regression.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from numpy import arange 3 | from sklearn.linear_model import LassoCV 4 | from sklearn.model_selection import RepeatedKFold 5 | 6 | #specify URL where data is located 7 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv" 8 | 9 | #read in data 10 | data_full = pd.read_csv(url) 11 | 12 | #select subset of data 13 | data = data_full[["mpg", "wt", "drat", "qsec", "hp"]] 14 | 15 | #view first six rows of data 16 | data[0:6] 17 | 18 | #define predictor and response variables 19 | X = data[["mpg", "wt", "drat", "qsec"]] 20 | 21 | y = data["hp"] 22 | 23 | #define cross-validation method to evaluate model 24 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 25 | 26 | #define model 27 | model = LassoCV(alphas=arange(0, 1, 0.01), cv=cv, n_jobs=-1) 28 | 29 | #fit model 30 | model.fit(X, y) 31 | 32 | #display lambda that produced the lowest test MSE 33 | print(model.alpha_) 34 | 35 | #define new observation 36 | new = [24, 2.5, 3.5, 18.5] 37 | 38 | #predict hp value using lasso regression model 39 | model.predict([new]) -------------------------------------------------------------------------------- /ridge_regression.py: -------------------------------------------------------------------------------- 1 | #IMPORT NECESSARY PACKAGES 2 | import pandas as pd 3 | from numpy import arange 4 | from sklearn.linear_model import Ridge 5 | from sklearn.linear_model import RidgeCV 6 | from sklearn.model_selection import RepeatedKFold 7 | 8 | #LOAD DATA 9 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv" 10 | data_full = pd.read_csv(url) 11 | data = data_full[["mpg", "wt", "drat", "qsec", "hp"]] 12 | data[0:6] 13 | 14 | #FIT RIDGE REGRESSION MODEL 15 | #define predictor and response variables 16 | X = data[["mpg", "wt", "drat", "qsec"]] 17 | y = data["hp"] 18 | 19 | #define cross-validation method to evaluate model 20 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 21 | 22 | #define model 23 | model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error') 24 | 25 | #fit model 26 | model.fit(X, y) 27 | 28 | #display lambda that produced the lowest test MSE 29 | print(model.alpha_) 30 | 31 | #USE MODEL TO PREDICT RESPONSE VALUE OF NEW OBSERVATIONS 32 | #define new observation 33 | new = [24, 2.5, 3.5, 18.5] 34 | 35 | #predict hp value using ridge regression model 36 | model.predict([new]) -------------------------------------------------------------------------------- /quadratic_discriminant_analysis.py: -------------------------------------------------------------------------------- 1 | #LOAD NECESSARY LIBRARIES 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 4 | from sklearn.model_selection import RepeatedStratifiedKFold 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn import datasets 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | import numpy as np 10 | 11 | #LOAD AND VIEW IRIS DATASET 12 | iris = datasets.load_iris() 13 | df = pd.DataFrame(data = np.c_[iris['data'], iris['target']], 14 | columns = iris['feature_names'] + ['target']) 15 | df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names) 16 | df.columns = ['s_length', 's_width', 'p_length', 'p_width', 'target', 'species'] 17 | print(df.head()) 18 | len(df.index) 19 | 20 | #DEFINE PREDICTOR AND RESPONSE VARIABLES 21 | X = df[['s_length', 's_width', 'p_length', 'p_width']] 22 | y = df['species'] 23 | 24 | #FIT LDA MODEL 25 | model = QuadraticDiscriminantAnalysis() 26 | model.fit(X, y) 27 | 28 | #DEFINE METHOD TO EVALUATE MODEL 29 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 30 | 31 | #EVALUATE MODEL 32 | scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) 33 | print(np.mean(scores)) 34 | 35 | #USE MODEL TO MAKE PREDICTION ON NEW OBSERVATION 36 | new = [5, 3, 1, .4] 37 | model.predict([new]) -------------------------------------------------------------------------------- /logistic_regression.py: -------------------------------------------------------------------------------- 1 | #IMPORT PACKAGES 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn import metrics 7 | import matplotlib.pyplot as plt 8 | 9 | #LOAD DATA 10 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/default.csv" 11 | data=pd.read_csv(url) 12 | 13 | #view first six rows of dataset 14 | data[0:6] 15 | 16 | #find total observations in dataset 17 | len(data.index) 18 | 19 | #FIT LOGISTIC REGRESSION MODEL 20 | X = data[['student', 'balance','income']] 21 | y = data['default'] 22 | 23 | #split the dataset into training (70%) and testing (30%) sets 24 | X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 25 | 26 | #instantiate the model 27 | log_regression = LogisticRegression() 28 | 29 | #fit the model using the training data 30 | log_regression.fit(X_train,y_train) 31 | 32 | #use model to make predictions on test data 33 | y_pred = log_regression.predict(X_test) 34 | 35 | #MODEL DIAGNOSTICS 36 | cnf_matrix = metrics.confusion_matrix(y_test, y_pred) 37 | cnf_matrix 38 | print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) 39 | 40 | #plot ROC curve 41 | y_pred_proba = log_regression.predict_proba(X_test)[::,1] 42 | fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba) 43 | auc = metrics.roc_auc_score(y_test, y_pred_proba) 44 | plt.plot(fpr,tpr,label="AUC="+str(auc)) 45 | plt.legend(loc=4) 46 | plt.show() -------------------------------------------------------------------------------- /mtcars.csv: -------------------------------------------------------------------------------- 1 | model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb 2 | Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4 3 | Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4 4 | Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1 5 | Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1 6 | Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2 7 | Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1 8 | Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2 10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3 16 | Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4 17 | Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4 18 | Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4 19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 23 | Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2 24 | AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2 25 | Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4 26 | Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2 27 | Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1 28 | Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2 29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 30 | Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4 31 | Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6 32 | Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8 33 | Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2 34 | -------------------------------------------------------------------------------- /linear_discriminant_analysis: -------------------------------------------------------------------------------- 1 | #LOAD NECESSARY LIBRARIES 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 4 | from sklearn.model_selection import RepeatedStratifiedKFold 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn import datasets 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | import numpy as np 10 | 11 | #LOAD AND VIEW IRIS DATASET 12 | iris = datasets.load_iris() 13 | df = pd.DataFrame(data = np.c_[iris['data'], iris['target']], 14 | columns = iris['feature_names'] + ['target']) 15 | df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names) 16 | df.columns = ['s_length', 's_width', 'p_length', 'p_width', 'target', 'species'] 17 | print(df.head()) 18 | len(df.index) 19 | 20 | #DEFINE PREDICTOR AND RESPONSE VARIABLES 21 | X = df[['s_length', 's_width', 'p_length', 'p_width']] 22 | y = df['species'] 23 | 24 | #FIT LDA MODEL 25 | model = LinearDiscriminantAnalysis() 26 | model.fit(X, y) 27 | 28 | #DEFINE METHOD TO EVALUATE MODEL 29 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 30 | 31 | #EVALUATE MODEL 32 | scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) 33 | print(np.mean(scores)) 34 | 35 | #USE MODEL TO MAKE PREDICTION ON NEW OBSERVATION 36 | new = [5, 3, 1, .4] 37 | model.predict([new]) 38 | 39 | #CREATE LDA PLOT 40 | X = iris.data 41 | y = iris.target 42 | model = LinearDiscriminantAnalysis() 43 | X_r2 = model.fit(X, y).transform(X) 44 | target_names = iris.target_names 45 | 46 | plt.figure() 47 | colors = ['red', 'green', 'blue'] 48 | lw = 2 49 | for color, i, target_name in zip(colors, [0, 1, 2], target_names): 50 | plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color, 51 | label=target_name) 52 | plt.legend(loc='best', shadow=False, scatterpoints=1) 53 | plt.show() -------------------------------------------------------------------------------- /partial_least_squares.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import scale 5 | from sklearn import model_selection 6 | from sklearn.model_selection import RepeatedKFold 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.cross_decomposition import PLSRegression 9 | from sklearn.metrics import mean_squared_error 10 | 11 | #define URL where data is located 12 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv" 13 | 14 | #read in data 15 | data_full = pd.read_csv(url) 16 | 17 | #select subset of data 18 | data = data_full[["mpg", "disp", "drat", "wt", "qsec", "hp"]] 19 | 20 | #view first six rows of data 21 | data[0:6] 22 | 23 | #define predictor and response variables 24 | X = data[["mpg", "disp", "drat", "wt", "qsec"]] 25 | y = data[["hp"]] 26 | 27 | #define cross-validation method 28 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 29 | 30 | mse = [] 31 | n = len(X) 32 | 33 | # Calculate MSE with only the intercept 34 | score = -1*model_selection.cross_val_score(PLSRegression(n_components=1), np.ones((n,1)), y, cv=cv, scoring='neg_mean_squared_error').mean() 35 | mse.append(score) 36 | 37 | # Calculate MSE using cross-validation, adding one component at a time 38 | for i in np.arange(1, 6): 39 | pls = PLSRegression(n_components=i) 40 | score = -1*model_selection.cross_val_score(pls, scale(X), y, cv=cv, scoring='neg_mean_squared_error').mean() 41 | mse.append(score) 42 | 43 | #plot test MSE vs. number of components 44 | plt.plot(mse) 45 | plt.xlabel('Number of PLS Components') 46 | plt.ylabel('MSE') 47 | plt.title('hp') 48 | 49 | #split the dataset into training (70%) and testing (30%) sets 50 | X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 51 | 52 | #calculate RMSE 53 | pls = PLSRegression(n_components=2) 54 | pls.fit(scale(X_train), y_train) 55 | 56 | np.sqrt(mean_squared_error(y_test, pls.predict(scale(X_test)))) 57 | -------------------------------------------------------------------------------- /principal_components_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import scale 5 | from sklearn import model_selection 6 | from sklearn.model_selection import RepeatedKFold 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.decomposition import PCA 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.metrics import mean_squared_error 11 | 12 | #define URL where data is located 13 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv" 14 | 15 | #read in data 16 | data_full = pd.read_csv(url) 17 | 18 | #select subset of data 19 | data = data_full[["mpg", "disp", "drat", "wt", "qsec", "hp"]] 20 | 21 | #view first six rows of data 22 | data[0:6] 23 | #define predictor and response variables 24 | X = data[["mpg", "disp", "drat", "wt", "qsec"]] 25 | y = data[["hp"]] 26 | 27 | #scale predictor variables 28 | pca = PCA() 29 | X_reduced = pca.fit_transform(scale(X)) 30 | 31 | #define cross validation method 32 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 33 | 34 | regr = LinearRegression() 35 | mse = [] 36 | 37 | # Calculate MSE with only the intercept 38 | score = -1*model_selection.cross_val_score(regr, np.ones((len(X_reduced),1)), y, cv=cv, scoring='neg_mean_squared_error').mean() 39 | mse.append(score) 40 | 41 | # Calculate MSE using cross-validation, adding one component at a time 42 | for i in np.arange(1, 6): 43 | score = -1*model_selection.cross_val_score(regr, X_reduced[:,:i], y, cv=cv, scoring='neg_mean_squared_error').mean() 44 | mse.append(score) 45 | 46 | # Plot cross-validation results 47 | plt.plot(mse) 48 | plt.xlabel('Number of Principal Components') 49 | plt.ylabel('MSE') 50 | plt.title('hp') 51 | 52 | #calculate percentage of variation explained 53 | np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100) 54 | 55 | #split the dataset into training (70%) and testing (30%) sets 56 | X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 57 | 58 | #scale the training and testing data 59 | X_reduced_train = pca.fit_transform(scale(X_train)) 60 | X_reduced_test = pca.transform(scale(X_test))[:,:1] 61 | 62 | #train PCR model on training data 63 | regr = LinearRegression() 64 | regr.fit(X_reduced_train[:,:1], y_train) 65 | 66 | #calculate RMSE 67 | pred = regr.predict(X_reduced_test) 68 | np.sqrt(mean_squared_error(y_test, pred)) --------------------------------------------------------------------------------