├── multiple_linear_regression.py
├── multivariate_adaptive_regression_splines.py
├── simple_linear_regression.py
├── lasso_regression.py
├── ridge_regression.py
├── quadratic_discriminant_analysis.py
├── logistic_regression.py
├── mtcars.csv
├── linear_discriminant_analysis
├── partial_least_squares.py
└── principal_components_regression.py


/multiple_linear_regression.py:
--------------------------------------------------------------------------------
 1 | #ENTER THE DATA
 2 | import pandas as pd
 3 | 
 4 | #create data
 5 | df = pd.DataFrame({'hours': [1, 2, 2, 4, 2, 1, 5, 4, 2, 4, 4, 3, 6, 5, 3, 4, 6, 2, 1, 2],
 6 |                    'exams': [1, 3, 3, 5, 2, 2, 1, 1, 0, 3, 4, 3, 2, 4, 4, 4, 5, 1, 0, 1],
 7 |                    'score': [76, 78, 85, 88, 72, 69, 94, 94, 88, 92, 90, 75, 96, 90, 82, 85, 99, 83, 62, 76]})
 8 | 
 9 | #view data 
10 | df
11 | 
12 | #FIT THE MODEL
13 | import statsmodels.api as sm
14 | 
15 | #define response variable
16 | y = df['score']
17 | 
18 | #define predictor variables
19 | x = df[['hours', 'exams']]
20 | 
21 | #add constant to predictor variables
22 | x = sm.add_constant(x)
23 | 
24 | #fit linear regression model
25 | model = sm.OLS(y, x).fit()
26 | 
27 | #view model summary
28 | print(model.summary())


--------------------------------------------------------------------------------
/multivariate_adaptive_regression_splines.py:
--------------------------------------------------------------------------------
 1 | pip install sklearn-contrib-py-earth
 2 | 
 3 | import pandas as pd
 4 | from numpy import mean
 5 | from sklearn.model_selection import cross_val_score
 6 | from sklearn.model_selection import RepeatedKFold
 7 | from sklearn.datasets import make_regression
 8 | from pyearth import Earth
 9 | 
10 | #create fake regression data
11 | X, y = make_regression(n_samples=5000, n_features=15, n_informative=10, noise=0.5, random_state=5)
12 | 
13 | # define the model
14 | model = Earth()
15 | 
16 | # define the evaluation procedure
17 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
18 | 
19 | # evaluate the model and collect results
20 | n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
21 | 
22 | # report performance
23 | mean(n_scores)


--------------------------------------------------------------------------------
/simple_linear_regression.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import statsmodels.api as sm 
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | #create data
 6 | df = pd.DataFrame({'hours': [1, 2, 4, 5, 5, 6, 6, 7, 8, 10, 11, 11, 12, 12, 14],
 7 |                    'score': [64, 66, 76, 73, 74, 81, 83, 82, 80, 88, 84, 82, 91, 93, 89]})
 8 | 
 9 | #create scatterplot
10 | plt.scatter(df.hours, df.score)
11 | plt.title('Hours studied vs. Exam Score')
12 | plt.xlabel('Hours')
13 | plt.ylabel('Score')
14 | plt.show()
15 | 
16 | #create boxplot
17 | df.boxplot(column=['score'])
18 | 
19 | #fit simple linear regression model
20 | y = df['score']
21 | x = df[['hours']]
22 | x = sm.add_constant(x)
23 | model = sm.OLS(y, x).fit()
24 | 
25 | #view model summary
26 | print(model.summary())
27 | 
28 | #produce residual plots
29 | fig = plt.figure(figsize=(12,8))
30 | fig = sm.graphics.plot_regress_exog(model, 'hours', fig=fig)
31 | 
32 | #produce Q-Q plot
33 | res = model.resid
34 | fig = sm.qqplot(res, fit=True, line="45")
35 | plt.show()


--------------------------------------------------------------------------------
/lasso_regression.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from numpy import arange
 3 | from sklearn.linear_model import LassoCV
 4 | from sklearn.model_selection import RepeatedKFold
 5 | 
 6 | #specify URL where data is located
 7 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv"
 8 | 
 9 | #read in data
10 | data_full = pd.read_csv(url)
11 | 
12 | #select subset of data
13 | data = data_full[["mpg", "wt", "drat", "qsec", "hp"]]
14 | 
15 | #view first six rows of data
16 | data[0:6]
17 | 
18 | #define predictor and response variables
19 | X = data[["mpg", "wt", "drat", "qsec"]]
20 | 
21 | y = data["hp"]
22 | 
23 | #define cross-validation method to evaluate model
24 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
25 | 
26 | #define model
27 | model = LassoCV(alphas=arange(0, 1, 0.01), cv=cv, n_jobs=-1)
28 | 
29 | #fit model
30 | model.fit(X, y)
31 | 
32 | #display lambda that produced the lowest test MSE
33 | print(model.alpha_)
34 | 
35 | #define new observation
36 | new = [24, 2.5, 3.5, 18.5]
37 | 
38 | #predict hp value using lasso regression model
39 | model.predict([new])


--------------------------------------------------------------------------------
/ridge_regression.py:
--------------------------------------------------------------------------------
 1 | #IMPORT NECESSARY PACKAGES
 2 | import pandas as pd
 3 | from numpy import arange
 4 | from sklearn.linear_model import Ridge
 5 | from sklearn.linear_model import RidgeCV
 6 | from sklearn.model_selection import RepeatedKFold
 7 | 
 8 | #LOAD DATA
 9 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv"
10 | data_full = pd.read_csv(url)
11 | data = data_full[["mpg", "wt", "drat", "qsec", "hp"]]
12 | data[0:6]
13 | 
14 | #FIT RIDGE REGRESSION MODEL
15 | #define predictor and response variables
16 | X = data[["mpg", "wt", "drat", "qsec"]]
17 | y = data["hp"]
18 | 
19 | #define cross-validation method to evaluate model
20 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
21 | 
22 | #define model
23 | model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')
24 | 
25 | #fit model
26 | model.fit(X, y)
27 | 
28 | #display lambda that produced the lowest test MSE
29 | print(model.alpha_)
30 | 
31 | #USE MODEL TO PREDICT RESPONSE VALUE OF NEW OBSERVATIONS
32 | #define new observation
33 | new = [24, 2.5, 3.5, 18.5]
34 | 
35 | #predict hp value using ridge regression model
36 | model.predict([new])


--------------------------------------------------------------------------------
/quadratic_discriminant_analysis.py:
--------------------------------------------------------------------------------
 1 | #LOAD NECESSARY LIBRARIES
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
 4 | from sklearn.model_selection import RepeatedStratifiedKFold
 5 | from sklearn.model_selection import cross_val_score
 6 | from sklearn import datasets
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | #LOAD AND VIEW IRIS DATASET
12 | iris = datasets.load_iris()
13 | df = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
14 |                  columns = iris['feature_names'] + ['target'])
15 | df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
16 | df.columns = ['s_length', 's_width', 'p_length', 'p_width', 'target', 'species']
17 | print(df.head())
18 | len(df.index)
19 | 
20 | #DEFINE PREDICTOR AND RESPONSE VARIABLES
21 | X = df[['s_length', 's_width', 'p_length', 'p_width']]
22 | y = df['species']
23 | 
24 | #FIT LDA MODEL
25 | model = QuadraticDiscriminantAnalysis()
26 | model.fit(X, y)
27 | 
28 | #DEFINE METHOD TO EVALUATE MODEL
29 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
30 | 
31 | #EVALUATE MODEL
32 | scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
33 | print(np.mean(scores))
34 | 
35 | #USE MODEL TO MAKE PREDICTION ON NEW OBSERVATION
36 | new = [5, 3, 1, .4]
37 | model.predict([new])


--------------------------------------------------------------------------------
/logistic_regression.py:
--------------------------------------------------------------------------------
 1 | #IMPORT PACKAGES
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.linear_model import LogisticRegression
 6 | from sklearn import metrics
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #LOAD DATA
10 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/default.csv"
11 | data=pd.read_csv(url)
12 | 
13 | #view first six rows of dataset
14 | data[0:6]
15 | 
16 | #find total observations in dataset
17 | len(data.index)
18 | 
19 | #FIT LOGISTIC REGRESSION MODEL
20 | X = data[['student', 'balance','income']]
21 | y = data['default']
22 | 
23 | #split the dataset into training (70%) and testing (30%) sets
24 | X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
25 | 
26 | #instantiate the model
27 | log_regression = LogisticRegression()
28 | 
29 | #fit the model using the training data
30 | log_regression.fit(X_train,y_train)
31 | 
32 | #use model to make predictions on test data
33 | y_pred = log_regression.predict(X_test)
34 | 
35 | #MODEL DIAGNOSTICS
36 | cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
37 | cnf_matrix
38 | print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
39 | 
40 | #plot ROC curve
41 | y_pred_proba = log_regression.predict_proba(X_test)[::,1]
42 | fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
43 | auc = metrics.roc_auc_score(y_test, y_pred_proba)
44 | plt.plot(fpr,tpr,label="AUC="+str(auc))
45 | plt.legend(loc=4)
46 | plt.show()


--------------------------------------------------------------------------------
/mtcars.csv:
--------------------------------------------------------------------------------
 1 | model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
 2 | Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4
 3 | Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4
 4 | Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
 5 | Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
 6 | Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
 7 | Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
 8 | Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16 | Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17 | Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4
18 | Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24 | AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25 | Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26 | Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27 | Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28 | Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31 | Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32 | Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8
33 | Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
34 | 


--------------------------------------------------------------------------------
/linear_discriminant_analysis:
--------------------------------------------------------------------------------
 1 | #LOAD NECESSARY LIBRARIES
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 4 | from sklearn.model_selection import RepeatedStratifiedKFold
 5 | from sklearn.model_selection import cross_val_score
 6 | from sklearn import datasets
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | #LOAD AND VIEW IRIS DATASET
12 | iris = datasets.load_iris()
13 | df = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
14 |                  columns = iris['feature_names'] + ['target'])
15 | df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
16 | df.columns = ['s_length', 's_width', 'p_length', 'p_width', 'target', 'species']
17 | print(df.head())
18 | len(df.index)
19 | 
20 | #DEFINE PREDICTOR AND RESPONSE VARIABLES
21 | X = df[['s_length', 's_width', 'p_length', 'p_width']]
22 | y = df['species']
23 | 
24 | #FIT LDA MODEL
25 | model = LinearDiscriminantAnalysis()
26 | model.fit(X, y)
27 | 
28 | #DEFINE METHOD TO EVALUATE MODEL
29 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
30 | 
31 | #EVALUATE MODEL
32 | scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
33 | print(np.mean(scores))
34 | 
35 | #USE MODEL TO MAKE PREDICTION ON NEW OBSERVATION
36 | new = [5, 3, 1, .4]
37 | model.predict([new])
38 | 
39 | #CREATE LDA PLOT
40 | X = iris.data
41 | y = iris.target
42 | model = LinearDiscriminantAnalysis()
43 | X_r2 = model.fit(X, y).transform(X)
44 | target_names = iris.target_names
45 | 
46 | plt.figure()
47 | colors = ['red', 'green', 'blue']
48 | lw = 2
49 | for color, i, target_name in zip(colors, [0, 1, 2], target_names):
50 |     plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
51 |                 label=target_name)
52 | plt.legend(loc='best', shadow=False, scatterpoints=1)
53 | plt.show()


--------------------------------------------------------------------------------
/partial_least_squares.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.preprocessing import scale 
 5 | from sklearn import model_selection
 6 | from sklearn.model_selection import RepeatedKFold
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.cross_decomposition import PLSRegression
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | #define URL where data is located
12 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv"
13 | 
14 | #read in data
15 | data_full = pd.read_csv(url)
16 | 
17 | #select subset of data
18 | data = data_full[["mpg", "disp", "drat", "wt", "qsec", "hp"]]
19 | 
20 | #view first six rows of data
21 | data[0:6]
22 | 
23 | #define predictor and response variables
24 | X = data[["mpg", "disp", "drat", "wt", "qsec"]]
25 | y = data[["hp"]]
26 | 
27 | #define cross-validation method
28 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
29 | 
30 | mse = []
31 | n = len(X)
32 | 
33 | # Calculate MSE with only the intercept
34 | score = -1*model_selection.cross_val_score(PLSRegression(n_components=1), np.ones((n,1)), y, cv=cv, scoring='neg_mean_squared_error').mean()    
35 | mse.append(score)
36 | 
37 | # Calculate MSE using cross-validation, adding one component at a time
38 | for i in np.arange(1, 6):
39 |     pls = PLSRegression(n_components=i)
40 |     score = -1*model_selection.cross_val_score(pls, scale(X), y, cv=cv, scoring='neg_mean_squared_error').mean()
41 |     mse.append(score)
42 | 
43 | #plot test MSE vs. number of components
44 | plt.plot(mse)
45 | plt.xlabel('Number of PLS Components')
46 | plt.ylabel('MSE')
47 | plt.title('hp')
48 | 
49 | #split the dataset into training (70%) and testing (30%) sets
50 | X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 
51 | 
52 | #calculate RMSE
53 | pls = PLSRegression(n_components=2)
54 | pls.fit(scale(X_train), y_train)
55 | 
56 | np.sqrt(mean_squared_error(y_test, pls.predict(scale(X_test))))
57 | 


--------------------------------------------------------------------------------
/principal_components_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.preprocessing import scale 
 5 | from sklearn import model_selection
 6 | from sklearn.model_selection import RepeatedKFold
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.decomposition import PCA
 9 | from sklearn.linear_model import LinearRegression
10 | from sklearn.metrics import mean_squared_error
11 | 
12 | #define URL where data is located
13 | url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv"
14 | 
15 | #read in data
16 | data_full = pd.read_csv(url)
17 | 
18 | #select subset of data
19 | data = data_full[["mpg", "disp", "drat", "wt", "qsec", "hp"]]
20 | 
21 | #view first six rows of data
22 | data[0:6]
23 | #define predictor and response variables
24 | X = data[["mpg", "disp", "drat", "wt", "qsec"]]
25 | y = data[["hp"]]
26 | 
27 | #scale predictor variables
28 | pca = PCA()
29 | X_reduced = pca.fit_transform(scale(X))
30 | 
31 | #define cross validation method
32 | cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
33 | 
34 | regr = LinearRegression()
35 | mse = []
36 | 
37 | # Calculate MSE with only the intercept
38 | score = -1*model_selection.cross_val_score(regr, np.ones((len(X_reduced),1)), y, cv=cv, scoring='neg_mean_squared_error').mean()    
39 | mse.append(score)
40 | 
41 | # Calculate MSE using cross-validation, adding one component at a time
42 | for i in np.arange(1, 6):
43 |     score = -1*model_selection.cross_val_score(regr, X_reduced[:,:i], y, cv=cv, scoring='neg_mean_squared_error').mean()
44 |     mse.append(score)
45 |     
46 | # Plot cross-validation results    
47 | plt.plot(mse)
48 | plt.xlabel('Number of Principal Components')
49 | plt.ylabel('MSE')
50 | plt.title('hp')
51 | 
52 | #calculate percentage of variation explained
53 | np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
54 | 
55 | #split the dataset into training (70%) and testing (30%) sets
56 | X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) 
57 | 
58 | #scale the training and testing data
59 | X_reduced_train = pca.fit_transform(scale(X_train))
60 | X_reduced_test = pca.transform(scale(X_test))[:,:1]
61 | 
62 | #train PCR model on training data 
63 | regr = LinearRegression()
64 | regr.fit(X_reduced_train[:,:1], y_train)
65 | 
66 | #calculate RMSE
67 | pred = regr.predict(X_reduced_test)
68 | np.sqrt(mean_squared_error(y_test, pred))


--------------------------------------------------------------------------------