├── Computer_Data.csv ├── Linear Regression from scratch.py ├── Linear Regression sklearn.py ├── README.md ├── Refernces.txt ├── Salary_Data.csv └── _Linear-Regression-from-Scratch.pdf /Linear Regression from scratch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 25 01:24:36 2019 4 | 5 | @author: Lalith Bharadwaj 6 | """ 7 | 8 | #Loading Essestial Libraries 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import pandas as pd 12 | import seaborn as sns 13 | #from sklearn.linear_model import LinearRegression 14 | from sklearn.model_selection import train_test_split 15 | #from sklearn.metrics import r2_score 16 | import matplotlib.patches as mpatches 17 | 18 | #finding the Slope of linear regression line 19 | def Slope(a,b): 20 | n=len(a) 21 | two_sum=np.sum(a*b) 22 | sumX=np.sum(a) 23 | sumY=np.sum(b) 24 | sumX_2=np.sum(a**2) 25 | slope=(n*two_sum-sumX*sumY)/(n*sumX_2-(sumX)**2) 26 | return slope 27 | 28 | #Finding Intercept of linear regression line 29 | def Intercept(a,b): 30 | intercept=np.mean(b)-Slope(a,b)*np.mean(a) 31 | return intercept 32 | 33 | #predictions are made with the help of linear regression algorithm 34 | def Predictions(slope,x_input,intercept): 35 | predict=slope*x_input + intercept 36 | return predict 37 | 38 | #R-squared is regression metric 39 | def R_squared(predicted_values,test_values): 40 | f=predicted_values 41 | y=test_values 42 | print(f,'\n\n',y) 43 | #sum of squares 44 | ss_total=np.sum((y-np.mean(y))**2) 45 | #regression sum 46 | #ss_reg=np.sum((f-np.mean(y))**2) 47 | #Residuals sum of squares 48 | ss_res=np.sum((y-f)**2) 49 | #R-squared formula 50 | R_2=1-(ss_res/ss_total) 51 | return R_2 52 | 53 | #Finding Correlation Coefficient for the given X & Y values 54 | def correlation_coeff(predicted_values,test_values): 55 | a=predicted_values 56 | b=test_values 57 | n=len(a) 58 | two_sum=np.sum(a*b) 59 | sumX=np.sum(a) 60 | sumY=np.sum(b) 61 | sumX_2=np.sum(a**2) 62 | sumY_2=np.sum(b**2) 63 | score=(n*two_sum-sumX*sumY)/np.sqrt((n*sumX_2-(sumX)**2)*(n*sumY_2-(sumY)**2)) 64 | return score 65 | 66 | #Finding Covariance for the given X & Y values 67 | def Covariance(X,Y): 68 | a=X 69 | b=Y 70 | n=len(a) 71 | two_sum=np.sum(a*b) 72 | cov=two_sum/n-np.mean(a)*np.mean(b) 73 | return cov 74 | 75 | #Importing data(csv format) using pandas 76 | #Replace another dataset to make predictions 77 | dataset=pd.read_csv('Salary_Data.csv') 78 | 79 | # Split-out validation dataset 80 | #knowing the dimenstions of data and making them READY for PREDICTIONS. 81 | array = dataset.values 82 | X = array[:,0] 83 | #print(X.shape) 84 | #X=X.reshape(1,-1).T 85 | print(X.shape) 86 | Y = array[:,1] 87 | print(Y.shape) 88 | 89 | #To know the distribution of data let us plot box plot 90 | ## 1 91 | left = 0.1 92 | width = 0.8 93 | #fig=plt.figure() 94 | #fig,(ax1,ax2) = plt.subplots(nrows=2,ncols=1,sharex=False,sharey=True) 95 | ax1 = plt.axes([left, 0.5, width, 0.45]) 96 | ax1.boxplot(X) 97 | ax1.set_title('Box plot for X') 98 | plt.show() 99 | ## 2 100 | ax2 = plt.axes([left, 0.5, width, 0.45]) 101 | ax2.boxplot(Y, '.-') 102 | ax2.set_title('Distribution of Y Data') 103 | plt.show() 104 | 105 | #Covariation in data 106 | print(Covariance(X,Y)) 107 | 108 | #Dividing data into training and testing classes 109 | test_size = 0.10 110 | seed = 7 111 | X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,test_size= test_size, random_state=seed) 112 | 113 | #finding intercepts of rgression line 114 | intercept=Intercept(X_train,Y_train) 115 | slope=Slope(X_train,Y_train) 116 | print(intercept,slope) 117 | predictions=Predictions(slope=slope,x_input=X_validation,intercept=intercept) 118 | print(predictions) 119 | print(R_squared(predicted_values=predictions,test_values=Y_validation)) 120 | print(correlation_coeff(test_values=Y_validation,predicted_values=predictions)) 121 | 122 | #Equation of Linear Regression 123 | y=slope*X+intercept 124 | 125 | #plotting the linear regression function 126 | plt.scatter(X,Y,marker='^',color='k',alpha=0.55) 127 | plt.plot(X,y,color='R',linewidth=2) 128 | red_patch = mpatches.Patch(color='red', label='Regression Line') 129 | plt.legend(loc=0,handles=[red_patch]) 130 | plt.title('Linear Regression Plot') 131 | plt.tight_layout(pad=2) 132 | plt.grid(False) 133 | plt.show() 134 | 135 | #Residual plots 136 | sns.set(style="whitegrid") 137 | # Make an example dataset with y ~ x 138 | rs = np.random.RandomState(7) 139 | #Plot the residuals after fitting a linear model 140 | sns.residplot(X, Y, lowess=True, color="r") 141 | plt.title('Residual Plot') 142 | plt.show() 143 | 144 | #--------------------------------------------------------------# 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /Linear Regression sklearn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 24 4:20:44 2019 4 | 5 | @author: Lalith Bharadwaj 6 | """ 7 | #Loading Essestial Libraries 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | from sklearn.linear_model import LinearRegression 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import r2_score 14 | import seaborn as sns 15 | 16 | 17 | #importing data 18 | #change the dataset here to perform predictions 19 | dataset=pd.read_csv('Salary_Data.csv') 20 | 21 | # Split-out validation dataset 22 | #knowing the dimenstions of data and making them READY for PREDICTIONS. 23 | array = dataset.values 24 | X = array[:,0] 25 | print(X.shape) 26 | X=X.reshape(1,-1).T 27 | print(X.shape) 28 | Y = array[:,1] 29 | print(Y.shape) 30 | 31 | #To know the distribution of data let us plot box plot 32 | ## 1 33 | left = 0.1 34 | width = 0.8 35 | #fig=plt.figure() 36 | #fig,(ax1,ax2) = plt.subplots(nrows=2,ncols=1,sharex=False,sharey=True) 37 | ax1 = plt.axes([left, 0.5, width, 0.45]) 38 | ax1.boxplot(X) 39 | ax1.set_title('Box plot for X') 40 | plt.show() 41 | ## 2 42 | ax2 = plt.axes([left, 0.5, width, 0.45]) 43 | ax2.boxplot(Y, '.-') 44 | ax2.set_title('Distribution of Y Data') 45 | plt.show() 46 | 47 | 48 | #Dividing data into training and testing classes 49 | test_size = 0.10 50 | seed = 7 51 | X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,test_size= test_size, random_state=seed) 52 | 53 | 54 | # Make predictions on validation dataset 55 | lin = LinearRegression() 56 | lin.fit(X_train, Y_train) 57 | predictions = lin.predict(X_validation) 58 | print(predictions) 59 | 60 | #we are using R-Squared metric to detrmine the model efficiency 61 | print(r2_score(Y_validation, predictions)) 62 | 63 | #Calcutating the intercept and slope 64 | c=lin.intercept_ 65 | m=lin.coef_ 66 | print(c,m) 67 | #so we write the linear regression function as, 68 | y=m*X+c 69 | 70 | #plotting the linear regression function 71 | plt.scatter(X,Y,marker='o',color='k') 72 | plt.plot(X,y,color='R') 73 | plt.legend(loc=0,title='Linear Regression') 74 | plt.title('Linear Regression') 75 | plt.tight_layout(pad=2) 76 | plt.grid(False) 77 | plt.show() 78 | 79 | #Residual plots 80 | sns.set(style="whitegrid") 81 | # Make an example dataset with y ~ x 82 | rs = np.random.RandomState(7) 83 | #Plot the residuals after fitting a linear model 84 | X = array[:,0] 85 | Y = array[:,1] 86 | sns.residplot(X, Y, lowess=True, color="g") 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Linear-Regression-from-Scratch 2 | Here you predict a certain value derived from the input using linear regression and for this, 3 | it is required to understand the statistics behind Linear Regression and bsics of numpy and pandas libraries. 4 | Firstly, let us understand linear regression with a toy example subsequently we'll get an enriched overview of how it works from scratch (almost). 5 | 6 | We divide the complete code into two classes, 7 | 1. Linear Regression from Scratch(almost) 8 | 2. Linear Regression from sklearn(Machine Learning Library in python). 9 | 10 | Here is a link(PDF) for theoritical and mathematical modelling for Linear Regression and it's Statistics. 11 | This elucidates how predictions are made step on step with mathematical construction and visualization for goodness of fit. 12 | 13 | [Linear Regresion](https://github.com/LalithBharadwaj/Linear-Regression-from-Scratch/blob/master/_Linear-Regression-from-Scratch.pdf) 14 | -------------------------------------------------------------------------------- /Refernces.txt: -------------------------------------------------------------------------------- 1 | 1. https://en.wikipedia.org/wiki/Linear_regression 2 | 2. https://en.wikipedia.org/wiki/Coefficient_of_determination 3 | 3. https://matplotlib.org/api/pyplot_api.html 4 | 4. https://seaborn.pydata.org/generated/seaborn.residplot.html 5 | 5. https://machinelearningmastery.com/how-machine-learning-algorithms-work/ 6 | -------------------------------------------------------------------------------- /Salary_Data.csv: -------------------------------------------------------------------------------- 1 | YearsExperience,Salary 2 | 1.1,39343 3 | 1.3,46205 4 | 1.5,37731 5 | 2,43525 6 | 2.2,39891 7 | 2.9,56642 8 | 3,60150 9 | 3.2,54445 10 | 3.2,64445 11 | 3.7,57189 12 | 3.9,63218 13 | 4,55794 14 | 4,56957 15 | 4.1,57081 16 | 4.5,61111 17 | 4.9,67938 18 | 5.1,66029 19 | 5.3,83088 20 | 5.9,81363 21 | 6,93940 22 | 6.8,91738 23 | 7.1,98273 24 | 7.9,101302 25 | 8.2,113812 26 | 8.7,109431 27 | 9,105582 28 | 9.5,116969 29 | 9.6,112635 30 | 10.3,122391 31 | 10.5,121872 32 | -------------------------------------------------------------------------------- /_Linear-Regression-from-Scratch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barulalithb/Linear-Regression-from-Scratch/97c214afda005039791d14bee831b783c00a0681/_Linear-Regression-from-Scratch.pdf --------------------------------------------------------------------------------