├── README.md ├── LICENSE ├── multiple_linear_regression.R ├── multiple_linear_regression.py └── 50_Startups.csv /README.md: -------------------------------------------------------------------------------- 1 | # Classic Multi Independent Variable problem 2 | 3 | ## Steps I followed : 4 | - Used scatter() from matplotlib to see the nature of data 5 | - Then used Multiple Linear Regression Model using Scikit learn lib 6 | - Finally Checked the predictor values by plotting graphs 7 | 8 | #### Assumptions of a Linear Regression : 9 | - Linear 10 | - Homoscedasticity ( In statistics, a sequence or a vector of random variables is homoscedastic if all random variables in the sequence or vector have the same finite variance. ) 11 | - Multivariate Normality 12 | - Independence of Error 13 | - Lack of Multicollinearity 14 | 15 | 16 | Mathematical Model : 17 | 18 | y = b0 + b1x1 + b2x2 + ...... 19 | 20 | 21 | # And here I was careful about Dummy Trap 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Kaustabh Ganguly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /multiple_linear_regression.R: -------------------------------------------------------------------------------- 1 | # Multiple Linear Regression 2 | 3 | # Importing the dataset 4 | dataset = read.csv('50_Startups.csv') 5 | 6 | # Encoding categorical data 7 | dataset$State = factor(dataset$State, 8 | levels = c('New York', 'California', 'Florida'), 9 | labels = c(1, 2, 3)) 10 | 11 | # Splitting the dataset into the Training set and Test set 12 | # install.packages('caTools') 13 | library(caTools) 14 | set.seed(123) 15 | split = sample.split(dataset$Profit, SplitRatio = 0.8) 16 | training_set = subset(dataset, split == TRUE) 17 | test_set = subset(dataset, split == FALSE) 18 | 19 | # Feature Scaling 20 | # training_set = scale(training_set) 21 | # test_set = scale(test_set) 22 | 23 | # Fitting Multiple Linear Regression to the Training set 24 | regressor = lm(formula = Profit ~ ., 25 | data = training_set) 26 | 27 | # Predicting the Test set results 28 | y_pred = predict(regressor, newdata = test_set) 29 | 30 | # Building the optimal model using Backward Elimination 31 | regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State, 32 | data = dataset) 33 | summary(regressor) 34 | # Optional Step: Remove State2 only (as opposed to removing State directly) 35 | # regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + factor(State, exclude = 2), 36 | # data = dataset) 37 | # summary(regressor) 38 | regressor = lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, 39 | data = dataset) 40 | summary(regressor) 41 | regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend, 42 | data = dataset) 43 | summary(regressor) 44 | regressor = lm(formula = Profit ~ R.D.Spend, 45 | data = dataset) 46 | summary(regressor) -------------------------------------------------------------------------------- /multiple_linear_regression.py: -------------------------------------------------------------------------------- 1 | # Multiple Linear Regression 2 | 3 | # Importing the libraries 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | # Importing the dataset 9 | dataset = pd.read_csv('50_Startups.csv') 10 | X = dataset.iloc[:, :-1].values 11 | y = dataset.iloc[:, 4].values 12 | 13 | # Encoding categorical data 14 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 15 | labelencoder = LabelEncoder() 16 | X[:, 3] = labelencoder.fit_transform(X[:, 3]) 17 | onehotencoder = OneHotEncoder(categorical_features = [3]) 18 | X = onehotencoder.fit_transform(X).toarray() 19 | 20 | # Avoiding the Dummy Variable Trap 21 | X = X[:, 1:] 22 | 23 | # Splitting the dataset into the Training set and Test set 24 | from sklearn.cross_validation import train_test_split 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 26 | 27 | # Feature Scaling 28 | """from sklearn.preprocessing import StandardScaler 29 | sc_X = StandardScaler() 30 | X_train = sc_X.fit_transform(X_train) 31 | X_test = sc_X.transform(X_test) 32 | sc_y = StandardScaler() 33 | y_train = sc_y.fit_transform(y_train)""" 34 | 35 | # Fitting Multiple Linear Regression to the Training set 36 | from sklearn.linear_model import LinearRegression 37 | regressor = LinearRegression() 38 | regressor.fit(X_train, y_train) 39 | 40 | # Predicting the Test set results 41 | y_pred = regressor.predict(X_test) 42 | 43 | # Building the optimal model using Backward Elimination 44 | import statsmodels.formula.api as sm 45 | X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1) 46 | X_opt = X[:, [0, 1, 2, 3, 4, 5]] 47 | regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() 48 | regressor_OLS.summary() 49 | X_opt = X[:, [0, 1, 3, 4, 5]] 50 | regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() 51 | regressor_OLS.summary() 52 | X_opt = X[:, [0, 3, 4, 5]] 53 | regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() 54 | regressor_OLS.summary() 55 | X_opt = X[:, [0, 3, 5]] 56 | regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() 57 | regressor_OLS.summary() 58 | X_opt = X[:, [0, 3]] 59 | regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() 60 | regressor_OLS.summary() -------------------------------------------------------------------------------- /50_Startups.csv: -------------------------------------------------------------------------------- 1 | R&D Spend,Administration,Marketing Spend,State,Profit 2 | 165349.2,136897.8,471784.1,New York,192261.83 3 | 162597.7,151377.59,443898.53,California,191792.06 4 | 153441.51,101145.55,407934.54,Florida,191050.39 5 | 144372.41,118671.85,383199.62,New York,182901.99 6 | 142107.34,91391.77,366168.42,Florida,166187.94 7 | 131876.9,99814.71,362861.36,New York,156991.12 8 | 134615.46,147198.87,127716.82,California,156122.51 9 | 130298.13,145530.06,323876.68,Florida,155752.6 10 | 120542.52,148718.95,311613.29,New York,152211.77 11 | 123334.88,108679.17,304981.62,California,149759.96 12 | 101913.08,110594.11,229160.95,Florida,146121.95 13 | 100671.96,91790.61,249744.55,California,144259.4 14 | 93863.75,127320.38,249839.44,Florida,141585.52 15 | 91992.39,135495.07,252664.93,California,134307.35 16 | 119943.24,156547.42,256512.92,Florida,132602.65 17 | 114523.61,122616.84,261776.23,New York,129917.04 18 | 78013.11,121597.55,264346.06,California,126992.93 19 | 94657.16,145077.58,282574.31,New York,125370.37 20 | 91749.16,114175.79,294919.57,Florida,124266.9 21 | 86419.7,153514.11,0,New York,122776.86 22 | 76253.86,113867.3,298664.47,California,118474.03 23 | 78389.47,153773.43,299737.29,New York,111313.02 24 | 73994.56,122782.75,303319.26,Florida,110352.25 25 | 67532.53,105751.03,304768.73,Florida,108733.99 26 | 77044.01,99281.34,140574.81,New York,108552.04 27 | 64664.71,139553.16,137962.62,California,107404.34 28 | 75328.87,144135.98,134050.07,Florida,105733.54 29 | 72107.6,127864.55,353183.81,New York,105008.31 30 | 66051.52,182645.56,118148.2,Florida,103282.38 31 | 65605.48,153032.06,107138.38,New York,101004.64 32 | 61994.48,115641.28,91131.24,Florida,99937.59 33 | 61136.38,152701.92,88218.23,New York,97483.56 34 | 63408.86,129219.61,46085.25,California,97427.84 35 | 55493.95,103057.49,214634.81,Florida,96778.92 36 | 46426.07,157693.92,210797.67,California,96712.8 37 | 46014.02,85047.44,205517.64,New York,96479.51 38 | 28663.76,127056.21,201126.82,Florida,90708.19 39 | 44069.95,51283.14,197029.42,California,89949.14 40 | 20229.59,65947.93,185265.1,New York,81229.06 41 | 38558.51,82982.09,174999.3,California,81005.76 42 | 28754.33,118546.05,172795.67,California,78239.91 43 | 27892.92,84710.77,164470.71,Florida,77798.83 44 | 23640.93,96189.63,148001.11,California,71498.49 45 | 15505.73,127382.3,35534.17,New York,69758.98 46 | 22177.74,154806.14,28334.72,California,65200.33 47 | 1000.23,124153.04,1903.93,New York,64926.08 48 | 1315.46,115816.21,297114.46,Florida,49490.75 49 | 0,135426.92,0,California,42559.73 50 | 542.05,51743.15,0,New York,35673.41 51 | 0,116983.8,45173.06,California,14681.4 --------------------------------------------------------------------------------