├── README.md ├── 2__Regression ├── Step-by-step-Blueprints-For-Building-Models.pdf ├── Position_Salaries.csv ├── polynomial_linear_regression.py ├── Salary_Data.csv ├── simple_linear_regression.py ├── multiple_linear_regression.py └── 50_Startups.csv ├── 0__datasets ├── Data.csv ├── Position_Salaries.csv ├── Salary_Data.csv └── 50_Startups.csv ├── 1__Data_Preprocessing └── Data.csv ├── .gitignore └── .gitattributes /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning 2 | Started with Machine Learning using Python. 3 | -------------------------------------------------------------------------------- /2__Regression/Step-by-step-Blueprints-For-Building-Models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ranajoy-dutta/Machine-Learning/master/2__Regression/Step-by-step-Blueprints-For-Building-Models.pdf -------------------------------------------------------------------------------- /0__datasets/Data.csv: -------------------------------------------------------------------------------- 1 | Country,Age,Salary,Purchased 2 | France,44,72000,No 3 | Spain,27,48000,Yes 4 | Germany,30,54000,No 5 | Spain,38,61000,No 6 | Germany,40,,Yes 7 | France,35,58000,Yes 8 | Spain,,52000,No 9 | France,48,79000,Yes 10 | Germany,50,83000,No 11 | France,37,67000,Yes -------------------------------------------------------------------------------- /1__Data_Preprocessing/Data.csv: -------------------------------------------------------------------------------- 1 | Country,Age,Salary,Purchased 2 | France,44,72000,No 3 | Spain,27,48000,Yes 4 | Germany,30,54000,No 5 | Spain,38,61000,No 6 | Germany,40,,Yes 7 | France,35,58000,Yes 8 | Spain,,52000,No 9 | France,48,79000,Yes 10 | Germany,50,83000,No 11 | France,37,67000,Yes -------------------------------------------------------------------------------- /0__datasets/Position_Salaries.csv: -------------------------------------------------------------------------------- 1 | Position,Level,Salary 2 | Business Analyst,1,45000 3 | Junior Consultant,2,50000 4 | Senior Consultant,3,60000 5 | Manager,4,80000 6 | Country Manager,5,110000 7 | Region Manager,6,150000 8 | Partner,7,200000 9 | Senior Partner,8,300000 10 | C-level,9,500000 11 | CEO,10,1000000 -------------------------------------------------------------------------------- /2__Regression/Position_Salaries.csv: -------------------------------------------------------------------------------- 1 | Position,Level,Salary 2 | Business Analyst,1,45000 3 | Junior Consultant,2,50000 4 | Senior Consultant,3,60000 5 | Manager,4,80000 6 | Country Manager,5,110000 7 | Region Manager,6,150000 8 | Partner,7,200000 9 | Senior Partner,8,300000 10 | C-level,9,500000 11 | CEO,10,1000000 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows thumbnail cache files 2 | Thumbs.db 3 | ehthumbs.db 4 | ehthumbs_vista.db 5 | 6 | # Folder config file 7 | Desktop.ini 8 | 9 | # Recycle Bin used on file shares 10 | $RECYCLE.BIN/ 11 | 12 | # Windows Installer files 13 | *.cab 14 | *.msi 15 | *.msm 16 | *.msp 17 | 18 | # Windows shortcuts 19 | *.lnk 20 | 21 | # ========================= 22 | # Operating System Files 23 | # ========================= 24 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /2__Regression/polynomial_linear_regression.py: -------------------------------------------------------------------------------- 1 | # Polynomial Linear Regression 2 | """ 3 | AIM :: Predict Salary of new employee based on model trained on salary of 4 | current employees as per their positions(LEVEL). 5 | """ 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import matplotlib as plt 10 | 11 | dataset = pd.read_csv("Position_Salaries.csv") 12 | x = dataset.iloc[:, 1:2] 13 | y = dataset.iloc[:, 2] 14 | 15 | from sklearn.linear_model import LinearRegression 16 | linearRegressor = LinearRegression() 17 | linearRegressor.fit(x, y) -------------------------------------------------------------------------------- /0__datasets/Salary_Data.csv: -------------------------------------------------------------------------------- 1 | YearsExperience,Salary 2 | 1.1,39343.00 3 | 1.3,46205.00 4 | 1.5,37731.00 5 | 2.0,43525.00 6 | 2.2,39891.00 7 | 2.9,56642.00 8 | 3.0,60150.00 9 | 3.2,54445.00 10 | 3.2,64445.00 11 | 3.7,57189.00 12 | 3.9,63218.00 13 | 4.0,55794.00 14 | 4.0,56957.00 15 | 4.1,57081.00 16 | 4.5,61111.00 17 | 4.9,67938.00 18 | 5.1,66029.00 19 | 5.3,83088.00 20 | 5.9,81363.00 21 | 6.0,93940.00 22 | 6.8,91738.00 23 | 7.1,98273.00 24 | 7.9,101302.00 25 | 8.2,113812.00 26 | 8.7,109431.00 27 | 9.0,105582.00 28 | 9.5,116969.00 29 | 9.6,112635.00 30 | 10.3,122391.00 31 | 10.5,121872.00 32 | -------------------------------------------------------------------------------- /2__Regression/Salary_Data.csv: -------------------------------------------------------------------------------- 1 | YearsExperience,Salary 2 | 1.1,39343.00 3 | 1.3,46205.00 4 | 1.5,37731.00 5 | 2.0,43525.00 6 | 2.2,39891.00 7 | 2.9,56642.00 8 | 3.0,60150.00 9 | 3.2,54445.00 10 | 3.2,64445.00 11 | 3.7,57189.00 12 | 3.9,63218.00 13 | 4.0,55794.00 14 | 4.0,56957.00 15 | 4.1,57081.00 16 | 4.5,61111.00 17 | 4.9,67938.00 18 | 5.1,66029.00 19 | 5.3,83088.00 20 | 5.9,81363.00 21 | 6.0,93940.00 22 | 6.8,91738.00 23 | 7.1,98273.00 24 | 7.9,101302.00 25 | 8.2,113812.00 26 | 8.7,109431.00 27 | 9.0,105582.00 28 | 9.5,116969.00 29 | 9.6,112635.00 30 | 10.3,122391.00 31 | 10.5,121872.00 32 | -------------------------------------------------------------------------------- /2__Regression/simple_linear_regression.py: -------------------------------------------------------------------------------- 1 | # Simple Linear Regression 2 | 3 | # Importing the libraries 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | # Importing the dataset 9 | dataset = pd.read_csv('Salary_Data.csv') 10 | x = dataset.iloc[:,:-1].values 11 | y = dataset.iloc[:,-1].values 12 | 13 | # Splitting the dataset into the Training set and Test set 14 | from sklearn.cross_validation import train_test_split 15 | X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state = 0) 16 | 17 | # Fitting Linear Regression to the training set 18 | from sklearn.linear_model import LinearRegression 19 | regressor = LinearRegression() 20 | regressor.fit(X_train,y_train) 21 | 22 | # Predicting the results on test set 23 | y_pred = regressor.predict(X_test) 24 | 25 | # visualising the results of training set in matplot(graph) 26 | plt.scatter(X_train, y_train, color = "red") 27 | plt.plot(X_train, regressor.predict(X_train), color="blue") 28 | plt.plot(X_train, y_train, color="yellow") 29 | xlabel = "experience" 30 | ylabel = "Salary" 31 | plt.title="Salary vs experience" 32 | plt.show() 33 | -------------------------------------------------------------------------------- /2__Regression/multiple_linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | # importing dataset and setting explanatory variable and dependent variable 6 | dataset = pd.read_csv('50_startups.csv') 7 | x = dataset.iloc[:,:-1].values 8 | y = dataset.iloc[:,4].values 9 | 10 | # encoding categorical data (here state name) 11 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 12 | labelencoder_x = LabelEncoder() 13 | x[:,3] = labelencoder_x.fit_transform(x[:,3]) 14 | onehotencoder = OneHotEncoder(categorical_features = [3]) 15 | x = onehotencoder.fit_transform(x).toarray() 16 | 17 | # avoiding the dummy variable trap 18 | x = x[:,1:] 19 | 20 | # splitting dataset 21 | from sklearn.cross_validation import train_test_split 22 | x_train, x_test, y_train, y_test = train_test_split( 23 | x, y, test_size = 0.2, random_state = 0) 24 | 25 | 26 | # fitting multiple linear regression to training set 27 | from sklearn.linear_model import LinearRegression 28 | regressor = LinearRegression() 29 | regressor.fit(x_train, y_train) 30 | 31 | # predicting the test set results 32 | y_pred = regressor.predict(x_test) 33 | 34 | # building the optimal model using backward elimination 35 | import statsmodels.formula.api as sm 36 | x = np.append(arr = np.ones((50,1)).astype(int), values = x, axis = 1) 37 | x_optimal = x[:, [0,1,2,3,4,5]] 38 | regressor_ols = sm.OLS(endog = y, exog = x_optimal).fit() 39 | regressor_ols.summary() 40 | 41 | x_optimal = x[:, [0,1,3,4,5]] 42 | regressor_ols = sm.OLS(endog = y, exog = x_optimal).fit() 43 | regressor_ols.summary() 44 | 45 | x_optimal = x[:, [0,3,4,5]] 46 | regressor_ols = sm.OLS(endog = y, exog = x_optimal).fit() 47 | regressor_ols.summary() 48 | 49 | x_optimal = x[:, [0,3,5]] 50 | regressor_ols = sm.OLS(endog = y, exog = x_optimal).fit() 51 | regressor_ols.summary() 52 | 53 | x_optimal = x[:, [0,3]] 54 | regressor_ols = sm.OLS(endog = y, exog = x_optimal).fit() 55 | regressor_ols.summary() 56 | -------------------------------------------------------------------------------- /0__datasets/50_Startups.csv: -------------------------------------------------------------------------------- 1 | R&D Spend,Administration,Marketing Spend,State,Profit 2 | 165349.2,136897.8,471784.1,New York,192261.83 3 | 162597.7,151377.59,443898.53,California,191792.06 4 | 153441.51,101145.55,407934.54,Florida,191050.39 5 | 144372.41,118671.85,383199.62,New York,182901.99 6 | 142107.34,91391.77,366168.42,Florida,166187.94 7 | 131876.9,99814.71,362861.36,New York,156991.12 8 | 134615.46,147198.87,127716.82,California,156122.51 9 | 130298.13,145530.06,323876.68,Florida,155752.6 10 | 120542.52,148718.95,311613.29,New York,152211.77 11 | 123334.88,108679.17,304981.62,California,149759.96 12 | 101913.08,110594.11,229160.95,Florida,146121.95 13 | 100671.96,91790.61,249744.55,California,144259.4 14 | 93863.75,127320.38,249839.44,Florida,141585.52 15 | 91992.39,135495.07,252664.93,California,134307.35 16 | 119943.24,156547.42,256512.92,Florida,132602.65 17 | 114523.61,122616.84,261776.23,New York,129917.04 18 | 78013.11,121597.55,264346.06,California,126992.93 19 | 94657.16,145077.58,282574.31,New York,125370.37 20 | 91749.16,114175.79,294919.57,Florida,124266.9 21 | 86419.7,153514.11,0,New York,122776.86 22 | 76253.86,113867.3,298664.47,California,118474.03 23 | 78389.47,153773.43,299737.29,New York,111313.02 24 | 73994.56,122782.75,303319.26,Florida,110352.25 25 | 67532.53,105751.03,304768.73,Florida,108733.99 26 | 77044.01,99281.34,140574.81,New York,108552.04 27 | 64664.71,139553.16,137962.62,California,107404.34 28 | 75328.87,144135.98,134050.07,Florida,105733.54 29 | 72107.6,127864.55,353183.81,New York,105008.31 30 | 66051.52,182645.56,118148.2,Florida,103282.38 31 | 65605.48,153032.06,107138.38,New York,101004.64 32 | 61994.48,115641.28,91131.24,Florida,99937.59 33 | 61136.38,152701.92,88218.23,New York,97483.56 34 | 63408.86,129219.61,46085.25,California,97427.84 35 | 55493.95,103057.49,214634.81,Florida,96778.92 36 | 46426.07,157693.92,210797.67,California,96712.8 37 | 46014.02,85047.44,205517.64,New York,96479.51 38 | 28663.76,127056.21,201126.82,Florida,90708.19 39 | 44069.95,51283.14,197029.42,California,89949.14 40 | 20229.59,65947.93,185265.1,New York,81229.06 41 | 38558.51,82982.09,174999.3,California,81005.76 42 | 28754.33,118546.05,172795.67,California,78239.91 43 | 27892.92,84710.77,164470.71,Florida,77798.83 44 | 23640.93,96189.63,148001.11,California,71498.49 45 | 15505.73,127382.3,35534.17,New York,69758.98 46 | 22177.74,154806.14,28334.72,California,65200.33 47 | 1000.23,124153.04,1903.93,New York,64926.08 48 | 1315.46,115816.21,297114.46,Florida,49490.75 49 | 0,135426.92,0,California,42559.73 50 | 542.05,51743.15,0,New York,35673.41 51 | 0,116983.8,45173.06,California,14681.4 -------------------------------------------------------------------------------- /2__Regression/50_Startups.csv: -------------------------------------------------------------------------------- 1 | R&D Spend,Administration,Marketing Spend,State,Profit 2 | 165349.2,136897.8,471784.1,New York,192261.83 3 | 162597.7,151377.59,443898.53,California,191792.06 4 | 153441.51,101145.55,407934.54,Florida,191050.39 5 | 144372.41,118671.85,383199.62,New York,182901.99 6 | 142107.34,91391.77,366168.42,Florida,166187.94 7 | 131876.9,99814.71,362861.36,New York,156991.12 8 | 134615.46,147198.87,127716.82,California,156122.51 9 | 130298.13,145530.06,323876.68,Florida,155752.6 10 | 120542.52,148718.95,311613.29,New York,152211.77 11 | 123334.88,108679.17,304981.62,California,149759.96 12 | 101913.08,110594.11,229160.95,Florida,146121.95 13 | 100671.96,91790.61,249744.55,California,144259.4 14 | 93863.75,127320.38,249839.44,Florida,141585.52 15 | 91992.39,135495.07,252664.93,California,134307.35 16 | 119943.24,156547.42,256512.92,Florida,132602.65 17 | 114523.61,122616.84,261776.23,New York,129917.04 18 | 78013.11,121597.55,264346.06,California,126992.93 19 | 94657.16,145077.58,282574.31,New York,125370.37 20 | 91749.16,114175.79,294919.57,Florida,124266.9 21 | 86419.7,153514.11,0,New York,122776.86 22 | 76253.86,113867.3,298664.47,California,118474.03 23 | 78389.47,153773.43,299737.29,New York,111313.02 24 | 73994.56,122782.75,303319.26,Florida,110352.25 25 | 67532.53,105751.03,304768.73,Florida,108733.99 26 | 77044.01,99281.34,140574.81,New York,108552.04 27 | 64664.71,139553.16,137962.62,California,107404.34 28 | 75328.87,144135.98,134050.07,Florida,105733.54 29 | 72107.6,127864.55,353183.81,New York,105008.31 30 | 66051.52,182645.56,118148.2,Florida,103282.38 31 | 65605.48,153032.06,107138.38,New York,101004.64 32 | 61994.48,115641.28,91131.24,Florida,99937.59 33 | 61136.38,152701.92,88218.23,New York,97483.56 34 | 63408.86,129219.61,46085.25,California,97427.84 35 | 55493.95,103057.49,214634.81,Florida,96778.92 36 | 46426.07,157693.92,210797.67,California,96712.8 37 | 46014.02,85047.44,205517.64,New York,96479.51 38 | 28663.76,127056.21,201126.82,Florida,90708.19 39 | 44069.95,51283.14,197029.42,California,89949.14 40 | 20229.59,65947.93,185265.1,New York,81229.06 41 | 38558.51,82982.09,174999.3,California,81005.76 42 | 28754.33,118546.05,172795.67,California,78239.91 43 | 27892.92,84710.77,164470.71,Florida,77798.83 44 | 23640.93,96189.63,148001.11,California,71498.49 45 | 15505.73,127382.3,35534.17,New York,69758.98 46 | 22177.74,154806.14,28334.72,California,65200.33 47 | 1000.23,124153.04,1903.93,New York,64926.08 48 | 1315.46,115816.21,297114.46,Florida,49490.75 49 | 0,135426.92,0,California,42559.73 50 | 542.05,51743.15,0,New York,35673.41 51 | 0,116983.8,45173.06,California,14681.4 --------------------------------------------------------------------------------