├── Data_Cleaning.py ├── Data_Exploration.py ├── Data_Manipulation.py └── Regression.py /Data_Cleaning.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jul 15 08:27:42 2019 4 | 5 | @author: KJee 6 | """ 7 | 8 | import pandas as pd 9 | 10 | data = pd.read_csv('craigslistVehicles.csv') 11 | data.columns 12 | 13 | data.describe() 14 | 15 | #remove duplcates 16 | data.drop_duplicates(inplace= True) 17 | 18 | #check for nulls / % of nulls 19 | 20 | data.isnull().any() 21 | data.isnull().sum()/ data.shape[0] 22 | 23 | #remove columns with certain threshold of nulls 24 | #threshold is the number of columns or rows without nulls 25 | thresh = len(data)*.6 26 | data.dropna(thresh = thresh, axis = 1) 27 | data.dropna(thresh = 21, axis = 0) 28 | 29 | #imputing nulls fillna() 30 | data.odometer.fillna(data.odometer.median()) 31 | data.odometer.fillna(data.odometer.mean()) 32 | 33 | #everything lower or uppercase 34 | data.desc.head() 35 | data.desc.head().apply(lambda x: x.lower()) 36 | data.desc.head().apply(lambda x: x.upper()) 37 | 38 | #use regex .extract 39 | #use strip() 40 | #use replace() 41 | #split 42 | 43 | data.cylinders.dtype 44 | data.cylinders.value_counts() 45 | data.cylinders = data.cylinders.apply(lambda x: str(x).replace('cylinders','').strip()) 46 | data.cylinders.value_counts() 47 | 48 | #change data type 49 | data.cylinders = pd.to_numeric(data.cylinders, errors = 'coerce') 50 | 51 | 52 | #boxplot 53 | data.boxplot('price') 54 | data.boxplot('odometer') 55 | 56 | #outlier detection and normalization remove rows with > 99% / z score 57 | numeric = data._get_numeric_data() 58 | 59 | # with no null values 60 | from scipy import stats 61 | import numpy as np 62 | 63 | data_outliers = data[(data.price < data.price.quantile(.995)) & (data.price > data.price.quantile(.005))] 64 | 65 | data_outliers.boxplot('price') 66 | 67 | #remove duplcates, subset, keep, etc. 68 | data.drop_duplicates() 69 | 70 | #histogram 71 | data_outliers.price.hist() 72 | 73 | #types of normalization 74 | from sklearn.preprocessing import MinMaxScaler 75 | scaler = MinMaxScaler() 76 | scaler.fit(data.cylinders.values.reshape(-1,1)) 77 | scaler.transform(data.cylinders.values.reshape(-1,1)) 78 | -------------------------------------------------------------------------------- /Data_Exploration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 2 21:46:40 2019 4 | 5 | @author: Ken 6 | """ 7 | 8 | 9 | 10 | """ What is Pandas? 11 | 12 | Pandas is a very flexible data science library that allows us to load in data, manipulate it, and analyze it. 13 | 14 | - Pandas uses dataframes which we can think of as tables. We can use functions to create or manipulate rows and columns. 15 | - Using pandas dataframes, we can begin to understand trends in data we can perform functions on the rows or columns to understand more about our set 16 | - Pandas also has some visualization tools that we can use to better understand our data 17 | 18 | """ 19 | 20 | import pandas as pd 21 | 22 | # read in data from workign directory (folder in top right) 23 | # can read in from anywhere if full path is the pd.read_csv() 24 | data = pd.read_csv('craigslistVehicles.csv') 25 | 26 | #rows and columns returns (rows, columns) 27 | data.shape 28 | 29 | #returns the first x number of rows when head(num). Without a number it returns 5 30 | data.head() 31 | 32 | #returns the last x number of rows when tail(num). Without a number it returns 5 33 | data.tail() 34 | 35 | #returns an object with all of the column headers 36 | data.columns 37 | 38 | #basic information on all columns 39 | data.info() 40 | 41 | #gives basic statistics on numeric columns 42 | data.describe() 43 | 44 | #shows what type the data was read in as (float, int, string, bool, etc.) 45 | data.dtypes 46 | 47 | #shows which values are null 48 | data.isnull() 49 | 50 | #shows which columns have null values 51 | data.isnull().any() 52 | 53 | #shows for each column the percentage of null values 54 | data.isnull().sum() / data.shape[0] 55 | 56 | # for categorical variables 57 | 58 | #shows unique values that appear in the column 59 | #data.type = data['type'] 60 | data.type.unique() 61 | 62 | #shows the counts for those unique values 63 | data.type.value_counts() 64 | 65 | #shows the percentage of values from 66 | data.type.value_counts()/ data.type.notnull().sum() 67 | 68 | data.cylinders.head(10) 69 | 70 | ############################################################# Graphing ####################################################### 71 | #histogram of year 72 | # data.year.hist() == data.year.plot(kind='hist') 73 | data.year.hist() 74 | data.year.hist(bins=100) 75 | 76 | #bar chart of types 77 | data.type.value_counts().plot(kind='bar') 78 | 79 | -------------------------------------------------------------------------------- /Data_Manipulation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jul 8 10:17:05 2019 4 | 5 | @author: KJee 6 | """ 7 | 8 | #load in the pandas module 9 | import pandas as pd 10 | 11 | # read in data from workign directory (folder in top right) 12 | # can read in from anywhere if full path is the pd.read_csv() 13 | data = pd.read_csv('craigslistVehicles.csv') 14 | 15 | #view columns & rename columns 16 | data.columns 17 | data.rename(index=str,columns={"url":"new_url"}) 18 | data.rename(index=str,columns={"new_url":"url"}) 19 | 20 | # view all rows for one column 21 | data['url'] 22 | 23 | #view all columns for select group of rows 24 | data[0:10] 25 | 26 | #filter for multiple columns (all below do the same thing ) 27 | data[['url','city','price']] 28 | data.loc[:,['url','city','price']] 29 | data.iloc[:,0:3] 30 | 31 | #filter by rows and columns 32 | data.loc[0:100,['url','city','price']] 33 | data.iloc[0:100,0:3] 34 | 35 | #filter by column list 36 | data[data.columns] 37 | 38 | 39 | #drop / add column #inplace = True 40 | #axis & inplace 41 | data.drop('url', axis = 1) 42 | data.drop(['url','price'], axis = 1) 43 | 44 | #add column 45 | data['age'] = 2019 - data['year'] 46 | 47 | #filtering data by columns & boolean indexing 48 | data[(data['age'] < 5)] 49 | 50 | data.loc[(data.age <5),:] 51 | 52 | # basic operators on columns 53 | data['price_per_mile'] = data['price'] / data['odometer'] 54 | 55 | # apply function 56 | 57 | def timex2(x): 58 | return 2*x 59 | 60 | data['price2x'] = data['price'].apply(timex2) 61 | data['price'].head() 62 | data['price2x'].head() 63 | 64 | #lambda function 65 | data['price3x'] = data['price'].apply(lambda x: x*3) 66 | data['price3x'].head() 67 | 68 | #tenary operator 69 | data['expensive'] = data['price'].apply(lambda x: 'expensive' if x > 10000 else 'cheap') 70 | 71 | data['newandcheap'] = data.apply(lambda x: 'yes' if x['price'] < 10000 and x['age'] < 5 else 'no', axis = 1) 72 | data['newandcheap2'] = data[['price','age']].apply(lambda x: 'yes' if x[0] < 10000 and x[1] < 5 else 'no', axis = 1) 73 | 74 | #binning pd.cut / pd.qcut 75 | pd.qcut(data.price,5) #even number 76 | pd.cut(data.price,5 ) #even spacing 77 | 78 | #dummy variables 79 | data_dummies = pd.get_dummies(data[['price','year','fuel','transmission','type']]) 80 | 81 | #pivot table / sort_index / sort_values 82 | data.pivot_table(index='year',columns='type',values='price',aggfunc ='mean').sort_index(ascending=False) 83 | data.pivot_table(index='year',columns='type',values='price',aggfunc ='count').sort_index(ascending=False) 84 | data.pivot_table(index='year',columns='type',values='price',aggfunc ='count').sort_index(ascending=False).plot() 85 | 86 | #groupby 87 | data.groupby('type').mean() 88 | data.groupby(['type','fuel']).mean() 89 | data.groupby(['type','fuel'],as_index = False).mean() 90 | 91 | # pd.merge == to join in sql 92 | df1 = data[['url','city']] 93 | df2 = data[['url','price']] 94 | 95 | df_merged = pd.merge(df1,df2,on='url') 96 | 97 | 98 | #append and concatenate (pd.concat / pd.append) 99 | data100 = data.sample(100, random_state = 1) 100 | data1002 = data.sample(100, random_state = 2) 101 | 102 | data100.append(data1002) 103 | pd.concat([data100,data1002], axis = 0) 104 | 105 | # write to a csv file pd.to_csv() 106 | data100.to_csv('data100.csv') 107 | 108 | -------------------------------------------------------------------------------- /Regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Aug 11 12:37:26 2019 4 | 5 | @author: Ken 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Mon Jul 15 08:27:42 2019 11 | 12 | @author: KJee 13 | """ 14 | 15 | import pandas as pd 16 | import numpy as np 17 | import matplotlib.pyplot as plt 18 | 19 | data = pd.read_csv('craigslistVehicles.csv') 20 | data.columns 21 | 22 | data.describe() 23 | 24 | #remove duplcates 25 | data.drop_duplicates(inplace= True) 26 | 27 | #check for nulls / % of nulls 28 | 29 | data.isnull().any() 30 | data.isnull().sum()/ data.shape[0] 31 | 32 | #remove columns with certain threshold of nulls 33 | #threshold is the number of columns or rows without nulls 34 | """thresh = len(data)*.6 35 | data.dropna(thresh = thresh, axis = 1) 36 | data.dropna(thresh = 21, axis = 0) 37 | """ 38 | 39 | #everything lower or uppercase 40 | data.desc.head() 41 | data.desc.head().apply(lambda x: x.lower()) 42 | data.desc.head().apply(lambda x: x.upper()) 43 | data['text_len'] = data.desc.apply(lambda x: len(str(x))) 44 | (data['text_len'].value_counts() > 1).sum() 45 | 46 | data.cylinders.dtype 47 | data.cylinders.value_counts() 48 | data.cylinders = data.cylinders.apply(lambda x: str(x).replace('cylinders','').strip()) 49 | data.cylinders.value_counts() 50 | 51 | #change data type 52 | data.cylinders = pd.to_numeric(data.cylinders, errors = 'coerce') 53 | data.cylinders.value_counts() 54 | 55 | #boxplot 56 | data.boxplot('price') 57 | data.boxplot('odometer') 58 | 59 | data.price.max() 60 | data.odometer.max() 61 | 62 | #outlier detection and normalization remove rows with > 99% / z score 63 | 64 | # with no null values 65 | data_outliers = data[(data.price < data.price.quantile(.995)) & (data.price > data.price.quantile(.005)) & (data.price != 0) & (data.odometer != 0)] 66 | data_outliers = data_outliers[(data_outliers.odometer < data_outliers.odometer.quantile(.995)) & (data_outliers.odometer > data_outliers.odometer.quantile(.005))] 67 | 68 | #histogram 69 | data_outliers[['price','odometer','cylinders','text_len']].hist() 70 | 71 | #types of data cleaning 72 | data_outliers.isnull().sum()/data_outliers.shape[0] 73 | 74 | #imputing nulls fillna() 75 | 76 | data_outliers.dropna(subset=['manufacturer','make','fuel','transmission', 'title_status','year'], inplace = True) 77 | data_outliers.isnull().sum()/data_outliers.shape[0] 78 | 79 | data_outliers.cylinders.fillna(data_outliers.cylinders.median(), inplace = True) 80 | data_outliers.isnull().sum()/data_outliers.shape[0] 81 | 82 | data_outliers[['condition','VIN','drive','type','paint_color']]= data_outliers[['condition','VIN','drive','type','paint_color']].fillna('n/a') 83 | data_outliers.isnull().sum()/data_outliers.shape[0] 84 | 85 | data_outliers.VIN = data_outliers.VIN.apply(lambda x: 'has_vin' if x != 'n/a' else 'no_vin' ) 86 | 87 | data_final = data_outliers.drop(['city_url','url','city','size','desc','lat','long','image_url'], axis = 1) 88 | data_final['constant'] = 1 89 | data_final['age'] = 2019 - data_final.year 90 | data_final.isnull().any() 91 | 92 | numeric = data_final._get_numeric_data() 93 | 94 | import seaborn as sns 95 | 96 | corrdata = numeric 97 | 98 | corr = corrdata.corr() 99 | ax = sns.heatmap( 100 | corr, 101 | vmin=-1, vmax=1, center=0, 102 | cmap=sns.diverging_palette(20, 220, n=200), 103 | square=True 104 | ) 105 | ax.set_xticklabels( 106 | ax.get_xticklabels(), 107 | rotation=45, 108 | horizontalalignment='right' 109 | ); 110 | 111 | #simple linear regression for year 112 | from sklearn.model_selection import train_test_split 113 | from sklearn.linear_model import LinearRegression 114 | from sklearn.metrics import mean_squared_error, mean_absolute_error 115 | import math 116 | 117 | #set variables need to be in specific format 118 | X1 = data_final.odometer.values.reshape(-1,1) 119 | y1 = data_final.price.values.reshape(-1,1) 120 | 121 | #create train / test split for validation 122 | X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3, random_state=0) 123 | 124 | reg = LinearRegression().fit(X_train1, y_train1) 125 | reg.score(X_train1, y_train1) 126 | reg.coef_ 127 | y_hat1 = reg.predict(X_train1) 128 | 129 | plt.scatter(X_train1,y_train1) 130 | plt.scatter(X_train1,y_hat1) 131 | plt.show() 132 | 133 | y_hat_test1 = reg.predict(X_test1) 134 | plt.scatter(X_test1, y_test1) 135 | plt.scatter(X_test1, y_hat_test1) 136 | plt.show() 137 | 138 | #MSE & RMSE penalize large errors more than MAE 139 | mae = mean_absolute_error(y_hat_test1,y_test1) 140 | rmse = math.sqrt(mean_squared_error(y_hat_test1,y_test1)) 141 | print('Root Mean Squared Error = ',rmse) 142 | print('Mean Absolute Error = ',mae) 143 | 144 | import statsmodels.api as sm 145 | 146 | X1b = data_final[['constant','odometer']] 147 | y1b = data_final.price.values 148 | 149 | X_train1b, X_test1b, y_train1b, y_test1b = train_test_split(X1b, y1b, test_size=0.3, random_state=0) 150 | 151 | reg_sm1b = sm.OLS(y_train1b, X_train1b).fit() 152 | reg_sm1b.summary() 153 | 154 | 155 | #multiple linear regression 156 | from statsmodels.stats.outliers_influence import variance_inflation_factor 157 | 158 | X2 = data_final[['constant','age','odometer','cylinders','text_len']] 159 | y2 = data_final.price.values 160 | 161 | X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=0) 162 | 163 | reg_sm2 = sm.OLS(y_train2, X_train2).fit() 164 | reg_sm2.summary() 165 | 166 | pd.Series([variance_inflation_factor(X2.values,i) for i in range(X2.shape[1])],index=X2.columns) 167 | 168 | 169 | 170 | #actual regression 171 | X3 = pd.get_dummies(data_final[['constant','age','odometer','text_len','cylinders','condition','fuel','VIN','type']]) 172 | y3 = data_final.price.values 173 | 174 | X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.3, random_state=0) 175 | 176 | reg_sm3 = sm.OLS(y_train3, X_train3).fit() 177 | reg_sm3.summary() 178 | 179 | y_hat3 = reg_sm.predict(X_test3) 180 | 181 | rmse3 = math.sqrt(mean_squared_error(y_hat3,y_test3)) 182 | 183 | plt.scatter(y_hat3,y_test3) 184 | 185 | #cross validation 5 fold 186 | from sklearn.model_selection import cross_val_score 187 | X4 = pd.get_dummies(data_final[['age','odometer','cylinders','condition','fuel','VIN','type']]) 188 | y4 = data_final.price.values 189 | 190 | X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.3, random_state=0) 191 | 192 | reg4 = LinearRegression().fit(X_train4, y_train4) 193 | reg4.score(X_train4,y_train4) 194 | 195 | scores = cross_val_score(reg4,X4,y4, cv=5, scoring = 'neg_mean_squared_error') 196 | np.sqrt(np.abs(scores)) 197 | 198 | 199 | 200 | 201 | 202 | 203 | --------------------------------------------------------------------------------