├── Data_Cleaning.py
├── Data_Exploration.py
├── Data_Manipulation.py
└── Regression.py


/Data_Cleaning.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jul 15 08:27:42 2019
 4 | 
 5 | @author: KJee
 6 | """
 7 | 
 8 | import pandas as pd 
 9 | 
10 | data = pd.read_csv('craigslistVehicles.csv')
11 | data.columns
12 | 
13 | data.describe()
14 | 
15 | #remove duplcates 
16 | data.drop_duplicates(inplace= True)
17 | 
18 | #check for nulls / % of nulls 
19 | 
20 | data.isnull().any()
21 | data.isnull().sum()/ data.shape[0]
22 | 
23 | #remove columns with certain threshold of nulls
24 | #threshold is the number of columns or rows without nulls 
25 | thresh = len(data)*.6
26 | data.dropna(thresh = thresh, axis = 1)
27 | data.dropna(thresh = 21, axis = 0)
28 | 
29 | #imputing nulls fillna()
30 | data.odometer.fillna(data.odometer.median())
31 | data.odometer.fillna(data.odometer.mean())
32 | 
33 | #everything lower or uppercase
34 | data.desc.head()
35 | data.desc.head().apply(lambda x: x.lower())
36 | data.desc.head().apply(lambda x: x.upper())
37 | 
38 | #use regex .extract
39 | #use strip()
40 | #use replace()
41 | #split 
42 | 
43 | data.cylinders.dtype
44 | data.cylinders.value_counts()
45 | data.cylinders = data.cylinders.apply(lambda x: str(x).replace('cylinders','').strip())
46 | data.cylinders.value_counts()
47 | 
48 | #change data type 
49 | data.cylinders = pd.to_numeric(data.cylinders, errors = 'coerce')
50 | 
51 | 
52 | #boxplot 
53 | data.boxplot('price')
54 | data.boxplot('odometer')
55 | 
56 | #outlier detection and normalization remove rows with > 99% / z score 
57 | numeric = data._get_numeric_data()
58 | 
59 | # with no null values 
60 | from scipy import stats
61 | import numpy as np 
62 | 
63 | data_outliers = data[(data.price < data.price.quantile(.995)) & (data.price > data.price.quantile(.005))]
64 | 
65 | data_outliers.boxplot('price')
66 | 
67 | #remove duplcates, subset, keep, etc.
68 | data.drop_duplicates()
69 | 
70 | #histogram
71 | data_outliers.price.hist()
72 | 
73 | #types of normalization 
74 | from sklearn.preprocessing import MinMaxScaler
75 | scaler = MinMaxScaler()
76 | scaler.fit(data.cylinders.values.reshape(-1,1))
77 | scaler.transform(data.cylinders.values.reshape(-1,1))
78 | 


--------------------------------------------------------------------------------
/Data_Exploration.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jul  2 21:46:40 2019
 4 | 
 5 | @author: Ken
 6 | """
 7 | 
 8 | 
 9 | 
10 | """ What is Pandas?
11 | 
12 | Pandas is a very flexible data science library that allows us to load in data, manipulate it, and analyze it. 
13 | 
14 | - Pandas uses dataframes which we can think of as tables. We can use functions to create or manipulate rows and columns. 
15 | - Using pandas dataframes, we can begin to understand trends in data we can perform functions on the rows or columns to understand more about our set
16 | - Pandas also has some visualization tools that we can use to better understand our data
17 | 
18 | """
19 | 
20 | import pandas as pd 
21 | 
22 | # read in data from workign directory (folder in top right)
23 | # can read in from anywhere if full path is the pd.read_csv()
24 | data = pd.read_csv('craigslistVehicles.csv')
25 | 
26 | #rows and columns returns (rows, columns)
27 | data.shape
28 | 
29 | #returns the first x number of rows when head(num). Without a number it returns 5
30 | data.head()
31 | 
32 | #returns the last x number of rows when tail(num). Without a number it returns 5
33 | data.tail()
34 | 
35 | #returns an object with all of the column headers 
36 | data.columns
37 | 
38 | #basic information on all columns 
39 | data.info()
40 | 
41 | #gives basic statistics on numeric columns
42 | data.describe()
43 | 
44 | #shows what type the data was read in as (float, int, string, bool, etc.)
45 | data.dtypes
46 | 
47 | #shows which values are null
48 | data.isnull()
49 | 
50 | #shows which columns have null values
51 | data.isnull().any()
52 | 
53 | #shows for each column the percentage of null values 
54 | data.isnull().sum() / data.shape[0]
55 | 
56 | # for categorical variables 
57 | 
58 | #shows unique values that appear in the column 
59 | #data.type = data['type']
60 | data.type.unique()
61 | 
62 | #shows the counts for those unique values 
63 | data.type.value_counts()
64 | 
65 | #shows the percentage of values from 
66 | data.type.value_counts()/ data.type.notnull().sum()
67 | 
68 | data.cylinders.head(10)
69 | 
70 | ############################################################# Graphing #######################################################
71 | #histogram of year 
72 | # data.year.hist() == data.year.plot(kind='hist')
73 | data.year.hist()
74 | data.year.hist(bins=100)
75 | 
76 | #bar chart of types 
77 | data.type.value_counts().plot(kind='bar')
78 | 
79 | 


--------------------------------------------------------------------------------
/Data_Manipulation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Jul  8 10:17:05 2019
  4 | 
  5 | @author: KJee
  6 | """
  7 | 
  8 | #load in the pandas module 
  9 | import pandas as pd 
 10 | 
 11 | # read in data from workign directory (folder in top right)
 12 | # can read in from anywhere if full path is the pd.read_csv()
 13 | data = pd.read_csv('craigslistVehicles.csv')
 14 | 
 15 | #view columns & rename columns 
 16 | data.columns
 17 | data.rename(index=str,columns={"url":"new_url"})
 18 | data.rename(index=str,columns={"new_url":"url"})
 19 | 
 20 | # view all rows for one column
 21 | data['url']
 22 | 
 23 | #view all columns for select group of rows
 24 | data[0:10]
 25 | 
 26 | #filter for multiple columns (all below do the same thing ) 
 27 | data[['url','city','price']]
 28 | data.loc[:,['url','city','price']]
 29 | data.iloc[:,0:3]
 30 | 
 31 | #filter by rows and columns 
 32 | data.loc[0:100,['url','city','price']]
 33 | data.iloc[0:100,0:3]
 34 | 
 35 | #filter by column list 
 36 | data[data.columns]
 37 | 
 38 | 
 39 | #drop / add column #inplace = True 
 40 | #axis & inplace 
 41 | data.drop('url', axis = 1)
 42 | data.drop(['url','price'], axis = 1)
 43 | 
 44 | #add column 
 45 | data['age'] = 2019 - data['year']
 46 | 
 47 | #filtering data by columns & boolean indexing 
 48 | data[(data['age'] < 5)]
 49 | 
 50 | data.loc[(data.age <5),:]
 51 | 
 52 | # basic operators on columns 
 53 | data['price_per_mile'] = data['price'] / data['odometer']
 54 | 
 55 | # apply function 
 56 | 
 57 | def timex2(x):
 58 |     return 2*x
 59 | 
 60 | data['price2x'] = data['price'].apply(timex2)
 61 | data['price'].head()
 62 | data['price2x'].head()
 63 | 
 64 | #lambda function 
 65 | data['price3x'] = data['price'].apply(lambda x: x*3)
 66 | data['price3x'].head()
 67 | 
 68 | #tenary operator 
 69 | data['expensive'] = data['price'].apply(lambda x: 'expensive' if x > 10000 else 'cheap')
 70 | 
 71 | data['newandcheap'] = data.apply(lambda x: 'yes' if x['price'] < 10000 and x['age'] < 5 else 'no', axis = 1)
 72 | data['newandcheap2'] = data[['price','age']].apply(lambda x: 'yes' if x[0] < 10000 and x[1] < 5 else 'no', axis = 1)
 73 | 
 74 | #binning pd.cut / pd.qcut
 75 | pd.qcut(data.price,5) #even number 
 76 | pd.cut(data.price,5 ) #even spacing 
 77 | 
 78 | #dummy variables 
 79 | data_dummies = pd.get_dummies(data[['price','year','fuel','transmission','type']])
 80 | 
 81 | #pivot table / sort_index / sort_values 
 82 | data.pivot_table(index='year',columns='type',values='price',aggfunc ='mean').sort_index(ascending=False)
 83 | data.pivot_table(index='year',columns='type',values='price',aggfunc ='count').sort_index(ascending=False)
 84 | data.pivot_table(index='year',columns='type',values='price',aggfunc ='count').sort_index(ascending=False).plot()
 85 | 
 86 | #groupby 
 87 | data.groupby('type').mean()
 88 | data.groupby(['type','fuel']).mean()
 89 | data.groupby(['type','fuel'],as_index = False).mean()
 90 | 
 91 | # pd.merge == to join in sql  
 92 | df1 = data[['url','city']]
 93 | df2 = data[['url','price']]
 94 | 
 95 | df_merged = pd.merge(df1,df2,on='url')
 96 | 
 97 | 
 98 | #append and concatenate (pd.concat / pd.append)
 99 | data100 = data.sample(100, random_state = 1)
100 | data1002 = data.sample(100, random_state = 2)
101 | 
102 | data100.append(data1002)
103 | pd.concat([data100,data1002], axis = 0)
104 | 
105 | # write to a csv file pd.to_csv()
106 | data100.to_csv('data100.csv')
107 | 
108 | 


--------------------------------------------------------------------------------
/Regression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Aug 11 12:37:26 2019
  4 | 
  5 | @author: Ken
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | """
 10 | Created on Mon Jul 15 08:27:42 2019
 11 | 
 12 | @author: KJee
 13 | """
 14 | 
 15 | import pandas as pd 
 16 | import numpy as np 
 17 | import matplotlib.pyplot as plt 
 18 | 
 19 | data = pd.read_csv('craigslistVehicles.csv')
 20 | data.columns
 21 | 
 22 | data.describe()
 23 | 
 24 | #remove duplcates 
 25 | data.drop_duplicates(inplace= True)
 26 | 
 27 | #check for nulls / % of nulls 
 28 | 
 29 | data.isnull().any()
 30 | data.isnull().sum()/ data.shape[0]
 31 | 
 32 | #remove columns with certain threshold of nulls
 33 | #threshold is the number of columns or rows without nulls 
 34 | """thresh = len(data)*.6
 35 | data.dropna(thresh = thresh, axis = 1)
 36 | data.dropna(thresh = 21, axis = 0)
 37 | """
 38 | 
 39 | #everything lower or uppercase
 40 | data.desc.head()
 41 | data.desc.head().apply(lambda x: x.lower())
 42 | data.desc.head().apply(lambda x: x.upper())
 43 | data['text_len'] = data.desc.apply(lambda x: len(str(x)))
 44 | (data['text_len'].value_counts() > 1).sum()
 45 | 
 46 | data.cylinders.dtype
 47 | data.cylinders.value_counts()
 48 | data.cylinders = data.cylinders.apply(lambda x: str(x).replace('cylinders','').strip())
 49 | data.cylinders.value_counts()
 50 | 
 51 | #change data type 
 52 | data.cylinders = pd.to_numeric(data.cylinders, errors = 'coerce')
 53 | data.cylinders.value_counts()
 54 | 
 55 | #boxplot 
 56 | data.boxplot('price')
 57 | data.boxplot('odometer')
 58 | 
 59 | data.price.max()
 60 | data.odometer.max()
 61 | 
 62 | #outlier detection and normalization remove rows with > 99% / z score 
 63 | 
 64 | # with no null values 
 65 | data_outliers = data[(data.price < data.price.quantile(.995)) & (data.price > data.price.quantile(.005)) & (data.price != 0) & (data.odometer != 0)]
 66 | data_outliers = data_outliers[(data_outliers.odometer < data_outliers.odometer.quantile(.995)) & (data_outliers.odometer > data_outliers.odometer.quantile(.005))]
 67 | 
 68 | #histogram
 69 | data_outliers[['price','odometer','cylinders','text_len']].hist()
 70 | 
 71 | #types of data cleaning  
 72 | data_outliers.isnull().sum()/data_outliers.shape[0]
 73 | 
 74 | #imputing nulls fillna()
 75 | 
 76 | data_outliers.dropna(subset=['manufacturer','make','fuel','transmission', 'title_status','year'], inplace = True)
 77 | data_outliers.isnull().sum()/data_outliers.shape[0]
 78 | 
 79 | data_outliers.cylinders.fillna(data_outliers.cylinders.median(), inplace = True)
 80 | data_outliers.isnull().sum()/data_outliers.shape[0]
 81 | 
 82 | data_outliers[['condition','VIN','drive','type','paint_color']]= data_outliers[['condition','VIN','drive','type','paint_color']].fillna('n/a')
 83 | data_outliers.isnull().sum()/data_outliers.shape[0]
 84 | 
 85 | data_outliers.VIN = data_outliers.VIN.apply(lambda x: 'has_vin' if x != 'n/a' else 'no_vin' )
 86 | 
 87 | data_final = data_outliers.drop(['city_url','url','city','size','desc','lat','long','image_url'], axis = 1)
 88 | data_final['constant'] = 1
 89 | data_final['age'] = 2019 - data_final.year
 90 | data_final.isnull().any()
 91 | 
 92 | numeric = data_final._get_numeric_data()
 93 | 
 94 | import seaborn as sns
 95 | 
 96 | corrdata = numeric
 97 | 
 98 | corr = corrdata.corr()
 99 | ax = sns.heatmap(
100 |     corr, 
101 |     vmin=-1, vmax=1, center=0,
102 |     cmap=sns.diverging_palette(20, 220, n=200),
103 |     square=True
104 | )
105 | ax.set_xticklabels(
106 |     ax.get_xticklabels(),
107 |     rotation=45,
108 |     horizontalalignment='right'
109 | );
110 | 
111 | #simple linear regression for year
112 | from sklearn.model_selection import train_test_split
113 | from sklearn.linear_model import LinearRegression
114 | from sklearn.metrics import mean_squared_error, mean_absolute_error
115 | import math
116 | 
117 | #set variables need to be in specific format 
118 | X1 = data_final.odometer.values.reshape(-1,1)
119 | y1 = data_final.price.values.reshape(-1,1)
120 | 
121 | #create train / test split for validation 
122 | X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3, random_state=0)
123 |         
124 | reg = LinearRegression().fit(X_train1, y_train1)
125 | reg.score(X_train1, y_train1)
126 | reg.coef_
127 | y_hat1 = reg.predict(X_train1)
128 | 
129 | plt.scatter(X_train1,y_train1)
130 | plt.scatter(X_train1,y_hat1)
131 | plt.show()
132 | 
133 | y_hat_test1 = reg.predict(X_test1)
134 | plt.scatter(X_test1, y_test1)
135 | plt.scatter(X_test1, y_hat_test1)
136 | plt.show()
137 | 
138 | #MSE & RMSE penalize large errors more than MAE 
139 | mae = mean_absolute_error(y_hat_test1,y_test1)
140 | rmse = math.sqrt(mean_squared_error(y_hat_test1,y_test1))
141 | print('Root Mean Squared Error = ',rmse)
142 | print('Mean Absolute Error = ',mae)
143 | 
144 | import statsmodels.api as sm
145 | 
146 | X1b = data_final[['constant','odometer']]
147 | y1b = data_final.price.values
148 | 
149 | X_train1b, X_test1b, y_train1b, y_test1b = train_test_split(X1b, y1b, test_size=0.3, random_state=0)
150 | 
151 | reg_sm1b = sm.OLS(y_train1b, X_train1b).fit()
152 | reg_sm1b.summary()
153 | 
154 | 
155 | #multiple linear regression 
156 | from statsmodels.stats.outliers_influence import variance_inflation_factor
157 | 
158 | X2 = data_final[['constant','age','odometer','cylinders','text_len']]
159 | y2 = data_final.price.values
160 | 
161 | X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=0)
162 | 
163 | reg_sm2 = sm.OLS(y_train2, X_train2).fit()
164 | reg_sm2.summary()
165 | 
166 | pd.Series([variance_inflation_factor(X2.values,i) for i in range(X2.shape[1])],index=X2.columns)
167 | 
168 | 
169 | 
170 | #actual regression 
171 | X3 = pd.get_dummies(data_final[['constant','age','odometer','text_len','cylinders','condition','fuel','VIN','type']])
172 | y3 = data_final.price.values
173 | 
174 | X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.3, random_state=0)
175 | 
176 | reg_sm3 = sm.OLS(y_train3, X_train3).fit()
177 | reg_sm3.summary()
178 | 
179 | y_hat3 = reg_sm.predict(X_test3)
180 | 
181 | rmse3 = math.sqrt(mean_squared_error(y_hat3,y_test3))
182 | 
183 | plt.scatter(y_hat3,y_test3)
184 | 
185 | #cross validation 5 fold 
186 | from sklearn.model_selection import cross_val_score 
187 | X4 = pd.get_dummies(data_final[['age','odometer','cylinders','condition','fuel','VIN','type']])
188 | y4 = data_final.price.values
189 | 
190 | X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.3, random_state=0)
191 | 
192 | reg4 = LinearRegression().fit(X_train4, y_train4)
193 | reg4.score(X_train4,y_train4)
194 | 
195 | scores = cross_val_score(reg4,X4,y4, cv=5, scoring = 'neg_mean_squared_error')
196 | np.sqrt(np.abs(scores))
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------