├── README.md └── ds_common_functions.py /README.md: -------------------------------------------------------------------------------- 1 | # data_science_starter 2 | This is some starter / reference code for those interested in starting data science 3 | -------------------------------------------------------------------------------- /ds_common_functions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 15 12:52:51 2020 4 | 5 | @author: Ken 6 | """ 7 | 8 | # import exploration files 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | file_path = 'path_to_data.csv' 14 | 15 | # read in data 16 | data = pd.read_csv(file_path) 17 | 18 | 19 | ############################################################################## 20 | #Data Exploration 21 | ############################################################################## 22 | 23 | #rows and columns returns (rows, columns) 24 | data.shape 25 | 26 | #returns the first x number of rows when head(num). Without a number it returns 5 27 | data.head() 28 | 29 | #returns the last x number of rows when tail(num). Without a number it returns 5 30 | data.tail() 31 | 32 | #returns an object with all of the column headers 33 | data.columns 34 | 35 | #basic information on all columns 36 | data.info() 37 | 38 | #gives basic statistics on numeric columns 39 | data.describe() 40 | 41 | #shows what type the data was read in as (float, int, string, bool, etc.) 42 | data.dtypes 43 | 44 | #shows which values are null 45 | data.isnull() 46 | 47 | #shows which columns have null values 48 | data.isnull().any() 49 | 50 | #shows for each column the percentage of null values 51 | data.isnull().sum() / data.shape[0] 52 | 53 | #plot histograms for all numeric columns 54 | data.hist() 55 | 56 | 57 | ############################################################################## 58 | #Data Manipulation 59 | ############################################################################## 60 | 61 | # rename columns 62 | data.rename(index=str columns={'col_oldname':'col_newname'}) 63 | 64 | # view all rows for one column 65 | data.col_name 66 | data['col_name'] 67 | 68 | # multiple columns by name 69 | data[['col1','col2']] 70 | data.loc[:['col1','col2']] 71 | 72 | #columns by index 73 | data.iloc[:,[0:2]] 74 | 75 | # drop columns 76 | data.drop('colname', axis =1) #add inplace = True to do save over current dataframe 77 | #drop multiple 78 | data.drop(['col1','col2'], axis =1) 79 | 80 | #lambda function 81 | data.apply(lambda x: x.colname**2, axis =1) 82 | 83 | # pivot table 84 | pd.pivot_table(data, index = 'col_name', values = 'col2', columns = 'col3') 85 | 86 | # merge == JOIN in SQL 87 | pd.merge(data1, data2, how = 'inner' , on = 'col1') 88 | 89 | # write to csv 90 | data.to_csv('data_out.csv') 91 | --------------------------------------------------------------------------------