├── README.md
└── ds_common_functions.py


/README.md:
--------------------------------------------------------------------------------
1 | # data_science_starter
2 | This is some starter / reference code for those interested in starting data science
3 | 


--------------------------------------------------------------------------------
/ds_common_functions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 15 12:52:51 2020
 4 | 
 5 | @author: Ken
 6 | """
 7 | 
 8 | # import exploration files 
 9 | import pandas as pd 
10 | import numpy as np 
11 | import matplotlib.pyplot as plt 
12 | 
13 | file_path = 'path_to_data.csv'
14 | 
15 | # read in data 
16 | data = pd.read_csv(file_path)
17 | 
18 | 
19 | ############################################################################## 
20 | #Data Exploration
21 | ##############################################################################
22 | 
23 | #rows and columns returns (rows, columns)
24 | data.shape
25 | 
26 | #returns the first x number of rows when head(num). Without a number it returns 5
27 | data.head()
28 | 
29 | #returns the last x number of rows when tail(num). Without a number it returns 5
30 | data.tail()
31 | 
32 | #returns an object with all of the column headers 
33 | data.columns
34 | 
35 | #basic information on all columns 
36 | data.info()
37 | 
38 | #gives basic statistics on numeric columns
39 | data.describe()
40 | 
41 | #shows what type the data was read in as (float, int, string, bool, etc.)
42 | data.dtypes
43 | 
44 | #shows which values are null
45 | data.isnull()
46 | 
47 | #shows which columns have null values
48 | data.isnull().any()
49 | 
50 | #shows for each column the percentage of null values 
51 | data.isnull().sum() / data.shape[0]
52 | 
53 | #plot histograms for all numeric columns 
54 | data.hist() 
55 | 
56 | 
57 | ############################################################################## 
58 | #Data Manipulation
59 | ##############################################################################
60 | 
61 | # rename columns 
62 | data.rename(index=str columns={'col_oldname':'col_newname'})
63 | 
64 | # view all rows for one column
65 | data.col_name 
66 | data['col_name']
67 | 
68 | # multiple columns by name
69 | data[['col1','col2']]
70 | data.loc[:['col1','col2']]
71 | 
72 | #columns by index 
73 | data.iloc[:,[0:2]]
74 | 
75 | # drop columns 
76 | data.drop('colname', axis =1) #add inplace = True to do save over current dataframe
77 | #drop multiple 
78 | data.drop(['col1','col2'], axis =1)
79 | 
80 | #lambda function 
81 | data.apply(lambda x: x.colname**2, axis =1)
82 | 
83 | # pivot table 
84 | pd.pivot_table(data, index = 'col_name', values = 'col2', columns = 'col3')
85 | 
86 | # merge  == JOIN in SQL
87 | pd.merge(data1, data2, how = 'inner' , on = 'col1')
88 | 
89 | # write to csv 
90 | data.to_csv('data_out.csv')
91 | 


--------------------------------------------------------------------------------