├── .gitignore ├── Ch02 ├── Customer Churn Columns.csv ├── Customer Churn Model.csv ├── Customer Churn Model.txt ├── ScatterPlots.jpeg ├── Tab Customer Churn Model.txt ├── Titanic Description.txt ├── Write.csv ├── Write.xls ├── basicDataCheck.py ├── changeDelimiter.py ├── plotData.py ├── readCustomerChurn.py ├── readCustomerChurn2.py ├── readDatasetByOpenMethod.py ├── readURLLib2Iris.py ├── readURLMedals.py ├── readXLS.py ├── titanic3.csv ├── titanic3.xls ├── titanic3.xlsx ├── titanicReadCSV.py └── titanicReadCSV1.py ├── Ch03 ├── Customer Churn Model.txt ├── appendManyFiles.py ├── calcPi.py ├── concatenateAndAppend.py ├── generateRandomNumbers.py ├── generateRandomProbDistr.py ├── groupData.py ├── mergeJoin.py ├── seedRandomNumbers.py ├── splitDataTrainTest.py ├── subsetColsRows.py ├── subsetDataset.py ├── subsetDatasetRows.py └── subsetNewCol.py ├── Ch04 ├── NewspaperSalesCorrelationPlot.png ├── RadioSalesCorrelationPlot.png ├── TVSalesCorrelationPlot.png ├── linearRegression.py └── linearRegressionFunction.py ├── Ch05 ├── CurrentVsPredicted1.png ├── CurrentVsPredictedVsMean1.png ├── CurrentVsPredictedVsModel1.png ├── MPGVSHorsepower.png ├── MPGVSHorsepowerModels.png ├── MPGVSHorsepowerVsLine.png ├── PredictedSalesVsTVAdvertisingCosts.png ├── linearRegression.py ├── linearRegressionECom.py ├── linearRegressionRFE.py ├── linearRegressionSKL.py ├── linearRegressionSMF.py └── nonlinearRegression.py ├── Ch06 ├── Histogram of Age.png ├── Purchase Frequency for Day of Week'.png ├── Purchase Frequency for Education Level.png ├── Purchase Frequency for Month of the Year.png ├── ROC Curve.png ├── Stacked Bar Chart of Marital Status vs Purchase.png ├── logisticRegression.py ├── logisticRegressionImplementation.py └── logisticRegressionScratch.py ├── Ch07 ├── Histogram of Clusters.png ├── Histogramn of Cluster Labels.png ├── clusterWine.py └── kMeanClustering.py ├── Ch08 ├── decisionTreeIris.py ├── dtree2.png ├── randomForest.py └── regressionTree.py ├── ISSUELOG.md ├── README.md └── datasets ├── Advertising.csv ├── Auto.csv ├── Bank data dictionary.txt ├── Boston.csv ├── Customer Churn Columns.csv ├── Customer Churn Model.csv ├── Customer Churn Model.txt ├── Ecom Expense.csv ├── Gender Purchase.csv ├── Titanic Description.txt ├── bank.csv ├── dtree2.dot ├── dtree2.png ├── iris.csv ├── lotsofdata ├── 001.csv ├── 002.csv ├── 003.csv ├── 004.csv ├── 005.csv ├── 006.csv ├── 007.csv ├── 008.csv ├── 009.csv ├── 010.csv ├── 011.csv ├── 012.csv ├── 021.csv ├── 022.csv ├── 023.csv ├── 113.csv ├── 114.csv ├── 115.csv └── 116.csv ├── medals ├── Athelete_Country_Map.csv ├── Athelete_Sports_Map.csv └── Medals.csv ├── titanic3.csv ├── wine.csv ├── winequality-red.csv └── winequality-white.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /Ch02/Customer Churn Columns.csv: -------------------------------------------------------------------------------- 1 | Column_Names 2 | A 3 | Bob 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | -------------------------------------------------------------------------------- /Ch02/ScatterPlots.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/ScatterPlots.jpeg -------------------------------------------------------------------------------- /Ch02/Titanic Description.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/Titanic Description.txt -------------------------------------------------------------------------------- /Ch02/Write.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/Write.xls -------------------------------------------------------------------------------- /Ch02/basicDataCheck.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 11:11:24 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 10 | filename = 'titanic3.csv' 11 | fullpath = path+'/'+filename 12 | 13 | data=pd.read_csv(fullpath) 14 | 15 | # Specify the number of rows to see. 16 | data.head(5) 17 | # Confirm dimension 18 | data.shape 19 | # List the data frame 20 | data.columns.values 21 | # Create summary statistics 22 | data.describe() 23 | # FInd out the data type of each column 24 | data.dtypes 25 | 26 | # Find entries with that have missing values. 27 | pd.isnull(data['body']) 28 | # Opposite method 29 | pd.notnull(data['body']) 30 | 31 | # Count the number of missing values. 1189 32 | pd.isnull(data['body']).values.ravel().sum() 33 | # Opposite: 121 34 | pd.notnull(data['body']).values.ravel().sum() 35 | 36 | 37 | # HANDLING MISSING DATA 38 | # Deletion 39 | # Drop any row with where all columns have missing info. 40 | data.dropna(axis=0,how='all') 41 | # Drop any rows where column have any empty cells of information. 42 | data.dropna(axis=0,how='any') 43 | 44 | #Imputation 45 | #data.fillna(0) 46 | #data.fillna('missing') 47 | data['body'].fillna(0) 48 | data['age'].fillna(data['age'].mean()) #29.881135 49 | data['age'].fillna(method='ffill') #Fill in with preceding non-missing value. 50 | data['age'].fillna(method='backfill') #Fill in with succeding non-missing value. 51 | 52 | # CREATING DUMMY VARIABLE 53 | # Split into new variable 'sex_female' and 'sex_male' 54 | dummy_sex=pd.get_dummies(data['sex'],prefix='sex') 55 | column_name=data.columns.values.tolist() 56 | column_name.remove('sex') # Remove column 'sex' 57 | data[column_name].join(dummy_sex) # Add dummy column created above. -------------------------------------------------------------------------------- /Ch02/changeDelimiter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 21:41:13 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 8 | filename1 = 'Customer Churn Model.txt' 9 | filename2 = 'Tab Customer Churn Model.txt' 10 | 11 | infile= path+'/'+filename1 12 | outfile= path+'/'+filename2 13 | with open(infile) as infile1: 14 | with open(outfile,'w') as outfile1: 15 | for line in infile1: 16 | fields=line.split(',') 17 | outfile1.write('/t'.join(fields)) 18 | 19 | import pandas as pd 20 | data=pd.read_csv(outfile,sep='/t') 21 | print(data) -------------------------------------------------------------------------------- /Ch02/plotData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 11:49:07 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | #from pylab import figure, axes, pie, title, show 11 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 12 | filename = 'Customer Churn Model.txt' 13 | fullpath = path+'/'+filename 14 | data=pd.read_csv(fullpath) 15 | 16 | # Scatter plot 17 | data.plot(kind='scatter',x='Day Mins',y='Day Charge') 18 | 19 | # Using matplotlib 20 | #figure,axs = plt.subplots(2, 2,sharey=True,sharex=True) 21 | #data.plot(kind='scatter',x='Day Mins',y='Day Charge',ax=axs[0][0]) 22 | #data.plot(kind='scatter',x='Night Mins',y='Night Charge',ax=axs[0][1]) 23 | #data.plot(kind='scatter',x='Day Calls',y='Day Charge',ax=axs[1][0]) 24 | #data.plot(kind='scatter',x='Night Calls',y='Night Charge',ax=axs[1][1]) 25 | 26 | # Save figure as a jpeg 27 | #figname = 'ScatterPlots.jpeg' 28 | #figpath = path+'/'+filename 29 | #figure.savefig(figname) 30 | 31 | # Histograms 32 | #plt.hist(data['Day Calls'],bins=8) 33 | #plt.xlabel('Day Calls Value') 34 | #plt.ylabel('Frequency') 35 | #plt.title('Frequency of Day Calls') 36 | 37 | # Boxplots 38 | plt.boxplot(data['Day Calls']) 39 | plt.ylabel('Day Calls') 40 | plt.title('Box Plot of Day Calls') -------------------------------------------------------------------------------- /Ch02/readCustomerChurn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 19:58:48 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | import pandas as pd 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 9 | filename = 'Customer Churn Model.txt' 10 | fullpath = path+'/'+filename 11 | data = pd.read_csv(fullpath) -------------------------------------------------------------------------------- /Ch02/readCustomerChurn2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 19:58:48 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | import pandas as pd 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 9 | filename1 = 'Customer Churn Columns.csv' 10 | filename2 = 'Customer Churn Model.txt' 11 | fullpath1 = path+'/'+filename1 12 | fullpath2 = path+'/'+filename2 13 | 14 | 15 | data_columns = pd.read_csv(fullpath1) 16 | data_column_list = data_columns['Column_Names'].tolist() 17 | data=pd.read_csv(fullpath2,header=None,names=data_column_list) 18 | data.columns.values -------------------------------------------------------------------------------- /Ch02/readDatasetByOpenMethod.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 19:58:48 2016 4 | 5 | @author: jasonm_dev 6 | 7 | python2 uses next() 8 | python3 uses readline() 9 | """ 10 | 11 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 12 | filename = 'Customer Churn Model.txt' 13 | fullpath = path+'/'+filename 14 | 15 | # Open file in read mode. 16 | data=open(fullpath,'r') 17 | # readline() method: 18 | # -> It navigates the computer memory to the line next to the header. 19 | # strip() method: 20 | # -> Removes all the trailing and leading blank spaces from the line 21 | # split() method: 22 | # -> Method breaks down a line into chunks separated by the argument provided 23 | cols=data.readline().strip().split(',') 24 | no_cols=len(data.readline().strip().split(',')) 25 | 26 | counter=0 27 | 28 | main_dict={} 29 | # Key: Column names 30 | # Value: Values of columns. 31 | for col in cols: 32 | main_dict[col]=[] 33 | 34 | for line in data: 35 | values = line.strip().split(',') 36 | for i in range(len(cols)): 37 | main_dict[cols[i]].append(values[i]) 38 | counter += 1 39 | 40 | #print ("The dataset has %d rows and %d columns") % (counter,no_cols) 41 | print ('The dataset has ',counter,' rows and ',no_cols,' columns') 42 | 43 | # Convert dataset to a dataframe similar pandas raed_csv 44 | import pandas as pd 45 | df=pd.DataFrame(main_dict) 46 | print (df.head(10)) 47 | 48 | filename_csv = 'Write.csv' 49 | filename_xls = 'Write.xls' 50 | fullpath_csv = path+'/'+filename_csv 51 | fullpath_xls = path+'/'+filename_xls 52 | 53 | # Write to CSV file. 54 | df.to_csv(fullpath_csv) 55 | 56 | # Write to xls file 57 | df.to_excel(fullpath_xls) -------------------------------------------------------------------------------- /Ch02/readURLLib2Iris.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 09:51:41 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import csv 9 | import urllib.request 10 | import codecs 11 | 12 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' 13 | html = urllib.request.urlopen(url) 14 | csvfile = csv.reader(codecs.iterdecode(html, 'utf-8')) 15 | for line in csvfile: 16 | print(line) #do something with line -------------------------------------------------------------------------------- /Ch02/readURLMedals.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 09:48:19 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | medal_data=pd.read_csv('http://winterolympicsmedals.com/medals.csv') -------------------------------------------------------------------------------- /Ch02/readXLS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 10:51:11 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | import pandas as pd 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 9 | 10 | filename1 = 'titanic3.xls' 11 | filename2 = 'titanic3.xlsx' 12 | fullpath1 = path+'/'+filename1 13 | fullpath2 = path+'/'+filename2 14 | # Read .xls 15 | data1=pd.read_excel(fullpath1,'titanic3') 16 | 17 | # Read .xlsx 18 | data2=pd.read_excel(fullpath2,'titanic3') -------------------------------------------------------------------------------- /Ch02/titanic3.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/titanic3.xls -------------------------------------------------------------------------------- /Ch02/titanic3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/titanic3.xlsx -------------------------------------------------------------------------------- /Ch02/titanicReadCSV.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 19:52:28 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | data = pd.read_csv('/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02/titanic3.csv') 10 | data -------------------------------------------------------------------------------- /Ch02/titanicReadCSV1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 27 19:55:36 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | import pandas as pd 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02' 9 | filename = 'titanic3.csv' 10 | fullpath = path+'/'+filename 11 | data = pd.read_csv(fullpath) 12 | -------------------------------------------------------------------------------- /Ch03/appendManyFiles.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 3 20:08:00 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | 9 | import pandas as pd 10 | 11 | # Check if first file works. 12 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/lotsofdata' 13 | filename= '001.csv' 14 | file = filepath+'/'+filename 15 | 16 | data=pd.read_csv(file) 17 | data.head() 18 | data.shape #Out: (1461, 4) 19 | 20 | # Loop through all dataset files. 21 | data_final=pd.read_csv(file) 22 | data_final_size=len(data_final) 23 | for i in range(1,12): #range(1,333): 24 | if i<10: 25 | filename='0'+'0'+str(i)+'.csv' 26 | if 10<=i<100: 27 | filename='0'+str(i)+'.csv' 28 | #if i>=100: 29 | # filename=str(i)+'.csv' 30 | 31 | file=filepath+'/'+filename 32 | data=pd.read_csv(file) 33 | data_final_size+=len(data) 34 | data_final=pd.concat([data_final,data],axis=0) 35 | 36 | data.shape # Out: (1461, 4) 37 | data_final.shape # Out: (27391, 4) 38 | print (data_final_size) # 27391 -------------------------------------------------------------------------------- /Ch03/calcPi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 15:26:46 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Calculate pi 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | def pi_run(nums,loops): 14 | pi_avg=0 15 | pi_value_list=[] 16 | for i in range(loops): 17 | value=0 18 | # Generate points within 0 to 1. 19 | x=np.random.uniform(0,1,nums).tolist() 20 | y=np.random.uniform(0,1,nums).tolist() 21 | # Check to see if they lie within circle. 22 | for j in range(nums): 23 | z=np.sqrt(x[j]*x[j]+y[j]*y[j]) 24 | if z<=1: 25 | value+=1 26 | # Amount of hits withion circle. 27 | float_value=float(value) 28 | # Using probabilty to calculate pi using hits 29 | pi_value=float_value*4/nums 30 | pi_value_list.append(pi_value) 31 | # Get pi value for this loop. 32 | pi_avg+=pi_value 33 | # Averag pi value from all loops. 34 | pi=pi_avg/loops 35 | ind=range(1,loops+1) 36 | fig=plt.plot(ind,pi_value_list) 37 | return (pi,fig) 38 | 39 | pi_run(1000,100) -------------------------------------------------------------------------------- /Ch03/concatenateAndAppend.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 3 19:50:48 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | 10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 11 | filename_red = 'winequality-red.csv' 12 | filename_white = 'winequality-white.csv' 13 | fullpath_red = path+'/'+filename_red 14 | fullpath_white = path+'/'+filename_white 15 | 16 | # RED WINE QUALITIES 17 | 18 | 19 | data1=pd.read_csv(fullpath_red,sep=';') # delimiter is ';' 20 | data1.head() 21 | data1.shape #Out: (1599, 12) 22 | data1.columns.values 23 | # Out: array(['fixed acidity', 'volatile acidity', 'citric acid', 24 | # 'residual sugar', 'chlorides', 'free sulfur dioxide', 25 | # 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 26 | # 'quality'], dtype=object) 27 | 28 | # WHITE WINE QUALITIES 29 | data2=pd.read_csv(fullpath_white,sep=';') 30 | data2.shape #Out: (4898, 12) 31 | data2.head() 32 | 33 | # APPEND DATA 34 | # Horizontal axis is denoted by 0. 35 | wine_total=pd.concat([data1,data2],axis=0) 36 | wine_total.shape #Out: (6497, 12) 37 | wine_total.head() 38 | 39 | #SCRAMBLING DATA WITH CONCAT 40 | data1_head=data1.head(50) 41 | data1_middle=data1[500:550] 42 | data1_tail=data1.tail(50) 43 | wine_scramble=pd.concat([data1_middle,data1_head,data1_tail],axis=0) 44 | wine_scramble 45 | wine_scramble.shape #Out: (150, 12) -------------------------------------------------------------------------------- /Ch03/generateRandomNumbers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 14:28:15 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | 10 | # Generate Random Numbers 11 | np.random.randint(1,100) #Random number between 1 and 100 12 | np.random.random() #Random number between 1 and 100 13 | 14 | # Generate n amount of random numbers between a and b 15 | def randint_range(n,a,b): 16 | x=[] 17 | for i in range(n): 18 | x.append(np.random.randint(a,b)) 19 | return x 20 | 21 | # Generate 10 amount of random numbers between 5 and 200 22 | randint_range(10,5,200) 23 | # Out: [169, 47, 124, 73, 109, 63, 84, 93, 8, 129] 24 | 25 | # Random range of number in specific multiple 26 | import random 27 | for i in range(3): 28 | print (random.randrange(0,100,5)) 29 | 30 | # Shuuffle list or array in a random order. 31 | b = randint_range(10,5,200) 32 | b # Out: [93, 194, 30, 38, 146, 40, 177, 172, 197, 182] 33 | np.random.shuffle(b) 34 | b # Out: [177, 182, 146, 40, 194, 30, 197, 172, 38, 93] 35 | 36 | # 'Choice' method is used to select a random item from a list of items. 37 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03' 38 | filename = 'Customer Churn Model.txt' 39 | fullpath = path+'/'+filename 40 | data=pd.read_csv(fullpath) 41 | data.shape # Output: (3333, 21) 42 | 43 | # Create a list from the column names 44 | column_list=data.columns.values.tolist() 45 | 46 | # Select an item at random from the list. 47 | np.random.choice(column_list) #Out: "Int'l Plan" 48 | np.random.choice(column_list) #Out: 'VMail Plan' 49 | np.random.choice(column_list) #Out: 'Eve Calls' 50 | np.random.choice(column_list) #Out: 'Eve Mins' -------------------------------------------------------------------------------- /Ch03/generateRandomProbDistr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 15:01:52 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | # Generate 100 random numbers lying between 1 and 100. 12 | randnum=np.random.uniform(1,100,100) 13 | 14 | # Plot histogram to confirm uniform distribution. 15 | # Used with ipython/spyder notepad. 16 | #%matplotlib inline 17 | 18 | # Not so uniform distribution with 10 numbers 19 | a=np.random.uniform(1,100,100) 20 | b=range(1,101) 21 | #plt.hist(a) 22 | 23 | # Better uniform distribution with a million numbers 24 | c=np.random.uniform(1,1000000,1000000) 25 | d=range(1,101) 26 | #plt.hist(c) 27 | 28 | # Normal distribution 29 | # Used with ipython/spyder notepad. 30 | #%matplotlib inline 31 | 32 | # Plot a random noise plot. 33 | e=np.random.randn(100) 34 | f=range(1,101) 35 | #plt.plot(f,e) 36 | 37 | # Plot a random noise plot with mean of 1.5 and standard deviation of 2.5. 38 | g=2.5*np.random.randn(100)+1.5 39 | h=range(1,101) 40 | #plt.plot(h,g) 41 | 42 | # Generate enough numbers to create belll curve 43 | i=np.random.randn(100000) 44 | j=range(1,101) 45 | plt.hist(i) -------------------------------------------------------------------------------- /Ch03/groupData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 2 09:57:12 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | # GENERATE DATAFRAME 12 | a=['Male','Female'] 13 | b=['Rich','Poor','Middle Class'] 14 | gender=[] 15 | seb=[] 16 | for i in range(1,101): 17 | gender.append(np.random.choice(a)) 18 | seb.append(np.random.choice(b)) 19 | height=30*np.random.randn(100)+155 20 | weight=20*np.random.randn(100)+60 21 | age=10*np.random.randn(100)+35 22 | income=1500*np.random.randn(100)+15000 23 | 24 | df=pd.DataFrame({'Gender':gender,'Height':height,'Weight':weight,'Age':age,'Income':income,'Socio-Eco':seb}) 25 | df.head() 26 | 27 | # GROUPING OF DATA 28 | # Splits data into data objects with attributes 'name' and 'group'. 29 | # df.groupby('Gender') # Out: 30 | 31 | # Group by gender. 32 | grouped = df.groupby('Gender') 33 | # Object created is 'Male' and its group of data, and 'Female' and its group of data. 34 | # grouped.groups 35 | 36 | for names,groups in grouped: 37 | print (names) 38 | print (groups) 39 | 40 | 41 | # Get a single group can be found. 42 | grouped_female=grouped.get_group('Female') 43 | 44 | # A set of categories can be used. 45 | grouped_gender_socio=df.groupby(['Gender','Socio-Eco']) 46 | 47 | for names,groups in grouped_gender_socio: 48 | print (names) 49 | print (groups) 50 | 51 | # AGGREGATION OF DATA 52 | # Sum of data 53 | grouped_gender_socio.sum() # Sum of dataheads 54 | grouped_gender_socio.size() # Calculates the size of each group. 55 | grouped_gender_socio.describe() # Summary statistics for each group separately. 56 | grouped_gender_socio.aggregate({'Age':np.mean,'Height':lambda x:np.mean(x)/np.std(x)}) 57 | # Use the lambda method for ratio of mean and standard deviation for height 58 | grouped_gender_socio.aggregate([np.sum, np.mean, np.std]) # Apply to all columns. 59 | 60 | # Grouped subsets behave like their own dataframes. 61 | grouped_income=grouped['Income'] # You can apply function above here as well. 62 | 63 | # FILTERING 64 | grouped_gender_socio['Age'].filter(lambda x:x.sum()>700) 65 | 66 | # TRANSFORMATION 67 | # Calculate the standard normal values for all the elements 68 | # in the numerical columns of our data frame 69 | zscore = lambda x: (x - x.mean()) / x.std() 70 | #grouped.transform(zscore) 71 | 72 | # Fills the missing values with the mean of the non-missing values. 73 | f = lambda x: x.fillna(x.mean()) 74 | #grouped.transform(f) 75 | 76 | # MISCELLANEOUS OPERTAIONS 77 | grouped.head(1) # Gets the first row of the male and female groups respectively. 78 | grouped_gender_socio.head(1) # First row of each group. 79 | 80 | grouped.tail(1) # Gets last rows of each group. 81 | grouped_gender_socio.tail(1) # Gets last rows of each group. 82 | 83 | # Good practise. First sort data frame before creating the groupby object. 84 | df1=df.sort_values(by=['Age','Income']) # Sort by age and income. 85 | sort_grouped=df1.groupby('Gender') # Group by gender 86 | sort_grouped.head(1) # Show rows for the youngest of each gender. 87 | sort_grouped.tail(1) # Show rows for the eldest of each gender. 88 | -------------------------------------------------------------------------------- /Ch03/mergeJoin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 3 20:41:51 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | 9 | import pandas as pd 10 | 11 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/medals' 12 | filename= 'Medals.csv' 13 | file = filepath+'/'+filename 14 | 15 | # IMPORT MAIN MEDAL FILE 16 | data_main=pd.read_csv(file,encoding='latin_1') 17 | data_main.head() 18 | data_main.shape # Out: (8618, 8) 19 | # ERROR 20 | # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf8 21 | # in position 8: invalid start byte 22 | # SOLUTION: used latin_1, ascii and utf-8 don't work 23 | 24 | 25 | # Check to see how many unique athletes there are. 26 | a=data_main['Athlete'].unique().tolist() 27 | len(a) # Out: 6956 28 | 29 | # IMPORT COUNTRY MAP 30 | filename_country = 'Athelete_Country_Map.csv' 31 | file_country = filepath+'/'+filename_country 32 | country_map=pd.read_csv(file_country,encoding='latin_1') 33 | country_map.head() 34 | country_map.shape # Out: (6970, 2) with 6956 uniques. 35 | 36 | # Uniques adding to total with 2 nationalities. 37 | country_map[country_map['Athlete']=='Aleksandar Ciric'] 38 | 39 | # IMPORT SPORTS MAP 40 | filename_sports = 'Athelete_Sports_Map.csv' 41 | file_sports = filepath+'/'+filename_sports 42 | sports_map=pd.read_csv(file_sports,encoding='latin_1') 43 | sports_map.head() 44 | sports_map.shape # Out: (6975, 2) 45 | 46 | # with very few doing more than one sport. 47 | sports_map[(sports_map['Athlete']=='Chen Jing') | (sports_map['Athlete']=='Richard Thompson') | (sports_map['Athlete']=='Matt Ryan')] 48 | 49 | # MERGE IMPORTED COUTRY MAP DATA FRAME FILES 50 | merged=pd.merge(left=data_main,right=country_map,left_on='Athlete',right_on='Athlete') 51 | merged.head() 52 | merged.shape # Out: (8657, 9) > 8618 uniques because of inner join. 53 | 54 | # See duplicated results. 55 | merged[merged['Athlete']=='Aleksandar Ciric'] 56 | 57 | # Drop duplicates from country_map data frame 58 | country_map_dp=country_map.drop_duplicates(subset='Athlete') # Out: (6956, 2) 59 | 60 | # Now retry merge as length is now the same as unique atheletes. 61 | merged_dp=pd.merge(left=data_main,right=country_map_dp,left_on='Athlete',right_on='Athlete') 62 | merged_dp.shape # Out: (8618, 9) 63 | 64 | # MERGE IMPORTED SPORTS MAP DATA FRAME FILES 65 | # Drop duplicates from country_map data frame 66 | sports_map_dp=sports_map.drop_duplicates(subset='Athlete') 67 | sports_map_dp.shape # Out: (6956, 2) 68 | 69 | # Merge into final data. 70 | merged_final=pd.merge(left=merged_dp,right=sports_map_dp,left_on='Athlete',right_on='Athlete') 71 | merged_final.shape # Out: (8618, 10) 72 | merged_final.head() 73 | 74 | 75 | # MERGE TYPES 76 | # PYTHON 2.7 uses '<>', whereas python3.5 uses '!=' 77 | # Prepare the data with some mismathes to show join examples. 78 | country_map_dlt=country_map_dp[(country_map_dp['Athlete']!='Michael Phelps') & (country_map_dp['Athlete']!='Natalie Coughlin') & (country_map_dp['Athlete']!='Chen Jing') 79 | & (country_map_dp['Athlete']!='Richard Thompson') & (country_map_dp['Athlete']!='Matt Ryan')] 80 | len(country_map_dlt) # Out: 6951 81 | sports_map_dlt=sports_map_dp[(sports_map_dp['Athlete']!='Michael Phelps') & (sports_map_dp['Athlete']!='Natalie Coughlin') & (sports_map_dp['Athlete']!='Chen Jing') 82 | & (sports_map_dp['Athlete']!='Richard Thompson') & (sports_map_dp['Athlete']!='Matt Ryan')] 83 | len(sports_map_dlt) # Out: 6951 84 | 85 | data_main_dlt=data_main[(data_main['Athlete']!='Michael Phelps') & (data_main['Athlete']!='Natalie Coughlin') & (data_main['Athlete']!='Chen Jing') 86 | & (data_main['Athlete']!='Richard Thompson') & (data_main['Athlete']!='Matt Ryan')] 87 | len(data_main_dlt) # Out: 8605 88 | 89 | # INNER JOIN EXAMPLE 90 | merged_inner=pd.merge(left=data_main,right=country_map_dlt,how='inner',left_on='Athlete',right_on='Athlete') 91 | len(merged_inner) # Out: 8605 92 | 93 | # LEFT JOIN EXAMPLE 94 | merged_left=pd.merge(left=data_main,right=country_map_dlt,how='left',left_on='Athlete',right_on='Athlete') 95 | len(merged_left) # Out: 8618 96 | # Check the athletes which don't have information because of the left join. 97 | merged_left_slt=merged_left[merged_left['Athlete']=='Michael Phelps'] 98 | merged_left_slt 99 | 100 | # RIGHT JOIN EXAMPLE 101 | merged_right=pd.merge(left=data_main_dlt,right=country_map_dp,how='right',left_on='Athlete',right_on='Athlete') 102 | len(merged_right) # Out: 8610 103 | -------------------------------------------------------------------------------- /Ch03/seedRandomNumbers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 14:55:21 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | import numpy as np 8 | 9 | # No seed is set and a set of new 5 random numbers 10 | # will be generate each time. 11 | for i in range(5): 12 | print (np.random.random()) 13 | 14 | # Seed is set as 1 and generate 5 random numbers. 15 | # The 5 random numbers will be repeated. 16 | np.random.seed(1) 17 | for i in range(5): 18 | print (np.random.random()) 19 | 20 | -------------------------------------------------------------------------------- /Ch03/splitDataTrainTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 2 10:48:24 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | # METHOD 1 – using the Customer Churn Model 12 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03' 13 | filename = 'Customer Churn Model.txt' 14 | fullpath = path+'/'+filename 15 | data=pd.read_csv(fullpath) 16 | 17 | len(data) 18 | 19 | # Generate set of random numbers with length of data. 20 | a=np.random.randn(len(data)) 21 | # Create filter 22 | check=a<0.8 23 | # Filter training data below 0.8. 24 | training=data[check] 25 | # Filter testing data above 0.8. 26 | testing=data[~check] 27 | 28 | # Check lengths 29 | len(training) 30 | len(testing) 31 | 32 | # METHOD 2 – using sklearn 33 | # The test size specifies the size of the testing dataset: 34 | # 0.2 means that 20 percent of the rows of the dataset should go to testing 35 | # and the remaining 80 percent to training. 36 | from sklearn.cross_validation import train_test_split 37 | train, test = train_test_split(data, test_size = 0.2) 38 | 39 | # METHOD 3 – using the shuffle function 40 | # Using 'rb' means opening in binary mode 41 | # and create a 'bytes' object used in dataframes. 42 | with open(fullpath,'rb') as f: 43 | #data_shuffle=f.readline().split('\n') 44 | data_shuffle=f.readline() 45 | #data_shuffle=open(fullpath,'r') 46 | #np.random.shuffle(data_shuffle) 47 | #train_data = data_shuffle[:3*len(data_shuffle)/4] 48 | #test_data = data_shuffle[len(data_shuffle)/4:] 49 | 50 | 51 | # Just readline creates a bytes object. 52 | #do a loop like the opdn one and main dict 53 | -------------------------------------------------------------------------------- /Ch03/subsetColsRows.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 13:58:58 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | 10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03' 11 | filename = 'Customer Churn Model.txt' 12 | fullpath = path+'/'+filename 13 | data=pd.read_csv(fullpath) 14 | data.shape # Output: (3333, 21) 15 | 16 | # Filter data for the first 50 rows. 17 | subdata_first_50=data[['Account Length','VMail Message','Day Calls']][1:50] 18 | subdata_first_50 19 | 20 | # Filter data by 'Day Calls' > 100 21 | data1=data[data['Day Calls']>100] 22 | data1.shape # Output: (1682, 21) 23 | 24 | # Alternative .ix[rowstart:rowend,colstart:colend] 25 | data.ix[1:100,1:6] 26 | data.ix[:,1:6] 27 | data.ix[1:100,[2,5,7]] 28 | data.ix[[1,2,5],['Area Code','VMail Plan','Day Mins']] -------------------------------------------------------------------------------- /Ch03/subsetDataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 13:35:32 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | 10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03' 11 | filename = 'Customer Churn Model.txt' 12 | fullpath = path+'/'+filename 13 | data=pd.read_csv(fullpath) 14 | 15 | # Creates subset of the DataFrame by using square brackets. 16 | # Selecting one column creates a Series object similar to Dataframe 17 | account_length = data['Account Length'] 18 | account_length.head() 19 | type(account_length) # Output: pandas.core.series.Series 20 | 21 | # Creates subset of the DataFrame by using square brackets. 22 | # Using multiple columns 23 | subdata = data[['Account Length','VMail Message','Day Calls']] 24 | subdata.head() 25 | type(subdata) # Output: pandas.core.frame.DataFrame 26 | 27 | # Alternative 28 | wanted_columns=['Account Length','VMail Message','Day Calls'] 29 | subdata1=data[wanted_columns] 30 | subdata1.head() 31 | 32 | # Alternative 33 | wanted=['Account Length','VMail Message','Day Calls'] 34 | # Gets list of columns names 35 | column_list=data.columns.values.tolist() 36 | # Removes 'wanted' column names from the column_list 37 | sublist=[x for x in column_list if x not in wanted] 38 | subdata2=data[sublist] 39 | subdata2.head() -------------------------------------------------------------------------------- /Ch03/subsetDatasetRows.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 13:50:16 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | 10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03' 11 | filename = 'Customer Churn Model.txt' 12 | fullpath = path+'/'+filename 13 | data=pd.read_csv(fullpath) 14 | data.shape # Output: (3333, 21) 15 | 16 | # Filter data by 'Day Calls' > 100 17 | data1=data[data['Day Calls']>100] 18 | data1.shape # Output: (1682, 21) 19 | 20 | # Filter data by 'State' > VA 21 | data2=data[data['State']=='VA'] 22 | data2.shape # Output: (77, 21) 23 | 24 | # Filter data by 'Day Calls' > 100 and 'State' > VA 25 | data3=data[(data['Day Calls']>100) & (data['State']=='VA')] 26 | data3.shape # Output: (51, 21) -------------------------------------------------------------------------------- /Ch03/subsetNewCol.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 14:06:22 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | 10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03' 11 | filename = 'Customer Churn Model.txt' 12 | fullpath = path+'/'+filename 13 | data=pd.read_csv(fullpath) 14 | data.shape # Output: (3333, 21) 15 | 16 | # Create new column by totalling the minutes columns. 17 | data['Total Mins']=data['Day Mins']+data['Eve Mins']+data['Night Mins'] 18 | data['Total Mins'].head() # Name: Total Mins, dtype: float64 -------------------------------------------------------------------------------- /Ch04/NewspaperSalesCorrelationPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch04/NewspaperSalesCorrelationPlot.png -------------------------------------------------------------------------------- /Ch04/RadioSalesCorrelationPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch04/RadioSalesCorrelationPlot.png -------------------------------------------------------------------------------- /Ch04/TVSalesCorrelationPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch04/TVSalesCorrelationPlot.png -------------------------------------------------------------------------------- /Ch04/linearRegression.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # Linear Regression 4 | """ 5 | Created on Wed May 4 21:08:54 2016 6 | 7 | @author: jasonm_dev 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | 13 | # Check if first file works. 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 15 | filename= 'Advertising.csv' 16 | file = filepath+'/'+filename 17 | 18 | advert=pd.read_csv(file) 19 | advert.head() 20 | 21 | # Determine correlation between 22 | # the advertisement costs on TV 23 | # and the resultant sales 24 | advert['corrn']=(advert['TV']-np.mean(advert['TV']))*(advert['Sales']-np.mean(advert['Sales'])) 25 | advert['corrd1']=(advert['TV']-np.mean(advert['TV']))**2 26 | advert['corrd2']=(advert['Sales']-np.mean(advert['Sales']))**2 27 | corrcoeffn=advert.sum()['corrn'] 28 | corrcoeffd1=advert.sum()['corrd1'] 29 | corrcoeffd2=advert.sum()['corrd2'] 30 | corrcoeffd=np.sqrt(corrcoeffd1*corrcoeffd2) 31 | corrcoeff=corrcoeffn/corrcoeffd 32 | corrcoeff #Out: 0.78222442486160604 -------------------------------------------------------------------------------- /Ch04/linearRegressionFunction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 4 21:12:24 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | # Check if first file works. 12 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 13 | filename= 'Advertising.csv' 14 | file = filepath+'/'+filename 15 | 16 | advert=pd.read_csv(file) 17 | advert.head() 18 | 19 | # Function to determine correlation between var1 and var2 20 | def corrcoeff(df,var1,var2): 21 | df['corrn']=(df[var1]-np.mean(df[var1]))*(df[var2]-np.mean(df[var2])) 22 | df['corrd1']=(df[var1]-np.mean(df[var1]))**2 23 | df['corrd2']=(df[var2]-np.mean(df[var2]))**2 24 | corrcoeffn=df.sum()['corrn'] 25 | corrcoeffd1=df.sum()['corrd1'] 26 | corrcoeffd2=df.sum()['corrd2'] 27 | corrcoeffd=np.sqrt(corrcoeffd1*corrcoeffd2) 28 | corrcoeff=corrcoeffn/corrcoeffd 29 | return corrcoeff 30 | 31 | # Correlation between TV and Radio 32 | Corr_TV_Radio = corrcoeff(advert,'TV','Radio') # Out: 0.05480866446583009 33 | 34 | # Correlation between TV and Newspaper 35 | Corr_TV_Newspaper = corrcoeff(advert,'TV','Newspaper') # Out: 0.056647874965056993 36 | 37 | # Correlation between TV and Sales 38 | Corr_TV_Sales = corrcoeff(advert,'TV','Sales') # Out: 0.78222442486160604 39 | 40 | # Correlation between Radio and Newspaper 41 | Corr_Radio_Newspaper = corrcoeff(advert,'Radio','Newspaper') # Out: 0.35410375076117517 42 | 43 | # Correlation between Radio and Sales 44 | Corr_Radio_Sales = corrcoeff(advert,'Radio','Sales') # Out: 0.5762225745710553 45 | 46 | # Correlation between Newspaper and Sales 47 | Corr_Newspaper_Sales = corrcoeff(advert,'Newspaper','Sales') # Out: 0.22829902637616525 48 | 49 | # Plot correlation of TV and Sales 50 | import matplotlib.pyplot as plt 51 | # %matplotlib inline 52 | #plt.plot(advert['TV'],advert['Sales'],'ro') 53 | #plt.title('TV vs Sales') 54 | 55 | # Plot correlation of Radio and Sales 56 | #plt.plot(advert['Radio'],advert['Sales'],'ro') 57 | #plt.title('Radio vs Sales') 58 | 59 | # Plot correlation of Newspaper and Sales 60 | plt.plot(advert['Newspaper'],advert['Sales'],'ro') 61 | plt.title('Newspaper vs Sales') -------------------------------------------------------------------------------- /Ch05/CurrentVsPredicted1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/CurrentVsPredicted1.png -------------------------------------------------------------------------------- /Ch05/CurrentVsPredictedVsMean1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/CurrentVsPredictedVsMean1.png -------------------------------------------------------------------------------- /Ch05/CurrentVsPredictedVsModel1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/CurrentVsPredictedVsModel1.png -------------------------------------------------------------------------------- /Ch05/MPGVSHorsepower.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/MPGVSHorsepower.png -------------------------------------------------------------------------------- /Ch05/MPGVSHorsepowerModels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/MPGVSHorsepowerModels.png -------------------------------------------------------------------------------- /Ch05/MPGVSHorsepowerVsLine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/MPGVSHorsepowerVsLine.png -------------------------------------------------------------------------------- /Ch05/PredictedSalesVsTVAdvertisingCosts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/PredictedSalesVsTVAdvertisingCosts.png -------------------------------------------------------------------------------- /Ch05/linearRegression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 5 20:26:46 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | # LINEAR REGRESSION 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | # %matplotlib inline 13 | 14 | # Input_Variable(X) 15 | # 100 normally distributed random numbers 16 | # with a mean of 1.5, and standard deviation 2.5 17 | x=2.5*np.random.randn(100)+1.5 18 | 19 | # Residual term (RES) which israndom variable distributed normally 20 | # with a mean of 0 and standard deviation of 0.5. 21 | res=.5*np.random.randn(100)+0 22 | 23 | # Predicted Value (Ye) 24 | # i.e. Predicted_Output(ypred) 25 | # Intercept of 2 and a slope of 0.3 26 | ypred=2+.3*x 27 | 28 | # Actual Value (Ya) 29 | # i.e. Actual_Output(yact) 30 | # We add the random residual. 31 | yact=2+.3*x+res 32 | 33 | # Create a dataframe with above lists. 34 | xlist=x.tolist() # Convert datatype 'numpy.ndarray' to a 'list' 35 | ypredlist=ypred.tolist() # Convert datatype 'numpy.ndarray' to a 'list' 36 | yactlist=yact.tolist() # Convert datatype 'numpy.ndarray' to a 'list' 37 | # Convert lists to a dataframe. 38 | df=pd.DataFrame({'Input_Variable(X)':xlist,'Predicted_Output(ypred)':ypredlist,'Actual_Output(yact)':yactlist}) 39 | df.head() 40 | 41 | # Get the mean of the actual data. 42 | ymean=np.mean(yact) 43 | yavg=[ymean for i in range(1,len(xlist)+1)] 44 | 45 | 46 | # Calculation of the R-squared or coefficient of determination 47 | # A way to judge the efficacy of the model 48 | # Total Sum of Squares (SST) = SSD + SSR = f(yact-yavg) 49 | # Difference Sum of Squares or SSD = f(yact-ypred) 50 | # Regression Sum of Squares or SSR = f(ypred-yavg) 51 | df['SSR']=(df['Predicted_Output(ypred)']-ymean)**2 52 | df['SST']=(df['Actual_Output(yact)']-ymean)**2 53 | SSR=df.sum()['SSR'] 54 | SST=df.sum()['SST'] 55 | SSR/SST # Out: 0.7354410334035838 56 | 57 | # Calculating alpha and beta coefficients 58 | xmean=np.mean(df['Input_Variable(X)']) 59 | ymean=np.mean(df['Actual_Output(yact)']) 60 | df['beta']=(df['Input_Variable(X)']-xmean)*(df['Actual_Output(yact)']-ymean) 61 | df['xvar']=(df['Input_Variable(X)']-xmean)**2 62 | betan=df.sum()['beta'] 63 | betad=df.sum()['xvar'] 64 | beta=betan/betad 65 | 66 | alpha=ymean-(betan/betad)*xmean 67 | beta,alpha # beta : 0.29063 alpha: 2.04474 68 | 69 | # Generate new colum to incoporate our new parameters or coefficients 70 | df['ymodel']=beta*df['Input_Variable(X)']+alpha 71 | 72 | # Calculation of the R-squared or coefficient of determination 73 | # for the new model. 74 | df['SSR']=(df['ymodel']-ymean)**2 75 | df['SST']=(df['Actual_Output(yact)']-ymean)**2 76 | SSR2=df.sum()['SSR'] 77 | SST2=df.sum()['SST'] 78 | SSR2/SST2 79 | 80 | # Plot the current model. 81 | plt.plot(x,ypred) 82 | plt.plot(x,df['ymodel']) 83 | plt.plot(x,yact,'ro') 84 | plt.plot(x,yavg) 85 | plt.title('Actual vs Predicted vs Model') 86 | 87 | # Residual Standard Error (RSE) 88 | df['RSE']=(df['Actual_Output(yact)']-df['ymodel'])**2 89 | RSEd=df.sum()['RSE'] 90 | RSE=np.sqrt(RSEd/98) 91 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables 92 | RSE -------------------------------------------------------------------------------- /Ch05/linearRegressionECom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 7 14:18:59 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | # Handling other issues in linear regression 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.linear_model import LinearRegression 11 | 12 | # Import Data from CSV file. 13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 14 | filename= 'Ecom Expense.csv' 15 | file = filepath+'/'+filename 16 | 17 | df=pd.read_csv(file) 18 | df.head() 19 | print(df.shape) #Out: (2362, 9) 20 | # Out: ['Transaction ID', 'Age ', ' Items ', 'Monthly Income', 21 | # 'Transaction Time', 'Record', 'Gender', City Tier', 'Total Spend'] 22 | 23 | # Create dummy variables for categorical and qualitive data 24 | dummy_gender=pd.get_dummies(df['Gender'],prefix='Sex') 25 | dummy_city_tier=pd.get_dummies(df['City Tier'],prefix='City') 26 | print(df.shape) #Out: (2362, 9) 27 | 28 | # Add dummy variables to the main data 29 | column_name=df.columns.values.tolist() 30 | df1=df[column_name].join(dummy_gender) 31 | column_name1=df1.columns.values.tolist() 32 | df2=df1[column_name1].join(dummy_city_tier) 33 | df2 34 | print(df2.shape) #Out: (2362, 14) 35 | 36 | # For the preceding dataset, let's assume a linear relationship between 37 | # the output variable 'Total Spend' and the predictor variables: 38 | # 'Monthly Income' and 'Transaction Time', and both set of dummy variables 39 | 40 | # Input Variables 41 | feature_cols = ['Monthly Income','Transaction Time','City_Tier 1','City_Tier 2','City_Tier 3','Sex_Female','Sex_Male'] 42 | X = df2[feature_cols] 43 | # Output Variable 44 | Y = df2['Total Spend'] 45 | lm = LinearRegression() 46 | lm.fit(X,Y) 47 | 48 | # Model Parameters 49 | print (lm.intercept_) # Out: 3655.72940769 50 | print (lm.coef_) 51 | # Out: [ 0.15297825 0.12372609 119.6632516 -16.67901801 -102.9842336 52 | # -94.15779883 94.15779883] 53 | zipped = zip(feature_cols, lm.coef_) 54 | list(zipped) 55 | # Out: 56 | #[('Monthly Income', 0.15297824609320515), 57 | # ('Transaction Time', 0.12372608642620003), 58 | # ('City_Tier 1', 119.66325160390119), 59 | # ('City_Tier 2', -16.679018007990429), 60 | # ('City_Tier 3', -102.98423359591075), 61 | # ('Sex_Female', -94.157798830320132), 62 | # ('Sex_Male', 94.157798830320075)] 63 | 64 | # R2 Score 65 | lm.score(X,Y) # Out: 0.19478920552885381 66 | 67 | # Model written out: 68 | # Total_Spend= 69 | # 3655.72 + 0.12*Transaction Time + 0.15*Monthly Income 70 | # + 119*City_Tier 1-16*City_Tier 2 - 102*City_Tier 3 71 | # -94*Sex_Female+94*Sex_Male 72 | 73 | # Calculate the RSE 74 | df2['total_spend_pred']=3720.72940769 + 0.12*df2['Transaction Time']+0.15*df2['Monthly Income']+119*df2['City_Tier 1']-16*df2['City_Tier 2'] 75 | -102*df2['City_Tier 3']-94*df2['Sex_Female']+94*df2['Sex_Male'] 76 | df2['RSE']=(df2['Total Spend']-df2['total_spend_pred'])**2 77 | RSEd=df2.sum()['RSE'] 78 | RSE=np.sqrt(RSEd/2354) # 2362 - 7 - 1 = 2354 79 | salesmean=np.mean(df2['Total Spend']) 80 | error=RSE/salesmean 81 | RSE,salesmean,error 82 | # Out: (2518.8520388731386, 6163.176415976714, 0.40869380800840849) 83 | 84 | # IMPROVEMENT 85 | # Mask the first variable from the resulting list using the iloc method of subsetting 86 | dummy_gender=pd.get_dummies(df['Gender'],prefix='Sex').iloc[:, 1:] 87 | dummy_city_tier=pd.get_dummies(df['City Tier'],prefix='City').iloc[:, 1:] 88 | column_name=df.columns.values.tolist() 89 | df3=df[column_name].join(dummy_gender) 90 | column_name1=df3.columns.values.tolist() 91 | df4=df3[column_name1].join(dummy_city_tier) 92 | df4 93 | 94 | feature_cols = ['Monthly Income','Transaction Time','City_Tier 2','City_Tier 3','Sex_Male'] 95 | X = df2[feature_cols] 96 | Y = df2['Total Spend'] 97 | lm = LinearRegression() 98 | lm.fit(X,Y) 99 | 100 | # Model Parameters 101 | print (lm.intercept_) # Out: 3681.23486046 102 | print (lm.coef_) 103 | # Out: [ 1.52978246e-01 1.23726086e-01 -1.36342270e+02 -2.22647485e+02 104 | # 1.88315598e+02] 105 | zipped = zip(feature_cols, lm.coef_) 106 | list(zipped) 107 | # Out: 108 | #[('Monthly Income', 0.15297824609320468), 109 | # ('Transaction Time', 0.12372608642590291), 110 | # ('City_Tier 2', -136.34226961189117), 111 | # ('City_Tier 3', -222.6474851998114), 112 | # ('Sex_Male', 188.31559766064038)] 113 | -------------------------------------------------------------------------------- /Ch05/linearRegressionRFE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 7 14:03:02 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Linear regression with scikit-learn 9 | 10 | import pandas as pd 11 | from sklearn.feature_selection import RFE 12 | from sklearn.svm import SVR 13 | 14 | # Import Data from CSV file. 15 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 16 | filename= 'Advertising.csv' 17 | file = filepath+'/'+filename 18 | 19 | advert=pd.read_csv(file) 20 | advert.head() 21 | 22 | # Feature selection with scikit-learn 23 | # Recursive Feature Elimination (RFE) 24 | feature_cols = ['TV', 'Radio','Newspaper'] 25 | X = advert[feature_cols] 26 | Y = advert['Sales'] 27 | # Choose 'linear' model. 28 | estimator = SVR(kernel="linear") 29 | # number of desired variables 30 | selector = RFE(estimator,2,step=1) 31 | selector = selector.fit(X, Y) 32 | 33 | # Selected variables. 34 | selector.support_ # Out: array([ True, True, False], dtype=bool) 35 | # X consists of three variables: TV, radio, and newspaper. 36 | # Newspaper hasn't been selected. 37 | 38 | # Selector ranking 39 | selector.ranking_ # Out: array([1, 1, 2]) 40 | # All the selected variables will have a ranking of 1. 41 | # Rest are shown in descending order. -------------------------------------------------------------------------------- /Ch05/linearRegressionSKL.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 7 14:03:02 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Linear regression with scikit-learn 9 | 10 | import pandas as pd 11 | from sklearn.linear_model import LinearRegression 12 | from sklearn.cross_validation import train_test_split 13 | 14 | # Import Data from CSV file. 15 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 16 | filename= 'Advertising.csv' 17 | file = filepath+'/'+filename 18 | 19 | advert=pd.read_csv(file) 20 | advert.head() 21 | 22 | # Split dataset into training and testing 23 | feature_cols = ['TV', 'Radio'] 24 | X = advert[feature_cols] 25 | Y = advert['Sales'] 26 | trainX,testX,trainY,testY = train_test_split(X,Y, test_size = 0.2) 27 | lm = LinearRegression() 28 | lm.fit(trainX, trainY) 29 | 30 | print (lm.intercept_) # Out: 2.98314900713 31 | print (lm.coef_) # Out: [ 'TV': 0.04536014 'Radio': 0.18767089] 32 | 33 | zipped = zip(feature_cols, lm.coef_) 34 | list(zipped) 35 | # Out: 36 | #[('TV', 0.044571627228483394), ('Radio', 0.19465327712760053)] 37 | 38 | # Rsquared 39 | lm.score(trainX, trainY) # Out: 0.89235897920220186 40 | 41 | # The model can be used to predict the value of sales using TV and radio 42 | # variables from the test dataset 43 | lm.predict(testX) -------------------------------------------------------------------------------- /Ch05/linearRegressionSMF.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 5 21:05:23 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import statsmodels.formula.api as smf 11 | import matplotlib.pyplot as plt 12 | 13 | # Import Data from CSV file. 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 15 | filename= 'Advertising.csv' 16 | file = filepath+'/'+filename 17 | 18 | advert=pd.read_csv(file) 19 | advert.head() 20 | 21 | # SECTION 1: Linear regression using the statsmodel library 22 | # Model Assumption 23 | # Model 1: A linear relationship between advertising costs on TV and sales 24 | # i.e. Sales = f(TV)= alpha + beta*TV 25 | # Created a best fit using the least sum of square method 26 | model1=smf.ols(formula='Sales~TV',data=advert).fit() 27 | model1.params # Intercept(alpha): 7.032594; TV(beta): 0.047537 28 | model1.pvalues # Intercept(alpha): 1.406300e-35; TV(beta): 1.467390e-42 29 | # p-values are very small, therfore parameters are significant. 30 | model1.rsquared # 0.61187505085007099 31 | model1.summary() 32 | # the F-statistic for this model is very high 33 | # and the associated p-value is negligible, 34 | # suggesting that the parameter estimates for this model 35 | # were all significant and non-zero. 36 | 37 | # Predict the values of sales based on the equation 38 | sales_pred=model1.predict(pd.DataFrame(advert['TV'])) 39 | sales_pred 40 | 41 | # Calculate RSE term for model 1 42 | advert['sales_pred']=0.047537*advert['TV']+7.03 43 | advert['RSE']=(advert['Sales']-advert['sales_pred'])**2 44 | RSEd=advert.sum()['RSE'] 45 | RSE=np.sqrt(RSEd/198) # # Df Residuals (n-p-1): 200-1-1 = 198 46 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables 47 | salesmean=np.mean(advert['Sales']) 48 | error=RSE/salesmean 49 | RSE,salesmean,error 50 | # (3.2586573692471279, 14.022500000000003, 0.23238776033140504) 51 | # The current model carries a 23% error and the R2 is 0.61 < 0.9 52 | # F-statistic: 312.1 53 | 54 | # Plot the Sales predicted vs TV Advertising costs 55 | #%matplotlib inline 56 | advert.plot(kind='scatter', x='TV', y='Sales') 57 | plt.plot(pd.DataFrame(advert['TV']),sales_pred,c='red',linewidth=2) 58 | plt.title('Predicted Sales vs TV Advertising Costs') 59 | 60 | 61 | # SECTION 2: Multiple linear regression 62 | # Model 2: 63 | # Sales = f(TV,Newspaper)= alpha + beta1*TV+ beta2*Newspaper 64 | model2=smf.ols(formula='Sales~TV+Newspaper',data=advert).fit() 65 | model2.params 66 | # Intercept(alpha): 5.774948; TV(beta1): 0.046901; Newspaper(beta2): 0.044219 67 | model2.pvalues 68 | # Intercept(alpha): 3.145860e-22; TV(beta1): 5.507584e-44; 69 | # Newspaper(beta2): 2.217084e-05 70 | # p-values are very small, therfore parameters are significant. 71 | model2.rsquared # 0.64583549382932715 72 | model2.summary() 73 | 74 | # Predict the values of sales based on the equation of model 2 75 | sales_pred2=model2.predict(advert[['TV','Newspaper']]) 76 | sales_pred2 77 | 78 | # Calculate RSE term for model 2 79 | advert['sales_pred2']=5.77 + 0.046*advert['TV'] + 0.04*advert['Newspaper'] 80 | advert['RSE2']=(advert['Sales']-advert['sales_pred2'])**2 81 | RSEd2=advert.sum()['RSE2'] 82 | RSE2=np.sqrt(RSEd2/197) # Df Residuals (n-p-1): 200-2-1 = 197 83 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables 84 | salesmean=np.mean(advert['Sales']) 85 | error2=RSE2/salesmean 86 | RSE2,salesmean,error2 87 | # (3.1346969895743846, 14.022500000000003, 0.22354765481008265) 88 | # The current model carries a 22% error and the R2 is 0.64 < 0.9 89 | # F-statistic: 179.6 90 | 91 | 92 | # Model 3: 93 | # Sales = f(TV,Radio)= alpha + beta1*TV+ beta2*Radio 94 | model3=smf.ols(formula='Sales~TV+Radio',data=advert).fit() 95 | model3.params 96 | # Intercept(alpha): 2.921100; TV(beta1): 0.045755; Radio(beta2): 0.187994 97 | model3.pvalues 98 | # Intercept(alpha): 4.565557e-19; TV(beta1): 5.436980e-82; 99 | # Radio(beta2): 9.776972e-59 100 | # p-values are very small, therfore parameters are significant. 101 | model3.rsquared # 0.89719426108289568 102 | model3.summary() 103 | 104 | # Predict the values of sales based on the equation of model 3 105 | sales_pred3=model3.predict(advert[['TV','Radio']]) 106 | sales_pred3 107 | 108 | # Calculate RSE term for model 3 109 | advert['sales_pred3']=2.92 + 0.045*advert['TV'] + 0.18*advert['Radio'] 110 | advert['RSE3']=(advert['Sales']-advert['sales_pred3'])**2 111 | RSEd3=advert.sum()['RSE3'] 112 | RSE3=np.sqrt(RSEd3/197) # Df Residuals (n-p-1): 200-2-1 = 197 113 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables 114 | salesmean=np.mean(advert['Sales']) 115 | error3=RSE3/salesmean 116 | RSE3,salesmean,error3 117 | # (1.7136206211553162, 14.022500000000003, 0.12220507193120456) 118 | # The current model carries a 12% error and the R2 is 0.89 < 0.9 119 | # F-statistic: 859.6 => indicating a very efficient model. 120 | 121 | 122 | # Model 4: 123 | # Sales = f(TV,Radio)= alpha + beta1*TV + beta2*Radio + beta3*Newspaper 124 | model4=smf.ols(formula='Sales~TV+Radio+Newspaper',data=advert).fit() 125 | model4.params 126 | # Intercept(alpha): 2.938889; TV(beta1): 0.045765; 127 | # Radio(beta2): 0.188530; Newspaper(beta3): -0.001037 128 | model4.pvalues 129 | # Intercept(alpha): 1.267295e-17; TV(beta1): 1.509960e-81; 130 | # Radio(beta2): 1.505339e-54; Newspaper(beta3): 8.599151e-01 131 | # p-values are very small, therfore parameters are significant. 132 | model4.rsquared # 0.89721063817895219 133 | model4.summary() 134 | 135 | # Predict the values of sales based on the equation of model 4 136 | sales_pred4=model4.predict(advert[['TV','Radio','Newspaper']]) 137 | sales_pred4 138 | 139 | # Calculate RSE term for model 4 140 | advert['sales_pred4']=2.938 + 0.045*advert['TV'] + 0.188*advert['Radio'] - 0.001*advert['Newspaper'] 141 | advert['RSE4']=(advert['Sales']-advert['sales_pred4'])**2 142 | RSEd4=advert.sum()['RSE4'] 143 | RSE4=np.sqrt(RSEd4/196) # Df Residuals (n-p-1): 200-3-1 = 196 144 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables 145 | salesmean=np.mean(advert['Sales']) 146 | error4=RSE4/salesmean 147 | RSE4,salesmean,error4 148 | # (1.691523011857319, 14.022500000000003, 0.12062920391209261) 149 | # The current model carries a 12% error and the R2 is 0.89 < 0.9 150 | # F-statistic: 570.3 => 151 | # This suggests that the partial benefit of adding newspaper to the model 152 | # containing TV and radio is negative. 153 | # RSE does not increase as book says. It decreases from 1.71 to 1.69. 154 | 155 | 156 | # Multi-collinearity 157 | # Calculate the Variance Inflation Factor 158 | # It is a method to quantify the rise in the variability of the coefficient 159 | # estimate of a particular variable because of high correlation between two or 160 | # more than two predictor variables. 161 | 162 | # VIF for the Newspaper 163 | modelVIF1=smf.ols(formula='Newspaper~TV+Radio',data=advert).fit() 164 | rsquared1=modelVIF1.rsquared 165 | VIF1=1/(1-rsquared1) 166 | VIF1 # Out: 1.1451873787239286 167 | 168 | # VIF for the Radio 169 | modelVIF2=smf.ols(formula='Radio~TV+Newspaper',data=advert).fit() 170 | rsquared2=modelVIF2.rsquared 171 | VIF2=1/(1-rsquared2) 172 | VIF2 # Out: 1.1449519171055353 173 | 174 | # VIF for the TV 175 | modelVIF3=smf.ols(formula='TV~Newspaper+Radio',data=advert).fit() 176 | rsquared3=modelVIF3.rsquared 177 | VIF3=1/(1-rsquared3) 178 | VIF3 # Out: 1.0046107849396502 179 | 180 | # Summary: 181 | # Newspaper and Radio have the same VIF and are thus correlated with one another. 182 | # Model 3 with TV and Radio is superior to Model 2 with TV and Newspaper. 183 | # Model 4 with all 3 variable is actually weaker than Model 3. 184 | 185 | # Training and testing data split 186 | a=np.random.randn(len(advert)) 187 | check=a<0.8 188 | training=advert[check] # Out: 152 189 | testing=advert[~check] # Out: 48 190 | 191 | # Model 5: [model will changeeach time its run because of random generator.] 192 | # Sales = f(TV,Radio)= alpha + beta1*TV+ beta2*Radio 193 | model5=smf.ols(formula='Sales~TV+Radio',data=training).fit() 194 | model5.params 195 | # Intercept(alpha): 2.771009; TV(beta1): 0.047188; Radio(beta2): 0.185030 196 | model5.pvalues 197 | # Intercept(alpha): 6.613587e-13; TV(beta1): 2.625145e-62; 198 | # Radio(beta2): 1.803356e-41 199 | # p-values are very small, therfore parameters are significant. 200 | model5.rsquared # 0.89415688916044844 201 | model5.summary() # F-statistic: 629.4 202 | 203 | # Predict the values of sales based on the equation of model 5 using testing data 204 | sales_pred5=model5.predict(training[['TV','Radio']]) 205 | sales_pred5 206 | 207 | # Calculate RSE term for model 5 208 | testing['sales_pred5']=2.7710 + 0.0472*testing['TV'] + 0.1850*testing['Radio'] 209 | testing['RSE5']=(testing['Sales']-testing['sales_pred5'])**2 210 | RSEd5=testing.sum()['RSE5'] 211 | RSE5=np.sqrt(RSEd5/45) # len(testing) = 48; (n-p-1): 48-2-1 = 45 212 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables 213 | salesmean=np.mean(testing['Sales']) 214 | error5=RSE5/salesmean 215 | RSE5,salesmean,error5 216 | # (1.4032428224556619, 14.120833333333335, 0.099373938444779819) 217 | # The current model carries a 11% error and the R2 is 0.89 < 0.9 -------------------------------------------------------------------------------- /Ch05/nonlinearRegression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 7 14:56:09 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Transforming a variable to fit non-linear relations 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from sklearn.linear_model import LinearRegression 13 | from sklearn.preprocessing import PolynomialFeatures 14 | from sklearn import linear_model 15 | 16 | # Import Data from CSV file. 17 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 18 | filename= 'Auto.csv' 19 | file = filepath+'/'+filename 20 | 21 | data=pd.read_csv(file) 22 | data.head() 23 | print(data.shape) #Out: (406, 9) 24 | 25 | # Plot data to check linearity 26 | #%matplotlib inline 27 | data['mpg']=data['mpg'].dropna() 28 | data['horsepower']=data['horsepower'].dropna() 29 | plt.plot(data['horsepower'],data['mpg'],'ro') 30 | plt.xlabel('Horsepower') 31 | plt.ylabel('MPG (Miles Per Gallon)') 32 | 33 | # Model 1 34 | # Try linear model. 35 | # MPG = co + alpha*HP 36 | X=data['horsepower'].fillna(data['horsepower'].mean()) 37 | Y=data['mpg'].fillna(data['mpg'].mean()) 38 | lm=LinearRegression() 39 | lm.fit(X[:,np.newaxis],Y) 40 | 41 | # Plot Again 42 | #plt.plot(data['horsepower'],data['mpg'],'ro') 43 | #plt.plot(X,lm.predict(X[:,np.newaxis]),color='blue') 44 | 45 | # R2 score 46 | lm.score(X[:,np.newaxis],Y) # Out: 0.57465334064502505 47 | 48 | # Alternative method for RSE 49 | RSEd=(Y-lm.predict(X[:,np.newaxis]))**2 50 | RSE1=np.sqrt(np.sum(RSEd)/389) 51 | ymean=np.mean(Y) 52 | error1=RSE1/ymean 53 | RSE1,error1 # Out: (5.1496254786975237, 0.21899719414044677) 54 | 55 | # Model 2 56 | # In the form of mpg = co+a1.horsepower**2, 57 | X2=data['horsepower'].fillna(data['horsepower'].mean())*data['horsepower'].fillna(data['horsepower'].mean()) 58 | Y2=data['mpg'].fillna(data['mpg'].mean()) 59 | lm2=LinearRegression() 60 | lm2.fit(X2[:,np.newaxis],Y2) 61 | 62 | type(lm2.predict(X2[:,np.newaxis])) 63 | RSEd=(Y2-lm2.predict(X2[:,np.newaxis]))**2 64 | RSE2=np.sqrt(np.sum(RSEd)/390) 65 | ymean=np.mean(Y2) 66 | error2=RSE2/ymean 67 | RSE2,error2,ymean 68 | # Out: (5.6591995312606125, 0.24066775798625065, 23.51457286432162) 69 | 70 | # R2 score 71 | lm2.score(X2[:,np.newaxis],Y2) # Out: 0.48498870348232048 72 | 73 | print (lm2.intercept_) # Out: 30.405683105 74 | print (lm2.coef_) # Out:[ 0. -0.43404318 0.00112615] 75 | 76 | # Model 3 77 | # Attempt polynomial fit with 2 degrees 78 | X3=data['horsepower'].fillna(data['horsepower'].mean()) 79 | Y3=data['mpg'].fillna(data['mpg'].mean()) 80 | poly = PolynomialFeatures(degree=2) 81 | X3_ = poly.fit_transform(X3[:,np.newaxis]) 82 | clf3 = linear_model.LinearRegression() 83 | clf3.fit(X3_, Y3) 84 | 85 | print (clf3.intercept_) # Out: 55.0261924471 86 | print (clf3.coef_) # Out: [-0.00055043] 87 | 88 | # R2 score # R2 = 0.688 89 | clf3.score(X3_,Y3) # Out: 0.6439066584257469 90 | 91 | # Model 4 92 | # Attempt polynomial fit with 5 degrees 93 | X5=data['horsepower'].fillna(data['horsepower'].mean()) 94 | Y5=data['mpg'].fillna(data['mpg'].mean()) 95 | poly = PolynomialFeatures(degree=5) 96 | X5_ = poly.fit_transform(X5[:,np.newaxis]) 97 | clf5 = linear_model.LinearRegression() 98 | clf5.fit(X5_, Y5) 99 | 100 | print (clf5.intercept_) # Out: -40.6939920548 101 | print (clf5.coef_) 102 | # Out:[ 0.00000000e+00 4.00021890e+00 -7.54802463e-02 6.19621638e-04 103 | # -2.36220983e-06 3.41983064e-09] 104 | 105 | # R2 = 0.7 106 | clf5.score(X5_,Y5) # Out: 0.6547512491826567 107 | 108 | # Model 5 109 | # Try y = 1/x or 1/x2 110 | 111 | 112 | # Plot All 113 | XP = np.arange(45,248,0.5) 114 | M2 = 30.405683105 -0.00055043*XP**2 115 | M3 = 55.0261924471 - 0.43404318*XP + 0.00112615*XP**2 116 | M4 = -40.6939920548 + 4.00021890e+00*XP -7.54802463e-02*XP**2 + 6.19621638e-04*XP**3 -2.36220983e-06*XP**4 + 3.41983064e-09*XP**5 117 | 118 | plt.plot(data['horsepower'],data['mpg'],'ro') # Actual Data 119 | plt.plot(XP,lm.predict(XP[:,np.newaxis]),color='magenta') 120 | plt.plot(XP,M2,color='blue') # Model 2 121 | plt.plot(XP,M3,color='green') # Model 3 122 | plt.plot(XP,M4,color='yellow') # Model 4 -------------------------------------------------------------------------------- /Ch06/Histogram of Age.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Histogram of Age.png -------------------------------------------------------------------------------- /Ch06/Purchase Frequency for Day of Week'.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Purchase Frequency for Day of Week'.png -------------------------------------------------------------------------------- /Ch06/Purchase Frequency for Education Level.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Purchase Frequency for Education Level.png -------------------------------------------------------------------------------- /Ch06/Purchase Frequency for Month of the Year.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Purchase Frequency for Month of the Year.png -------------------------------------------------------------------------------- /Ch06/ROC Curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/ROC Curve.png -------------------------------------------------------------------------------- /Ch06/Stacked Bar Chart of Marital Status vs Purchase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Stacked Bar Chart of Marital Status vs Purchase.png -------------------------------------------------------------------------------- /Ch06/logisticRegression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 7 21:48:26 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Understanding the math behind logistic regression 9 | import pandas as pd 10 | import numpy as np 11 | 12 | # Import Data from CSV file. 13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 14 | filename= 'Gender Purchase.csv' 15 | file = filepath+'/'+filename 16 | 17 | df=pd.read_csv(file) 18 | df.head() 19 | print(df.shape) #Out: (511, 2) 20 | 21 | # Contingency table for the dataset 22 | contingency_table=pd.crosstab(df['Gender'],df['Purchase']) 23 | contingency_table 24 | # Add horizontally 25 | contingency_table.sum(axis=1) 26 | # Add vertically 27 | contingency_table.sum(axis=0) 28 | 29 | # Calculate the proportions 30 | contingency_table.astype('float').div(contingency_table.sum(axis=1),axis=0) 31 | 32 | -------------------------------------------------------------------------------- /Ch06/logisticRegressionImplementation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 11 20:15:24 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Implementing logistic regression with Python 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | # Import Data from CSV file. 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 15 | filename= 'bank.csv' 16 | file = filepath+'/'+filename 17 | 18 | bank=pd.read_csv(file, sep=';') 19 | bank.head() 20 | print(bank.shape) #Out: (4119, 21) 21 | 22 | # Column Names 23 | bank.columns.values 24 | 25 | # Out: array(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 26 | # 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 27 | # 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 28 | # 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'], dtype=object) 29 | 30 | # Type of the column from the dataset 31 | bank.dtypes 32 | #Out[6]: 33 | #age int64 34 | #job object 35 | #marital object 36 | #education object 37 | #default object 38 | #housing object 39 | #loan object 40 | #contact object 41 | #month object 42 | #day_of_week object 43 | #duration int64 44 | #campaign int64 45 | #pdays int64 46 | #previous int64 47 | #poutcome object 48 | #emp.var.rate float64 49 | #cons.price.idx float64 50 | #cons.conf.idx float64 51 | #euribor3m float64 52 | #nr.employed float64 53 | #y object 54 | #dtype: object 55 | 56 | # Processing the data 57 | # the 'y' column is the customer variable with outcome'yes' and 'no'. 58 | # Convert column to something that can be used, i.e. '1' and '0' 59 | bank['y']=(bank['y']=='yes').astype(int) 60 | 61 | # Education column has many categories and needs to be reduced. 62 | bank['education'].unique() 63 | 64 | # The basic category has been repeated three times probably to 65 | # capture 4, 6, and 9 years of education. Let us club these three together 66 | # and call them basic. Other modified as well. 67 | bank['education']=np.where(bank['education'] =='basic.9y', 'Basic', bank['education']) 68 | bank['education']=np.where(bank['education'] =='basic.6y', 'Basic', bank['education']) 69 | bank['education']=np.where(bank['education'] =='basic.4y', 'Basic', bank['education']) 70 | bank['education']=np.where(bank['education'] =='university.degree', 'University Degree', bank['education']) 71 | bank['education']=np.where(bank['education'] =='professional.course', 'Professional Course', bank['education']) 72 | bank['education']=np.where(bank['education'] =='high.school', 'High School', bank['education']) 73 | bank['education']=np.where(bank['education'] =='illiterate', 'Illiterate', bank['education']) 74 | bank['education']=np.where(bank['education'] =='unknown', 'Unknown', bank['education']) 75 | 76 | # Data exploration 77 | # The number of people who purchased the term deposit 78 | bank['y'].value_counts() # Out: Out[12]: [ '0' 3668, '1' 451] 79 | 80 | # Many numbers, so lets gets an overview. 81 | bank.groupby('y').mean() 82 | # Categorical means 83 | bank.groupby('education').mean() 84 | 85 | # Data visualization 86 | 87 | # Tabular data 88 | pd.crosstab(bank.education,bank.y) 89 | # %matplotlib inline 90 | #pd.crosstab(bank.education,bank.y).plot(kind='bar') 91 | #plt.title('Purchase Frequency for Education Level') 92 | #plt.xlabel('Education') 93 | #plt.ylabel('Frequency of Purchase') 94 | 95 | # Stacked bar chart of marital staus and purchase of term deposit. 96 | #table=pd.crosstab(bank.marital,bank.y) 97 | #table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True) 98 | #plt.title('Stacked Bar Chart of Marital Status vs Purchase') 99 | #plt.xlabel('Marital Status') 100 | #plt.ylabel('Proportion of Customers') 101 | 102 | # Bar chart of Purchase Frequency for Day of Week' 103 | #pd.crosstab(bank.day_of_week,bank.y).plot(kind='bar') 104 | #plt.title('Purchase Frequency for Day of Week') 105 | #plt.xlabel('Day of Week') 106 | #plt.ylabel('Frequency of Purchase') 107 | 108 | # Bar chart of Purchase Frequency for Day of Week' 109 | #pd.crosstab(bank.month,bank.y).plot(kind='bar') 110 | #plt.title('Purchase Frequency for Month of the Year') 111 | #plt.xlabel('Month of the Year') 112 | #plt.ylabel('Frequency of Purchase') 113 | 114 | # Histogram of Age 115 | #bank.age.hist() 116 | #plt.title('Histogram of Age') 117 | #plt.xlabel('Age') 118 | #plt.ylabel('Frequency') 119 | 120 | # Creating dummy variables for categorical variables 121 | cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'] 122 | for var in cat_vars: 123 | cat_list='var'+'_'+var 124 | cat_list = pd.get_dummies(bank[var], prefix=var) 125 | bank1=bank.join(cat_list) 126 | bank=bank1 127 | 128 | # Remove actual categories once dummies have been created 129 | bank_vars=bank.columns.values.tolist() 130 | to_keep=[i for i in bank_vars if i not in cat_vars] 131 | 132 | # Subset the bank dataframe to only keep the columns present 133 | bank_final=bank[to_keep] 134 | bank_final.columns.values 135 | 136 | # Y outcomes and X predictors can now be calculated 137 | bank_final_vars=bank_final.columns.values.tolist() 138 | Y=['y'] 139 | X=[i for i in bank_final_vars if i not in Y ] 140 | 141 | # Feature selection 142 | # All 12 columns can be selected 143 | from sklearn import datasets 144 | from sklearn.feature_selection import RFE 145 | from sklearn.linear_model import LogisticRegression 146 | 147 | model = LogisticRegression() 148 | 149 | # Selct a model with 12 variables. 150 | rfe = RFE(model, 12) 151 | rfe = rfe.fit(bank_final[X],bank_final[Y] ) 152 | 153 | # Print out the support array 154 | print(rfe.support_) 155 | # Print out the ranking 156 | print(rfe.ranking_) 157 | # The columns with true or 1 shall be selected for the final selection. 158 | 159 | # 'previous', 'euribor3m', 'job_entrepreneur', 'job_self-employed', 160 | # 'poutcome_success', 'poutcome_failure', 'month_oct', 'month_may','month_mar', 161 | # 'month_jun', 'month_jul', 'month_dec' 162 | 163 | # Fit a logistic regression model using the preceding selected variables 164 | # as predictor variables, with the y as the outcome variable 165 | cols=['previous', 'euribor3m', 'job_entrepreneur', 'job_self-employed', 'poutcome_success', 'poutcome_failure', 'month_oct', 'month_may', 166 | 'month_mar', 'month_jun', 'month_jul', 'month_dec'] 167 | # Dataframe taht just has the selected columns 168 | X=bank_final[cols] 169 | Y=bank_final['y'] 170 | 171 | # Implementing the model 172 | import statsmodels.api as sm 173 | logit_model=sm.Logit(Y,X) 174 | result=logit_model.fit() 175 | print (result.summary()) 176 | 177 | # The statsmodel.api method can be used while exploring and fine-tuning the model. 178 | # One advantage of this method is that p-values are calculated automatically 179 | # in the result summary. 180 | # The scikit-learn method can be used in the final model used to predict the outcome. 181 | # The scikit-learn method doesn't have this facility, 182 | # but is more powerful for calculation-intensive tasks such as prediction, 183 | # calculating scores, and advanced functions such as feature selection. 184 | 185 | # Fit the model 186 | from sklearn import linear_model 187 | clf = linear_model.LogisticRegression() 188 | clf.fit(X, Y) 189 | 190 | # Calculate the accuracy 191 | clf.score(X,Y) #Out = 0.90216071862102454 192 | # The value comes out to be .902. The mean value of the outcome is .11, 193 | # meaning that the outcome is positive (1) around 11% of the time and negative 194 | # around 89% of the time. 195 | 196 | # Get the values of the coefficients 197 | zipped = list(zip(X.columns, np.transpose(clf.coef_))) 198 | pd.DataFrame(zipped) 199 | 200 | # Out: 201 | # 0 1 202 | #0 previous [0.379831612876] 203 | #1 euribor3m [-0.502749071837] 204 | #2 job_entrepreneur [-0.343066155888] 205 | #3 job_self-employed [-0.335064163493] 206 | #4 poutcome_success [1.07783253323] 207 | #5 poutcome_failure [-0.753161867894] 208 | #6 month_oct [0.411855745929] 209 | #7 month_may [-0.743089630936] 210 | #8 month_mar [1.2703612295] 211 | #9 month_jun [0.509694983142] 212 | #10 month_jul [0.382087449085] 213 | #11 month_dec [0.873316799315] 214 | 215 | # Model validation and evaluation 216 | # Split into training and testing sets 217 | from sklearn.cross_validation import train_test_split 218 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0) 219 | 220 | # Build a logistic regression model ove the training dataset 221 | from sklearn import linear_model 222 | from sklearn import metrics 223 | clf1 = linear_model.LogisticRegression() 224 | clf1.fit(X_train, Y_train) 225 | 226 | # Get probalities and classifications 227 | probs = clf1.predict_proba(X_test) 228 | 229 | # Out: [ Negative, Positive] 230 | # array([[ 0.93352157, 0.06647843], 231 | # ..., 232 | # [ 0.24746608, 0.75253392]]) 233 | 234 | # Get predicted outcomes 235 | predicted = clf1.predict(X_test) 236 | print(predicted) # Out: [0 0 0 ..., 0 0 1] 237 | # Default cut off is at 0.5. 238 | # We saw that 10% of customers brought product, hence 0.1 cut-off. 239 | 240 | # Changing the threshold value 241 | prob=probs[:,1] # Take second column, i.e. positive outcomes 242 | prob_df=pd.DataFrame(prob) # Push to dataframe 243 | prob_df['predict']=np.where(prob_df[0]>=0.10,1,0) 244 | prob_df.head() 245 | # [ @0.1 => 28%, @0.15 => 18%, @0.05 => 65%] 246 | 247 | # Accuracy of the model 248 | print (metrics.accuracy_score(Y_test, predicted)) # Out: 0.902103559871 249 | 250 | # Cross validation 251 | # Using the k-fold method 252 | # Use a 8-fold cross validation method 253 | # CAlculates the accuracy of each iteration 254 | from sklearn.cross_validation import cross_val_score 255 | scores = cross_val_score(linear_model.LogisticRegression(), X, Y, scoring='accuracy', cv=8) 256 | print (scores) 257 | # Out: [ 0.91860465 0.90310078 0.89534884 0.90679612 0.89883268 258 | # 0.89299611 0.90466926 0.89883268] 259 | print (scores.mean()) # Out: 0.902397639921 260 | 261 | 262 | # Model Validation 263 | # ROC Curve 264 | # Run model and calculate the probabilities for each observation 265 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0) 266 | clf1 = linear_model.LogisticRegression() 267 | clf1.fit(X_train, Y_train) 268 | probs = clf1.predict_proba(X_test) 269 | 270 | # Each probable value is compared to threshold probability and categorized as 271 | # 1 (postive outcome) 272 | prob=probs[:,1] 273 | prob_df=pd.DataFrame(prob) 274 | prob_df['predict']=np.where(prob_df[0]>=0.05,1,0) 275 | prob_df['actual']=Y_test #TODO: Comes out as NAN 276 | prob_df.head() 277 | 278 | # Confusion matrix 279 | confusion_matrix=pd.crosstab(prob_df['actual'],prob_df['predict']) 280 | confusion_matrix 281 | 282 | # Plot ROC curve manually 283 | 284 | #%matplotlib inline 285 | Sensitivity=[1,0.95,0.87,0.62,0.67,0.59,0.5,0.41,0] 286 | FPR=[1,0.76,0.62,0.23,0.27,0.17,0.12,0.07,0] 287 | #plt.plot(FPR,Sensitivity,marker='o',linestyle='--',color='r') 288 | x=[i*0.01 for i in range(100)] 289 | y=[i*0.01 for i in range(100)] 290 | #plt.plot(x,y) 291 | #plt.xlabel('(1-Specificity)') 292 | #plt.ylabel('Sensitivity') 293 | #plt.title('ROC Curve') 294 | 295 | # Using scikit-learn package to plot the ROC Curve 296 | #TODO: 297 | from sklearn import metrics 298 | from ggplot import * 299 | 300 | prob = clf1.predict_proba(X_test)[:,1] 301 | fpr, sensitivity, _ = metrics.roc_curve(Y_test, prob) 302 | 303 | df = pd.DataFrame(dict(fpr=fpr, sensitivity=sensitivity)) 304 | ggplot(df, aes(x='fpr', y='sensitivity')) + geom_line() +\ 305 | geom_abline(linetype='dashed') 306 | 307 | # Area under the curve 308 | auc = metrics.auc(fpr,sensitivity) 309 | auc 310 | 311 | # Area under curve can be plotted. 312 | ggplot(df, aes(x='fpr', ymin=0, ymax='sensitivity')) +\ 313 | geom_area(alpha=0.2) +\ 314 | geom_line(aes(y='sensitivity')) +\ 315 | ggtitle("ROC Curve w/ AUC=%s" % str(auc)) 316 | 317 | -------------------------------------------------------------------------------- /Ch06/logisticRegressionScratch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 10 20:28:13 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Building the logistic regression model from scratch 9 | # Step 1: defining the likelihood function 10 | def likelihood(y,pi): 11 | import numpy as np 12 | ll=1 13 | ll_in=list(range(1,len(y)+1)) 14 | for i in range(len(y)): 15 | ll_in[i]=np.where(y[i]==1,pi[i],(1-pi[i])) 16 | ll=ll*ll_in[i] 17 | return ll 18 | 19 | # Step 2: calculating probability for each observation 20 | def logitprob(X,beta): 21 | import numpy as np 22 | rows=np.shape(X)[0] 23 | cols=np.shape(X)[1] 24 | pi=list(range(1,rows+1)) 25 | expon=list(range(1,rows+1)) 26 | for i in range(rows): 27 | expon[i]=0 28 | for j in range(cols): 29 | ex=X[i][j]*beta[j] 30 | expon[i]=ex+expon[i] 31 | with np.errstate(divide='ignore', invalid='ignore'): 32 | pi[i]=np.exp(expon[i])/(1+np.exp(expon[i])) 33 | return pi 34 | 35 | # Step 3: Calculate the W diagonal matrix 36 | def findW(pi): 37 | import numpy as np 38 | W=np.zeros(len(pi)*len(pi)).reshape(len(pi),len(pi)) 39 | for i in range(len(pi)): 40 | print (i) 41 | W[i,i]=pi[i]*(1-pi[i]) 42 | W[i,i].astype(float) 43 | return W 44 | 45 | # Step 4: defining the logistic function 46 | def logistic(X,Y,limit): 47 | import numpy as np 48 | from numpy import linalg 49 | nrow=np.shape(X)[0] 50 | bias=np.ones(nrow).reshape(nrow,1) 51 | X_new=np.append(X,bias,axis=1) 52 | ncol=np.shape(X_new)[1] 53 | beta=np.zeros(ncol).reshape(ncol,1) 54 | root_diff=np.array(range(1,ncol+1)).reshape(ncol,1) 55 | iter_i=10000 56 | while(iter_i>limit): 57 | print (iter_i, limit) 58 | pi=logitprob(X_new,beta) 59 | print (pi) 60 | W=findW(pi) 61 | print (W) 62 | print (X_new) 63 | print (Y-np.transpose(pi)) 64 | print (np.array((linalg.inv(np.matrix(np.transpose(X_new))*np.matrix(W)*np.matrix(X_new)))*(np.transpose(np.matrix(X_new))*np.matrix(Y-np.transpose(pi)).transpose()))) 65 | print (beta) 66 | print (type(np.matrix(np.transpose(Y-np.transpose(pi)))) ) 67 | print (np.matrix(Y-np.transpose(pi)).transpose().shape) 68 | print (np.matrix(np.transpose(X_new)).shape) 69 | root_diff=np.array((linalg.inv(np.matrix(np.transpose(X_new))*np.matrix(W)*np.matrix(X_new)))*(np.transpose(np.matrix(X_new))*np.matrix(Y-np.transpose(pi)).transpose())) 70 | beta=beta+root_diff 71 | iter_i=np.sum(root_diff*root_diff) 72 | ll=likelihood(Y,pi) 73 | print (beta) 74 | print (beta.shape) 75 | return beta 76 | 77 | # Testing the model 78 | import numpy as np 79 | X=np.array(range(10)).reshape(10,1) 80 | Y=[0,0,0,0,1,0,1,0,1,1] 81 | bias=np.ones(10).reshape(10,1) 82 | X_new=np.append(X,bias,axis=1) 83 | 84 | # Running logistic Regression using our function 85 | a=logistic(X,Y,0.000000001) 86 | ll=likelihood(Y,logitprob(X,a)) 87 | #Coefficient of X = 0.66 , Intercept = -3.69 88 | 89 | # From stasmodel.api 90 | import statsmodels.api as sm 91 | logit_model=sm.Logit(Y,X_new) 92 | result=logit_model.fit() 93 | print (result.summary()) 94 | #Coefficient of X = 0.66, Intercept = -3.69 -------------------------------------------------------------------------------- /Ch07/Histogram of Clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch07/Histogram of Clusters.png -------------------------------------------------------------------------------- /Ch07/Histogramn of Cluster Labels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch07/Histogramn of Cluster Labels.png -------------------------------------------------------------------------------- /Ch07/clusterWine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 13 13:33:18 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | 8 | # Classify wine by chemical composition 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | 12 | # Import Data from CSV file. 13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 14 | filename= 'wine.csv' 15 | file = filepath+'/'+filename 16 | df=pd.read_csv(file,sep=';') 17 | df.head() 18 | 19 | # Plot data to have a look at the quality 20 | #% matplotlib inline 21 | #plt.hist(df['quality']) 22 | 23 | # Check the mean of the quality 24 | df.groupby('quality').mean() 25 | 26 | # Normalizing the values in the dataset 27 | df_norm = (df - df.min()) / (df.max() - df.min()) 28 | df_norm.head() 29 | 30 | # Hierarchical clustering using scikit-learn 31 | from sklearn.cluster import AgglomerativeClustering 32 | ward = AgglomerativeClustering(n_clusters=6, linkage='ward').fit(df_norm) 33 | md=pd.Series(ward.labels_) 34 | ward.children_ 35 | 36 | # Plot the histogram of cluster labels 37 | #plt.hist(md) 38 | #plt.title('Histogram of Cluster Label') 39 | #plt.xlabel('Cluster') 40 | #plt.ylabel('Frequency') 41 | 42 | # K-Means clustering using scikit-learn 43 | # fits the k-means clustering model to the wine dataset 44 | from sklearn.cluster import KMeans 45 | from sklearn import datasets 46 | model=KMeans(n_clusters=6) 47 | model.fit(df_norm) 48 | 49 | # an array depicting the cluster the row belongs to 50 | model.labels_ 51 | # Out: array([4, 4, 4, ..., 0, 0, 3], dtype=int32) 52 | 53 | # Make the array apart of the dataframe 54 | md=pd.Series(model.labels_) 55 | df_norm['clust']=md 56 | df_norm.head() 57 | 58 | # Centroids for each cluster 59 | model.cluster_centers_ 60 | 61 | # j-score 62 | model.inertia_ 63 | 64 | # Plot histogram of the cluster 65 | plt.hist(df_norm['clust']) 66 | plt.title('Histogram of Clusters') 67 | plt.xlabel('Cluster') 68 | plt.ylabel('Frequency') 69 | 70 | # Calculate the mean of the composition for each cluster and component 71 | df_norm.groupby('clust').mean() -------------------------------------------------------------------------------- /Ch07/kMeanClustering.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 13 13:15:01 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | # K-means clustering 8 | import numpy as np 9 | 10 | # Define an observation set of 30x3 11 | obs=np.random.random(90).reshape(30,3) 12 | obs 13 | 14 | # I decided that I want two clusters 15 | c1=np.random.choice(range(len(obs))) 16 | c2=np.random.choice(range(len(obs))) 17 | clust_cen=np.vstack([obs[c1],obs[c2]]) 18 | clust_cen # 2 rows in array correspond to 2 cluster centroids. 19 | 20 | # Implement k-menas clustering 21 | from scipy.cluster.vq import vq 22 | vq(obs,clust_cen) 23 | 24 | # First array tells us which cluster the observation belongs to. 25 | # '0' for c1, '1' for c2 26 | # i.e. obs1 is with c2, obs2 is with c1 27 | # Second array tells us how far the observation is from it cluster centroid. 28 | # obs1 is 0.25 units away from c2 cluster centroid 29 | # obs1 is 0.49 units away from c1 cluster centroid 30 | #(array([1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 31 | # 0, 0, 1, 1, 0, 0, 1], dtype=int32), 32 | # array([ 0.24932073, 0.49594229, 0.28098465, 0.50348212, 0. , 33 | # 0.61496506, 0.26303013, 0.42779775, 0.59958318, 0.3468564 , 34 | # 0.40935109, 0.58624004, 0.42803874, 0.78335592, 0.50565815, 35 | # 0.61892717, 0.57338804, 0.51580769, 0.37107392, 0.54979847, 36 | # 0.48482825, 0.5257047 , 0.50568491, 0.43748909, 0.71436479, 37 | # 0. , 0.39646343, 0.47429546, 0.21875716, 0.59853208])) 38 | 39 | # FInd the cluster centroid for the two centroids 40 | from scipy.cluster.vq import kmeans 41 | kmeans(obs,clust_cen) 42 | # The two rows in the array correspond to the two final cluster centroids. 43 | # At the end, J-score, which we seek to minimize 44 | # (array([[ 0.62260732, 0.69445579, 0.50227104], 45 | # [ 0.37635439, 0.32446748, 0.32121864]]), 0.36366199194289345) 46 | 47 | 48 | # Alternatively, just provide the number of required clusters. 49 | from scipy.cluster.vq import kmeans 50 | kmeans(obs,2) -------------------------------------------------------------------------------- /Ch08/decisionTreeIris.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 13 15:11:39 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | # Implementing a decision tree with scikit-learn 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | # Import Data from CSV file. 13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 14 | filename= 'iris.csv' 15 | file = filepath+'/'+filename 16 | data=pd.read_csv(file) 17 | data.head() 18 | 19 | # Unique species 20 | data['Species'].unique() 21 | # Out: array(['setosa', 'versicolor', 'virginica'], dtype=object) 22 | 23 | # 1st get the predictor and the target variables 24 | colnames=data.columns.values.tolist() 25 | predictors=colnames[:4] 26 | target=colnames[4] 27 | 28 | # Split into training and test data 29 | # Generate a uniform random distribution of numbers between 0 and 1. 30 | # train data selected is any data which has a number less than 0.75. 31 | # Complement goes to the test data 32 | data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75 33 | train, test = data[data['is_train']==True], data[data['is_train']==False] 34 | 35 | # Create a decision tree. 36 | from sklearn.tree import DecisionTreeClassifier 37 | dt = DecisionTreeClassifier(criterion='entropy',min_samples_split=20, random_state=99) 38 | dt.fit(train[predictors], train[target]) 39 | # min_samples_split specifies the mnimum number of observations required 40 | # to split a node into a subnode. 41 | # Default = 2 42 | # Recommended = 20 43 | 44 | 45 | # Test predicted model 46 | # Predicts class (species) of the flower via decision tree 47 | preds=dt.predict(test[predictors]) 48 | # Creates a tablecomparing the Actual species and the predicted species. 49 | pd.crosstab(test['Species'],preds,rownames=['Actual'],colnames=['Predictions']) 50 | 51 | # Visualizing the tree 52 | # Create a .dot file from the Decision Tree Classifier 53 | from sklearn.tree import export_graphviz 54 | dotfilename= 'dtree2.dot' 55 | dotfiles = filepath+'/'+dotfilename 56 | with open(dotfiles, 'w') as dotfile: 57 | export_graphviz(dt, out_file = dotfile, feature_names = predictors) 58 | dotfile.close() 59 | 60 | # Rendering a dotfile into a tree 61 | # After installing graphviz 62 | from os import system 63 | 64 | system("dot -Tpng //home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/dtree2.dot -o //home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/dtree2.png") 65 | 66 | # Cross validate the etire dataset. 67 | X=data[predictors] 68 | Y=data[target] 69 | dt1 = DecisionTreeClassifier(criterion='entropy',max_depth=5, min_samples_split=20, random_state=99) 70 | dt1.fit(X,Y) 71 | # Import the cross validation methods from sklearn and perform the cross validation 72 | from sklearn.cross_validation import KFold 73 | crossvalidation = KFold(n=X.shape[0], n_folds=10, shuffle=True, random_state=1) 74 | from sklearn.cross_validation import cross_val_score 75 | score = np.mean(cross_val_score(dt1, X, Y, scoring='accuracy', cv=crossvalidation, n_jobs=1)) 76 | score #Out: 0.93333333333333335 77 | 78 | # Feature importance test 79 | # Higher the value, the higher the feature importance 80 | dt1.feature_importances_ 81 | # Out: array([ 0. , 0. , 0.66869158, 0.33130842]) 82 | # Out: ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species'] 83 | # 1st: petal.length then petal width -------------------------------------------------------------------------------- /Ch08/dtree2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch08/dtree2.png -------------------------------------------------------------------------------- /Ch08/randomForest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 16 20:01:46 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | """ 8 | Implementing a regression tree using Python 9 | """ 10 | import pandas as pd 11 | import numpy as np 12 | 13 | # Import Data from CSV file. 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 15 | filename= 'Boston.csv' 16 | file = filepath+'/'+filename 17 | data=pd.read_csv(file) 18 | data.head() #Out: (506, 14) 19 | 20 | # First 13 varaibles set as predictor variables 21 | # and the last (MEDV) as the target variable 22 | colnames=data.columns.values.tolist() 23 | predictors=colnames[:13] 24 | target=colnames[13] 25 | X=data[predictors] 26 | Y=data[target] 27 | 28 | # Build the random forest model. 29 | from sklearn.ensemble import RandomForestRegressor 30 | # Node size(min_samples_leaf): not so important here 31 | # Number of trees (n_estimators): generally around 500 32 | # Number of predictors sampled: 2 - 5 33 | # number of jobs running parallel (n_jobs) 34 | rf = RandomForestRegressor(n_jobs=2,oob_score=True,n_estimators=10) 35 | rf.fit(X,Y) 36 | 37 | # The predicted values can be obtained 38 | rf.oob_prediction_ 39 | #Let us now make the predictions a part of the data frame and have a look at it. 40 | data['rf_pred']=rf.oob_prediction_ 41 | cols=['rf_pred','medv'] 42 | data[cols].head() 43 | 44 | # To calculate a mean squared error we use oob predicted and actual values. 45 | data['rf_pred']=rf.oob_prediction_ 46 | data['err']=(data['rf_pred']-data['medv'])**2 47 | sum(data['err'])/506 # Out[23]: 23.031183503507172 48 | 49 | # oob score 50 | rf.oob_score_ # Out[24]: 0.72718189300945413 51 | 52 | # Try with bigger sample 53 | rf2 = RandomForestRegressor(n_jobs=2,oob_score=True,n_estimators=500) 54 | rf2.fit(X,Y) 55 | data['rf2_pred']=rf2.oob_prediction_ 56 | cols2=['rf2_pred','medv'] 57 | data[cols2].head() 58 | data['rf2_pred']=rf2.oob_prediction_ 59 | data['err2']=(data['rf2_pred']-data['medv'])**2 60 | sum(data['err2'])/506 # Out[23]: 10.05342135115402 61 | rf2.oob_score_ # Out[24]: 0.88091122710291847 -------------------------------------------------------------------------------- /Ch08/regressionTree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 16 20:01:46 2016 4 | 5 | @author: jasonm_dev 6 | """ 7 | """ 8 | Implementing a regression tree using Python 9 | """ 10 | import pandas as pd 11 | import numpy as np 12 | 13 | # Import Data from CSV file. 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets' 15 | filename= 'Boston.csv' 16 | file = filepath+'/'+filename 17 | data=pd.read_csv(file) 18 | data.head() #Out: (506, 14) 19 | 20 | # First 13 varaibles set as predictor variables 21 | # and the last (MEDV) as the target variable 22 | colnames=data.columns.values.tolist() 23 | predictors=colnames[:13] 24 | target=colnames[13] 25 | X=data[predictors] 26 | Y=data[target] 27 | 28 | # Build the regression tree model. 29 | from sklearn.tree import DecisionTreeRegressor 30 | # min number of observation per node for split: 30 31 | # min number of observations per node to classify as leaf: 10 32 | regression_tree = DecisionTreeRegressor(min_samples_split=30,min_samples_leaf=10,random_state=0) 33 | regression_tree.fit(X,Y) 34 | 35 | # Use model to make predictions 36 | reg_tree_pred=regression_tree.predict(data[predictors]) 37 | data['pred']=reg_tree_pred 38 | cols=['pred','medv'] 39 | # Compare prediction with actual 40 | data[cols] 41 | 42 | # Cross-validate the model and check accuracy 43 | from sklearn.cross_validation import KFold 44 | from sklearn.cross_validation import cross_val_score 45 | 46 | crossvalidation = KFold(n=X.shape[0], n_folds=10,shuffle=True, random_state=1) 47 | score = np.mean(cross_val_score(regression_tree, X, Y,scoring='mean_squared_error', cv=crossvalidation,n_jobs=1)) 48 | score #Out[14]: -20.107307036443846 49 | 50 | # The feature importance can be checked 51 | regression_tree.feature_importances_ 52 | """Out[16]: 53 | array([ 0.03421203, 0. , 0.00116059, 0. , 0.01856163, 54 | 0.6308568 , 0.01725115, 0.00137451, 0. , 0.00236983, 55 | 0.00933325, 0. , 0.28488021]) 56 | In [8]: colnames 57 | Out[8]: 58 | ['crim', 59 | 'zn', 60 | 'indus', 61 | 'chas', 62 | 'nox', 63 | 'rm', 64 | 'age', 65 | 'dis', 66 | 'rad', 67 | 'tax', 68 | 'ptratio', 69 | 'black', 70 | 'lstat', 71 | 'medv'] 72 | 73 | The most important varaibles are age, lstat and rm in ascending order. 74 | Highest values have the most impoprtance. DOn't agree with selected variables. 75 | """ 76 | 77 | -------------------------------------------------------------------------------- /ISSUELOG.md: -------------------------------------------------------------------------------- 1 | #ISSUELOG 2 | 3 | - [ ] Open file in chapter2 4 | - [ ] Add y=1/x to mpg vs hp model 5 | - [ ] File: logisticRegressionImplementation.py 6 | - [ ] After installing ggplot within conda, still does not work 7 | - [ ] dframe show nan instead of zero which affect the confusion matrix -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Predictive Analytics with Python 2 | 3 | These are my notes from working through the book 4 | [*Learning Predictive Analytics with Python*](https://www.packtpub.com/big-data-and-business-intelligence/learning-predictive-analytics-python) 5 | by [Ashish Kumar](https://in.linkedin.com/in/ashishk64) 6 | and published on Feb 2016. 7 | 8 | ## General 9 | ###Chapter 1: Getting Started with Predictive Modelling 10 | - [x] Installed Anaconda Package. 11 | - [x] Python3.5 has been installed. 12 | - [x] Book follows python2, so some codes is modified along the way for python3. 13 | 14 | ###Chapter 2: Data Cleaning 15 | - [x] Reading the data: variations and examples 16 | - [x] Data frames and delimiters. 17 | 18 | ####Case 1: Reading a dataset using the read_csv method 19 | - [x] File: titanicReadCSV.py 20 | - [x] File: titanicReadCSV1.py 21 | - [x] File: readCustomerChurn.py 22 | - [x] File: readCustomerChurn2.py 23 | - [x] File: changeDelimiter.py 24 | 25 | ####Case 2: Reading a dataset using the open method of Python 26 | - [x] File: readDatasetByOpenMethod.py 27 | 28 | ####Case 3: Reading data from a URL 29 | - [x] Modified the code that it works and prints out line by line dictionary of the dataset. 30 | - [x] File: readURLLib2Iris.py 31 | - [x] File: readURLMedals.py 32 | 33 | ####Case 4: Miscellaneous cases 34 | - [x] File: readXLS.py 35 | - [x] Created the file above to read from both .xls an .xlsx 36 | 37 | ####Basics: Summary, dimensions, and structure 38 | - [x] File: basicDataCheck.py 39 | - [x] Created the file above to read from both .xls an .xlsx 40 | 41 | ####Handling missing values 42 | - [x] File: basicDataCheck.py 43 | - [x] RE: Treating missing data like NaN or None 44 | - [x] Deletion orr imputaion 45 | 46 | ####Creating dummy variables 47 | - [x] File: basicDataCheck.py 48 | - [x] Split into new variable 'sex_female' and 'sex_male' 49 | - [x] Remove column 'sex' 50 | - [x] Add both dummy column created above. 51 | 52 | ####Visualizing a dataset by basic plotting 53 | - [x] File: plotData.py 54 | - [x] Figure file: ScatterPlots.jpeg 55 | - [x] Plot Types: Scatterplot, Histograms and boxplots 56 | 57 | ###Chapter 3: Data Wrangling 58 | ####Subsetting a dataset 59 | - [x] Selecting Columns 60 | - [x] File: subsetDataset.py 61 | - [x] Selecting Rows 62 | - [x] File: subsetDatasetRows.py 63 | - [x] Selecting a combination of rows and columns 64 | - [x] File: subsetColRows.py 65 | - [x] Creating new columns 66 | - [x] File: subsetNewCol.py 67 | 68 | ####Generating random numbers and their usage 69 | - [x] Various methods for generating random numbers 70 | - [x] File: generateRandomNumbers.py 71 | - [x] Seeding a random number 72 | - [x] File: generateRandomNumbers.py 73 | - [x] Generating random numbers following probability distributions 74 | - [x] File: generateRandomProbDistr.py 75 | - [x] Probability density function: PDF = Prob(X=x) 76 | - [x] Cumulative density function: CDF(x) = Prob(X<=x) 77 | - [x] Uniform distribution: random variables occur with the same (uniform) frequency/probability 78 | - [x] Normal distribution: Bell Curve and most ubiquitous and versatile probability distribution 79 | - [x] Using the Monte-Carlo simulation to find the value of pi 80 | - [x] File: calcPi.py 81 | - [x] Geometry and mathematics behind the calculation of pi 82 | - [x] Generating a dummy data frame 83 | - [x] File: generateDummyDataFrame.py 84 | 85 | ####Grouping the data – aggregation, filtering, and transformation 86 | - [x] File: groupData.py 87 | - [x] Grouping 88 | - [x] Aggregation 89 | - [x] Filtering 90 | - [x] Transformation 91 | - [x] Miscellaneous operations 92 | 93 | ####Random sampling – splitting a dataset in training and testing datasets 94 | - [ ] File: splitDataTrainTest.py 95 | - [x] Method 1: using the Customer Churn Model 96 | - [x] Method 2: using sklearn 97 | - [ ] Method 3: using the shuffle function 98 | 99 | ####Concatenating and appending data 100 | - [x] File: concatenateAndAppend.py 101 | - [x] File: appendManyFiles.py 102 | 103 | ####Merging/joining datasets 104 | - [x] File: mergeJoin.py 105 | - [x] Inner Join 106 | - [x] Left Join 107 | - [x] Right Join 108 | - [x] An example of the Inner Join 109 | - [x] An example of the Left Join 110 | - [x] An example of the Right Join 111 | - [x] Summary of Joins in terms of their length 112 | 113 | ###Chapter 4: Statistical Concepts for Predictive Modelling 114 | ####Random sampling and central limit theorem 115 | ####Hypothesis testing 116 | - [x] Null versus alternate hypothesis 117 | - [x] Z-statistic and t-statistic 118 | - [x] Confidence intervals, significance levels, and p-values 119 | - [x] Different kinds of hypothesis test 120 | - [x] A step-by-step guide to do a hypothesis test 121 | - [x] An example of a hypothesis test 122 | 123 | ####Chi-square testing 124 | ####Correlation 125 | - [x] File: linearRegression.py 126 | - [x] File: linearRegressionFunction.py 127 | - [x] Picture: TVSalesCorrelationPlot.png 128 | - [x] Picture: RadioSalesCorrelationPlot.png 129 | - [x] Picture: NewspaperSalesCorrelationPlot.png 130 | 131 | ###Chapter 5: Linear Regression with Python 132 | ####Understanding the maths behind linear regression 133 | - [x] Linear regression using simulated data 134 | - [x] File: linearRegression.py 135 | - [x] Picture: CurrentVsPredicted1.png 136 | - [x] Picture: CurrentVsPredictedVsMean1.png 137 | - [x] Picture: CurrentVsPredictedVsModel1.png 138 | 139 | ####Making sense of result parameters 140 | - [x] File: linearRegression.py 141 | - [x] p-values 142 | - [x] F-statistics 143 | - [x] Residual Standard Error (RSE) 144 | 145 | ####Implementing linear regression with Python 146 | - [x] File: linearRegressionSMF.py 147 | - [x] Linear regression using the statsmodel library 148 | - [x] Multiple linear regression 149 | - [x] Multi-collinearity: sub-optimal performance of the model 150 | - [x] Variance Inflation Factor 151 | - [x] It is a method to quantify the rise in the variability of the coefficient estimate of a particular variable because of high correlation between two or more than two predictor variables. 152 | 153 | ####Model validation 154 | - [x] Training and testing data split 155 | - [x] File: linearRegressionSMF.py 156 | - [x] Linear regression with scikit-learn 157 | - [x] File: linearRegressionSKL.py 158 | - [x] Feature selection with scikit-learn 159 | - [x] Recursive Feature Elimination (RFE) 160 | - [x] File: linearRegressionRFE.py 161 | 162 | ####Handling other issues in linear regression 163 | - [x] Handling categorical variables 164 | - [x] File: linearRegressionECom.py 165 | - [x] Transforming a variable to fit non-linear relations 166 | - [x] File: nonlinearRegression.py 167 | - [x] Picture: MPGVSHorsepower.png 168 | - [x] Picture: MPGVSHorsepowerVsLine.png 169 | - [x] Picture: MPGVSHorsepowerModels.png 170 | - [x] Handling outliers 171 | - [x] Other considerations and assumptions for linear regression 172 | 173 | ###Chapter 6: Logistic Regression with Python 174 | ####Linear regression versus logistic regression 175 | ####Understanding the math behind logistic regression 176 | - [x] File: logisticRegression.py 177 | - [x] Contingency tables 178 | - [x] Conditional probability 179 | - [x] Odds ratio 180 | - [x] Moving on to logistic regression from linear regression 181 | - [x] Estimation using the Maximum Likelihood Method 182 | - [x] Building the logistic regression model from scratch 183 | - [x] File: logisticRegressionScratch.py 184 | - [ ] Read above again. 185 | - [x] Making sense of logistic regression parameters 186 | - [x] Wald test 187 | - [x] Likelihood Ratio Test statistic 188 | - [x] Chi-square test 189 | - [x] 190 | 191 | ####Implementing logistic regression with Python 192 | - [x] File: logisticRegressionImplementation.py 193 | - [x] Processing the data 194 | - [x] Data exploration 195 | - [x] Data visualization 196 | - [x] Creating dummy variables for categorical variables 197 | - [x] Feature selection 198 | - [x] Implementing the model 199 | 200 | ####Model validation and evaluation 201 | - [x] File: logisticRegressionImplementation.py 202 | - [x] Cross validation 203 | 204 | ####Model validation 205 | - [x] File: logisticRegressionImplementation.py 206 | - [x] The ROC curve {see terms} 207 | 208 | ###Chapter 7: Clustering with Python 209 | ####Introduction to clustering – what, why, and how? 210 | - [x] What is clustering? 211 | - [x] How is clustering used? 212 | - [x] Why do we do clustering? 213 | 214 | ####Mathematics behind clustering 215 | - [x] Distances between two observations 216 | - [x] Euclidean distance 217 | - [x] Manhattan distance 218 | - [x] Minkowski distance 219 | - [x] The distance matrix 220 | - [x] Normalizing the distances 221 | - [x] Linkage methods 222 | - [x] Single linkage 223 | - [x] Compete linkage 224 | - [x] Average linkage 225 | - [x] Centroid linkage 226 | - [x] Ward's method uses ANOVA method 227 | - [x] Hierarchical clustering 228 | - [x] K-means clustering 229 | - [x] File: kMeanClustering.py 230 | 231 | ####Implementing clustering using Python 232 | - [x] File: clusterWine.py 233 | - [x] Importing and exploring the dataset 234 | - [x] Normalizing the values in the dataset 235 | - [x] Hierarchical clustering using scikit-learn 236 | - [x] K-Means clustering using scikit-learn 237 | - [x] Interpreting the cluster 238 | 239 | ####Fine-tuning the clustering 240 | - [x] The elbow method 241 | - [x] Silhouette Coefficient 242 | 243 | ###Chapter 8: Trees and Random Forests with Python 244 | ####Introducing decision trees 245 | - [x] A decision tree 246 | 247 | ####Understanding the mathematics behind decision trees 248 | - [x] Homogeneity 249 | - [x] Entropy 250 | - [x] Information gain 251 | - [x] ID3 algorithm to create a decision tree 252 | - [x] Gini index 253 | - [x] Reduction in Variance 254 | - [x] Pruning a tree 255 | - [x] Handling a continuous numerical variable 256 | - [x] Handling a missing value of an attribute 257 | 258 | ####Implementing a decision tree with scikit-learn 259 | - [x] File: decisionTreeIris.py 260 | - [x] Visualizing the tree 261 | - [x] Picture: dtree2.png 262 | - [x] File: dtree2.dot 263 | - [x] Cross-validating and pruning the decision tree 264 | 265 | ####Understanding and implementing regression trees 266 | - [x] File: regressionTree.py 267 | - [x] Regression tree algorithm 268 | - [x] Implementing a regression tree using Python 269 | 270 | ####Understanding and implementing random forests 271 | - [x] File: randomForest.py 272 | - [x] The random forest algorithm 273 | - [x] Implementing a random forest using Python 274 | - [x] Why do random forests work? 275 | - [x] Important parameters for random forests 276 | 277 | 278 | ###Chapter 9: Best Practices for Predictive Modelling 279 | ####Best practices for coding 280 | - [x] Commenting the codes 281 | - [x] Defining functions for substantial individual tasks 282 | - [x] Example 1 283 | - [x] Example 2 284 | - [x] Example 3 285 | - [x] Avoid hard-coding of variables as much as possible 286 | - [x] Version control 287 | - [x] Using standard libraries, methods, and formulas 288 | 289 | ####Best practices for data handling 290 | 291 | ####Best practices for algorithms 292 | 293 | ####Best practices for statistics 294 | 295 | ####Best practices for business contexts 296 | 297 | 298 | 299 | -------------------------------------------------------------------------------- /datasets/Advertising.csv: -------------------------------------------------------------------------------- 1 | TV,Radio,Newspaper,Sales 2 | 230.1,37.8,69.2,22.1 3 | 44.5,39.3,45.1,10.4 4 | 17.2,45.9,69.3,9.3 5 | 151.5,41.3,58.5,18.5 6 | 180.8,10.8,58.4,12.9 7 | 8.7,48.9,75,7.2 8 | 57.5,32.8,23.5,11.8 9 | 120.2,19.6,11.6,13.2 10 | 8.6,2.1,1,4.8 11 | 199.8,2.6,21.2,10.6 12 | 66.1,5.8,24.2,8.6 13 | 214.7,24,4,17.4 14 | 23.8,35.1,65.9,9.2 15 | 97.5,7.6,7.2,9.7 16 | 204.1,32.9,46,19 17 | 195.4,47.7,52.9,22.4 18 | 67.8,36.6,114,12.5 19 | 281.4,39.6,55.8,24.4 20 | 69.2,20.5,18.3,11.3 21 | 147.3,23.9,19.1,14.6 22 | 218.4,27.7,53.4,18 23 | 237.4,5.1,23.5,12.5 24 | 13.2,15.9,49.6,5.6 25 | 228.3,16.9,26.2,15.5 26 | 62.3,12.6,18.3,9.7 27 | 262.9,3.5,19.5,12 28 | 142.9,29.3,12.6,15 29 | 240.1,16.7,22.9,15.9 30 | 248.8,27.1,22.9,18.9 31 | 70.6,16,40.8,10.5 32 | 292.9,28.3,43.2,21.4 33 | 112.9,17.4,38.6,11.9 34 | 97.2,1.5,30,9.6 35 | 265.6,20,0.3,17.4 36 | 95.7,1.4,7.4,9.5 37 | 290.7,4.1,8.5,12.8 38 | 266.9,43.8,5,25.4 39 | 74.7,49.4,45.7,14.7 40 | 43.1,26.7,35.1,10.1 41 | 228,37.7,32,21.5 42 | 202.5,22.3,31.6,16.6 43 | 177,33.4,38.7,17.1 44 | 293.6,27.7,1.8,20.7 45 | 206.9,8.4,26.4,12.9 46 | 25.1,25.7,43.3,8.5 47 | 175.1,22.5,31.5,14.9 48 | 89.7,9.9,35.7,10.6 49 | 239.9,41.5,18.5,23.2 50 | 227.2,15.8,49.9,14.8 51 | 66.9,11.7,36.8,9.7 52 | 199.8,3.1,34.6,11.4 53 | 100.4,9.6,3.6,10.7 54 | 216.4,41.7,39.6,22.6 55 | 182.6,46.2,58.7,21.2 56 | 262.7,28.8,15.9,20.2 57 | 198.9,49.4,60,23.7 58 | 7.3,28.1,41.4,5.5 59 | 136.2,19.2,16.6,13.2 60 | 210.8,49.6,37.7,23.8 61 | 210.7,29.5,9.3,18.4 62 | 53.5,2,21.4,8.1 63 | 261.3,42.7,54.7,24.2 64 | 239.3,15.5,27.3,15.7 65 | 102.7,29.6,8.4,14 66 | 131.1,42.8,28.9,18 67 | 69,9.3,0.9,9.3 68 | 31.5,24.6,2.2,9.5 69 | 139.3,14.5,10.2,13.4 70 | 237.4,27.5,11,18.9 71 | 216.8,43.9,27.2,22.3 72 | 199.1,30.6,38.7,18.3 73 | 109.8,14.3,31.7,12.4 74 | 26.8,33,19.3,8.8 75 | 129.4,5.7,31.3,11 76 | 213.4,24.6,13.1,17 77 | 16.9,43.7,89.4,8.7 78 | 27.5,1.6,20.7,6.9 79 | 120.5,28.5,14.2,14.2 80 | 5.4,29.9,9.4,5.3 81 | 116,7.7,23.1,11 82 | 76.4,26.7,22.3,11.8 83 | 239.8,4.1,36.9,12.3 84 | 75.3,20.3,32.5,11.3 85 | 68.4,44.5,35.6,13.6 86 | 213.5,43,33.8,21.7 87 | 193.2,18.4,65.7,15.2 88 | 76.3,27.5,16,12 89 | 110.7,40.6,63.2,16 90 | 88.3,25.5,73.4,12.9 91 | 109.8,47.8,51.4,16.7 92 | 134.3,4.9,9.3,11.2 93 | 28.6,1.5,33,7.3 94 | 217.7,33.5,59,19.4 95 | 250.9,36.5,72.3,22.2 96 | 107.4,14,10.9,11.5 97 | 163.3,31.6,52.9,16.9 98 | 197.6,3.5,5.9,11.7 99 | 184.9,21,22,15.5 100 | 289.7,42.3,51.2,25.4 101 | 135.2,41.7,45.9,17.2 102 | 222.4,4.3,49.8,11.7 103 | 296.4,36.3,100.9,23.8 104 | 280.2,10.1,21.4,14.8 105 | 187.9,17.2,17.9,14.7 106 | 238.2,34.3,5.3,20.7 107 | 137.9,46.4,59,19.2 108 | 25,11,29.7,7.2 109 | 90.4,0.3,23.2,8.7 110 | 13.1,0.4,25.6,5.3 111 | 255.4,26.9,5.5,19.8 112 | 225.8,8.2,56.5,13.4 113 | 241.7,38,23.2,21.8 114 | 175.7,15.4,2.4,14.1 115 | 209.6,20.6,10.7,15.9 116 | 78.2,46.8,34.5,14.6 117 | 75.1,35,52.7,12.6 118 | 139.2,14.3,25.6,12.2 119 | 76.4,0.8,14.8,9.4 120 | 125.7,36.9,79.2,15.9 121 | 19.4,16,22.3,6.6 122 | 141.3,26.8,46.2,15.5 123 | 18.8,21.7,50.4,7 124 | 224,2.4,15.6,11.6 125 | 123.1,34.6,12.4,15.2 126 | 229.5,32.3,74.2,19.7 127 | 87.2,11.8,25.9,10.6 128 | 7.8,38.9,50.6,6.6 129 | 80.2,0,9.2,8.8 130 | 220.3,49,3.2,24.7 131 | 59.6,12,43.1,9.7 132 | 0.7,39.6,8.7,1.6 133 | 265.2,2.9,43,12.7 134 | 8.4,27.2,2.1,5.7 135 | 219.8,33.5,45.1,19.6 136 | 36.9,38.6,65.6,10.8 137 | 48.3,47,8.5,11.6 138 | 25.6,39,9.3,9.5 139 | 273.7,28.9,59.7,20.8 140 | 43,25.9,20.5,9.6 141 | 184.9,43.9,1.7,20.7 142 | 73.4,17,12.9,10.9 143 | 193.7,35.4,75.6,19.2 144 | 220.5,33.2,37.9,20.1 145 | 104.6,5.7,34.4,10.4 146 | 96.2,14.8,38.9,11.4 147 | 140.3,1.9,9,10.3 148 | 240.1,7.3,8.7,13.2 149 | 243.2,49,44.3,25.4 150 | 38,40.3,11.9,10.9 151 | 44.7,25.8,20.6,10.1 152 | 280.7,13.9,37,16.1 153 | 121,8.4,48.7,11.6 154 | 197.6,23.3,14.2,16.6 155 | 171.3,39.7,37.7,19 156 | 187.8,21.1,9.5,15.6 157 | 4.1,11.6,5.7,3.2 158 | 93.9,43.5,50.5,15.3 159 | 149.8,1.3,24.3,10.1 160 | 11.7,36.9,45.2,7.3 161 | 131.7,18.4,34.6,12.9 162 | 172.5,18.1,30.7,14.4 163 | 85.7,35.8,49.3,13.3 164 | 188.4,18.1,25.6,14.9 165 | 163.5,36.8,7.4,18 166 | 117.2,14.7,5.4,11.9 167 | 234.5,3.4,84.8,11.9 168 | 17.9,37.6,21.6,8 169 | 206.8,5.2,19.4,12.2 170 | 215.4,23.6,57.6,17.1 171 | 284.3,10.6,6.4,15 172 | 50,11.6,18.4,8.4 173 | 164.5,20.9,47.4,14.5 174 | 19.6,20.1,17,7.6 175 | 168.4,7.1,12.8,11.7 176 | 222.4,3.4,13.1,11.5 177 | 276.9,48.9,41.8,27 178 | 248.4,30.2,20.3,20.2 179 | 170.2,7.8,35.2,11.7 180 | 276.7,2.3,23.7,11.8 181 | 165.6,10,17.6,12.6 182 | 156.6,2.6,8.3,10.5 183 | 218.5,5.4,27.4,12.2 184 | 56.2,5.7,29.7,8.7 185 | 287.6,43,71.8,26.2 186 | 253.8,21.3,30,17.6 187 | 205,45.1,19.6,22.6 188 | 139.5,2.1,26.6,10.3 189 | 191.1,28.7,18.2,17.3 190 | 286,13.9,3.7,15.9 191 | 18.7,12.1,23.4,6.7 192 | 39.5,41.1,5.8,10.8 193 | 75.5,10.8,6,9.9 194 | 17.2,4.1,31.6,5.9 195 | 166.8,42,3.6,19.6 196 | 149.7,35.6,6,17.3 197 | 38.2,3.7,13.8,7.6 198 | 94.2,4.9,8.1,9.7 199 | 177,9.3,6.4,12.8 200 | 283.6,42,66.2,25.5 201 | 232.1,8.6,8.7,13.4 202 | -------------------------------------------------------------------------------- /datasets/Auto.csv: -------------------------------------------------------------------------------- 1 | mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name 2 | 18,8,307,130,3504,12,70,1,chevrolet chevelle malibu 3 | 15,8,350,165,3693,11.5,70,1,buick skylark 320 4 | 18,8,318,150,3436,11,70,1,plymouth satellite 5 | 16,8,304,150,3433,12,70,1,amc rebel sst 6 | 17,8,302,140,3449,10.5,70,1,ford torino 7 | 15,8,429,198,4341,10,70,1,ford galaxie 500 8 | 14,8,454,220,4354,9,70,1,chevrolet impala 9 | 14,8,440,215,4312,8.5,70,1,plymouth fury iii 10 | 14,8,455,225,4425,10,70,1,pontiac catalina 11 | 15,8,390,190,3850,8.5,70,1,amc ambassador dpl 12 | NA,4,133,115,3090,17.5,70,2,citroen ds-21 pallas 13 | NA,8,350,165,4142,11.5,70,1,chevrolet chevelle concours (sw) 14 | NA,8,351,153,4034,11,70,1,ford torino (sw) 15 | NA,8,383,175,4166,10.5,70,1,plymouth satellite (sw) 16 | NA,8,360,175,3850,11,70,1,amc rebel sst (sw) 17 | 15,8,383,170,3563,10,70,1,dodge challenger se 18 | 14,8,340,160,3609,8,70,1,plymouth 'cuda 340 19 | NA,8,302,140,3353,8,70,1,ford mustang boss 302 20 | 15,8,400,150,3761,9.5,70,1,chevrolet monte carlo 21 | 14,8,455,225,3086,10,70,1,buick estate wagon (sw) 22 | 24,4,113,95,2372,15,70,3,toyota corona mark ii 23 | 22,6,198,95,2833,15.5,70,1,plymouth duster 24 | 18,6,199,97,2774,15.5,70,1,amc hornet 25 | 21,6,200,85,2587,16,70,1,ford maverick 26 | 27,4,97,88,2130,14.5,70,3,datsun pl510 27 | 26,4,97,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan 28 | 25,4,110,87,2672,17.5,70,2,peugeot 504 29 | 24,4,107,90,2430,14.5,70,2,audi 100 ls 30 | 25,4,104,95,2375,17.5,70,2,saab 99e 31 | 26,4,121,113,2234,12.5,70,2,bmw 2002 32 | 21,6,199,90,2648,15,70,1,amc gremlin 33 | 10,8,360,215,4615,14,70,1,ford f250 34 | 10,8,307,200,4376,15,70,1,chevy c20 35 | 11,8,318,210,4382,13.5,70,1,dodge d200 36 | 9,8,304,193,4732,18.5,70,1,hi 1200d 37 | 27,4,97,88,2130,14.5,71,3,datsun pl510 38 | 28,4,140,90,2264,15.5,71,1,chevrolet vega 2300 39 | 25,4,113,95,2228,14,71,3,toyota corona 40 | 25,4,98,NA,2046,19,71,1,ford pinto 41 | NA,4,97,48,1978,20,71,2,volkswagen super beetle 117 42 | 19,6,232,100,2634,13,71,1,amc gremlin 43 | 16,6,225,105,3439,15.5,71,1,plymouth satellite custom 44 | 17,6,250,100,3329,15.5,71,1,chevrolet chevelle malibu 45 | 19,6,250,88,3302,15.5,71,1,ford torino 500 46 | 18,6,232,100,3288,15.5,71,1,amc matador 47 | 14,8,350,165,4209,12,71,1,chevrolet impala 48 | 14,8,400,175,4464,11.5,71,1,pontiac catalina brougham 49 | 14,8,351,153,4154,13.5,71,1,ford galaxie 500 50 | 14,8,318,150,4096,13,71,1,plymouth fury iii 51 | 12,8,383,180,4955,11.5,71,1,dodge monaco (sw) 52 | 13,8,400,170,4746,12,71,1,ford country squire (sw) 53 | 13,8,400,175,5140,12,71,1,pontiac safari (sw) 54 | 18,6,258,110,2962,13.5,71,1,amc hornet sportabout (sw) 55 | 22,4,140,72,2408,19,71,1,chevrolet vega (sw) 56 | 19,6,250,100,3282,15,71,1,pontiac firebird 57 | 18,6,250,88,3139,14.5,71,1,ford mustang 58 | 23,4,122,86,2220,14,71,1,mercury capri 2000 59 | 28,4,116,90,2123,14,71,2,opel 1900 60 | 30,4,79,70,2074,19.5,71,2,peugeot 304 61 | 30,4,88,76,2065,14.5,71,2,fiat 124b 62 | 31,4,71,65,1773,19,71,3,toyota corolla 1200 63 | 35,4,72,69,1613,18,71,3,datsun 1200 64 | 27,4,97,60,1834,19,71,2,volkswagen model 111 65 | 26,4,91,70,1955,20.5,71,1,plymouth cricket 66 | 24,4,113,95,2278,15.5,72,3,toyota corona hardtop 67 | 25,4,97.5,80,2126,17,72,1,dodge colt hardtop 68 | 23,4,97,54,2254,23.5,72,2,volkswagen type 3 69 | 20,4,140,90,2408,19.5,72,1,chevrolet vega 70 | 21,4,122,86,2226,16.5,72,1,ford pinto runabout 71 | 13,8,350,165,4274,12,72,1,chevrolet impala 72 | 14,8,400,175,4385,12,72,1,pontiac catalina 73 | 15,8,318,150,4135,13.5,72,1,plymouth fury iii 74 | 14,8,351,153,4129,13,72,1,ford galaxie 500 75 | 17,8,304,150,3672,11.5,72,1,amc ambassador sst 76 | 11,8,429,208,4633,11,72,1,mercury marquis 77 | 13,8,350,155,4502,13.5,72,1,buick lesabre custom 78 | 12,8,350,160,4456,13.5,72,1,oldsmobile delta 88 royale 79 | 13,8,400,190,4422,12.5,72,1,chrysler newport royal 80 | 19,3,70,97,2330,13.5,72,3,mazda rx2 coupe 81 | 15,8,304,150,3892,12.5,72,1,amc matador (sw) 82 | 13,8,307,130,4098,14,72,1,chevrolet chevelle concours (sw) 83 | 13,8,302,140,4294,16,72,1,ford gran torino (sw) 84 | 14,8,318,150,4077,14,72,1,plymouth satellite custom (sw) 85 | 18,4,121,112,2933,14.5,72,2,volvo 145e (sw) 86 | 22,4,121,76,2511,18,72,2,volkswagen 411 (sw) 87 | 21,4,120,87,2979,19.5,72,2,peugeot 504 (sw) 88 | 26,4,96,69,2189,18,72,2,renault 12 (sw) 89 | 22,4,122,86,2395,16,72,1,ford pinto (sw) 90 | 28,4,97,92,2288,17,72,3,datsun 510 (sw) 91 | 23,4,120,97,2506,14.5,72,3,toyouta corona mark ii (sw) 92 | 28,4,98,80,2164,15,72,1,dodge colt (sw) 93 | 27,4,97,88,2100,16.5,72,3,toyota corolla 1600 (sw) 94 | 13,8,350,175,4100,13,73,1,buick century 350 95 | 14,8,304,150,3672,11.5,73,1,amc matador 96 | 13,8,350,145,3988,13,73,1,chevrolet malibu 97 | 14,8,302,137,4042,14.5,73,1,ford gran torino 98 | 15,8,318,150,3777,12.5,73,1,dodge coronet custom 99 | 12,8,429,198,4952,11.5,73,1,mercury marquis brougham 100 | 13,8,400,150,4464,12,73,1,chevrolet caprice classic 101 | 13,8,351,158,4363,13,73,1,ford ltd 102 | 14,8,318,150,4237,14.5,73,1,plymouth fury gran sedan 103 | 13,8,440,215,4735,11,73,1,chrysler new yorker brougham 104 | 12,8,455,225,4951,11,73,1,buick electra 225 custom 105 | 13,8,360,175,3821,11,73,1,amc ambassador brougham 106 | 18,6,225,105,3121,16.5,73,1,plymouth valiant 107 | 16,6,250,100,3278,18,73,1,chevrolet nova custom 108 | 18,6,232,100,2945,16,73,1,amc hornet 109 | 18,6,250,88,3021,16.5,73,1,ford maverick 110 | 23,6,198,95,2904,16,73,1,plymouth duster 111 | 26,4,97,46,1950,21,73,2,volkswagen super beetle 112 | 11,8,400,150,4997,14,73,1,chevrolet impala 113 | 12,8,400,167,4906,12.5,73,1,ford country 114 | 13,8,360,170,4654,13,73,1,plymouth custom suburb 115 | 12,8,350,180,4499,12.5,73,1,oldsmobile vista cruiser 116 | 18,6,232,100,2789,15,73,1,amc gremlin 117 | 20,4,97,88,2279,19,73,3,toyota carina 118 | 21,4,140,72,2401,19.5,73,1,chevrolet vega 119 | 22,4,108,94,2379,16.5,73,3,datsun 610 120 | 18,3,70,90,2124,13.5,73,3,maxda rx3 121 | 19,4,122,85,2310,18.5,73,1,ford pinto 122 | 21,6,155,107,2472,14,73,1,mercury capri v6 123 | 26,4,98,90,2265,15.5,73,2,fiat 124 sport coupe 124 | 15,8,350,145,4082,13,73,1,chevrolet monte carlo s 125 | 16,8,400,230,4278,9.5,73,1,pontiac grand prix 126 | 29,4,68,49,1867,19.5,73,2,fiat 128 127 | 24,4,116,75,2158,15.5,73,2,opel manta 128 | 20,4,114,91,2582,14,73,2,audi 100ls 129 | 19,4,121,112,2868,15.5,73,2,volvo 144ea 130 | 15,8,318,150,3399,11,73,1,dodge dart custom 131 | 24,4,121,110,2660,14,73,2,saab 99le 132 | 20,6,156,122,2807,13.5,73,3,toyota mark ii 133 | 11,8,350,180,3664,11,73,1,oldsmobile omega 134 | 20,6,198,95,3102,16.5,74,1,plymouth duster 135 | 21,6,200,NA,2875,17,74,1,ford maverick 136 | 19,6,232,100,2901,16,74,1,amc hornet 137 | 15,6,250,100,3336,17,74,1,chevrolet nova 138 | 31,4,79,67,1950,19,74,3,datsun b210 139 | 26,4,122,80,2451,16.5,74,1,ford pinto 140 | 32,4,71,65,1836,21,74,3,toyota corolla 1200 141 | 25,4,140,75,2542,17,74,1,chevrolet vega 142 | 16,6,250,100,3781,17,74,1,chevrolet chevelle malibu classic 143 | 16,6,258,110,3632,18,74,1,amc matador 144 | 18,6,225,105,3613,16.5,74,1,plymouth satellite sebring 145 | 16,8,302,140,4141,14,74,1,ford gran torino 146 | 13,8,350,150,4699,14.5,74,1,buick century luxus (sw) 147 | 14,8,318,150,4457,13.5,74,1,dodge coronet custom (sw) 148 | 14,8,302,140,4638,16,74,1,ford gran torino (sw) 149 | 14,8,304,150,4257,15.5,74,1,amc matador (sw) 150 | 29,4,98,83,2219,16.5,74,2,audi fox 151 | 26,4,79,67,1963,15.5,74,2,volkswagen dasher 152 | 26,4,97,78,2300,14.5,74,2,opel manta 153 | 31,4,76,52,1649,16.5,74,3,toyota corona 154 | 32,4,83,61,2003,19,74,3,datsun 710 155 | 28,4,90,75,2125,14.5,74,1,dodge colt 156 | 24,4,90,75,2108,15.5,74,2,fiat 128 157 | 26,4,116,75,2246,14,74,2,fiat 124 tc 158 | 24,4,120,97,2489,15,74,3,honda civic 159 | 26,4,108,93,2391,15.5,74,3,subaru 160 | 31,4,79,67,2000,16,74,2,fiat x1.9 161 | 19,6,225,95,3264,16,75,1,plymouth valiant custom 162 | 18,6,250,105,3459,16,75,1,chevrolet nova 163 | 15,6,250,72,3432,21,75,1,mercury monarch 164 | 15,6,250,72,3158,19.5,75,1,ford maverick 165 | 16,8,400,170,4668,11.5,75,1,pontiac catalina 166 | 15,8,350,145,4440,14,75,1,chevrolet bel air 167 | 16,8,318,150,4498,14.5,75,1,plymouth grand fury 168 | 14,8,351,148,4657,13.5,75,1,ford ltd 169 | 17,6,231,110,3907,21,75,1,buick century 170 | 16,6,250,105,3897,18.5,75,1,chevroelt chevelle malibu 171 | 15,6,258,110,3730,19,75,1,amc matador 172 | 18,6,225,95,3785,19,75,1,plymouth fury 173 | 21,6,231,110,3039,15,75,1,buick skyhawk 174 | 20,8,262,110,3221,13.5,75,1,chevrolet monza 2+2 175 | 13,8,302,129,3169,12,75,1,ford mustang ii 176 | 29,4,97,75,2171,16,75,3,toyota corolla 177 | 23,4,140,83,2639,17,75,1,ford pinto 178 | 20,6,232,100,2914,16,75,1,amc gremlin 179 | 23,4,140,78,2592,18.5,75,1,pontiac astro 180 | 24,4,134,96,2702,13.5,75,3,toyota corona 181 | 25,4,90,71,2223,16.5,75,2,volkswagen dasher 182 | 24,4,119,97,2545,17,75,3,datsun 710 183 | 18,6,171,97,2984,14.5,75,1,ford pinto 184 | 29,4,90,70,1937,14,75,2,volkswagen rabbit 185 | 19,6,232,90,3211,17,75,1,amc pacer 186 | 23,4,115,95,2694,15,75,2,audi 100ls 187 | 23,4,120,88,2957,17,75,2,peugeot 504 188 | 22,4,121,98,2945,14.5,75,2,volvo 244dl 189 | 25,4,121,115,2671,13.5,75,2,saab 99le 190 | 33,4,91,53,1795,17.5,75,3,honda civic cvcc 191 | 28,4,107,86,2464,15.5,76,2,fiat 131 192 | 25,4,116,81,2220,16.9,76,2,opel 1900 193 | 25,4,140,92,2572,14.9,76,1,capri ii 194 | 26,4,98,79,2255,17.7,76,1,dodge colt 195 | 27,4,101,83,2202,15.3,76,2,renault 12tl 196 | 17.5,8,305,140,4215,13,76,1,chevrolet chevelle malibu classic 197 | 16,8,318,150,4190,13,76,1,dodge coronet brougham 198 | 15.5,8,304,120,3962,13.9,76,1,amc matador 199 | 14.5,8,351,152,4215,12.8,76,1,ford gran torino 200 | 22,6,225,100,3233,15.4,76,1,plymouth valiant 201 | 22,6,250,105,3353,14.5,76,1,chevrolet nova 202 | 24,6,200,81,3012,17.6,76,1,ford maverick 203 | 22.5,6,232,90,3085,17.6,76,1,amc hornet 204 | 29,4,85,52,2035,22.2,76,1,chevrolet chevette 205 | 24.5,4,98,60,2164,22.1,76,1,chevrolet woody 206 | 29,4,90,70,1937,14.2,76,2,vw rabbit 207 | 33,4,91,53,1795,17.4,76,3,honda civic 208 | 20,6,225,100,3651,17.7,76,1,dodge aspen se 209 | 18,6,250,78,3574,21,76,1,ford granada ghia 210 | 18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj 211 | 17.5,6,258,95,3193,17.8,76,1,amc pacer d/l 212 | 29.5,4,97,71,1825,12.2,76,2,volkswagen rabbit 213 | 32,4,85,70,1990,17,76,3,datsun b-210 214 | 28,4,97,75,2155,16.4,76,3,toyota corolla 215 | 26.5,4,140,72,2565,13.6,76,1,ford pinto 216 | 20,4,130,102,3150,15.7,76,2,volvo 245 217 | 13,8,318,150,3940,13.2,76,1,plymouth volare premier v8 218 | 19,4,120,88,3270,21.9,76,2,peugeot 504 219 | 19,6,156,108,2930,15.5,76,3,toyota mark ii 220 | 16.5,6,168,120,3820,16.7,76,2,mercedes-benz 280s 221 | 16.5,8,350,180,4380,12.1,76,1,cadillac seville 222 | 13,8,350,145,4055,12,76,1,chevy c10 223 | 13,8,302,130,3870,15,76,1,ford f108 224 | 13,8,318,150,3755,14,76,1,dodge d100 225 | 31.5,4,98,68,2045,18.5,77,3,honda accord cvcc 226 | 30,4,111,80,2155,14.8,77,1,buick opel isuzu deluxe 227 | 36,4,79,58,1825,18.6,77,2,renault 5 gtl 228 | 25.5,4,122,96,2300,15.5,77,1,plymouth arrow gs 229 | 33.5,4,85,70,1945,16.8,77,3,datsun f-10 hatchback 230 | 17.5,8,305,145,3880,12.5,77,1,chevrolet caprice classic 231 | 17,8,260,110,4060,19,77,1,oldsmobile cutlass supreme 232 | 15.5,8,318,145,4140,13.7,77,1,dodge monaco brougham 233 | 15,8,302,130,4295,14.9,77,1,mercury cougar brougham 234 | 17.5,6,250,110,3520,16.4,77,1,chevrolet concours 235 | 20.5,6,231,105,3425,16.9,77,1,buick skylark 236 | 19,6,225,100,3630,17.7,77,1,plymouth volare custom 237 | 18.5,6,250,98,3525,19,77,1,ford granada 238 | 16,8,400,180,4220,11.1,77,1,pontiac grand prix lj 239 | 15.5,8,350,170,4165,11.4,77,1,chevrolet monte carlo landau 240 | 15.5,8,400,190,4325,12.2,77,1,chrysler cordoba 241 | 16,8,351,149,4335,14.5,77,1,ford thunderbird 242 | 29,4,97,78,1940,14.5,77,2,volkswagen rabbit custom 243 | 24.5,4,151,88,2740,16,77,1,pontiac sunbird coupe 244 | 26,4,97,75,2265,18.2,77,3,toyota corolla liftback 245 | 25.5,4,140,89,2755,15.8,77,1,ford mustang ii 2+2 246 | 30.5,4,98,63,2051,17,77,1,chevrolet chevette 247 | 33.5,4,98,83,2075,15.9,77,1,dodge colt m/m 248 | 30,4,97,67,1985,16.4,77,3,subaru dl 249 | 30.5,4,97,78,2190,14.1,77,2,volkswagen dasher 250 | 22,6,146,97,2815,14.5,77,3,datsun 810 251 | 21.5,4,121,110,2600,12.8,77,2,bmw 320i 252 | 21.5,3,80,110,2720,13.5,77,3,mazda rx-4 253 | 43.1,4,90,48,1985,21.5,78,2,volkswagen rabbit custom diesel 254 | 36.1,4,98,66,1800,14.4,78,1,ford fiesta 255 | 32.8,4,78,52,1985,19.4,78,3,mazda glc deluxe 256 | 39.4,4,85,70,2070,18.6,78,3,datsun b210 gx 257 | 36.1,4,91,60,1800,16.4,78,3,honda civic cvcc 258 | 19.9,8,260,110,3365,15.5,78,1,oldsmobile cutlass salon brougham 259 | 19.4,8,318,140,3735,13.2,78,1,dodge diplomat 260 | 20.2,8,302,139,3570,12.8,78,1,mercury monarch ghia 261 | 19.2,6,231,105,3535,19.2,78,1,pontiac phoenix lj 262 | 20.5,6,200,95,3155,18.2,78,1,chevrolet malibu 263 | 20.2,6,200,85,2965,15.8,78,1,ford fairmont (auto) 264 | 25.1,4,140,88,2720,15.4,78,1,ford fairmont (man) 265 | 20.5,6,225,100,3430,17.2,78,1,plymouth volare 266 | 19.4,6,232,90,3210,17.2,78,1,amc concord 267 | 20.6,6,231,105,3380,15.8,78,1,buick century special 268 | 20.8,6,200,85,3070,16.7,78,1,mercury zephyr 269 | 18.6,6,225,110,3620,18.7,78,1,dodge aspen 270 | 18.1,6,258,120,3410,15.1,78,1,amc concord d/l 271 | 19.2,8,305,145,3425,13.2,78,1,chevrolet monte carlo landau 272 | 17.7,6,231,165,3445,13.4,78,1,buick regal sport coupe (turbo) 273 | 18.1,8,302,139,3205,11.2,78,1,ford futura 274 | 17.5,8,318,140,4080,13.7,78,1,dodge magnum xe 275 | 30,4,98,68,2155,16.5,78,1,chevrolet chevette 276 | 27.5,4,134,95,2560,14.2,78,3,toyota corona 277 | 27.2,4,119,97,2300,14.7,78,3,datsun 510 278 | 30.9,4,105,75,2230,14.5,78,1,dodge omni 279 | 21.1,4,134,95,2515,14.8,78,3,toyota celica gt liftback 280 | 23.2,4,156,105,2745,16.7,78,1,plymouth sapporo 281 | 23.8,4,151,85,2855,17.6,78,1,oldsmobile starfire sx 282 | 23.9,4,119,97,2405,14.9,78,3,datsun 200-sx 283 | 20.3,5,131,103,2830,15.9,78,2,audi 5000 284 | 17,6,163,125,3140,13.6,78,2,volvo 264gl 285 | 21.6,4,121,115,2795,15.7,78,2,saab 99gle 286 | 16.2,6,163,133,3410,15.8,78,2,peugeot 604sl 287 | 31.5,4,89,71,1990,14.9,78,2,volkswagen scirocco 288 | 29.5,4,98,68,2135,16.6,78,3,honda accord lx 289 | 21.5,6,231,115,3245,15.4,79,1,pontiac lemans v6 290 | 19.8,6,200,85,2990,18.2,79,1,mercury zephyr 6 291 | 22.3,4,140,88,2890,17.3,79,1,ford fairmont 4 292 | 20.2,6,232,90,3265,18.2,79,1,amc concord dl 6 293 | 20.6,6,225,110,3360,16.6,79,1,dodge aspen 6 294 | 17,8,305,130,3840,15.4,79,1,chevrolet caprice classic 295 | 17.6,8,302,129,3725,13.4,79,1,ford ltd landau 296 | 16.5,8,351,138,3955,13.2,79,1,mercury grand marquis 297 | 18.2,8,318,135,3830,15.2,79,1,dodge st. regis 298 | 16.9,8,350,155,4360,14.9,79,1,buick estate wagon (sw) 299 | 15.5,8,351,142,4054,14.3,79,1,ford country squire (sw) 300 | 19.2,8,267,125,3605,15,79,1,chevrolet malibu classic (sw) 301 | 18.5,8,360,150,3940,13,79,1,chrysler lebaron town @ country (sw) 302 | 31.9,4,89,71,1925,14,79,2,vw rabbit custom 303 | 34.1,4,86,65,1975,15.2,79,3,maxda glc deluxe 304 | 35.7,4,98,80,1915,14.4,79,1,dodge colt hatchback custom 305 | 27.4,4,121,80,2670,15,79,1,amc spirit dl 306 | 25.4,5,183,77,3530,20.1,79,2,mercedes benz 300d 307 | 23,8,350,125,3900,17.4,79,1,cadillac eldorado 308 | 27.2,4,141,71,3190,24.8,79,2,peugeot 504 309 | 23.9,8,260,90,3420,22.2,79,1,oldsmobile cutlass salon brougham 310 | 34.2,4,105,70,2200,13.2,79,1,plymouth horizon 311 | 34.5,4,105,70,2150,14.9,79,1,plymouth horizon tc3 312 | 31.8,4,85,65,2020,19.2,79,3,datsun 210 313 | 37.3,4,91,69,2130,14.7,79,2,fiat strada custom 314 | 28.4,4,151,90,2670,16,79,1,buick skylark limited 315 | 28.8,6,173,115,2595,11.3,79,1,chevrolet citation 316 | 26.8,6,173,115,2700,12.9,79,1,oldsmobile omega brougham 317 | 33.5,4,151,90,2556,13.2,79,1,pontiac phoenix 318 | 41.5,4,98,76,2144,14.7,80,2,vw rabbit 319 | 38.1,4,89,60,1968,18.8,80,3,toyota corolla tercel 320 | 32.1,4,98,70,2120,15.5,80,1,chevrolet chevette 321 | 37.2,4,86,65,2019,16.4,80,3,datsun 310 322 | 28,4,151,90,2678,16.5,80,1,chevrolet citation 323 | 26.4,4,140,88,2870,18.1,80,1,ford fairmont 324 | 24.3,4,151,90,3003,20.1,80,1,amc concord 325 | 19.1,6,225,90,3381,18.7,80,1,dodge aspen 326 | 34.3,4,97,78,2188,15.8,80,2,audi 4000 327 | 29.8,4,134,90,2711,15.5,80,3,toyota corona liftback 328 | 31.3,4,120,75,2542,17.5,80,3,mazda 626 329 | 37,4,119,92,2434,15,80,3,datsun 510 hatchback 330 | 32.2,4,108,75,2265,15.2,80,3,toyota corolla 331 | 46.6,4,86,65,2110,17.9,80,3,mazda glc 332 | 27.9,4,156,105,2800,14.4,80,1,dodge colt 333 | 40.8,4,85,65,2110,19.2,80,3,datsun 210 334 | 44.3,4,90,48,2085,21.7,80,2,vw rabbit c (diesel) 335 | 43.4,4,90,48,2335,23.7,80,2,vw dasher (diesel) 336 | 36.4,5,121,67,2950,19.9,80,2,audi 5000s (diesel) 337 | 30,4,146,67,3250,21.8,80,2,mercedes-benz 240d 338 | 44.6,4,91,67,1850,13.8,80,3,honda civic 1500 gl 339 | 40.9,4,85,NA,1835,17.3,80,2,renault lecar deluxe 340 | 33.8,4,97,67,2145,18,80,3,subaru dl 341 | 29.8,4,89,62,1845,15.3,80,2,vokswagen rabbit 342 | 32.7,6,168,132,2910,11.4,80,3,datsun 280-zx 343 | 23.7,3,70,100,2420,12.5,80,3,mazda rx-7 gs 344 | 35,4,122,88,2500,15.1,80,2,triumph tr7 coupe 345 | 23.6,4,140,NA,2905,14.3,80,1,ford mustang cobra 346 | 32.4,4,107,72,2290,17,80,3,honda accord 347 | 27.2,4,135,84,2490,15.7,81,1,plymouth reliant 348 | 26.6,4,151,84,2635,16.4,81,1,buick skylark 349 | 25.8,4,156,92,2620,14.4,81,1,dodge aries wagon (sw) 350 | 23.5,6,173,110,2725,12.6,81,1,chevrolet citation 351 | 30,4,135,84,2385,12.9,81,1,plymouth reliant 352 | 39.1,4,79,58,1755,16.9,81,3,toyota starlet 353 | 39,4,86,64,1875,16.4,81,1,plymouth champ 354 | 35.1,4,81,60,1760,16.1,81,3,honda civic 1300 355 | 32.3,4,97,67,2065,17.8,81,3,subaru 356 | 37,4,85,65,1975,19.4,81,3,datsun 210 mpg 357 | 37.7,4,89,62,2050,17.3,81,3,toyota tercel 358 | 34.1,4,91,68,1985,16,81,3,mazda glc 4 359 | 34.7,4,105,63,2215,14.9,81,1,plymouth horizon 4 360 | 34.4,4,98,65,2045,16.2,81,1,ford escort 4w 361 | 29.9,4,98,65,2380,20.7,81,1,ford escort 2h 362 | 33,4,105,74,2190,14.2,81,2,volkswagen jetta 363 | 34.5,4,100,NA,2320,15.8,81,2,renault 18i 364 | 33.7,4,107,75,2210,14.4,81,3,honda prelude 365 | 32.4,4,108,75,2350,16.8,81,3,toyota corolla 366 | 32.9,4,119,100,2615,14.8,81,3,datsun 200sx 367 | 31.6,4,120,74,2635,18.3,81,3,mazda 626 368 | 28.1,4,141,80,3230,20.4,81,2,peugeot 505s turbo diesel 369 | NA,4,121,110,2800,15.4,81,2,saab 900s 370 | 30.7,6,145,76,3160,19.6,81,2,volvo diesel 371 | 25.4,6,168,116,2900,12.6,81,3,toyota cressida 372 | 24.2,6,146,120,2930,13.8,81,3,datsun 810 maxima 373 | 22.4,6,231,110,3415,15.8,81,1,buick century 374 | 26.6,8,350,105,3725,19,81,1,oldsmobile cutlass ls 375 | 20.2,6,200,88,3060,17.1,81,1,ford granada gl 376 | 17.6,6,225,85,3465,16.6,81,1,chrysler lebaron salon 377 | 28,4,112,88,2605,19.6,82,1,chevrolet cavalier 378 | 27,4,112,88,2640,18.6,82,1,chevrolet cavalier wagon 379 | 34,4,112,88,2395,18,82,1,chevrolet cavalier 2-door 380 | 31,4,112,85,2575,16.2,82,1,pontiac j2000 se hatchback 381 | 29,4,135,84,2525,16,82,1,dodge aries se 382 | 27,4,151,90,2735,18,82,1,pontiac phoenix 383 | 24,4,140,92,2865,16.4,82,1,ford fairmont futura 384 | 23,4,151,NA,3035,20.5,82,1,amc concord dl 385 | 36,4,105,74,1980,15.3,82,2,volkswagen rabbit l 386 | 37,4,91,68,2025,18.2,82,3,mazda glc custom l 387 | 31,4,91,68,1970,17.6,82,3,mazda glc custom 388 | 38,4,105,63,2125,14.7,82,1,plymouth horizon miser 389 | 36,4,98,70,2125,17.3,82,1,mercury lynx l 390 | 36,4,120,88,2160,14.5,82,3,nissan stanza xe 391 | 36,4,107,75,2205,14.5,82,3,honda accord 392 | 34,4,108,70,2245,16.9,82,3,toyota corolla 393 | 38,4,91,67,1965,15,82,3,honda civic 394 | 32,4,91,67,1965,15.7,82,3,honda civic (auto) 395 | 38,4,91,67,1995,16.2,82,3,datsun 310 gx 396 | 25,6,181,110,2945,16.4,82,1,buick century limited 397 | 38,6,262,85,3015,17,82,1,oldsmobile cutlass ciera (diesel) 398 | 26,4,156,92,2585,14.5,82,1,chrysler lebaron medallion 399 | 22,6,232,112,2835,14.7,82,1,ford granada l 400 | 32,4,144,96,2665,13.9,82,3,toyota celica gt 401 | 36,4,135,84,2370,13,82,1,dodge charger 2.2 402 | 27,4,151,90,2950,17.3,82,1,chevrolet camaro 403 | 27,4,140,86,2790,15.6,82,1,ford mustang gl 404 | 44,4,97,52,2130,24.6,82,2,vw pickup 405 | 32,4,135,84,2295,11.6,82,1,dodge rampage 406 | 28,4,120,79,2625,18.6,82,1,ford ranger 407 | 31,4,119,82,2720,19.4,82,1,chevy s-10 408 | -------------------------------------------------------------------------------- /datasets/Bank data dictionary.txt: -------------------------------------------------------------------------------- 1 | 1 - age (numeric) 2 | 2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown") 3 | 3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed) 4 | 4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown") 5 | 5 - default: has credit in default? (categorical: "no","yes","unknown") 6 | 6 - housing: has housing loan? (categorical: "no","yes","unknown") 7 | 7 - loan: has personal loan? (categorical: "no","yes","unknown") 8 | # related with the last contact of the current campaign: 9 | 8 - contact: contact communication type (categorical: "cellular","telephone") 10 | 9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") 11 | 10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri") 12 | 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model. 13 | # other attributes: 14 | 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact) 15 | 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) 16 | 14 - previous: number of contacts performed before this campaign and for this client (numeric) 17 | 15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success") 18 | # social and economic context attributes 19 | 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric) 20 | 17 - cons.price.idx: consumer price index - monthly indicator (numeric) 21 | 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric) 22 | 19 - euribor3m: euribor 3 month rate - daily indicator (numeric) 23 | 20 - nr.employed: number of employees - quarterly indicator (numeric) 24 | 25 | Output variable (desired target): 26 | 21 - y - has the client subscribed a term deposit? (binary: "yes","no") 27 | -------------------------------------------------------------------------------- /datasets/Customer Churn Columns.csv: -------------------------------------------------------------------------------- 1 | Column_Names 2 | A 3 | Bob 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | -------------------------------------------------------------------------------- /datasets/Gender Purchase.csv: -------------------------------------------------------------------------------- 1 | Gender,Purchase 2 | Female,Yes 3 | Female,Yes 4 | Female,No 5 | Male,No 6 | Male,Yes 7 | Female,Yes 8 | Male,No 9 | Female,Yes 10 | Female,No 11 | Female,Yes 12 | Female,No 13 | Male,No 14 | Male,Yes 15 | Male,No 16 | Female,Yes 17 | Male,Yes 18 | Male,Yes 19 | Male,Yes 20 | Female,Yes 21 | Female,No 22 | Male,Yes 23 | Male,Yes 24 | Male,No 25 | Female,Yes 26 | Male,Yes 27 | Female,Yes 28 | Male,No 29 | Male,No 30 | Female,Yes 31 | Female,Yes 32 | Male,No 33 | Female,Yes 34 | Female,Yes 35 | Female,No 36 | Female,No 37 | Female,Yes 38 | Male,Yes 39 | Female,Yes 40 | Female,Yes 41 | Female,Yes 42 | Male,Yes 43 | Male,No 44 | Female,Yes 45 | Female,No 46 | Female,Yes 47 | Female,Yes 48 | Female,No 49 | Male,Yes 50 | Female,No 51 | Female,Yes 52 | Female,No 53 | Male,No 54 | Female,Yes 55 | Male,Yes 56 | Female,No 57 | Female,No 58 | Female,No 59 | Female,Yes 60 | Male,Yes 61 | Female,Yes 62 | Male,Yes 63 | Male,No 64 | Male,Yes 65 | Male,Yes 66 | Male,No 67 | Male,Yes 68 | Female,Yes 69 | Female,No 70 | Male,Yes 71 | Female,No 72 | Male,Yes 73 | Female,Yes 74 | Female,Yes 75 | Female,No 76 | Female,No 77 | Male,Yes 78 | Male,No 79 | Male,No 80 | Male,No 81 | Male,No 82 | Female,No 83 | Male,No 84 | Male,No 85 | Female,Yes 86 | Female,Yes 87 | Female,Yes 88 | Female,Yes 89 | Female,Yes 90 | Female,Yes 91 | Male,No 92 | Male,Yes 93 | Female,Yes 94 | Male,No 95 | Male,No 96 | Female,Yes 97 | Female,No 98 | Male,Yes 99 | Female,Yes 100 | Female,Yes 101 | Male,Yes 102 | Male,No 103 | Male,Yes 104 | Female,No 105 | Male,Yes 106 | Female,Yes 107 | Female,Yes 108 | Male,Yes 109 | Female,No 110 | Male,No 111 | Female,Yes 112 | Female,No 113 | Male,Yes 114 | Male,Yes 115 | Male,Yes 116 | Male,No 117 | Male,No 118 | Female,No 119 | Female,No 120 | Male,Yes 121 | Female,No 122 | Female,Yes 123 | Female,No 124 | Female,Yes 125 | Female,No 126 | Male,Yes 127 | Female,Yes 128 | Female,No 129 | Male,No 130 | Female,Yes 131 | Female,Yes 132 | Male,No 133 | Female,Yes 134 | Female,Yes 135 | Male,Yes 136 | Male,No 137 | Male,Yes 138 | Female,Yes 139 | Female,Yes 140 | Female,No 141 | Female,No 142 | Male,Yes 143 | Male,Yes 144 | Male,No 145 | Female,Yes 146 | Male,Yes 147 | Male,No 148 | Female,Yes 149 | Male,No 150 | Male,No 151 | Female,Yes 152 | Female,No 153 | Female,Yes 154 | Male,Yes 155 | Male,Yes 156 | Female,Yes 157 | Male,No 158 | Male,Yes 159 | Male,No 160 | Male,No 161 | Female,No 162 | Male,Yes 163 | Female,No 164 | Male,Yes 165 | Male,Yes 166 | Male,Yes 167 | Male,Yes 168 | Female,Yes 169 | Female,No 170 | Female,Yes 171 | Female,Yes 172 | Female,No 173 | Female,Yes 174 | Female,No 175 | Male,Yes 176 | Male,No 177 | Female,No 178 | Male,No 179 | Male,No 180 | Male,No 181 | Female,Yes 182 | Female,Yes 183 | Female,No 184 | Female,No 185 | Female,No 186 | Female,No 187 | Female,Yes 188 | Male,No 189 | Female,Yes 190 | Female,Yes 191 | Female,No 192 | Female,No 193 | Female,No 194 | Female,Yes 195 | Female,Yes 196 | Male,No 197 | Male,No 198 | Male,Yes 199 | Female,No 200 | Male,No 201 | Female,Yes 202 | Female,Yes 203 | Female,No 204 | Female,No 205 | Male,No 206 | Male,No 207 | Male,No 208 | Female,Yes 209 | Male,Yes 210 | Male,No 211 | Female,Yes 212 | Female,Yes 213 | Male,No 214 | Female,No 215 | Male,Yes 216 | Male,No 217 | Male,Yes 218 | Male,Yes 219 | Female,Yes 220 | Female,Yes 221 | Male,No 222 | Female,No 223 | Male,Yes 224 | Male,No 225 | Male,Yes 226 | Male,No 227 | Female,Yes 228 | Female,Yes 229 | Female,No 230 | Male,No 231 | Male,No 232 | Female,No 233 | Male,No 234 | Male,Yes 235 | Female,Yes 236 | Female,Yes 237 | Female,No 238 | Male,No 239 | Female,No 240 | Female,Yes 241 | Female,No 242 | Male,Yes 243 | Male,Yes 244 | Female,Yes 245 | Female,Yes 246 | Female,Yes 247 | Male,No 248 | Male,Yes 249 | Female,No 250 | Male,Yes 251 | Male,Yes 252 | Male,No 253 | Female,Yes 254 | Female,No 255 | Female,No 256 | Female,Yes 257 | Female,Yes 258 | Male,No 259 | Male,No 260 | Male,No 261 | Male,No 262 | Male,No 263 | Female,Yes 264 | Female,No 265 | Female,Yes 266 | Male,Yes 267 | Female,Yes 268 | Female,Yes 269 | Male,No 270 | Male,No 271 | Male,No 272 | Male,No 273 | Male,No 274 | Female,Yes 275 | Female,Yes 276 | Female,No 277 | Male,No 278 | Female,Yes 279 | Female,Yes 280 | Female,Yes 281 | Female,Yes 282 | Male,No 283 | Male,No 284 | Female,No 285 | Male,No 286 | Male,No 287 | Female,No 288 | Female,Yes 289 | Male,No 290 | Female,Yes 291 | Female,No 292 | Female,Yes 293 | Female,No 294 | Male,No 295 | Female,Yes 296 | Male,No 297 | Male,Yes 298 | Female,Yes 299 | Female,Yes 300 | Female,Yes 301 | Female,No 302 | Male,Yes 303 | Female,No 304 | Male,No 305 | Female,Yes 306 | Male,Yes 307 | Male,No 308 | Female,Yes 309 | Female,Yes 310 | Female,Yes 311 | Female,Yes 312 | Female,No 313 | Male,Yes 314 | Male,No 315 | Female,Yes 316 | Female,Yes 317 | Female,No 318 | Female,Yes 319 | Female,Yes 320 | Male,No 321 | Female,No 322 | Male,No 323 | Female,No 324 | Male,No 325 | Male,No 326 | Male,Yes 327 | Female,Yes 328 | Male,Yes 329 | Male,No 330 | Male,Yes 331 | Male,Yes 332 | Male,Yes 333 | Male,No 334 | Female,Yes 335 | Male,Yes 336 | Male,No 337 | Male,Yes 338 | Male,Yes 339 | Female,Yes 340 | Male,No 341 | Male,Yes 342 | Male,Yes 343 | Female,Yes 344 | Female,No 345 | Female,No 346 | Female,No 347 | Male,Yes 348 | Female,No 349 | Male,No 350 | Female,Yes 351 | Female,No 352 | Male,Yes 353 | Female,No 354 | Female,No 355 | Male,Yes 356 | Female,No 357 | Female,No 358 | Male,Yes 359 | Female,Yes 360 | Female,Yes 361 | Male,Yes 362 | Male,No 363 | Male,Yes 364 | Female,No 365 | Female,Yes 366 | Male,Yes 367 | Male,Yes 368 | Male,Yes 369 | Male,No 370 | Male,Yes 371 | Male,No 372 | Male,No 373 | Female,Yes 374 | Female,No 375 | Female,Yes 376 | Female,No 377 | Male,Yes 378 | Female,Yes 379 | Female,Yes 380 | Male,No 381 | Female,No 382 | Female,No 383 | Female,No 384 | Male,Yes 385 | Female,Yes 386 | Female,Yes 387 | Male,Yes 388 | Male,No 389 | Female,No 390 | Male,No 391 | Female,Yes 392 | Male,No 393 | Female,Yes 394 | Male,Yes 395 | Female,Yes 396 | Male,Yes 397 | Male,Yes 398 | Male,No 399 | Male,No 400 | Male,No 401 | Female,No 402 | Female,No 403 | Male,Yes 404 | Female,Yes 405 | Female,No 406 | Female,Yes 407 | Male,Yes 408 | Male,No 409 | Female,No 410 | Male,No 411 | Female,Yes 412 | Female,Yes 413 | Female,No 414 | Male,No 415 | Male,Yes 416 | Male,No 417 | Male,Yes 418 | Female,Yes 419 | Male,Yes 420 | Male,Yes 421 | Female,No 422 | Male,No 423 | Female,No 424 | Female,No 425 | Female,No 426 | Female,Yes 427 | Male,Yes 428 | Male,Yes 429 | Male,No 430 | Male,No 431 | Male,No 432 | Female,Yes 433 | Male,No 434 | Male,Yes 435 | Female,Yes 436 | Male,Yes 437 | Male,Yes 438 | Female,No 439 | Female,Yes 440 | Female,No 441 | Female,Yes 442 | Female,Yes 443 | Male,Yes 444 | Male,Yes 445 | Male,No 446 | Female,Yes 447 | Male,No 448 | Male,Yes 449 | Female,Yes 450 | Female,No 451 | Female,No 452 | Female,No 453 | Male,No 454 | Female,Yes 455 | Male,Yes 456 | Male,No 457 | Male,Yes 458 | Female,No 459 | Male,No 460 | Male,No 461 | Female,Yes 462 | Male,No 463 | Female,Yes 464 | Female,Yes 465 | Male,Yes 466 | Female,Yes 467 | Male,Yes 468 | Female,Yes 469 | Female,Yes 470 | Male,No 471 | Female,No 472 | Female,Yes 473 | Female,No 474 | Male,No 475 | Female,No 476 | Male,Yes 477 | Female,No 478 | Male,Yes 479 | Female,Yes 480 | Male,No 481 | Male,No 482 | Female,No 483 | Male,No 484 | Male,No 485 | Male,No 486 | Male,No 487 | Male,Yes 488 | Male,Yes 489 | Male,Yes 490 | Female,Yes 491 | Male,Yes 492 | Male,Yes 493 | Female,Yes 494 | Female,No 495 | Male,Yes 496 | Female,Yes 497 | Female,Yes 498 | Female,Yes 499 | Male,Yes 500 | Male,Yes 501 | Female,Yes 502 | Male,Yes 503 | Male,Yes 504 | Male,No 505 | Female,Yes 506 | Female,Yes 507 | Male,Yes 508 | Male,Yes 509 | Female,Yes 510 | Male,No 511 | Female,Yes 512 | Female,Yes 513 | -------------------------------------------------------------------------------- /datasets/Titanic Description.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/Titanic Description.txt -------------------------------------------------------------------------------- /datasets/dtree2.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | node [shape=box] ; 3 | 0 [label="Petal.Length <= 2.45\nentropy = 1.5849\nsamples = 127\nvalue = [42, 42, 43]"] ; 4 | 1 [label="entropy = 0.0\nsamples = 42\nvalue = [42, 0, 0]"] ; 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; 6 | 2 [label="Petal.Width <= 1.65\nentropy = 0.9999\nsamples = 85\nvalue = [0, 42, 43]"] ; 7 | 0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; 8 | 3 [label="Petal.Length <= 4.95\nentropy = 0.3591\nsamples = 44\nvalue = [0, 41, 3]"] ; 9 | 2 -> 3 ; 10 | 4 [label="entropy = 0.0\nsamples = 40\nvalue = [0, 40, 0]"] ; 11 | 3 -> 4 ; 12 | 5 [label="entropy = 0.8113\nsamples = 4\nvalue = [0, 1, 3]"] ; 13 | 3 -> 5 ; 14 | 6 [label="Petal.Length <= 4.85\nentropy = 0.1654\nsamples = 41\nvalue = [0, 1, 40]"] ; 15 | 2 -> 6 ; 16 | 7 [label="entropy = 0.8113\nsamples = 4\nvalue = [0, 1, 3]"] ; 17 | 6 -> 7 ; 18 | 8 [label="entropy = 0.0\nsamples = 37\nvalue = [0, 0, 37]"] ; 19 | 6 -> 8 ; 20 | } -------------------------------------------------------------------------------- /datasets/dtree2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/dtree2.png -------------------------------------------------------------------------------- /datasets/iris.csv: -------------------------------------------------------------------------------- 1 | Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3,1.4,0.1,setosa 15 | 4.3,3,1.1,0.1,setosa 16 | 5.8,4,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5,3,1.6,0.2,setosa 28 | 5,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.2,setosa 37 | 5,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.6,1.4,0.1,setosa 40 | 4.4,3,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5,3.3,1.4,0.2,setosa 52 | 7,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5,2,3.5,1,versicolor 63 | 5.9,3,4.2,1.5,versicolor 64 | 6,2.2,4,1,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3,5,1.7,versicolor 80 | 6,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6,2.7,5.1,1.6,versicolor 86 | 5.4,3,4.5,1.5,versicolor 87 | 6,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3,4.1,1.3,versicolor 91 | 5.5,2.5,4,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3,4.6,1.4,versicolor 94 | 5.8,2.6,4,1.2,versicolor 95 | 5,2.3,3.3,1,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3,5.8,2.2,virginica 107 | 7.6,3,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3,5.5,2.1,virginica 115 | 5.7,2.5,5,2,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6,2.2,5,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2,virginica 124 | 7.7,2.8,6.7,2,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6,3,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3,5.2,2.3,virginica 148 | 6.3,2.5,5,1.9,virginica 149 | 6.5,3,5.2,2,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3,5.1,1.8,virginica 152 | -------------------------------------------------------------------------------- /datasets/lotsofdata/010.csv: -------------------------------------------------------------------------------- 1 | "Date","sulfate","nitrate","ID" 2 | "2002-01-01",NA,NA,10 3 | "2002-01-02",NA,NA,10 4 | "2002-01-03",NA,NA,10 5 | "2002-01-04",NA,NA,10 6 | "2002-01-05",NA,NA,10 7 | "2002-01-06",NA,NA,10 8 | "2002-01-07",NA,NA,10 9 | "2002-01-08",NA,NA,10 10 | "2002-01-09",NA,NA,10 11 | "2002-01-10",NA,NA,10 12 | "2002-01-11",NA,NA,10 13 | "2002-01-12",NA,NA,10 14 | "2002-01-13",NA,NA,10 15 | "2002-01-14",NA,NA,10 16 | "2002-01-15",NA,NA,10 17 | "2002-01-16",NA,NA,10 18 | "2002-01-17",NA,NA,10 19 | "2002-01-18",NA,NA,10 20 | "2002-01-19",NA,NA,10 21 | "2002-01-20",NA,NA,10 22 | "2002-01-21",NA,NA,10 23 | "2002-01-22",NA,NA,10 24 | "2002-01-23",NA,NA,10 25 | "2002-01-24",NA,NA,10 26 | "2002-01-25",NA,NA,10 27 | "2002-01-26",NA,NA,10 28 | "2002-01-27",NA,NA,10 29 | "2002-01-28",NA,NA,10 30 | "2002-01-29",NA,NA,10 31 | "2002-01-30",NA,NA,10 32 | "2002-01-31",NA,NA,10 33 | "2002-02-01",NA,NA,10 34 | "2002-02-02",NA,NA,10 35 | "2002-02-03",NA,NA,10 36 | "2002-02-04",NA,NA,10 37 | "2002-02-05",NA,NA,10 38 | "2002-02-06",NA,NA,10 39 | "2002-02-07",NA,NA,10 40 | "2002-02-08",0.782,2.05,10 41 | "2002-02-09",NA,NA,10 42 | "2002-02-10",NA,NA,10 43 | "2002-02-11",NA,NA,10 44 | "2002-02-12",NA,NA,10 45 | "2002-02-13",0.416,0.292,10 46 | "2002-02-14",NA,NA,10 47 | "2002-02-15",NA,NA,10 48 | "2002-02-16",NA,NA,10 49 | "2002-02-17",NA,NA,10 50 | "2002-02-18",NA,NA,10 51 | "2002-02-19",0.665,0.544,10 52 | "2002-02-20",NA,NA,10 53 | "2002-02-21",NA,NA,10 54 | "2002-02-22",NA,NA,10 55 | "2002-02-23",NA,NA,10 56 | "2002-02-24",NA,NA,10 57 | "2002-02-25",0.537,0.977,10 58 | "2002-02-26",NA,NA,10 59 | "2002-02-27",NA,NA,10 60 | "2002-02-28",NA,NA,10 61 | "2002-03-01",NA,NA,10 62 | "2002-03-02",NA,NA,10 63 | "2002-03-03",NA,NA,10 64 | "2002-03-04",NA,NA,10 65 | "2002-03-05",NA,NA,10 66 | "2002-03-06",NA,NA,10 67 | "2002-03-07",NA,NA,10 68 | "2002-03-08",NA,NA,10 69 | "2002-03-09",1.25,1,10 70 | "2002-03-10",NA,NA,10 71 | "2002-03-11",NA,NA,10 72 | "2002-03-12",NA,NA,10 73 | "2002-03-13",NA,NA,10 74 | "2002-03-14",NA,NA,10 75 | "2002-03-15",0.985,0.722,10 76 | "2002-03-16",NA,NA,10 77 | "2002-03-17",NA,NA,10 78 | "2002-03-18",1.04,2.39,10 79 | "2002-03-19",NA,NA,10 80 | "2002-03-20",NA,NA,10 81 | "2002-03-21",NA,NA,10 82 | "2002-03-22",NA,NA,10 83 | "2002-03-23",NA,NA,10 84 | "2002-03-24",0.698,0.589,10 85 | "2002-03-25",NA,NA,10 86 | "2002-03-26",NA,NA,10 87 | "2002-03-27",NA,0.294,10 88 | "2002-03-28",NA,NA,10 89 | "2002-03-29",NA,NA,10 90 | "2002-03-30",0.755,0.396,10 91 | "2002-03-31",NA,NA,10 92 | "2002-04-01",NA,NA,10 93 | "2002-04-02",NA,NA,10 94 | "2002-04-03",NA,NA,10 95 | "2002-04-04",NA,NA,10 96 | "2002-04-05",1.55,1.05,10 97 | "2002-04-06",NA,NA,10 98 | "2002-04-07",NA,NA,10 99 | "2002-04-08",1.31,1.21,10 100 | "2002-04-09",NA,NA,10 101 | "2002-04-10",NA,NA,10 102 | "2002-04-11",1.52,0.218,10 103 | "2002-04-12",NA,NA,10 104 | "2002-04-13",NA,NA,10 105 | "2002-04-14",0.903,1.36,10 106 | "2002-04-15",NA,NA,10 107 | "2002-04-16",NA,NA,10 108 | "2002-04-17",0.469,0.27,10 109 | "2002-04-18",NA,NA,10 110 | "2002-04-19",NA,NA,10 111 | "2002-04-20",NA,NA,10 112 | "2002-04-21",NA,NA,10 113 | "2002-04-22",NA,NA,10 114 | "2002-04-23",0.913,0.427,10 115 | "2002-04-24",NA,NA,10 116 | "2002-04-25",NA,NA,10 117 | "2002-04-26",1.05,0.334,10 118 | "2002-04-27",NA,NA,10 119 | "2002-04-28",NA,NA,10 120 | "2002-04-29",1.21,1.12,10 121 | "2002-04-30",NA,NA,10 122 | "2002-05-01",NA,NA,10 123 | "2002-05-02",0.328,0.149,10 124 | "2002-05-03",NA,NA,10 125 | "2002-05-04",NA,NA,10 126 | "2002-05-05",NA,NA,10 127 | "2002-05-06",NA,NA,10 128 | "2002-05-07",NA,NA,10 129 | "2002-05-08",0.873,0.482,10 130 | "2002-05-09",NA,NA,10 131 | "2002-05-10",NA,NA,10 132 | "2002-05-11",NA,NA,10 133 | "2002-05-12",NA,NA,10 134 | "2002-05-13",NA,NA,10 135 | "2002-05-14",0.634,0.339,10 136 | "2002-05-15",NA,NA,10 137 | "2002-05-16",NA,NA,10 138 | "2002-05-17",0.866,0.442,10 139 | "2002-05-18",NA,NA,10 140 | "2002-05-19",NA,NA,10 141 | "2002-05-20",1.1,0.454,10 142 | "2002-05-21",NA,NA,10 143 | "2002-05-22",NA,NA,10 144 | "2002-05-23",0.264,0.221,10 145 | "2002-05-24",NA,NA,10 146 | "2002-05-25",NA,NA,10 147 | "2002-05-26",0.659,0.507,10 148 | "2002-05-27",NA,NA,10 149 | "2002-05-28",NA,NA,10 150 | "2002-05-29",0.219,0.181,10 151 | "2002-05-30",NA,NA,10 152 | "2002-05-31",NA,NA,10 153 | "2002-06-01",NA,NA,10 154 | "2002-06-02",NA,NA,10 155 | "2002-06-03",NA,NA,10 156 | "2002-06-04",NA,NA,10 157 | "2002-06-05",NA,NA,10 158 | "2002-06-06",NA,NA,10 159 | "2002-06-07",0.595,0.266,10 160 | "2002-06-08",NA,NA,10 161 | "2002-06-09",NA,NA,10 162 | "2002-06-10",NA,NA,10 163 | "2002-06-11",NA,NA,10 164 | "2002-06-12",NA,NA,10 165 | "2002-06-13",NA,NA,10 166 | "2002-06-14",NA,NA,10 167 | "2002-06-15",NA,NA,10 168 | "2002-06-16",0.787,0.456,10 169 | "2002-06-17",NA,NA,10 170 | "2002-06-18",NA,NA,10 171 | "2002-06-19",NA,NA,10 172 | "2002-06-20",NA,NA,10 173 | "2002-06-21",NA,NA,10 174 | "2002-06-22",0.592,0.214,10 175 | "2002-06-23",NA,NA,10 176 | "2002-06-24",NA,NA,10 177 | "2002-06-25",0.387,0.305,10 178 | "2002-06-26",NA,NA,10 179 | "2002-06-27",NA,NA,10 180 | "2002-06-28",NA,0.401,10 181 | "2002-06-29",NA,NA,10 182 | "2002-06-30",NA,NA,10 183 | "2002-07-01",NA,NA,10 184 | "2002-07-02",NA,NA,10 185 | "2002-07-03",NA,NA,10 186 | "2002-07-04",NA,NA,10 187 | "2002-07-05",NA,NA,10 188 | "2002-07-06",NA,NA,10 189 | "2002-07-07",1.53,0.373,10 190 | "2002-07-08",NA,NA,10 191 | "2002-07-09",NA,NA,10 192 | "2002-07-10",NA,NA,10 193 | "2002-07-11",NA,NA,10 194 | "2002-07-12",NA,NA,10 195 | "2002-07-13",0.862,0.37,10 196 | "2002-07-14",NA,NA,10 197 | "2002-07-15",NA,NA,10 198 | "2002-07-16",0.706,0.404,10 199 | "2002-07-17",NA,NA,10 200 | "2002-07-18",NA,NA,10 201 | "2002-07-19",NA,NA,10 202 | "2002-07-20",NA,NA,10 203 | "2002-07-21",NA,NA,10 204 | "2002-07-22",0.394,0.219,10 205 | "2002-07-23",NA,NA,10 206 | "2002-07-24",NA,NA,10 207 | "2002-07-25",0.966,0.376,10 208 | "2002-07-26",NA,NA,10 209 | "2002-07-27",NA,NA,10 210 | "2002-07-28",0.766,0.393,10 211 | "2002-07-29",NA,NA,10 212 | "2002-07-30",NA,NA,10 213 | "2002-07-31",0.413,0.21,10 214 | "2002-08-01",NA,NA,10 215 | "2002-08-02",NA,NA,10 216 | "2002-08-03",NA,NA,10 217 | "2002-08-04",NA,NA,10 218 | "2002-08-05",NA,NA,10 219 | "2002-08-06",NA,NA,10 220 | "2002-08-07",NA,NA,10 221 | "2002-08-08",NA,NA,10 222 | "2002-08-09",NA,NA,10 223 | "2002-08-10",NA,NA,10 224 | "2002-08-11",NA,NA,10 225 | "2002-08-12",NA,NA,10 226 | "2002-08-13",NA,NA,10 227 | "2002-08-14",NA,NA,10 228 | "2002-08-15",NA,NA,10 229 | "2002-08-16",NA,NA,10 230 | "2002-08-17",NA,NA,10 231 | "2002-08-18",0.839,0.315,10 232 | "2002-08-19",NA,NA,10 233 | "2002-08-20",NA,NA,10 234 | "2002-08-21",NA,NA,10 235 | "2002-08-22",NA,NA,10 236 | "2002-08-23",NA,NA,10 237 | "2002-08-24",0.258,0.216,10 238 | "2002-08-25",NA,NA,10 239 | "2002-08-26",NA,NA,10 240 | "2002-08-27",NA,0.687,10 241 | "2002-08-28",NA,NA,10 242 | "2002-08-29",NA,NA,10 243 | "2002-08-30",0.445,0.203,10 244 | "2002-08-31",NA,NA,10 245 | "2002-09-01",NA,NA,10 246 | "2002-09-02",NA,NA,10 247 | "2002-09-03",NA,NA,10 248 | "2002-09-04",NA,NA,10 249 | "2002-09-05",NA,0.236,10 250 | "2002-09-06",NA,NA,10 251 | "2002-09-07",NA,NA,10 252 | "2002-09-08",NA,NA,10 253 | "2002-09-09",NA,NA,10 254 | "2002-09-10",NA,NA,10 255 | "2002-09-11",0.247,0.0751,10 256 | "2002-09-12",NA,NA,10 257 | "2002-09-13",NA,NA,10 258 | "2002-09-14",NA,NA,10 259 | "2002-09-15",NA,NA,10 260 | "2002-09-16",NA,NA,10 261 | "2002-09-17",NA,0.231,10 262 | "2002-09-18",NA,NA,10 263 | "2002-09-19",NA,NA,10 264 | "2002-09-20",NA,NA,10 265 | "2002-09-21",NA,NA,10 266 | "2002-09-22",NA,NA,10 267 | "2002-09-23",0.433,0.246,10 268 | "2002-09-24",NA,NA,10 269 | "2002-09-25",NA,NA,10 270 | "2002-09-26",NA,0.343,10 271 | "2002-09-27",NA,NA,10 272 | "2002-09-28",NA,NA,10 273 | "2002-09-29",NA,0.235,10 274 | "2002-09-30",NA,NA,10 275 | "2002-10-01",NA,NA,10 276 | "2002-10-02",NA,0.319,10 277 | "2002-10-03",NA,NA,10 278 | "2002-10-04",NA,NA,10 279 | "2002-10-05",NA,NA,10 280 | "2002-10-06",NA,NA,10 281 | "2002-10-07",NA,NA,10 282 | "2002-10-08",NA,NA,10 283 | "2002-10-09",NA,NA,10 284 | "2002-10-10",NA,NA,10 285 | "2002-10-11",NA,NA,10 286 | "2002-10-12",NA,NA,10 287 | "2002-10-13",NA,NA,10 288 | "2002-10-14",NA,NA,10 289 | "2002-10-15",NA,NA,10 290 | "2002-10-16",NA,NA,10 291 | "2002-10-17",0.402,0.556,10 292 | "2002-10-18",NA,NA,10 293 | "2002-10-19",NA,NA,10 294 | "2002-10-20",0.336,0.411,10 295 | "2002-10-21",NA,NA,10 296 | "2002-10-22",NA,NA,10 297 | "2002-10-23",NA,NA,10 298 | "2002-10-24",NA,NA,10 299 | "2002-10-25",NA,NA,10 300 | "2002-10-26",NA,0.547,10 301 | "2002-10-27",NA,NA,10 302 | "2002-10-28",NA,NA,10 303 | "2002-10-29",0.197,0.159,10 304 | "2002-10-30",NA,NA,10 305 | "2002-10-31",NA,NA,10 306 | "2002-11-01",NA,NA,10 307 | "2002-11-02",NA,0.644,10 308 | "2002-11-03",NA,NA,10 309 | "2002-11-04",NA,NA,10 310 | "2002-11-05",NA,NA,10 311 | "2002-11-06",NA,NA,10 312 | "2002-11-07",NA,NA,10 313 | "2002-11-08",NA,NA,10 314 | "2002-11-09",NA,NA,10 315 | "2002-11-10",0.387,0.669,10 316 | "2002-11-11",NA,NA,10 317 | "2002-11-12",NA,NA,10 318 | "2002-11-13",0.587,0.973,10 319 | "2002-11-14",NA,NA,10 320 | "2002-11-15",NA,NA,10 321 | "2002-11-16",0.365,2.11,10 322 | "2002-11-17",NA,NA,10 323 | "2002-11-18",NA,NA,10 324 | "2002-11-19",NA,1.24,10 325 | "2002-11-20",NA,NA,10 326 | "2002-11-21",NA,NA,10 327 | "2002-11-22",NA,NA,10 328 | "2002-11-23",NA,NA,10 329 | "2002-11-24",NA,NA,10 330 | "2002-11-25",NA,0.394,10 331 | "2002-11-26",NA,NA,10 332 | "2002-11-27",NA,NA,10 333 | "2002-11-28",NA,NA,10 334 | "2002-11-29",NA,NA,10 335 | "2002-11-30",NA,NA,10 336 | "2002-12-01",NA,NA,10 337 | "2002-12-02",NA,NA,10 338 | "2002-12-03",NA,NA,10 339 | "2002-12-04",NA,0.563,10 340 | "2002-12-05",NA,NA,10 341 | "2002-12-06",NA,NA,10 342 | "2002-12-07",0.414,0.557,10 343 | "2002-12-08",NA,NA,10 344 | "2002-12-09",NA,NA,10 345 | "2002-12-10",NA,NA,10 346 | "2002-12-11",NA,NA,10 347 | "2002-12-12",NA,NA,10 348 | "2002-12-13",0.434,1.2,10 349 | "2002-12-14",NA,NA,10 350 | "2002-12-15",NA,NA,10 351 | "2002-12-16",NA,NA,10 352 | "2002-12-17",NA,1.49,10 353 | "2002-12-18",NA,NA,10 354 | "2002-12-19",0.753,1.33,10 355 | "2002-12-20",NA,NA,10 356 | "2002-12-21",NA,NA,10 357 | "2002-12-22",NA,NA,10 358 | "2002-12-23",NA,NA,10 359 | "2002-12-24",NA,NA,10 360 | "2002-12-25",NA,1.01,10 361 | "2002-12-26",NA,NA,10 362 | "2002-12-27",NA,NA,10 363 | "2002-12-28",NA,NA,10 364 | "2002-12-29",NA,NA,10 365 | "2002-12-30",NA,NA,10 366 | "2002-12-31",0.752,2.16,10 367 | "2003-01-01",NA,NA,10 368 | "2003-01-02",NA,NA,10 369 | "2003-01-03",0.386,0.863,10 370 | "2003-01-04",NA,NA,10 371 | "2003-01-05",NA,NA,10 372 | "2003-01-06",NA,NA,10 373 | "2003-01-07",NA,NA,10 374 | "2003-01-08",NA,NA,10 375 | "2003-01-09",0.349,0.822,10 376 | "2003-01-10",NA,NA,10 377 | "2003-01-11",NA,NA,10 378 | "2003-01-12",NA,0.931,10 379 | "2003-01-13",NA,NA,10 380 | "2003-01-14",NA,NA,10 381 | "2003-01-15",0.393,0.659,10 382 | "2003-01-16",NA,NA,10 383 | "2003-01-17",NA,NA,10 384 | "2003-01-18",NA,1.13,10 385 | "2003-01-19",NA,NA,10 386 | "2003-01-20",NA,NA,10 387 | "2003-01-21",NA,NA,10 388 | "2003-01-22",NA,NA,10 389 | "2003-01-23",NA,NA,10 390 | "2003-01-24",NA,NA,10 391 | "2003-01-25",NA,NA,10 392 | "2003-01-26",NA,NA,10 393 | "2003-01-27",NA,0.576,10 394 | "2003-01-28",NA,NA,10 395 | "2003-01-29",NA,NA,10 396 | "2003-01-30",0.392,1.12,10 397 | "2003-01-31",NA,NA,10 398 | "2003-02-01",NA,NA,10 399 | "2003-02-02",NA,1.24,10 400 | "2003-02-03",NA,NA,10 401 | "2003-02-04",NA,NA,10 402 | "2003-02-05",NA,NA,10 403 | "2003-02-06",NA,NA,10 404 | "2003-02-07",NA,NA,10 405 | "2003-02-08",0.253,0.503,10 406 | "2003-02-09",NA,NA,10 407 | "2003-02-10",NA,NA,10 408 | "2003-02-11",0.74,0.62,10 409 | "2003-02-12",NA,NA,10 410 | "2003-02-13",NA,NA,10 411 | "2003-02-14",NA,0.746,10 412 | "2003-02-15",NA,NA,10 413 | "2003-02-16",NA,NA,10 414 | "2003-02-17",NA,NA,10 415 | "2003-02-18",NA,NA,10 416 | "2003-02-19",NA,NA,10 417 | "2003-02-20",NA,0.351,10 418 | "2003-02-21",NA,NA,10 419 | "2003-02-22",NA,NA,10 420 | "2003-02-23",0.237,0.192,10 421 | "2003-02-24",NA,NA,10 422 | "2003-02-25",NA,NA,10 423 | "2003-02-26",NA,NA,10 424 | "2003-02-27",NA,NA,10 425 | "2003-02-28",NA,NA,10 426 | "2003-03-01",NA,NA,10 427 | "2003-03-02",NA,NA,10 428 | "2003-03-03",NA,NA,10 429 | "2003-03-04",0.665,0.583,10 430 | "2003-03-05",NA,NA,10 431 | "2003-03-06",NA,NA,10 432 | "2003-03-07",0.514,0.543,10 433 | "2003-03-08",NA,NA,10 434 | "2003-03-09",NA,NA,10 435 | "2003-03-10",0.86,0.132,10 436 | "2003-03-11",NA,NA,10 437 | "2003-03-12",NA,NA,10 438 | "2003-03-13",1.9,0.205,10 439 | "2003-03-14",NA,NA,10 440 | "2003-03-15",NA,NA,10 441 | "2003-03-16",0.488,0.461,10 442 | "2003-03-17",NA,NA,10 443 | "2003-03-18",NA,NA,10 444 | "2003-03-19",NA,NA,10 445 | "2003-03-20",NA,NA,10 446 | "2003-03-21",NA,NA,10 447 | "2003-03-22",1.12,0.741,10 448 | "2003-03-23",NA,NA,10 449 | "2003-03-24",NA,NA,10 450 | "2003-03-25",NA,NA,10 451 | "2003-03-26",NA,NA,10 452 | "2003-03-27",NA,NA,10 453 | "2003-03-28",0.705,0.231,10 454 | "2003-03-29",NA,NA,10 455 | "2003-03-30",NA,NA,10 456 | "2003-03-31",NA,0.148,10 457 | "2003-04-01",NA,NA,10 458 | "2003-04-02",NA,NA,10 459 | "2003-04-03",0.766,0.629,10 460 | "2003-04-04",NA,NA,10 461 | "2003-04-05",NA,NA,10 462 | "2003-04-06",0.904,0.134,10 463 | "2003-04-07",NA,NA,10 464 | "2003-04-08",NA,NA,10 465 | "2003-04-09",0.577,0.414,10 466 | "2003-04-10",NA,NA,10 467 | "2003-04-11",NA,NA,10 468 | "2003-04-12",NA,NA,10 469 | "2003-04-13",NA,NA,10 470 | "2003-04-14",NA,NA,10 471 | "2003-04-15",NA,NA,10 472 | "2003-04-16",NA,NA,10 473 | "2003-04-17",NA,NA,10 474 | "2003-04-18",0.453,0.53,10 475 | "2003-04-19",NA,NA,10 476 | "2003-04-20",NA,NA,10 477 | "2003-04-21",0.143,0.134,10 478 | "2003-04-22",NA,NA,10 479 | "2003-04-23",NA,NA,10 480 | "2003-04-24",0.414,0.328,10 481 | "2003-04-25",NA,NA,10 482 | "2003-04-26",NA,NA,10 483 | "2003-04-27",NA,NA,10 484 | "2003-04-28",NA,NA,10 485 | "2003-04-29",NA,NA,10 486 | "2003-04-30",1.14,0.358,10 487 | "2003-05-01",NA,NA,10 488 | "2003-05-02",NA,NA,10 489 | "2003-05-03",NA,NA,10 490 | "2003-05-04",NA,NA,10 491 | "2003-05-05",NA,NA,10 492 | "2003-05-06",1.39,0.77,10 493 | "2003-05-07",NA,NA,10 494 | "2003-05-08",NA,NA,10 495 | "2003-05-09",NA,NA,10 496 | "2003-05-10",NA,NA,10 497 | "2003-05-11",NA,NA,10 498 | "2003-05-12",2.27,0.367,10 499 | "2003-05-13",NA,NA,10 500 | "2003-05-14",NA,NA,10 501 | "2003-05-15",0.991,0.321,10 502 | "2003-05-16",NA,NA,10 503 | "2003-05-17",NA,NA,10 504 | "2003-05-18",1.02,0.306,10 505 | "2003-05-19",NA,NA,10 506 | "2003-05-20",NA,NA,10 507 | "2003-05-21",0.783,0.358,10 508 | "2003-05-22",NA,NA,10 509 | "2003-05-23",NA,NA,10 510 | "2003-05-24",0.544,0.165,10 511 | "2003-05-25",NA,NA,10 512 | "2003-05-26",NA,NA,10 513 | "2003-05-27",NA,NA,10 514 | "2003-05-28",NA,NA,10 515 | "2003-05-29",NA,NA,10 516 | "2003-05-30",NA,NA,10 517 | "2003-05-31",NA,NA,10 518 | "2003-06-01",NA,NA,10 519 | "2003-06-02",NA,NA,10 520 | "2003-06-03",NA,NA,10 521 | "2003-06-04",NA,NA,10 522 | "2003-06-05",NA,NA,10 523 | "2003-06-06",NA,NA,10 524 | "2003-06-07",NA,NA,10 525 | "2003-06-08",NA,0.304,10 526 | "2003-06-09",NA,NA,10 527 | "2003-06-10",NA,NA,10 528 | "2003-06-11",NA,0.59,10 529 | "2003-06-12",NA,NA,10 530 | "2003-06-13",NA,NA,10 531 | "2003-06-14",0.46,0.218,10 532 | "2003-06-15",NA,NA,10 533 | "2003-06-16",NA,NA,10 534 | "2003-06-17",0.447,0.188,10 535 | "2003-06-18",NA,NA,10 536 | "2003-06-19",NA,NA,10 537 | "2003-06-20",0.769,0.323,10 538 | "2003-06-21",NA,NA,10 539 | "2003-06-22",NA,NA,10 540 | "2003-06-23",0.645,0.331,10 541 | "2003-06-24",NA,NA,10 542 | "2003-06-25",NA,NA,10 543 | "2003-06-26",NA,NA,10 544 | "2003-06-27",NA,NA,10 545 | "2003-06-28",NA,NA,10 546 | "2003-06-29",NA,NA,10 547 | "2003-06-30",NA,NA,10 548 | "2003-07-01",NA,NA,10 549 | "2003-07-02",0.571,0.202,10 550 | "2003-07-03",NA,NA,10 551 | "2003-07-04",NA,NA,10 552 | "2003-07-05",0.741,0.199,10 553 | "2003-07-06",NA,NA,10 554 | "2003-07-07",NA,NA,10 555 | "2003-07-08",0.417,0.15,10 556 | "2003-07-09",NA,NA,10 557 | "2003-07-10",NA,NA,10 558 | "2003-07-11",0.7,0.36,10 559 | "2003-07-12",NA,NA,10 560 | "2003-07-13",NA,NA,10 561 | "2003-07-14",NA,0.224,10 562 | "2003-07-15",NA,NA,10 563 | "2003-07-16",NA,NA,10 564 | "2003-07-17",NA,NA,10 565 | "2003-07-18",NA,NA,10 566 | "2003-07-19",NA,NA,10 567 | "2003-07-20",NA,NA,10 568 | "2003-07-21",NA,NA,10 569 | "2003-07-22",NA,NA,10 570 | "2003-07-23",0.754,0.279,10 571 | "2003-07-24",NA,NA,10 572 | "2003-07-25",NA,NA,10 573 | "2003-07-26",NA,NA,10 574 | "2003-07-27",NA,NA,10 575 | "2003-07-28",NA,NA,10 576 | "2003-07-29",0.365,0.129,10 577 | "2003-07-30",NA,NA,10 578 | "2003-07-31",NA,NA,10 579 | "2003-08-01",NA,NA,10 580 | "2003-08-02",NA,NA,10 581 | "2003-08-03",NA,NA,10 582 | "2003-08-04",NA,NA,10 583 | "2003-08-05",NA,0.199,10 584 | "2003-08-06",NA,NA,10 585 | "2003-08-07",NA,NA,10 586 | "2003-08-08",NA,NA,10 587 | "2003-08-09",NA,NA,10 588 | "2003-08-10",NA,NA,10 589 | "2003-08-11",NA,NA,10 590 | "2003-08-12",NA,NA,10 591 | "2003-08-13",NA,NA,10 592 | "2003-08-14",NA,NA,10 593 | "2003-08-15",NA,NA,10 594 | "2003-08-16",0.281,0.204,10 595 | "2003-08-17",NA,NA,10 596 | "2003-08-18",NA,NA,10 597 | "2003-08-19",0.768,0.254,10 598 | "2003-08-20",NA,NA,10 599 | "2003-08-21",NA,NA,10 600 | "2003-08-22",NA,NA,10 601 | "2003-08-23",NA,NA,10 602 | "2003-08-24",NA,NA,10 603 | "2003-08-25",NA,NA,10 604 | "2003-08-26",NA,NA,10 605 | "2003-08-27",NA,NA,10 606 | "2003-08-28",NA,0.563,10 607 | "2003-08-29",NA,NA,10 608 | "2003-08-30",NA,NA,10 609 | "2003-08-31",0.782,0.213,10 610 | "2003-09-01",NA,NA,10 611 | "2003-09-02",NA,NA,10 612 | "2003-09-03",0.455,0.219,10 613 | "2003-09-04",NA,NA,10 614 | "2003-09-05",NA,NA,10 615 | "2003-09-06",NA,NA,10 616 | "2003-09-07",NA,NA,10 617 | "2003-09-08",NA,NA,10 618 | "2003-09-09",NA,0.325,10 619 | "2003-09-10",NA,NA,10 620 | "2003-09-11",NA,NA,10 621 | "2003-09-12",0.509,0.294,10 622 | "2003-09-13",NA,NA,10 623 | "2003-09-14",NA,NA,10 624 | "2003-09-15",0.257,0.116,10 625 | "2003-09-16",NA,NA,10 626 | "2003-09-17",NA,NA,10 627 | "2003-09-18",NA,NA,10 628 | "2003-09-19",NA,NA,10 629 | "2003-09-20",NA,NA,10 630 | "2003-09-21",NA,NA,10 631 | "2003-09-22",NA,NA,10 632 | "2003-09-23",NA,NA,10 633 | "2003-09-24",NA,NA,10 634 | "2003-09-25",NA,NA,10 635 | "2003-09-26",NA,NA,10 636 | "2003-09-27",0.538,0.265,10 637 | "2003-09-28",NA,NA,10 638 | "2003-09-29",NA,NA,10 639 | "2003-09-30",0.367,0.161,10 640 | "2003-10-01",NA,NA,10 641 | "2003-10-02",NA,NA,10 642 | "2003-10-03",NA,NA,10 643 | "2003-10-04",NA,NA,10 644 | "2003-10-05",NA,NA,10 645 | "2003-10-06",NA,NA,10 646 | "2003-10-07",NA,NA,10 647 | "2003-10-08",NA,NA,10 648 | "2003-10-09",NA,NA,10 649 | "2003-10-10",NA,NA,10 650 | "2003-10-11",NA,NA,10 651 | "2003-10-12",0.321,0.673,10 652 | "2003-10-13",NA,NA,10 653 | "2003-10-14",NA,NA,10 654 | "2003-10-15",NA,NA,10 655 | "2003-10-16",NA,NA,10 656 | "2003-10-17",NA,NA,10 657 | "2003-10-18",0.349,0.173,10 658 | "2003-10-19",NA,NA,10 659 | "2003-10-20",0.299,0.43,10 660 | "2003-10-21",NA,NA,10 661 | "2003-10-22",NA,NA,10 662 | "2003-10-23",NA,NA,10 663 | "2003-10-24",NA,NA,10 664 | "2003-10-25",NA,NA,10 665 | "2003-10-26",NA,NA,10 666 | "2003-10-27",NA,NA,10 667 | "2003-10-28",NA,NA,10 668 | "2003-10-29",NA,NA,10 669 | "2003-10-30",0.408,0.295,10 670 | "2003-10-31",NA,NA,10 671 | "2003-11-01",NA,NA,10 672 | "2003-11-02",0.265,0.628,10 673 | "2003-11-03",NA,NA,10 674 | "2003-11-04",NA,NA,10 675 | "2003-11-05",NA,NA,10 676 | "2003-11-06",NA,NA,10 677 | "2003-11-07",NA,NA,10 678 | "2003-11-08",NA,0.145,10 679 | "2003-11-09",NA,NA,10 680 | "2003-11-10",NA,NA,10 681 | "2003-11-11",NA,NA,10 682 | "2003-11-12",NA,NA,10 683 | "2003-11-13",NA,NA,10 684 | "2003-11-14",0.377,0.195,10 685 | "2003-11-15",NA,NA,10 686 | "2003-11-16",NA,NA,10 687 | "2003-11-17",0.47,0.782,10 688 | "2003-11-18",NA,NA,10 689 | "2003-11-19",NA,NA,10 690 | "2003-11-20",0.283,0.981,10 691 | "2003-11-21",NA,NA,10 692 | "2003-11-22",NA,NA,10 693 | "2003-11-23",0.268,0.389,10 694 | "2003-11-24",NA,NA,10 695 | "2003-11-25",NA,NA,10 696 | "2003-11-26",NA,0.307,10 697 | "2003-11-27",NA,NA,10 698 | "2003-11-28",NA,NA,10 699 | "2003-11-29",0.454,0.877,10 700 | "2003-11-30",NA,NA,10 701 | "2003-12-01",NA,NA,10 702 | "2003-12-02",1.08,0.936,10 703 | "2003-12-03",NA,NA,10 704 | "2003-12-04",NA,NA,10 705 | "2003-12-05",NA,NA,10 706 | "2003-12-06",NA,NA,10 707 | "2003-12-07",NA,NA,10 708 | "2003-12-08",0.324,0.856,10 709 | "2003-12-09",NA,NA,10 710 | "2003-12-10",NA,NA,10 711 | "2003-12-11",0.284,1.33,10 712 | "2003-12-12",NA,NA,10 713 | "2003-12-13",NA,NA,10 714 | "2003-12-14",0.272,2.31,10 715 | "2003-12-15",NA,NA,10 716 | "2003-12-16",NA,NA,10 717 | "2003-12-17",0.323,0.998,10 718 | "2003-12-18",NA,NA,10 719 | "2003-12-19",NA,NA,10 720 | "2003-12-20",0.541,1.37,10 721 | "2003-12-21",NA,NA,10 722 | "2003-12-22",NA,NA,10 723 | "2003-12-23",NA,NA,10 724 | "2003-12-24",NA,NA,10 725 | "2003-12-25",NA,NA,10 726 | "2003-12-26",0.778,1.69,10 727 | "2003-12-27",NA,NA,10 728 | "2003-12-28",NA,NA,10 729 | "2003-12-29",NA,NA,10 730 | "2003-12-30",0.201,0.386,10 731 | "2003-12-31",NA,NA,10 732 | "2004-01-01",NA,1.68,10 733 | "2004-01-02",NA,NA,10 734 | "2004-01-03",NA,NA,10 735 | "2004-01-04",1.65,2.09,10 736 | "2004-01-05",NA,NA,10 737 | "2004-01-06",NA,NA,10 738 | "2004-01-07",NA,NA,10 739 | "2004-01-08",NA,NA,10 740 | "2004-01-09",NA,NA,10 741 | "2004-01-10",0.206,0.289,10 742 | "2004-01-11",NA,NA,10 743 | "2004-01-12",NA,NA,10 744 | "2004-01-13",NA,NA,10 745 | "2004-01-14",NA,NA,10 746 | "2004-01-15",NA,NA,10 747 | "2004-01-16",NA,NA,10 748 | "2004-01-17",NA,NA,10 749 | "2004-01-18",NA,NA,10 750 | "2004-01-19",NA,NA,10 751 | "2004-01-20",NA,NA,10 752 | "2004-01-21",NA,NA,10 753 | "2004-01-22",0.208,0.357,10 754 | "2004-01-23",NA,NA,10 755 | "2004-01-24",NA,NA,10 756 | "2004-01-25",0.751,1.29,10 757 | "2004-01-26",NA,NA,10 758 | "2004-01-27",NA,NA,10 759 | "2004-01-28",0.601,0.59,10 760 | "2004-01-29",NA,NA,10 761 | "2004-01-30",NA,NA,10 762 | "2004-01-31",0.412,0.446,10 763 | "2004-02-01",NA,NA,10 764 | "2004-02-02",NA,NA,10 765 | "2004-02-03",NA,0.876,10 766 | "2004-02-04",NA,NA,10 767 | "2004-02-05",NA,NA,10 768 | "2004-02-06",0.541,0.524,10 769 | "2004-02-07",NA,NA,10 770 | "2004-02-08",NA,NA,10 771 | "2004-02-09",0.571,1.04,10 772 | "2004-02-10",NA,NA,10 773 | "2004-02-11",NA,NA,10 774 | "2004-02-12",0.407,0.69,10 775 | "2004-02-13",NA,NA,10 776 | "2004-02-14",NA,NA,10 777 | "2004-02-15",NA,1.28,10 778 | "2004-02-16",NA,NA,10 779 | "2004-02-17",NA,NA,10 780 | "2004-02-18",0.415,1.1,10 781 | "2004-02-19",NA,NA,10 782 | "2004-02-20",NA,NA,10 783 | "2004-02-21",0.227,0.716,10 784 | "2004-02-22",NA,NA,10 785 | "2004-02-23",NA,NA,10 786 | "2004-02-24",0.442,0.754,10 787 | "2004-02-25",NA,NA,10 788 | "2004-02-26",NA,NA,10 789 | "2004-02-27",NA,NA,10 790 | "2004-02-28",NA,NA,10 791 | "2004-02-29",NA,NA,10 792 | "2004-03-01",NA,NA,10 793 | "2004-03-02",NA,NA,10 794 | "2004-03-03",NA,NA,10 795 | "2004-03-04",0.311,1.39,10 796 | "2004-03-05",NA,NA,10 797 | "2004-03-06",NA,NA,10 798 | "2004-03-07",0.425,0.564,10 799 | "2004-03-08",NA,NA,10 800 | "2004-03-09",NA,NA,10 801 | "2004-03-10",0.446,0.615,10 802 | "2004-03-11",NA,NA,10 803 | "2004-03-12",NA,NA,10 804 | "2004-03-13",NA,NA,10 805 | "2004-03-14",NA,NA,10 806 | "2004-03-15",NA,NA,10 807 | "2004-03-16",NA,NA,10 808 | "2004-03-17",NA,NA,10 809 | "2004-03-18",NA,NA,10 810 | "2004-03-19",0.486,0.213,10 811 | "2004-03-20",NA,NA,10 812 | "2004-03-21",NA,NA,10 813 | "2004-03-22",NA,NA,10 814 | "2004-03-23",0.891,1.29,10 815 | "2004-03-24",NA,NA,10 816 | "2004-03-25",NA,NA,10 817 | "2004-03-26",NA,NA,10 818 | "2004-03-27",NA,NA,10 819 | "2004-03-28",0.686,0.739,10 820 | "2004-03-29",NA,NA,10 821 | "2004-03-30",NA,NA,10 822 | "2004-03-31",0.983,0.77,10 823 | "2004-04-01",NA,NA,10 824 | "2004-04-02",NA,NA,10 825 | "2004-04-03",0.471,0.753,10 826 | "2004-04-04",NA,NA,10 827 | "2004-04-05",NA,NA,10 828 | "2004-04-06",0.305,0.818,10 829 | "2004-04-07",NA,NA,10 830 | "2004-04-08",NA,NA,10 831 | "2004-04-09",NA,NA,10 832 | "2004-04-10",NA,NA,10 833 | "2004-04-11",NA,NA,10 834 | "2004-04-12",0.458,1.35,10 835 | "2004-04-13",NA,NA,10 836 | "2004-04-14",NA,NA,10 837 | "2004-04-15",0.625,0.545,10 838 | "2004-04-16",NA,NA,10 839 | "2004-04-17",NA,NA,10 840 | "2004-04-18",0.442,0.641,10 841 | "2004-04-19",NA,NA,10 842 | "2004-04-20",NA,NA,10 843 | "2004-04-21",0.48,0.717,10 844 | "2004-04-22",NA,NA,10 845 | "2004-04-23",NA,NA,10 846 | "2004-04-24",NA,0.355,10 847 | "2004-04-25",NA,NA,10 848 | "2004-04-26",NA,NA,10 849 | "2004-04-27",0.117,0.0636,10 850 | "2004-04-28",NA,NA,10 851 | "2004-04-29",NA,NA,10 852 | "2004-04-30",NA,0.163,10 853 | "2004-05-01",NA,NA,10 854 | "2004-05-02",NA,NA,10 855 | "2004-05-03",0.352,0.388,10 856 | "2004-05-04",NA,NA,10 857 | "2004-05-05",NA,NA,10 858 | "2004-05-06",0.769,0.585,10 859 | "2004-05-07",NA,NA,10 860 | "2004-05-08",NA,NA,10 861 | "2004-05-09",NA,NA,10 862 | "2004-05-10",NA,NA,10 863 | "2004-05-11",NA,NA,10 864 | "2004-05-12",1.01,1.73,10 865 | "2004-05-13",NA,NA,10 866 | "2004-05-14",NA,NA,10 867 | "2004-05-15",0.667,0.28,10 868 | "2004-05-16",NA,NA,10 869 | "2004-05-17",NA,NA,10 870 | "2004-05-18",0.415,0.381,10 871 | "2004-05-19",NA,NA,10 872 | "2004-05-20",NA,NA,10 873 | "2004-05-21",1.53,0.44,10 874 | "2004-05-22",NA,NA,10 875 | "2004-05-23",NA,NA,10 876 | "2004-05-24",NA,NA,10 877 | "2004-05-25",NA,NA,10 878 | "2004-05-26",NA,NA,10 879 | "2004-05-27",0.394,0.281,10 880 | "2004-05-28",NA,NA,10 881 | "2004-05-29",NA,NA,10 882 | "2004-05-30",0.349,0.222,10 883 | "2004-05-31",NA,NA,10 884 | "2004-06-01",NA,NA,10 885 | "2004-06-02",NA,NA,10 886 | "2004-06-03",NA,NA,10 887 | "2004-06-04",NA,NA,10 888 | "2004-06-05",NA,NA,10 889 | "2004-06-06",NA,NA,10 890 | "2004-06-07",NA,NA,10 891 | "2004-06-08",NA,NA,10 892 | "2004-06-09",NA,NA,10 893 | "2004-06-10",NA,NA,10 894 | "2004-06-11",NA,NA,10 895 | "2004-06-12",NA,NA,10 896 | "2004-06-13",NA,NA,10 897 | "2004-06-14",NA,NA,10 898 | "2004-06-15",NA,NA,10 899 | "2004-06-16",NA,NA,10 900 | "2004-06-17",NA,NA,10 901 | "2004-06-18",NA,NA,10 902 | "2004-06-19",NA,NA,10 903 | "2004-06-20",NA,NA,10 904 | "2004-06-21",NA,NA,10 905 | "2004-06-22",NA,NA,10 906 | "2004-06-23",NA,NA,10 907 | "2004-06-24",NA,NA,10 908 | "2004-06-25",NA,NA,10 909 | "2004-06-26",NA,NA,10 910 | "2004-06-27",NA,NA,10 911 | "2004-06-28",NA,NA,10 912 | "2004-06-29",NA,NA,10 913 | "2004-06-30",NA,NA,10 914 | "2004-07-01",NA,NA,10 915 | "2004-07-02",NA,NA,10 916 | "2004-07-03",NA,NA,10 917 | "2004-07-04",NA,NA,10 918 | "2004-07-05",NA,NA,10 919 | "2004-07-06",NA,NA,10 920 | "2004-07-07",NA,NA,10 921 | "2004-07-08",NA,NA,10 922 | "2004-07-09",NA,NA,10 923 | "2004-07-10",NA,NA,10 924 | "2004-07-11",NA,NA,10 925 | "2004-07-12",NA,NA,10 926 | "2004-07-13",NA,NA,10 927 | "2004-07-14",NA,NA,10 928 | "2004-07-15",NA,NA,10 929 | "2004-07-16",NA,NA,10 930 | "2004-07-17",NA,NA,10 931 | "2004-07-18",NA,NA,10 932 | "2004-07-19",NA,NA,10 933 | "2004-07-20",NA,NA,10 934 | "2004-07-21",NA,NA,10 935 | "2004-07-22",NA,NA,10 936 | "2004-07-23",NA,NA,10 937 | "2004-07-24",NA,NA,10 938 | "2004-07-25",NA,NA,10 939 | "2004-07-26",NA,NA,10 940 | "2004-07-27",NA,NA,10 941 | "2004-07-28",NA,NA,10 942 | "2004-07-29",NA,NA,10 943 | "2004-07-30",NA,NA,10 944 | "2004-07-31",NA,NA,10 945 | "2004-08-01",NA,NA,10 946 | "2004-08-02",NA,NA,10 947 | "2004-08-03",NA,NA,10 948 | "2004-08-04",NA,NA,10 949 | "2004-08-05",NA,NA,10 950 | "2004-08-06",NA,NA,10 951 | "2004-08-07",NA,NA,10 952 | "2004-08-08",NA,NA,10 953 | "2004-08-09",NA,NA,10 954 | "2004-08-10",NA,NA,10 955 | "2004-08-11",NA,NA,10 956 | "2004-08-12",NA,NA,10 957 | "2004-08-13",NA,NA,10 958 | "2004-08-14",NA,NA,10 959 | "2004-08-15",NA,NA,10 960 | "2004-08-16",NA,NA,10 961 | "2004-08-17",NA,NA,10 962 | "2004-08-18",NA,NA,10 963 | "2004-08-19",NA,NA,10 964 | "2004-08-20",NA,NA,10 965 | "2004-08-21",NA,NA,10 966 | "2004-08-22",NA,NA,10 967 | "2004-08-23",NA,NA,10 968 | "2004-08-24",NA,NA,10 969 | "2004-08-25",NA,NA,10 970 | "2004-08-26",NA,NA,10 971 | "2004-08-27",NA,NA,10 972 | "2004-08-28",NA,NA,10 973 | "2004-08-29",NA,NA,10 974 | "2004-08-30",NA,NA,10 975 | "2004-08-31",NA,NA,10 976 | "2004-09-01",NA,NA,10 977 | "2004-09-02",NA,NA,10 978 | "2004-09-03",NA,NA,10 979 | "2004-09-04",NA,NA,10 980 | "2004-09-05",NA,NA,10 981 | "2004-09-06",NA,NA,10 982 | "2004-09-07",NA,NA,10 983 | "2004-09-08",NA,NA,10 984 | "2004-09-09",NA,NA,10 985 | "2004-09-10",NA,NA,10 986 | "2004-09-11",NA,NA,10 987 | "2004-09-12",NA,NA,10 988 | "2004-09-13",NA,NA,10 989 | "2004-09-14",NA,NA,10 990 | "2004-09-15",NA,NA,10 991 | "2004-09-16",NA,NA,10 992 | "2004-09-17",NA,NA,10 993 | "2004-09-18",NA,NA,10 994 | "2004-09-19",NA,NA,10 995 | "2004-09-20",NA,NA,10 996 | "2004-09-21",NA,NA,10 997 | "2004-09-22",NA,NA,10 998 | "2004-09-23",NA,NA,10 999 | "2004-09-24",NA,NA,10 1000 | "2004-09-25",NA,NA,10 1001 | "2004-09-26",NA,NA,10 1002 | "2004-09-27",NA,NA,10 1003 | "2004-09-28",NA,NA,10 1004 | "2004-09-29",NA,NA,10 1005 | "2004-09-30",NA,NA,10 1006 | "2004-10-01",NA,NA,10 1007 | "2004-10-02",NA,NA,10 1008 | "2004-10-03",NA,NA,10 1009 | "2004-10-04",NA,NA,10 1010 | "2004-10-05",NA,NA,10 1011 | "2004-10-06",NA,NA,10 1012 | "2004-10-07",NA,NA,10 1013 | "2004-10-08",NA,NA,10 1014 | "2004-10-09",NA,NA,10 1015 | "2004-10-10",NA,NA,10 1016 | "2004-10-11",NA,NA,10 1017 | "2004-10-12",NA,NA,10 1018 | "2004-10-13",NA,NA,10 1019 | "2004-10-14",NA,NA,10 1020 | "2004-10-15",NA,NA,10 1021 | "2004-10-16",NA,NA,10 1022 | "2004-10-17",NA,NA,10 1023 | "2004-10-18",NA,NA,10 1024 | "2004-10-19",NA,NA,10 1025 | "2004-10-20",NA,NA,10 1026 | "2004-10-21",NA,NA,10 1027 | "2004-10-22",NA,NA,10 1028 | "2004-10-23",NA,NA,10 1029 | "2004-10-24",NA,NA,10 1030 | "2004-10-25",NA,NA,10 1031 | "2004-10-26",NA,NA,10 1032 | "2004-10-27",NA,NA,10 1033 | "2004-10-28",NA,NA,10 1034 | "2004-10-29",NA,NA,10 1035 | "2004-10-30",NA,NA,10 1036 | "2004-10-31",NA,NA,10 1037 | "2004-11-01",NA,NA,10 1038 | "2004-11-02",NA,NA,10 1039 | "2004-11-03",NA,NA,10 1040 | "2004-11-04",NA,NA,10 1041 | "2004-11-05",NA,NA,10 1042 | "2004-11-06",NA,NA,10 1043 | "2004-11-07",NA,NA,10 1044 | "2004-11-08",NA,NA,10 1045 | "2004-11-09",NA,NA,10 1046 | "2004-11-10",NA,NA,10 1047 | "2004-11-11",NA,NA,10 1048 | "2004-11-12",NA,NA,10 1049 | "2004-11-13",NA,NA,10 1050 | "2004-11-14",NA,NA,10 1051 | "2004-11-15",NA,NA,10 1052 | "2004-11-16",NA,NA,10 1053 | "2004-11-17",NA,NA,10 1054 | "2004-11-18",NA,NA,10 1055 | "2004-11-19",NA,NA,10 1056 | "2004-11-20",NA,NA,10 1057 | "2004-11-21",NA,NA,10 1058 | "2004-11-22",NA,NA,10 1059 | "2004-11-23",NA,NA,10 1060 | "2004-11-24",NA,NA,10 1061 | "2004-11-25",NA,NA,10 1062 | "2004-11-26",NA,NA,10 1063 | "2004-11-27",NA,NA,10 1064 | "2004-11-28",NA,NA,10 1065 | "2004-11-29",NA,NA,10 1066 | "2004-11-30",NA,NA,10 1067 | "2004-12-01",NA,NA,10 1068 | "2004-12-02",NA,NA,10 1069 | "2004-12-03",NA,NA,10 1070 | "2004-12-04",NA,NA,10 1071 | "2004-12-05",NA,NA,10 1072 | "2004-12-06",NA,NA,10 1073 | "2004-12-07",NA,NA,10 1074 | "2004-12-08",NA,NA,10 1075 | "2004-12-09",NA,NA,10 1076 | "2004-12-10",NA,NA,10 1077 | "2004-12-11",NA,NA,10 1078 | "2004-12-12",NA,NA,10 1079 | "2004-12-13",NA,NA,10 1080 | "2004-12-14",NA,NA,10 1081 | "2004-12-15",NA,NA,10 1082 | "2004-12-16",NA,NA,10 1083 | "2004-12-17",NA,NA,10 1084 | "2004-12-18",NA,NA,10 1085 | "2004-12-19",NA,NA,10 1086 | "2004-12-20",NA,NA,10 1087 | "2004-12-21",NA,NA,10 1088 | "2004-12-22",NA,NA,10 1089 | "2004-12-23",NA,NA,10 1090 | "2004-12-24",NA,NA,10 1091 | "2004-12-25",NA,NA,10 1092 | "2004-12-26",NA,NA,10 1093 | "2004-12-27",NA,NA,10 1094 | "2004-12-28",NA,NA,10 1095 | "2004-12-29",NA,NA,10 1096 | "2004-12-30",NA,NA,10 1097 | "2004-12-31",NA,NA,10 1098 | -------------------------------------------------------------------------------- /datasets/lotsofdata/012.csv: -------------------------------------------------------------------------------- 1 | "Date","sulfate","nitrate","ID" 2 | "2004-01-01",NA,NA,12 3 | "2004-01-02",NA,NA,12 4 | "2004-01-03",NA,NA,12 5 | "2004-01-04",NA,NA,12 6 | "2004-01-05",NA,NA,12 7 | "2004-01-06",NA,NA,12 8 | "2004-01-07",NA,NA,12 9 | "2004-01-08",NA,NA,12 10 | "2004-01-09",NA,NA,12 11 | "2004-01-10",NA,NA,12 12 | "2004-01-11",NA,NA,12 13 | "2004-01-12",NA,NA,12 14 | "2004-01-13",NA,NA,12 15 | "2004-01-14",NA,NA,12 16 | "2004-01-15",NA,NA,12 17 | "2004-01-16",NA,NA,12 18 | "2004-01-17",NA,NA,12 19 | "2004-01-18",NA,NA,12 20 | "2004-01-19",NA,NA,12 21 | "2004-01-20",NA,NA,12 22 | "2004-01-21",NA,NA,12 23 | "2004-01-22",NA,NA,12 24 | "2004-01-23",NA,NA,12 25 | "2004-01-24",NA,NA,12 26 | "2004-01-25",NA,NA,12 27 | "2004-01-26",NA,NA,12 28 | "2004-01-27",NA,NA,12 29 | "2004-01-28",NA,NA,12 30 | "2004-01-29",NA,NA,12 31 | "2004-01-30",NA,NA,12 32 | "2004-01-31",NA,NA,12 33 | "2004-02-01",NA,NA,12 34 | "2004-02-02",NA,NA,12 35 | "2004-02-03",NA,NA,12 36 | "2004-02-04",NA,NA,12 37 | "2004-02-05",NA,NA,12 38 | "2004-02-06",NA,NA,12 39 | "2004-02-07",NA,NA,12 40 | "2004-02-08",NA,NA,12 41 | "2004-02-09",NA,NA,12 42 | "2004-02-10",NA,NA,12 43 | "2004-02-11",NA,NA,12 44 | "2004-02-12",NA,NA,12 45 | "2004-02-13",NA,NA,12 46 | "2004-02-14",NA,NA,12 47 | "2004-02-15",NA,NA,12 48 | "2004-02-16",NA,NA,12 49 | "2004-02-17",NA,NA,12 50 | "2004-02-18",NA,NA,12 51 | "2004-02-19",NA,NA,12 52 | "2004-02-20",NA,NA,12 53 | "2004-02-21",NA,NA,12 54 | "2004-02-22",NA,NA,12 55 | "2004-02-23",NA,NA,12 56 | "2004-02-24",NA,NA,12 57 | "2004-02-25",NA,NA,12 58 | "2004-02-26",NA,NA,12 59 | "2004-02-27",NA,NA,12 60 | "2004-02-28",NA,NA,12 61 | "2004-02-29",NA,NA,12 62 | "2004-03-01",NA,NA,12 63 | "2004-03-02",NA,NA,12 64 | "2004-03-03",NA,NA,12 65 | "2004-03-04",NA,NA,12 66 | "2004-03-05",NA,NA,12 67 | "2004-03-06",NA,NA,12 68 | "2004-03-07",NA,NA,12 69 | "2004-03-08",NA,NA,12 70 | "2004-03-09",NA,NA,12 71 | "2004-03-10",NA,NA,12 72 | "2004-03-11",NA,NA,12 73 | "2004-03-12",NA,NA,12 74 | "2004-03-13",NA,NA,12 75 | "2004-03-14",NA,NA,12 76 | "2004-03-15",NA,NA,12 77 | "2004-03-16",NA,NA,12 78 | "2004-03-17",NA,NA,12 79 | "2004-03-18",NA,NA,12 80 | "2004-03-19",NA,NA,12 81 | "2004-03-20",NA,NA,12 82 | "2004-03-21",NA,NA,12 83 | "2004-03-22",NA,NA,12 84 | "2004-03-23",NA,NA,12 85 | "2004-03-24",NA,NA,12 86 | "2004-03-25",NA,NA,12 87 | "2004-03-26",NA,NA,12 88 | "2004-03-27",NA,NA,12 89 | "2004-03-28",0.0353,0.0598,12 90 | "2004-03-29",NA,NA,12 91 | "2004-03-30",NA,NA,12 92 | "2004-03-31",NA,NA,12 93 | "2004-04-01",NA,NA,12 94 | "2004-04-02",NA,NA,12 95 | "2004-04-03",NA,NA,12 96 | "2004-04-04",NA,NA,12 97 | "2004-04-05",NA,NA,12 98 | "2004-04-06",NA,NA,12 99 | "2004-04-07",NA,NA,12 100 | "2004-04-08",NA,NA,12 101 | "2004-04-09",1.33,0.95,12 102 | "2004-04-10",NA,NA,12 103 | "2004-04-11",NA,NA,12 104 | "2004-04-12",NA,NA,12 105 | "2004-04-13",NA,NA,12 106 | "2004-04-14",NA,NA,12 107 | "2004-04-15",1.29,0.255,12 108 | "2004-04-16",NA,NA,12 109 | "2004-04-17",NA,NA,12 110 | "2004-04-18",NA,NA,12 111 | "2004-04-19",NA,NA,12 112 | "2004-04-20",NA,NA,12 113 | "2004-04-21",1.94,0.853,12 114 | "2004-04-22",NA,NA,12 115 | "2004-04-23",NA,NA,12 116 | "2004-04-24",NA,NA,12 117 | "2004-04-25",NA,NA,12 118 | "2004-04-26",NA,NA,12 119 | "2004-04-27",1.83,0.348,12 120 | "2004-04-28",NA,NA,12 121 | "2004-04-29",NA,NA,12 122 | "2004-04-30",NA,NA,12 123 | "2004-05-01",NA,NA,12 124 | "2004-05-02",NA,NA,12 125 | "2004-05-03",1.16,0.275,12 126 | "2004-05-04",NA,NA,12 127 | "2004-05-05",NA,NA,12 128 | "2004-05-06",NA,NA,12 129 | "2004-05-07",NA,NA,12 130 | "2004-05-08",NA,NA,12 131 | "2004-05-09",1.65,0.438,12 132 | "2004-05-10",NA,NA,12 133 | "2004-05-11",NA,NA,12 134 | "2004-05-12",NA,NA,12 135 | "2004-05-13",NA,NA,12 136 | "2004-05-14",NA,NA,12 137 | "2004-05-15",1.26,0.486,12 138 | "2004-05-16",NA,NA,12 139 | "2004-05-17",NA,NA,12 140 | "2004-05-18",NA,NA,12 141 | "2004-05-19",NA,NA,12 142 | "2004-05-20",NA,NA,12 143 | "2004-05-21",0.804,0.383,12 144 | "2004-05-22",NA,NA,12 145 | "2004-05-23",NA,NA,12 146 | "2004-05-24",NA,NA,12 147 | "2004-05-25",NA,NA,12 148 | "2004-05-26",NA,NA,12 149 | "2004-05-27",1.34,0.692,12 150 | "2004-05-28",NA,NA,12 151 | "2004-05-29",NA,NA,12 152 | "2004-05-30",NA,NA,12 153 | "2004-05-31",NA,NA,12 154 | "2004-06-01",NA,NA,12 155 | "2004-06-02",1.35,0.446,12 156 | "2004-06-03",NA,NA,12 157 | "2004-06-04",NA,NA,12 158 | "2004-06-05",NA,NA,12 159 | "2004-06-06",NA,NA,12 160 | "2004-06-07",NA,NA,12 161 | "2004-06-08",1.04,0.422,12 162 | "2004-06-09",NA,NA,12 163 | "2004-06-10",NA,NA,12 164 | "2004-06-11",NA,NA,12 165 | "2004-06-12",NA,NA,12 166 | "2004-06-13",NA,NA,12 167 | "2004-06-14",1.28,0.307,12 168 | "2004-06-15",NA,NA,12 169 | "2004-06-16",NA,NA,12 170 | "2004-06-17",NA,NA,12 171 | "2004-06-18",NA,NA,12 172 | "2004-06-19",NA,NA,12 173 | "2004-06-20",0.561,0.184,12 174 | "2004-06-21",NA,NA,12 175 | "2004-06-22",NA,NA,12 176 | "2004-06-23",NA,NA,12 177 | "2004-06-24",NA,NA,12 178 | "2004-06-25",NA,NA,12 179 | "2004-06-26",1.15,0.36,12 180 | "2004-06-27",NA,NA,12 181 | "2004-06-28",NA,NA,12 182 | "2004-06-29",NA,NA,12 183 | "2004-06-30",NA,NA,12 184 | "2004-07-01",NA,NA,12 185 | "2004-07-02",0.532,0.196,12 186 | "2004-07-03",NA,NA,12 187 | "2004-07-04",NA,NA,12 188 | "2004-07-05",NA,NA,12 189 | "2004-07-06",NA,NA,12 190 | "2004-07-07",NA,NA,12 191 | "2004-07-08",NA,NA,12 192 | "2004-07-09",NA,NA,12 193 | "2004-07-10",NA,NA,12 194 | "2004-07-11",NA,NA,12 195 | "2004-07-12",NA,NA,12 196 | "2004-07-13",NA,NA,12 197 | "2004-07-14",1.19,0.957,12 198 | "2004-07-15",NA,NA,12 199 | "2004-07-16",NA,NA,12 200 | "2004-07-17",NA,NA,12 201 | "2004-07-18",NA,NA,12 202 | "2004-07-19",NA,NA,12 203 | "2004-07-20",1.6,0.459,12 204 | "2004-07-21",NA,NA,12 205 | "2004-07-22",NA,NA,12 206 | "2004-07-23",NA,NA,12 207 | "2004-07-24",NA,NA,12 208 | "2004-07-25",NA,NA,12 209 | "2004-07-26",0.973,0.325,12 210 | "2004-07-27",NA,NA,12 211 | "2004-07-28",NA,NA,12 212 | "2004-07-29",NA,NA,12 213 | "2004-07-30",NA,NA,12 214 | "2004-07-31",NA,NA,12 215 | "2004-08-01",1.57,0.366,12 216 | "2004-08-02",NA,NA,12 217 | "2004-08-03",NA,NA,12 218 | "2004-08-04",NA,NA,12 219 | "2004-08-05",NA,NA,12 220 | "2004-08-06",NA,NA,12 221 | "2004-08-07",NA,NA,12 222 | "2004-08-08",NA,NA,12 223 | "2004-08-09",NA,NA,12 224 | "2004-08-10",NA,NA,12 225 | "2004-08-11",NA,NA,12 226 | "2004-08-12",NA,NA,12 227 | "2004-08-13",1.22,0.337,12 228 | "2004-08-14",NA,NA,12 229 | "2004-08-15",NA,NA,12 230 | "2004-08-16",NA,NA,12 231 | "2004-08-17",NA,NA,12 232 | "2004-08-18",NA,NA,12 233 | "2004-08-19",1.04,0.337,12 234 | "2004-08-20",NA,NA,12 235 | "2004-08-21",NA,NA,12 236 | "2004-08-22",NA,NA,12 237 | "2004-08-23",NA,NA,12 238 | "2004-08-24",NA,NA,12 239 | "2004-08-25",1.24,0.268,12 240 | "2004-08-26",NA,NA,12 241 | "2004-08-27",NA,NA,12 242 | "2004-08-28",NA,NA,12 243 | "2004-08-29",NA,NA,12 244 | "2004-08-30",NA,NA,12 245 | "2004-08-31",1.34,0.361,12 246 | "2004-09-01",NA,NA,12 247 | "2004-09-02",NA,NA,12 248 | "2004-09-03",NA,NA,12 249 | "2004-09-04",NA,NA,12 250 | "2004-09-05",NA,NA,12 251 | "2004-09-06",0.876,0.38,12 252 | "2004-09-07",NA,NA,12 253 | "2004-09-08",NA,NA,12 254 | "2004-09-09",NA,NA,12 255 | "2004-09-10",NA,NA,12 256 | "2004-09-11",NA,NA,12 257 | "2004-09-12",1.88,0.781,12 258 | "2004-09-13",NA,NA,12 259 | "2004-09-14",NA,NA,12 260 | "2004-09-15",NA,NA,12 261 | "2004-09-16",NA,NA,12 262 | "2004-09-17",NA,NA,12 263 | "2004-09-18",1.37,0.288,12 264 | "2004-09-19",NA,NA,12 265 | "2004-09-20",NA,NA,12 266 | "2004-09-21",NA,NA,12 267 | "2004-09-22",NA,NA,12 268 | "2004-09-23",NA,NA,12 269 | "2004-09-24",1.06,0.755,12 270 | "2004-09-25",NA,NA,12 271 | "2004-09-26",NA,NA,12 272 | "2004-09-27",NA,NA,12 273 | "2004-09-28",NA,NA,12 274 | "2004-09-29",NA,NA,12 275 | "2004-09-30",0.882,0.48,12 276 | "2004-10-01",NA,NA,12 277 | "2004-10-02",NA,NA,12 278 | "2004-10-03",NA,NA,12 279 | "2004-10-04",NA,NA,12 280 | "2004-10-05",NA,NA,12 281 | "2004-10-06",1.44,0.409,12 282 | "2004-10-07",NA,NA,12 283 | "2004-10-08",NA,NA,12 284 | "2004-10-09",NA,NA,12 285 | "2004-10-10",NA,NA,12 286 | "2004-10-11",NA,NA,12 287 | "2004-10-12",1.88,0.501,12 288 | "2004-10-13",NA,NA,12 289 | "2004-10-14",NA,NA,12 290 | "2004-10-15",NA,NA,12 291 | "2004-10-16",NA,NA,12 292 | "2004-10-17",NA,NA,12 293 | "2004-10-18",1.38,0.826,12 294 | "2004-10-19",NA,NA,12 295 | "2004-10-20",NA,NA,12 296 | "2004-10-21",NA,NA,12 297 | "2004-10-22",NA,NA,12 298 | "2004-10-23",NA,NA,12 299 | "2004-10-24",NA,2.9,12 300 | "2004-10-25",NA,NA,12 301 | "2004-10-26",NA,NA,12 302 | "2004-10-27",NA,NA,12 303 | "2004-10-28",NA,NA,12 304 | "2004-10-29",NA,NA,12 305 | "2004-10-30",0.137,0.0314,12 306 | "2004-10-31",NA,NA,12 307 | "2004-11-01",NA,NA,12 308 | "2004-11-02",NA,NA,12 309 | "2004-11-03",NA,NA,12 310 | "2004-11-04",NA,NA,12 311 | "2004-11-05",1.01,0.402,12 312 | "2004-11-06",NA,NA,12 313 | "2004-11-07",NA,NA,12 314 | "2004-11-08",NA,NA,12 315 | "2004-11-09",NA,NA,12 316 | "2004-11-10",NA,NA,12 317 | "2004-11-11",1.11,2.74,12 318 | "2004-11-12",NA,NA,12 319 | "2004-11-13",NA,NA,12 320 | "2004-11-14",NA,NA,12 321 | "2004-11-15",NA,NA,12 322 | "2004-11-16",NA,NA,12 323 | "2004-11-17",0.7,3.98,12 324 | "2004-11-18",NA,NA,12 325 | "2004-11-19",NA,NA,12 326 | "2004-11-20",NA,NA,12 327 | "2004-11-21",NA,NA,12 328 | "2004-11-22",NA,NA,12 329 | "2004-11-23",NA,1.21,12 330 | "2004-11-24",NA,NA,12 331 | "2004-11-25",NA,NA,12 332 | "2004-11-26",NA,NA,12 333 | "2004-11-27",NA,NA,12 334 | "2004-11-28",NA,NA,12 335 | "2004-11-29",0.563,0.39,12 336 | "2004-11-30",NA,NA,12 337 | "2004-12-01",NA,NA,12 338 | "2004-12-02",NA,NA,12 339 | "2004-12-03",NA,NA,12 340 | "2004-12-04",NA,NA,12 341 | "2004-12-05",0.819,3.83,12 342 | "2004-12-06",NA,NA,12 343 | "2004-12-07",NA,NA,12 344 | "2004-12-08",NA,NA,12 345 | "2004-12-09",NA,NA,12 346 | "2004-12-10",NA,NA,12 347 | "2004-12-11",0.752,2.72,12 348 | "2004-12-12",NA,NA,12 349 | "2004-12-13",NA,NA,12 350 | "2004-12-14",NA,NA,12 351 | "2004-12-15",NA,NA,12 352 | "2004-12-16",NA,NA,12 353 | "2004-12-17",0.654,0.318,12 354 | "2004-12-18",NA,NA,12 355 | "2004-12-19",NA,NA,12 356 | "2004-12-20",NA,NA,12 357 | "2004-12-21",NA,NA,12 358 | "2004-12-22",NA,NA,12 359 | "2004-12-23",0.564,1.08,12 360 | "2004-12-24",NA,NA,12 361 | "2004-12-25",NA,NA,12 362 | "2004-12-26",NA,NA,12 363 | "2004-12-27",NA,NA,12 364 | "2004-12-28",NA,NA,12 365 | "2004-12-29",0.573,0.482,12 366 | "2004-12-30",NA,NA,12 367 | "2004-12-31",NA,NA,12 368 | "2005-01-01",NA,NA,12 369 | "2005-01-02",NA,NA,12 370 | "2005-01-03",NA,NA,12 371 | "2005-01-04",0.201,0.816,12 372 | "2005-01-05",NA,NA,12 373 | "2005-01-06",NA,NA,12 374 | "2005-01-07",NA,NA,12 375 | "2005-01-08",NA,NA,12 376 | "2005-01-09",NA,NA,12 377 | "2005-01-10",1.17,4.58,12 378 | "2005-01-11",NA,NA,12 379 | "2005-01-12",NA,NA,12 380 | "2005-01-13",NA,NA,12 381 | "2005-01-14",NA,NA,12 382 | "2005-01-15",NA,NA,12 383 | "2005-01-16",0.48,3.3,12 384 | "2005-01-17",NA,NA,12 385 | "2005-01-18",NA,NA,12 386 | "2005-01-19",NA,NA,12 387 | "2005-01-20",NA,NA,12 388 | "2005-01-21",NA,NA,12 389 | "2005-01-22",1.96,5.01,12 390 | "2005-01-23",NA,NA,12 391 | "2005-01-24",NA,NA,12 392 | "2005-01-25",NA,NA,12 393 | "2005-01-26",NA,NA,12 394 | "2005-01-27",NA,NA,12 395 | "2005-01-28",1.29,3.44,12 396 | "2005-01-29",NA,NA,12 397 | "2005-01-30",NA,NA,12 398 | "2005-01-31",NA,NA,12 399 | "2005-02-01",NA,NA,12 400 | "2005-02-02",NA,NA,12 401 | "2005-02-03",0.886,0.216,12 402 | "2005-02-04",NA,NA,12 403 | "2005-02-05",NA,NA,12 404 | "2005-02-06",NA,NA,12 405 | "2005-02-07",NA,NA,12 406 | "2005-02-08",NA,NA,12 407 | "2005-02-09",0.677,1.01,12 408 | "2005-02-10",NA,NA,12 409 | "2005-02-11",NA,NA,12 410 | "2005-02-12",NA,NA,12 411 | "2005-02-13",NA,NA,12 412 | "2005-02-14",NA,NA,12 413 | "2005-02-15",0.992,6.23,12 414 | "2005-02-16",NA,NA,12 415 | "2005-02-17",NA,NA,12 416 | "2005-02-18",NA,NA,12 417 | "2005-02-19",NA,NA,12 418 | "2005-02-20",NA,NA,12 419 | "2005-02-21",0.603,0.63,12 420 | "2005-02-22",NA,NA,12 421 | "2005-02-23",NA,NA,12 422 | "2005-02-24",NA,NA,12 423 | "2005-02-25",NA,NA,12 424 | "2005-02-26",NA,NA,12 425 | "2005-02-27",0.759,1.3,12 426 | "2005-02-28",NA,NA,12 427 | "2005-03-01",NA,NA,12 428 | "2005-03-02",NA,NA,12 429 | "2005-03-03",NA,NA,12 430 | "2005-03-04",NA,NA,12 431 | "2005-03-05",0.382,0.426,12 432 | "2005-03-06",NA,NA,12 433 | "2005-03-07",NA,NA,12 434 | "2005-03-08",NA,NA,12 435 | "2005-03-09",NA,NA,12 436 | "2005-03-10",NA,NA,12 437 | "2005-03-11",0.889,0.8,12 438 | "2005-03-12",NA,NA,12 439 | "2005-03-13",NA,NA,12 440 | "2005-03-14",NA,NA,12 441 | "2005-03-15",NA,NA,12 442 | "2005-03-16",NA,NA,12 443 | "2005-03-17",1.02,1.77,12 444 | "2005-03-18",NA,NA,12 445 | "2005-03-19",NA,NA,12 446 | "2005-03-20",NA,NA,12 447 | "2005-03-21",NA,NA,12 448 | "2005-03-22",NA,NA,12 449 | "2005-03-23",0.453,0.475,12 450 | "2005-03-24",NA,NA,12 451 | "2005-03-25",NA,NA,12 452 | "2005-03-26",NA,NA,12 453 | "2005-03-27",NA,NA,12 454 | "2005-03-28",NA,NA,12 455 | "2005-03-29",0.795,0.293,12 456 | "2005-03-30",NA,NA,12 457 | "2005-03-31",NA,NA,12 458 | "2005-04-01",NA,NA,12 459 | "2005-04-02",NA,NA,12 460 | "2005-04-03",NA,NA,12 461 | "2005-04-04",0.894,0.376,12 462 | "2005-04-05",NA,NA,12 463 | "2005-04-06",NA,NA,12 464 | "2005-04-07",NA,NA,12 465 | "2005-04-08",NA,NA,12 466 | "2005-04-09",NA,NA,12 467 | "2005-04-10",0.372,0.148,12 468 | "2005-04-11",NA,NA,12 469 | "2005-04-12",NA,NA,12 470 | "2005-04-13",NA,NA,12 471 | "2005-04-14",NA,NA,12 472 | "2005-04-15",NA,NA,12 473 | "2005-04-16",2.31,0.837,12 474 | "2005-04-17",NA,NA,12 475 | "2005-04-18",NA,NA,12 476 | "2005-04-19",NA,NA,12 477 | "2005-04-20",NA,NA,12 478 | "2005-04-21",NA,NA,12 479 | "2005-04-22",0.916,1.1,12 480 | "2005-04-23",NA,NA,12 481 | "2005-04-24",NA,NA,12 482 | "2005-04-25",NA,NA,12 483 | "2005-04-26",NA,NA,12 484 | "2005-04-27",NA,NA,12 485 | "2005-04-28",1.1,0.314,12 486 | "2005-04-29",NA,NA,12 487 | "2005-04-30",NA,NA,12 488 | "2005-05-01",NA,NA,12 489 | "2005-05-02",NA,NA,12 490 | "2005-05-03",NA,NA,12 491 | "2005-05-04",1.8,0.353,12 492 | "2005-05-05",NA,NA,12 493 | "2005-05-06",NA,NA,12 494 | "2005-05-07",NA,NA,12 495 | "2005-05-08",NA,NA,12 496 | "2005-05-09",NA,NA,12 497 | "2005-05-10",0.873,0.284,12 498 | "2005-05-11",NA,NA,12 499 | "2005-05-12",NA,NA,12 500 | "2005-05-13",NA,NA,12 501 | "2005-05-14",NA,NA,12 502 | "2005-05-15",NA,NA,12 503 | "2005-05-16",NA,NA,12 504 | "2005-05-17",NA,NA,12 505 | "2005-05-18",NA,NA,12 506 | "2005-05-19",NA,NA,12 507 | "2005-05-20",NA,NA,12 508 | "2005-05-21",NA,NA,12 509 | "2005-05-22",1.22,0.402,12 510 | "2005-05-23",NA,NA,12 511 | "2005-05-24",NA,NA,12 512 | "2005-05-25",NA,NA,12 513 | "2005-05-26",NA,NA,12 514 | "2005-05-27",NA,NA,12 515 | "2005-05-28",NA,NA,12 516 | "2005-05-29",NA,NA,12 517 | "2005-05-30",NA,NA,12 518 | "2005-05-31",NA,NA,12 519 | "2005-06-01",NA,NA,12 520 | "2005-06-02",NA,NA,12 521 | "2005-06-03",1.33,0.447,12 522 | "2005-06-04",NA,NA,12 523 | "2005-06-05",NA,NA,12 524 | "2005-06-06",NA,NA,12 525 | "2005-06-07",NA,NA,12 526 | "2005-06-08",NA,NA,12 527 | "2005-06-09",0.995,0.443,12 528 | "2005-06-10",NA,NA,12 529 | "2005-06-11",NA,NA,12 530 | "2005-06-12",NA,NA,12 531 | "2005-06-13",NA,NA,12 532 | "2005-06-14",NA,NA,12 533 | "2005-06-15",2.02,0.426,12 534 | "2005-06-16",NA,NA,12 535 | "2005-06-17",NA,NA,12 536 | "2005-06-18",NA,NA,12 537 | "2005-06-19",NA,NA,12 538 | "2005-06-20",NA,NA,12 539 | "2005-06-21",2.77,0.365,12 540 | "2005-06-22",NA,NA,12 541 | "2005-06-23",NA,NA,12 542 | "2005-06-24",NA,NA,12 543 | "2005-06-25",NA,NA,12 544 | "2005-06-26",NA,NA,12 545 | "2005-06-27",1.45,0.203,12 546 | "2005-06-28",NA,NA,12 547 | "2005-06-29",NA,NA,12 548 | "2005-06-30",NA,NA,12 549 | "2005-07-01",NA,NA,12 550 | "2005-07-02",NA,NA,12 551 | "2005-07-03",NA,NA,12 552 | "2005-07-04",NA,NA,12 553 | "2005-07-05",NA,NA,12 554 | "2005-07-06",NA,NA,12 555 | "2005-07-07",NA,NA,12 556 | "2005-07-08",NA,NA,12 557 | "2005-07-09",1.56,0.405,12 558 | "2005-07-10",NA,NA,12 559 | "2005-07-11",NA,NA,12 560 | "2005-07-12",NA,NA,12 561 | "2005-07-13",NA,NA,12 562 | "2005-07-14",NA,NA,12 563 | "2005-07-15",2.37,0.428,12 564 | "2005-07-16",NA,NA,12 565 | "2005-07-17",NA,NA,12 566 | "2005-07-18",NA,NA,12 567 | "2005-07-19",NA,NA,12 568 | "2005-07-20",NA,NA,12 569 | "2005-07-21",1.72,0.366,12 570 | "2005-07-22",NA,NA,12 571 | "2005-07-23",NA,NA,12 572 | "2005-07-24",NA,NA,12 573 | "2005-07-25",NA,NA,12 574 | "2005-07-26",NA,NA,12 575 | "2005-07-27",1.25,0.261,12 576 | "2005-07-28",NA,NA,12 577 | "2005-07-29",NA,NA,12 578 | "2005-07-30",NA,NA,12 579 | "2005-07-31",NA,NA,12 580 | "2005-08-01",NA,NA,12 581 | "2005-08-02",NA,NA,12 582 | "2005-08-03",NA,NA,12 583 | "2005-08-04",NA,NA,12 584 | "2005-08-05",NA,NA,12 585 | "2005-08-06",NA,NA,12 586 | "2005-08-07",NA,NA,12 587 | "2005-08-08",1.44,0.742,12 588 | "2005-08-09",NA,NA,12 589 | "2005-08-10",NA,NA,12 590 | "2005-08-11",NA,NA,12 591 | "2005-08-12",NA,NA,12 592 | "2005-08-13",NA,NA,12 593 | "2005-08-14",1.55,0.265,12 594 | "2005-08-15",NA,NA,12 595 | "2005-08-16",NA,NA,12 596 | "2005-08-17",NA,NA,12 597 | "2005-08-18",NA,NA,12 598 | "2005-08-19",NA,NA,12 599 | "2005-08-20",NA,NA,12 600 | "2005-08-21",NA,NA,12 601 | "2005-08-22",NA,NA,12 602 | "2005-08-23",NA,NA,12 603 | "2005-08-24",NA,NA,12 604 | "2005-08-25",NA,NA,12 605 | "2005-08-26",NA,NA,12 606 | "2005-08-27",1.39,0.253,12 607 | "2005-08-28",NA,NA,12 608 | "2005-08-29",NA,NA,12 609 | "2005-08-30",NA,NA,12 610 | "2005-08-31",NA,NA,12 611 | "2005-09-01",1.69,0.561,12 612 | "2005-09-02",NA,NA,12 613 | "2005-09-03",NA,NA,12 614 | "2005-09-04",NA,NA,12 615 | "2005-09-05",NA,NA,12 616 | "2005-09-06",NA,NA,12 617 | "2005-09-07",1.65,0.24,12 618 | "2005-09-08",NA,NA,12 619 | "2005-09-09",NA,NA,12 620 | "2005-09-10",NA,NA,12 621 | "2005-09-11",NA,NA,12 622 | "2005-09-12",NA,NA,12 623 | "2005-09-13",1.07,0.361,12 624 | "2005-09-14",NA,NA,12 625 | "2005-09-15",NA,NA,12 626 | "2005-09-16",NA,NA,12 627 | "2005-09-17",NA,NA,12 628 | "2005-09-18",NA,NA,12 629 | "2005-09-19",1.19,0.357,12 630 | "2005-09-20",NA,NA,12 631 | "2005-09-21",NA,NA,12 632 | "2005-09-22",NA,NA,12 633 | "2005-09-23",NA,NA,12 634 | "2005-09-24",NA,NA,12 635 | "2005-09-25",1,0.257,12 636 | "2005-09-26",NA,NA,12 637 | "2005-09-27",NA,NA,12 638 | "2005-09-28",NA,NA,12 639 | "2005-09-29",NA,NA,12 640 | "2005-09-30",NA,NA,12 641 | "2005-10-01",1.15,0.34,12 642 | "2005-10-02",NA,NA,12 643 | "2005-10-03",NA,NA,12 644 | "2005-10-04",NA,NA,12 645 | "2005-10-05",NA,NA,12 646 | "2005-10-06",NA,NA,12 647 | "2005-10-07",1.18,0.257,12 648 | "2005-10-08",NA,NA,12 649 | "2005-10-09",NA,NA,12 650 | "2005-10-10",NA,NA,12 651 | "2005-10-11",NA,NA,12 652 | "2005-10-12",NA,NA,12 653 | "2005-10-13",1.24,1.96,12 654 | "2005-10-14",NA,NA,12 655 | "2005-10-15",NA,NA,12 656 | "2005-10-16",NA,NA,12 657 | "2005-10-17",NA,NA,12 658 | "2005-10-18",NA,NA,12 659 | "2005-10-19",NA,NA,12 660 | "2005-10-20",NA,NA,12 661 | "2005-10-21",NA,NA,12 662 | "2005-10-22",NA,NA,12 663 | "2005-10-23",NA,NA,12 664 | "2005-10-24",NA,NA,12 665 | "2005-10-25",NA,NA,12 666 | "2005-10-26",NA,NA,12 667 | "2005-10-27",1.79,0.653,12 668 | "2005-10-28",NA,NA,12 669 | "2005-10-29",NA,NA,12 670 | "2005-10-30",NA,NA,12 671 | "2005-10-31",1.77,0.508,12 672 | "2005-11-01",NA,NA,12 673 | "2005-11-02",NA,NA,12 674 | "2005-11-03",NA,NA,12 675 | "2005-11-04",NA,NA,12 676 | "2005-11-05",NA,NA,12 677 | "2005-11-06",1.83,0.791,12 678 | "2005-11-07",NA,NA,12 679 | "2005-11-08",NA,NA,12 680 | "2005-11-09",NA,NA,12 681 | "2005-11-10",NA,NA,12 682 | "2005-11-11",NA,NA,12 683 | "2005-11-12",0.969,1.43,12 684 | "2005-11-13",NA,NA,12 685 | "2005-11-14",NA,NA,12 686 | "2005-11-15",NA,NA,12 687 | "2005-11-16",NA,NA,12 688 | "2005-11-17",NA,NA,12 689 | "2005-11-18",0.537,0.187,12 690 | "2005-11-19",NA,NA,12 691 | "2005-11-20",NA,NA,12 692 | "2005-11-21",NA,NA,12 693 | "2005-11-22",NA,NA,12 694 | "2005-11-23",NA,NA,12 695 | "2005-11-24",0.775,1.95,12 696 | "2005-11-25",NA,NA,12 697 | "2005-11-26",NA,NA,12 698 | "2005-11-27",NA,NA,12 699 | "2005-11-28",NA,NA,12 700 | "2005-11-29",0.508,2.68,12 701 | "2005-11-30",NA,NA,12 702 | "2005-12-01",NA,NA,12 703 | "2005-12-02",NA,NA,12 704 | "2005-12-03",NA,NA,12 705 | "2005-12-04",NA,NA,12 706 | "2005-12-05",NA,NA,12 707 | "2005-12-06",0.553,2.41,12 708 | "2005-12-07",NA,NA,12 709 | "2005-12-08",NA,NA,12 710 | "2005-12-09",NA,NA,12 711 | "2005-12-10",NA,NA,12 712 | "2005-12-11",NA,NA,12 713 | "2005-12-12",1.34,7.51,12 714 | "2005-12-13",NA,NA,12 715 | "2005-12-14",NA,NA,12 716 | "2005-12-15",NA,NA,12 717 | "2005-12-16",NA,NA,12 718 | "2005-12-17",NA,NA,12 719 | "2005-12-18",0.987,4.69,12 720 | "2005-12-19",NA,NA,12 721 | "2005-12-20",NA,NA,12 722 | "2005-12-21",NA,NA,12 723 | "2005-12-22",NA,NA,12 724 | "2005-12-23",NA,NA,12 725 | "2005-12-24",0.814,5.53,12 726 | "2005-12-25",NA,NA,12 727 | "2005-12-26",NA,NA,12 728 | "2005-12-27",NA,NA,12 729 | "2005-12-28",NA,NA,12 730 | "2005-12-29",NA,NA,12 731 | "2005-12-30",NA,NA,12 732 | "2005-12-31",NA,NA,12 733 | -------------------------------------------------------------------------------- /datasets/medals/Athelete_Country_Map.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/medals/Athelete_Country_Map.csv -------------------------------------------------------------------------------- /datasets/medals/Athelete_Sports_Map.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/medals/Athelete_Sports_Map.csv -------------------------------------------------------------------------------- /datasets/medals/Medals.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/medals/Medals.csv --------------------------------------------------------------------------------