├── .gitignore
├── Ch02
    ├── Customer Churn Columns.csv
    ├── Customer Churn Model.csv
    ├── Customer Churn Model.txt
    ├── ScatterPlots.jpeg
    ├── Tab Customer Churn Model.txt
    ├── Titanic Description.txt
    ├── Write.csv
    ├── Write.xls
    ├── basicDataCheck.py
    ├── changeDelimiter.py
    ├── plotData.py
    ├── readCustomerChurn.py
    ├── readCustomerChurn2.py
    ├── readDatasetByOpenMethod.py
    ├── readURLLib2Iris.py
    ├── readURLMedals.py
    ├── readXLS.py
    ├── titanic3.csv
    ├── titanic3.xls
    ├── titanic3.xlsx
    ├── titanicReadCSV.py
    └── titanicReadCSV1.py
├── Ch03
    ├── Customer Churn Model.txt
    ├── appendManyFiles.py
    ├── calcPi.py
    ├── concatenateAndAppend.py
    ├── generateRandomNumbers.py
    ├── generateRandomProbDistr.py
    ├── groupData.py
    ├── mergeJoin.py
    ├── seedRandomNumbers.py
    ├── splitDataTrainTest.py
    ├── subsetColsRows.py
    ├── subsetDataset.py
    ├── subsetDatasetRows.py
    └── subsetNewCol.py
├── Ch04
    ├── NewspaperSalesCorrelationPlot.png
    ├── RadioSalesCorrelationPlot.png
    ├── TVSalesCorrelationPlot.png
    ├── linearRegression.py
    └── linearRegressionFunction.py
├── Ch05
    ├── CurrentVsPredicted1.png
    ├── CurrentVsPredictedVsMean1.png
    ├── CurrentVsPredictedVsModel1.png
    ├── MPGVSHorsepower.png
    ├── MPGVSHorsepowerModels.png
    ├── MPGVSHorsepowerVsLine.png
    ├── PredictedSalesVsTVAdvertisingCosts.png
    ├── linearRegression.py
    ├── linearRegressionECom.py
    ├── linearRegressionRFE.py
    ├── linearRegressionSKL.py
    ├── linearRegressionSMF.py
    └── nonlinearRegression.py
├── Ch06
    ├── Histogram of Age.png
    ├── Purchase Frequency for Day of Week'.png
    ├── Purchase Frequency for Education Level.png
    ├── Purchase Frequency for Month of the Year.png
    ├── ROC Curve.png
    ├── Stacked Bar Chart of Marital Status vs Purchase.png
    ├── logisticRegression.py
    ├── logisticRegressionImplementation.py
    └── logisticRegressionScratch.py
├── Ch07
    ├── Histogram of Clusters.png
    ├── Histogramn of Cluster Labels.png
    ├── clusterWine.py
    └── kMeanClustering.py
├── Ch08
    ├── decisionTreeIris.py
    ├── dtree2.png
    ├── randomForest.py
    └── regressionTree.py
├── ISSUELOG.md
├── README.md
└── datasets
    ├── Advertising.csv
    ├── Auto.csv
    ├── Bank data dictionary.txt
    ├── Boston.csv
    ├── Customer Churn Columns.csv
    ├── Customer Churn Model.csv
    ├── Customer Churn Model.txt
    ├── Ecom Expense.csv
    ├── Gender Purchase.csv
    ├── Titanic Description.txt
    ├── bank.csv
    ├── dtree2.dot
    ├── dtree2.png
    ├── iris.csv
    ├── lotsofdata
        ├── 001.csv
        ├── 002.csv
        ├── 003.csv
        ├── 004.csv
        ├── 005.csv
        ├── 006.csv
        ├── 007.csv
        ├── 008.csv
        ├── 009.csv
        ├── 010.csv
        ├── 011.csv
        ├── 012.csv
        ├── 021.csv
        ├── 022.csv
        ├── 023.csv
        ├── 113.csv
        ├── 114.csv
        ├── 115.csv
        └── 116.csv
    ├── medals
        ├── Athelete_Country_Map.csv
        ├── Athelete_Sports_Map.csv
        └── Medals.csv
    ├── titanic3.csv
    ├── wine.csv
    ├── winequality-red.csv
    └── winequality-white.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/Ch02/Customer Churn Columns.csv:
--------------------------------------------------------------------------------
 1 | Column_Names
 2 | A
 3 | Bob
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | 


--------------------------------------------------------------------------------
/Ch02/ScatterPlots.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/ScatterPlots.jpeg


--------------------------------------------------------------------------------
/Ch02/Titanic Description.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/Titanic Description.txt


--------------------------------------------------------------------------------
/Ch02/Write.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/Write.xls


--------------------------------------------------------------------------------
/Ch02/basicDataCheck.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 11:11:24 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
10 | filename = 'titanic3.csv'
11 | fullpath = path+'/'+filename
12 | 
13 | data=pd.read_csv(fullpath)
14 | 
15 | # Specify the number of rows to see.
16 | data.head(5)
17 | # Confirm dimension
18 | data.shape
19 | # List the data frame
20 | data.columns.values
21 | # Create summary statistics
22 | data.describe()
23 | # FInd out the data type of each column
24 | data.dtypes
25 | 
26 | # Find entries with that have missing values.
27 | pd.isnull(data['body'])
28 | # Opposite method
29 | pd.notnull(data['body'])
30 | 
31 | # Count the number of missing values. 1189
32 | pd.isnull(data['body']).values.ravel().sum()
33 | # Opposite: 121
34 | pd.notnull(data['body']).values.ravel().sum()
35 | 
36 | 
37 | # HANDLING MISSING DATA
38 | # Deletion
39 | # Drop any row with where all columns have missing info.
40 | data.dropna(axis=0,how='all')
41 | # Drop any rows where column have any empty cells of information.
42 | data.dropna(axis=0,how='any')
43 | 
44 | #Imputation
45 | #data.fillna(0)
46 | #data.fillna('missing')
47 | data['body'].fillna(0)
48 | data['age'].fillna(data['age'].mean()) #29.881135
49 | data['age'].fillna(method='ffill') #Fill in with preceding non-missing value.
50 | data['age'].fillna(method='backfill') #Fill in with succeding non-missing value.
51 | 
52 | # CREATING DUMMY VARIABLE
53 | # Split into new variable 'sex_female' and 'sex_male'
54 | dummy_sex=pd.get_dummies(data['sex'],prefix='sex')
55 | column_name=data.columns.values.tolist()
56 | column_name.remove('sex') # Remove column 'sex'
57 | data[column_name].join(dummy_sex) # Add dummy column created above.


--------------------------------------------------------------------------------
/Ch02/changeDelimiter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 27 21:41:13 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
 8 | filename1 = 'Customer Churn Model.txt'
 9 | filename2 = 'Tab Customer Churn Model.txt'
10 | 
11 | infile= path+'/'+filename1
12 | outfile= path+'/'+filename2
13 | with open(infile) as infile1:
14 |   with open(outfile,'w') as outfile1:
15 |     for line in infile1:
16 |       fields=line.split(',')
17 |       outfile1.write('/t'.join(fields))
18 |       
19 | import pandas as pd
20 | data=pd.read_csv(outfile,sep='/t')
21 | print(data)


--------------------------------------------------------------------------------
/Ch02/plotData.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 11:49:07 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | #from pylab import figure, axes, pie, title, show
11 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
12 | filename = 'Customer Churn Model.txt'
13 | fullpath = path+'/'+filename
14 | data=pd.read_csv(fullpath)
15 | 
16 | # Scatter plot
17 | data.plot(kind='scatter',x='Day Mins',y='Day Charge')
18 | 
19 | # Using matplotlib
20 | #figure,axs = plt.subplots(2, 2,sharey=True,sharex=True)
21 | #data.plot(kind='scatter',x='Day Mins',y='Day Charge',ax=axs[0][0])
22 | #data.plot(kind='scatter',x='Night Mins',y='Night Charge',ax=axs[0][1])
23 | #data.plot(kind='scatter',x='Day Calls',y='Day Charge',ax=axs[1][0])
24 | #data.plot(kind='scatter',x='Night Calls',y='Night Charge',ax=axs[1][1])
25 | 
26 | # Save figure as a jpeg
27 | #figname = 'ScatterPlots.jpeg'
28 | #figpath = path+'/'+filename
29 | #figure.savefig(figname)
30 | 
31 | # Histograms
32 | #plt.hist(data['Day Calls'],bins=8)
33 | #plt.xlabel('Day Calls Value')
34 | #plt.ylabel('Frequency')
35 | #plt.title('Frequency of Day Calls')
36 | 
37 | # Boxplots
38 | plt.boxplot(data['Day Calls'])
39 | plt.ylabel('Day Calls')
40 | plt.title('Box Plot of Day Calls')


--------------------------------------------------------------------------------
/Ch02/readCustomerChurn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 27 19:58:48 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | import pandas as pd
 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
 9 | filename = 'Customer Churn Model.txt'
10 | fullpath = path+'/'+filename
11 | data = pd.read_csv(fullpath)


--------------------------------------------------------------------------------
/Ch02/readCustomerChurn2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 27 19:58:48 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | import pandas as pd
 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
 9 | filename1 = 'Customer Churn Columns.csv'
10 | filename2 = 'Customer Churn Model.txt'
11 | fullpath1 = path+'/'+filename1
12 | fullpath2 = path+'/'+filename2
13 | 
14 | 
15 | data_columns = pd.read_csv(fullpath1)
16 | data_column_list = data_columns['Column_Names'].tolist()
17 | data=pd.read_csv(fullpath2,header=None,names=data_column_list)
18 | data.columns.values


--------------------------------------------------------------------------------
/Ch02/readDatasetByOpenMethod.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 27 19:58:48 2016
 4 | 
 5 | @author: jasonm_dev
 6 | 
 7 | python2 uses next()
 8 | python3 uses readline()
 9 | """
10 | 
11 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
12 | filename = 'Customer Churn Model.txt'
13 | fullpath = path+'/'+filename
14 | 
15 | # Open file in read mode.
16 | data=open(fullpath,'r')
17 | # readline() method:
18 | # -> It navigates the computer memory to the line next to the header.
19 | # strip() method: 
20 | # -> Removes all the trailing and leading blank spaces from the line
21 | # split() method: 
22 | # -> Method breaks down a line into chunks separated by the argument provided
23 | cols=data.readline().strip().split(',')
24 | no_cols=len(data.readline().strip().split(','))
25 | 
26 | counter=0
27 | 
28 | main_dict={}
29 | # Key: Column names
30 | # Value: Values of columns. 
31 | for col in cols:
32 |     main_dict[col]=[]
33 |     
34 | for line in data:
35 |     values = line.strip().split(',')
36 |     for i in range(len(cols)):
37 |         main_dict[cols[i]].append(values[i])
38 |     counter += 1
39 | 
40 | #print ("The dataset has %d rows and %d columns") % (counter,no_cols)
41 | print ('The dataset has ',counter,' rows and ',no_cols,' columns')
42 | 
43 | # Convert dataset to a dataframe similar pandas raed_csv
44 | import pandas as pd
45 | df=pd.DataFrame(main_dict)
46 | print (df.head(10))
47 | 
48 | filename_csv = 'Write.csv'
49 | filename_xls = 'Write.xls'
50 | fullpath_csv = path+'/'+filename_csv
51 | fullpath_xls = path+'/'+filename_xls
52 | 
53 | # Write to CSV file.
54 | df.to_csv(fullpath_csv)
55 | 
56 | # Write to xls file
57 | df.to_excel(fullpath_xls)


--------------------------------------------------------------------------------
/Ch02/readURLLib2Iris.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 09:51:41 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import csv
 9 | import urllib.request
10 | import codecs
11 | 
12 | url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
13 | html = urllib.request.urlopen(url)
14 | csvfile = csv.reader(codecs.iterdecode(html, 'utf-8'))
15 | for line in csvfile:
16 |     print(line) #do something with line


--------------------------------------------------------------------------------
/Ch02/readURLMedals.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Apr 29 09:48:19 2016
4 | 
5 | @author: jasonm_dev
6 | """
7 | 
8 | import pandas as pd
9 | medal_data=pd.read_csv('http://winterolympicsmedals.com/medals.csv')


--------------------------------------------------------------------------------
/Ch02/readXLS.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 10:51:11 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | import pandas as pd
 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
 9 | 
10 | filename1 = 'titanic3.xls'
11 | filename2 = 'titanic3.xlsx'
12 | fullpath1 = path+'/'+filename1
13 | fullpath2 = path+'/'+filename2
14 | # Read .xls
15 | data1=pd.read_excel(fullpath1,'titanic3')
16 | 
17 | # Read .xlsx
18 | data2=pd.read_excel(fullpath2,'titanic3')


--------------------------------------------------------------------------------
/Ch02/titanic3.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/titanic3.xls


--------------------------------------------------------------------------------
/Ch02/titanic3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch02/titanic3.xlsx


--------------------------------------------------------------------------------
/Ch02/titanicReadCSV.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 27 19:52:28 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | data = pd.read_csv('/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02/titanic3.csv')
10 | data


--------------------------------------------------------------------------------
/Ch02/titanicReadCSV1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr 27 19:55:36 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | import pandas as pd
 8 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch02'
 9 | filename = 'titanic3.csv'
10 | fullpath = path+'/'+filename
11 | data = pd.read_csv(fullpath)
12 | 


--------------------------------------------------------------------------------
/Ch03/appendManyFiles.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue May  3 20:08:00 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | 
 9 | import pandas as pd
10 | 
11 | # Check if first file works.
12 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/lotsofdata'
13 | filename= '001.csv'
14 | file = filepath+'/'+filename
15 | 
16 | data=pd.read_csv(file)
17 | data.head()
18 | data.shape #Out: (1461, 4)
19 | 
20 | # Loop through all dataset files.
21 | data_final=pd.read_csv(file)
22 | data_final_size=len(data_final)
23 | for i in range(1,12): #range(1,333):
24 |     if i<10:
25 |         filename='0'+'0'+str(i)+'.csv'
26 |     if 10<=i<100:
27 |         filename='0'+str(i)+'.csv'
28 |     #if i>=100:
29 |     #    filename=str(i)+'.csv'
30 |         
31 |     file=filepath+'/'+filename
32 |     data=pd.read_csv(file)
33 |     data_final_size+=len(data)
34 |     data_final=pd.concat([data_final,data],axis=0)
35 |     
36 | data.shape # Out: (1461, 4)
37 | data_final.shape # Out: (27391, 4)
38 | print (data_final_size) # 27391


--------------------------------------------------------------------------------
/Ch03/calcPi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 15:26:46 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | # Calculate pi
 9 | 
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | 
13 | def pi_run(nums,loops):
14 |     pi_avg=0
15 |     pi_value_list=[]
16 |     for i in range(loops):
17 |         value=0
18 |         # Generate points within 0 to 1.        
19 |         x=np.random.uniform(0,1,nums).tolist()
20 |         y=np.random.uniform(0,1,nums).tolist()
21 |         # Check to see if they lie within circle.
22 |         for j in range(nums):
23 |             z=np.sqrt(x[j]*x[j]+y[j]*y[j])
24 |             if z<=1:
25 |                 value+=1
26 |         # Amount of hits withion circle.        
27 |         float_value=float(value)
28 |         # Using probabilty to calculate pi using hits       
29 |         pi_value=float_value*4/nums
30 |         pi_value_list.append(pi_value)
31 |         # Get pi value for this loop.
32 |         pi_avg+=pi_value
33 |     # Averag pi value from all loops.
34 |     pi=pi_avg/loops
35 |     ind=range(1,loops+1)
36 |     fig=plt.plot(ind,pi_value_list)
37 |     return (pi,fig)
38 |     
39 | pi_run(1000,100)    


--------------------------------------------------------------------------------
/Ch03/concatenateAndAppend.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue May  3 19:50:48 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
11 | filename_red = 'winequality-red.csv'
12 | filename_white = 'winequality-white.csv'
13 | fullpath_red = path+'/'+filename_red
14 | fullpath_white = path+'/'+filename_white
15 | 
16 | # RED WINE QUALITIES
17 | 
18 | 
19 | data1=pd.read_csv(fullpath_red,sep=';') # delimiter is ';'
20 | data1.head() 
21 | data1.shape #Out: (1599, 12)
22 | data1.columns.values 
23 | # Out: array(['fixed acidity', 'volatile acidity', 'citric acid',
24 | #   'residual sugar', 'chlorides', 'free sulfur dioxide',
25 | #   'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
26 | #   'quality'], dtype=object)
27 | 
28 | # WHITE WINE QUALITIES
29 | data2=pd.read_csv(fullpath_white,sep=';')
30 | data2.shape #Out: (4898, 12)
31 | data2.head() 
32 | 
33 | # APPEND DATA
34 | # Horizontal axis is denoted by 0.
35 | wine_total=pd.concat([data1,data2],axis=0)
36 | wine_total.shape #Out: (6497, 12)
37 | wine_total.head()
38 | 
39 | #SCRAMBLING DATA WITH CONCAT
40 | data1_head=data1.head(50)
41 | data1_middle=data1[500:550]
42 | data1_tail=data1.tail(50)
43 | wine_scramble=pd.concat([data1_middle,data1_head,data1_tail],axis=0)
44 | wine_scramble
45 | wine_scramble.shape #Out: (150, 12)


--------------------------------------------------------------------------------
/Ch03/generateRandomNumbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 14:28:15 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | # Generate Random Numbers
11 | np.random.randint(1,100) #Random number between 1 and 100
12 | np.random.random() #Random number between 1 and 100
13 | 
14 | # Generate n amount of random numbers between a and b
15 | def randint_range(n,a,b):
16 |     x=[]
17 |     for i in range(n):
18 |         x.append(np.random.randint(a,b))
19 |     return x
20 |     
21 | # Generate 10 amount of random numbers between 5 and 200
22 | randint_range(10,5,200) 
23 | # Out: [169, 47, 124, 73, 109, 63, 84, 93, 8, 129]
24 | 
25 | # Random range of number in specific multiple
26 | import random
27 | for i in range(3):
28 |     print (random.randrange(0,100,5))
29 |     
30 | # Shuuffle list or array in a random order.
31 | b = randint_range(10,5,200) 
32 | b # Out: [93, 194, 30, 38, 146, 40, 177, 172, 197, 182]
33 | np.random.shuffle(b) 
34 | b # Out: [177, 182, 146, 40, 194, 30, 197, 172, 38, 93]
35 | 
36 | # 'Choice' method is used to select a random item from a list of items.
37 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03'
38 | filename = 'Customer Churn Model.txt'
39 | fullpath = path+'/'+filename
40 | data=pd.read_csv(fullpath)
41 | data.shape # Output: (3333, 21)
42 | 
43 | # Create a list from the column names
44 | column_list=data.columns.values.tolist()
45 | 
46 | # Select an item at random from the list.
47 | np.random.choice(column_list) #Out: "Int'l Plan"
48 | np.random.choice(column_list) #Out: 'VMail Plan'
49 | np.random.choice(column_list) #Out: 'Eve Calls'
50 | np.random.choice(column_list) #Out: 'Eve Mins'


--------------------------------------------------------------------------------
/Ch03/generateRandomProbDistr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 15:01:52 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | # Generate 100 random numbers lying between 1 and 100.
12 | randnum=np.random.uniform(1,100,100)
13 | 
14 | # Plot histogram to confirm uniform distribution.
15 | # Used with ipython/spyder notepad.
16 | #%matplotlib inline
17 | 
18 | # Not so uniform distribution with 10 numbers
19 | a=np.random.uniform(1,100,100)
20 | b=range(1,101)
21 | #plt.hist(a)
22 | 
23 | # Better uniform distribution with a million numbers
24 | c=np.random.uniform(1,1000000,1000000)
25 | d=range(1,101)
26 | #plt.hist(c)
27 | 
28 | # Normal distribution
29 | # Used with ipython/spyder notepad.
30 | #%matplotlib inline
31 | 
32 | # Plot a random noise plot.
33 | e=np.random.randn(100)
34 | f=range(1,101)
35 | #plt.plot(f,e)
36 | 
37 | # Plot a random noise plot with mean of 1.5 and standard deviation of 2.5.
38 | g=2.5*np.random.randn(100)+1.5
39 | h=range(1,101)
40 | #plt.plot(h,g)
41 | 
42 | # Generate enough numbers to create belll curve
43 | i=np.random.randn(100000)
44 | j=range(1,101)
45 | plt.hist(i)


--------------------------------------------------------------------------------
/Ch03/groupData.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May  2 09:57:12 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | # GENERATE DATAFRAME
12 | a=['Male','Female']
13 | b=['Rich','Poor','Middle Class']
14 | gender=[]
15 | seb=[]
16 | for i in range(1,101):
17 |     gender.append(np.random.choice(a))
18 |     seb.append(np.random.choice(b))
19 | height=30*np.random.randn(100)+155
20 | weight=20*np.random.randn(100)+60
21 | age=10*np.random.randn(100)+35
22 | income=1500*np.random.randn(100)+15000
23 | 
24 | df=pd.DataFrame({'Gender':gender,'Height':height,'Weight':weight,'Age':age,'Income':income,'Socio-Eco':seb})
25 | df.head()
26 | 
27 | # GROUPING OF DATA
28 | # Splits data into data objects with attributes 'name' and 'group'.
29 | # df.groupby('Gender') # Out: <pandas.core.groupby.DataFrameGroupBy object at 0x7f970d9b9828>
30 | 
31 | # Group by gender.
32 | grouped = df.groupby('Gender')
33 | # Object created is 'Male' and its group of data, and 'Female' and its group of data.
34 | # grouped.groups
35 | 
36 | for names,groups in grouped:
37 |     print (names)
38 |     print (groups)
39 |     
40 | 
41 | # Get a single group can be found.
42 | grouped_female=grouped.get_group('Female')
43 | 
44 | # A set of categories can be used.
45 | grouped_gender_socio=df.groupby(['Gender','Socio-Eco'])
46 | 
47 | for names,groups in grouped_gender_socio:
48 |     print (names)
49 |     print (groups)    
50 |     
51 | # AGGREGATION OF DATA
52 | # Sum of data
53 | grouped_gender_socio.sum() # Sum of dataheads
54 | grouped_gender_socio.size() # Calculates the size of each group.
55 | grouped_gender_socio.describe() # Summary statistics for each group separately.
56 | grouped_gender_socio.aggregate({'Age':np.mean,'Height':lambda x:np.mean(x)/np.std(x)})
57 | #  Use the lambda method for ratio of mean and standard deviation for height
58 | grouped_gender_socio.aggregate([np.sum, np.mean, np.std]) # Apply to all columns.
59 | 
60 | # Grouped subsets behave like their own dataframes.
61 | grouped_income=grouped['Income'] # You can apply function above here as well.
62 | 
63 | # FILTERING
64 | grouped_gender_socio['Age'].filter(lambda x:x.sum()>700)
65 | 
66 | # TRANSFORMATION
67 | # Calculate the standard normal values for all the elements 
68 | # in the numerical columns of our data frame
69 | zscore = lambda x: (x - x.mean()) / x.std()
70 | #grouped.transform(zscore)
71 | 
72 | # Fills the missing values with the mean of the non-missing values.
73 | f = lambda x: x.fillna(x.mean())
74 | #grouped.transform(f) 
75 | 
76 | # MISCELLANEOUS OPERTAIONS
77 | grouped.head(1) # Gets the first row of the male and female groups respectively.
78 | grouped_gender_socio.head(1) # First row of each group.
79 | 
80 | grouped.tail(1) # Gets last rows of each group.
81 | grouped_gender_socio.tail(1) # Gets last rows of each group.
82 | 
83 | # Good practise. First sort data frame before creating the groupby object.
84 | df1=df.sort_values(by=['Age','Income']) # Sort by age and income.
85 | sort_grouped=df1.groupby('Gender') # Group by gender
86 | sort_grouped.head(1) # Show rows for the youngest of each gender.
87 | sort_grouped.tail(1) # Show rows for the eldest of each gender.
88 | 


--------------------------------------------------------------------------------
/Ch03/mergeJoin.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue May  3 20:41:51 2016
  4 | 
  5 | @author: jasonm_dev
  6 | """
  7 | 
  8 | 
  9 | import pandas as pd
 10 | 
 11 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/medals'
 12 | filename= 'Medals.csv'
 13 | file = filepath+'/'+filename
 14 | 
 15 | # IMPORT MAIN MEDAL FILE
 16 | data_main=pd.read_csv(file,encoding='latin_1')
 17 | data_main.head()
 18 | data_main.shape # Out: (8618, 8)
 19 | # ERROR 
 20 | # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf8 
 21 | # in position 8: invalid start byte
 22 | # SOLUTION: used latin_1, ascii and utf-8 don't work
 23 | 
 24 | 
 25 | # Check to see how many unique athletes there are.
 26 | a=data_main['Athlete'].unique().tolist()
 27 | len(a) # Out: 6956
 28 | 
 29 | # IMPORT COUNTRY MAP
 30 | filename_country = 'Athelete_Country_Map.csv'
 31 | file_country = filepath+'/'+filename_country
 32 | country_map=pd.read_csv(file_country,encoding='latin_1')
 33 | country_map.head()
 34 | country_map.shape # Out: (6970, 2) with 6956 uniques.
 35 | 
 36 | # Uniques adding to total with 2 nationalities.
 37 | country_map[country_map['Athlete']=='Aleksandar Ciric']
 38 | 
 39 | # IMPORT SPORTS MAP
 40 | filename_sports = 'Athelete_Sports_Map.csv'
 41 | file_sports = filepath+'/'+filename_sports
 42 | sports_map=pd.read_csv(file_sports,encoding='latin_1')
 43 | sports_map.head()
 44 | sports_map.shape # Out: (6975, 2) 
 45 | 
 46 | # with very few doing more than one sport.
 47 | sports_map[(sports_map['Athlete']=='Chen Jing') | (sports_map['Athlete']=='Richard Thompson') | (sports_map['Athlete']=='Matt Ryan')]
 48 | 
 49 | # MERGE IMPORTED COUTRY MAP DATA FRAME FILES
 50 | merged=pd.merge(left=data_main,right=country_map,left_on='Athlete',right_on='Athlete')
 51 | merged.head()
 52 | merged.shape # Out: (8657, 9) > 8618 uniques because of inner join.
 53 | 
 54 | # See duplicated results. 
 55 | merged[merged['Athlete']=='Aleksandar Ciric']
 56 | 
 57 | # Drop duplicates from country_map data frame
 58 | country_map_dp=country_map.drop_duplicates(subset='Athlete') # Out: (6956, 2)
 59 | 
 60 | # Now retry merge as length is now the same as unique atheletes.
 61 | merged_dp=pd.merge(left=data_main,right=country_map_dp,left_on='Athlete',right_on='Athlete')
 62 | merged_dp.shape # Out: (8618, 9)
 63 | 
 64 | # MERGE IMPORTED SPORTS MAP DATA FRAME FILES
 65 | # Drop duplicates from country_map data frame
 66 | sports_map_dp=sports_map.drop_duplicates(subset='Athlete')
 67 | sports_map_dp.shape # Out: (6956, 2)
 68 | 
 69 | # Merge into final data.
 70 | merged_final=pd.merge(left=merged_dp,right=sports_map_dp,left_on='Athlete',right_on='Athlete')
 71 | merged_final.shape # Out: (8618, 10)
 72 | merged_final.head()
 73 | 
 74 | 
 75 | # MERGE TYPES
 76 | # PYTHON 2.7 uses '<>', whereas python3.5 uses '!='
 77 | # Prepare the data with some mismathes to show join examples.
 78 | country_map_dlt=country_map_dp[(country_map_dp['Athlete']!='Michael Phelps') & (country_map_dp['Athlete']!='Natalie Coughlin') & (country_map_dp['Athlete']!='Chen Jing')
 79 |                     & (country_map_dp['Athlete']!='Richard Thompson') & (country_map_dp['Athlete']!='Matt Ryan')]
 80 | len(country_map_dlt) # Out: 6951
 81 | sports_map_dlt=sports_map_dp[(sports_map_dp['Athlete']!='Michael Phelps') & (sports_map_dp['Athlete']!='Natalie Coughlin') & (sports_map_dp['Athlete']!='Chen Jing')
 82 |                     & (sports_map_dp['Athlete']!='Richard Thompson') & (sports_map_dp['Athlete']!='Matt Ryan')]
 83 | len(sports_map_dlt) # Out: 6951
 84 | 
 85 | data_main_dlt=data_main[(data_main['Athlete']!='Michael Phelps') & (data_main['Athlete']!='Natalie Coughlin') & (data_main['Athlete']!='Chen Jing')
 86 |                     & (data_main['Athlete']!='Richard Thompson') & (data_main['Athlete']!='Matt Ryan')]
 87 | len(data_main_dlt) # Out: 8605
 88 | 
 89 | # INNER JOIN EXAMPLE
 90 | merged_inner=pd.merge(left=data_main,right=country_map_dlt,how='inner',left_on='Athlete',right_on='Athlete')
 91 | len(merged_inner) # Out: 8605
 92 | 
 93 | # LEFT JOIN EXAMPLE
 94 | merged_left=pd.merge(left=data_main,right=country_map_dlt,how='left',left_on='Athlete',right_on='Athlete')
 95 | len(merged_left) # Out: 8618
 96 | # Check the athletes which don't have information because of the left join.
 97 | merged_left_slt=merged_left[merged_left['Athlete']=='Michael Phelps']
 98 | merged_left_slt
 99 | 
100 | # RIGHT JOIN EXAMPLE
101 | merged_right=pd.merge(left=data_main_dlt,right=country_map_dp,how='right',left_on='Athlete',right_on='Athlete')
102 | len(merged_right) # Out: 8610
103 | 


--------------------------------------------------------------------------------
/Ch03/seedRandomNumbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 14:55:21 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | import numpy as np
 8 | 
 9 | # No seed is set and a set of new 5 random numbers 
10 | # will be generate each time.
11 | for i in range(5):
12 |     print (np.random.random())
13 | 
14 | # Seed is set as 1 and generate 5 random numbers.
15 | # The 5 random numbers will be repeated.
16 | np.random.seed(1)
17 | for i in range(5):
18 |     print (np.random.random())
19 |     
20 | 


--------------------------------------------------------------------------------
/Ch03/splitDataTrainTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May  2 10:48:24 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | # METHOD 1 – using the Customer Churn Model
12 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03'
13 | filename = 'Customer Churn Model.txt'
14 | fullpath = path+'/'+filename
15 | data=pd.read_csv(fullpath)
16 | 
17 | len(data)
18 | 
19 | # Generate set of random numbers with length of data.
20 | a=np.random.randn(len(data))
21 | # Create filter
22 | check=a<0.8
23 | # Filter training data below 0.8.
24 | training=data[check]
25 | # Filter testing data above 0.8.
26 | testing=data[~check]
27 | 
28 | # Check lengths
29 | len(training)
30 | len(testing)
31 | 
32 | # METHOD 2 – using sklearn
33 | # The test size specifies the size of the testing dataset: 
34 | # 0.2 means that 20 percent of the rows of the dataset should go to testing 
35 | # and the remaining 80 percent to training.
36 | from sklearn.cross_validation import train_test_split
37 | train, test = train_test_split(data, test_size = 0.2)
38 | 
39 | # METHOD 3 – using the shuffle function
40 | # Using 'rb' means opening in binary mode 
41 | # and create a 'bytes' object used in dataframes.
42 | with open(fullpath,'rb') as f:
43 |     #data_shuffle=f.readline().split('\n')
44 |     data_shuffle=f.readline()    
45 | #data_shuffle=open(fullpath,'r')
46 | #np.random.shuffle(data_shuffle)
47 | #train_data = data_shuffle[:3*len(data_shuffle)/4]
48 | #test_data = data_shuffle[len(data_shuffle)/4:]
49 |     
50 |     
51 | # Just readline creates a bytes object.
52 |     #do a loop like the opdn one and main dict
53 | 


--------------------------------------------------------------------------------
/Ch03/subsetColsRows.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 13:58:58 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03'
11 | filename = 'Customer Churn Model.txt'
12 | fullpath = path+'/'+filename
13 | data=pd.read_csv(fullpath)
14 | data.shape # Output: (3333, 21)
15 | 
16 | # Filter data for the first 50 rows.
17 | subdata_first_50=data[['Account Length','VMail Message','Day Calls']][1:50]
18 | subdata_first_50
19 | 
20 | # Filter data by 'Day Calls' > 100
21 | data1=data[data['Day Calls']>100]
22 | data1.shape # Output: (1682, 21)
23 | 
24 | # Alternative .ix[rowstart:rowend,colstart:colend]
25 | data.ix[1:100,1:6]
26 | data.ix[:,1:6]
27 | data.ix[1:100,[2,5,7]]
28 | data.ix[[1,2,5],['Area Code','VMail Plan','Day Mins']]


--------------------------------------------------------------------------------
/Ch03/subsetDataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 13:35:32 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03'
11 | filename = 'Customer Churn Model.txt'
12 | fullpath = path+'/'+filename
13 | data=pd.read_csv(fullpath)
14 | 
15 | # Creates subset of the DataFrame by using square brackets.
16 | # Selecting one column creates a Series object similar to Dataframe
17 | account_length = data['Account Length']
18 | account_length.head()
19 | type(account_length) # Output: pandas.core.series.Series
20 | 
21 | # Creates subset of the DataFrame by using square brackets.
22 | # Using multiple columns
23 | subdata = data[['Account Length','VMail Message','Day Calls']]
24 | subdata.head()
25 | type(subdata) # Output: pandas.core.frame.DataFrame
26 | 
27 | # Alternative
28 | wanted_columns=['Account Length','VMail Message','Day Calls']
29 | subdata1=data[wanted_columns]
30 | subdata1.head()
31 | 
32 | # Alternative
33 | wanted=['Account Length','VMail Message','Day Calls']
34 | # Gets list of columns names
35 | column_list=data.columns.values.tolist()
36 | # Removes 'wanted' column names from the column_list
37 | sublist=[x for x in column_list if x not in wanted]
38 | subdata2=data[sublist]
39 | subdata2.head()


--------------------------------------------------------------------------------
/Ch03/subsetDatasetRows.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 13:50:16 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03'
11 | filename = 'Customer Churn Model.txt'
12 | fullpath = path+'/'+filename
13 | data=pd.read_csv(fullpath)
14 | data.shape # Output: (3333, 21)
15 | 
16 | # Filter data by 'Day Calls' > 100
17 | data1=data[data['Day Calls']>100]
18 | data1.shape # Output: (1682, 21)
19 | 
20 | # Filter data by 'State' > VA
21 | data2=data[data['State']=='VA']
22 | data2.shape # Output: (77, 21)
23 | 
24 | # Filter data by 'Day Calls' > 100 and 'State' > VA
25 | data3=data[(data['Day Calls']>100) & (data['State']=='VA')]
26 | data3.shape # Output: (51, 21)


--------------------------------------------------------------------------------
/Ch03/subsetNewCol.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Apr 29 14:06:22 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | path = '/home/jasonm_dev/coding/learning-python-predictive-analytics/Ch03'
11 | filename = 'Customer Churn Model.txt'
12 | fullpath = path+'/'+filename
13 | data=pd.read_csv(fullpath)
14 | data.shape # Output: (3333, 21)
15 | 
16 | # Create new column by totalling the minutes columns.
17 | data['Total Mins']=data['Day Mins']+data['Eve Mins']+data['Night Mins']
18 | data['Total Mins'].head() # Name: Total Mins, dtype: float64


--------------------------------------------------------------------------------
/Ch04/NewspaperSalesCorrelationPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch04/NewspaperSalesCorrelationPlot.png


--------------------------------------------------------------------------------
/Ch04/RadioSalesCorrelationPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch04/RadioSalesCorrelationPlot.png


--------------------------------------------------------------------------------
/Ch04/TVSalesCorrelationPlot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch04/TVSalesCorrelationPlot.png


--------------------------------------------------------------------------------
/Ch04/linearRegression.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- coding: utf-8 -*-
 3 | # Linear Regression
 4 | """
 5 | Created on Wed May  4 21:08:54 2016
 6 | 
 7 | @author: jasonm_dev
 8 | """
 9 | 
10 | import pandas as pd
11 | import numpy as np
12 | 
13 | # Check if first file works.
14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
15 | filename= 'Advertising.csv'
16 | file = filepath+'/'+filename
17 | 
18 | advert=pd.read_csv(file)
19 | advert.head()
20 | 
21 | # Determine correlation between 
22 | # the advertisement costs on TV 
23 | # and the resultant sales
24 | advert['corrn']=(advert['TV']-np.mean(advert['TV']))*(advert['Sales']-np.mean(advert['Sales']))
25 | advert['corrd1']=(advert['TV']-np.mean(advert['TV']))**2
26 | advert['corrd2']=(advert['Sales']-np.mean(advert['Sales']))**2
27 | corrcoeffn=advert.sum()['corrn']
28 | corrcoeffd1=advert.sum()['corrd1']
29 | corrcoeffd2=advert.sum()['corrd2']
30 | corrcoeffd=np.sqrt(corrcoeffd1*corrcoeffd2)
31 | corrcoeff=corrcoeffn/corrcoeffd
32 | corrcoeff #Out: 0.78222442486160604


--------------------------------------------------------------------------------
/Ch04/linearRegressionFunction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed May  4 21:12:24 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | # Check if first file works.
12 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
13 | filename= 'Advertising.csv'
14 | file = filepath+'/'+filename
15 | 
16 | advert=pd.read_csv(file)
17 | advert.head()
18 | 
19 | # Function to determine correlation between var1 and var2
20 | def corrcoeff(df,var1,var2):
21 |     df['corrn']=(df[var1]-np.mean(df[var1]))*(df[var2]-np.mean(df[var2]))
22 |     df['corrd1']=(df[var1]-np.mean(df[var1]))**2
23 |     df['corrd2']=(df[var2]-np.mean(df[var2]))**2
24 |     corrcoeffn=df.sum()['corrn']
25 |     corrcoeffd1=df.sum()['corrd1']
26 |     corrcoeffd2=df.sum()['corrd2']
27 |     corrcoeffd=np.sqrt(corrcoeffd1*corrcoeffd2)
28 |     corrcoeff=corrcoeffn/corrcoeffd
29 |     return corrcoeff
30 |     
31 | # Correlation between TV and Radio
32 | Corr_TV_Radio = corrcoeff(advert,'TV','Radio') # Out: 0.05480866446583009
33 | 
34 | # Correlation between TV and Newspaper
35 | Corr_TV_Newspaper = corrcoeff(advert,'TV','Newspaper') # Out: 0.056647874965056993
36 | 
37 | # Correlation between TV and Sales
38 | Corr_TV_Sales = corrcoeff(advert,'TV','Sales') # Out: 0.78222442486160604
39 | 
40 | # Correlation between Radio and Newspaper
41 | Corr_Radio_Newspaper = corrcoeff(advert,'Radio','Newspaper') # Out: 0.35410375076117517
42 | 
43 | # Correlation between Radio and Sales
44 | Corr_Radio_Sales = corrcoeff(advert,'Radio','Sales') # Out: 0.5762225745710553
45 | 
46 | # Correlation between Newspaper and Sales
47 | Corr_Newspaper_Sales = corrcoeff(advert,'Newspaper','Sales') # Out: 0.22829902637616525
48 | 
49 | # Plot correlation of TV and Sales
50 | import matplotlib.pyplot as plt
51 | # %matplotlib inline
52 | #plt.plot(advert['TV'],advert['Sales'],'ro')
53 | #plt.title('TV vs Sales')
54 | 
55 | # Plot correlation of Radio and Sales
56 | #plt.plot(advert['Radio'],advert['Sales'],'ro')
57 | #plt.title('Radio vs Sales')
58 | 
59 | # Plot correlation of Newspaper and Sales
60 | plt.plot(advert['Newspaper'],advert['Sales'],'ro')
61 | plt.title('Newspaper vs Sales')


--------------------------------------------------------------------------------
/Ch05/CurrentVsPredicted1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/CurrentVsPredicted1.png


--------------------------------------------------------------------------------
/Ch05/CurrentVsPredictedVsMean1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/CurrentVsPredictedVsMean1.png


--------------------------------------------------------------------------------
/Ch05/CurrentVsPredictedVsModel1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/CurrentVsPredictedVsModel1.png


--------------------------------------------------------------------------------
/Ch05/MPGVSHorsepower.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/MPGVSHorsepower.png


--------------------------------------------------------------------------------
/Ch05/MPGVSHorsepowerModels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/MPGVSHorsepowerModels.png


--------------------------------------------------------------------------------
/Ch05/MPGVSHorsepowerVsLine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/MPGVSHorsepowerVsLine.png


--------------------------------------------------------------------------------
/Ch05/PredictedSalesVsTVAdvertisingCosts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch05/PredictedSalesVsTVAdvertisingCosts.png


--------------------------------------------------------------------------------
/Ch05/linearRegression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu May  5 20:26:46 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | # LINEAR REGRESSION
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | # %matplotlib inline
13 | 
14 | # Input_Variable(X)
15 | # 100 normally distributed random numbers 
16 | # with a mean of 1.5, and standard deviation 2.5
17 | x=2.5*np.random.randn(100)+1.5
18 | 
19 | # Residual term (RES) which israndom variable distributed normally  
20 | # with a mean of 0 and  standard deviation of 0.5.
21 | res=.5*np.random.randn(100)+0
22 | 
23 | # Predicted Value (Ye)
24 | # i.e. Predicted_Output(ypred)
25 | # Intercept of 2 and a slope of 0.3
26 | ypred=2+.3*x
27 | 
28 | # Actual Value (Ya)
29 | # i.e. Actual_Output(yact)
30 | # We add the random residual.
31 | yact=2+.3*x+res
32 | 
33 | # Create a dataframe with above lists.
34 | xlist=x.tolist() # Convert datatype 'numpy.ndarray' to a 'list'
35 | ypredlist=ypred.tolist() # Convert datatype 'numpy.ndarray' to a 'list'
36 | yactlist=yact.tolist() # Convert datatype 'numpy.ndarray' to a 'list'
37 | # Convert lists to a dataframe.
38 | df=pd.DataFrame({'Input_Variable(X)':xlist,'Predicted_Output(ypred)':ypredlist,'Actual_Output(yact)':yactlist})
39 | df.head()
40 | 
41 | # Get the mean of the actual data.
42 | ymean=np.mean(yact)
43 | yavg=[ymean for i in range(1,len(xlist)+1)]
44 | 
45 | 
46 | # Calculation of the R-squared or coefficient of determination
47 | # A way to judge the efficacy of the model
48 | # Total Sum of Squares (SST) = SSD + SSR = f(yact-yavg)
49 | # Difference Sum of Squares or SSD = f(yact-ypred)
50 | # Regression Sum of Squares or SSR = f(ypred-yavg)
51 | df['SSR']=(df['Predicted_Output(ypred)']-ymean)**2
52 | df['SST']=(df['Actual_Output(yact)']-ymean)**2
53 | SSR=df.sum()['SSR']
54 | SST=df.sum()['SST']
55 | SSR/SST # Out: 0.7354410334035838
56 | 
57 | # Calculating alpha and beta coefficients
58 | xmean=np.mean(df['Input_Variable(X)'])
59 | ymean=np.mean(df['Actual_Output(yact)'])
60 | df['beta']=(df['Input_Variable(X)']-xmean)*(df['Actual_Output(yact)']-ymean)
61 | df['xvar']=(df['Input_Variable(X)']-xmean)**2
62 | betan=df.sum()['beta']
63 | betad=df.sum()['xvar']
64 | beta=betan/betad
65 | 
66 | alpha=ymean-(betan/betad)*xmean
67 | beta,alpha # beta : 0.29063 alpha: 2.04474
68 | 
69 | # Generate new colum to incoporate our new parameters or coefficients
70 | df['ymodel']=beta*df['Input_Variable(X)']+alpha
71 | 
72 | # Calculation of the R-squared or coefficient of determination
73 | # for the new model.
74 | df['SSR']=(df['ymodel']-ymean)**2
75 | df['SST']=(df['Actual_Output(yact)']-ymean)**2
76 | SSR2=df.sum()['SSR']
77 | SST2=df.sum()['SST']
78 | SSR2/SST2
79 | 
80 | # Plot the current model.
81 | plt.plot(x,ypred)
82 | plt.plot(x,df['ymodel'])
83 | plt.plot(x,yact,'ro')
84 | plt.plot(x,yavg)
85 | plt.title('Actual vs Predicted vs Model')
86 | 
87 | # Residual Standard Error (RSE)
88 | df['RSE']=(df['Actual_Output(yact)']-df['ymodel'])**2
89 | RSEd=df.sum()['RSE']
90 | RSE=np.sqrt(RSEd/98) 
91 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables
92 | RSE


--------------------------------------------------------------------------------
/Ch05/linearRegressionECom.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat May  7 14:18:59 2016
  4 | 
  5 | @author: jasonm_dev
  6 | """
  7 | # Handling other issues in linear regression
  8 | import pandas as pd
  9 | import numpy as np
 10 | from sklearn.linear_model import LinearRegression
 11 | 
 12 | # Import Data from CSV file.
 13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
 14 | filename= 'Ecom Expense.csv'
 15 | file = filepath+'/'+filename
 16 | 
 17 | df=pd.read_csv(file)
 18 | df.head()
 19 | print(df.shape) #Out: (2362, 9)
 20 | # Out: ['Transaction ID', 'Age ', ' Items ', 'Monthly Income', 
 21 | # 'Transaction Time', 'Record', 'Gender', City Tier', 'Total Spend']
 22 | 
 23 | # Create dummy variables for categorical and qualitive data
 24 | dummy_gender=pd.get_dummies(df['Gender'],prefix='Sex')
 25 | dummy_city_tier=pd.get_dummies(df['City Tier'],prefix='City')
 26 | print(df.shape) #Out: (2362, 9)
 27 | 
 28 | # Add dummy variables to the main data
 29 | column_name=df.columns.values.tolist()
 30 | df1=df[column_name].join(dummy_gender)
 31 | column_name1=df1.columns.values.tolist()
 32 | df2=df1[column_name1].join(dummy_city_tier)
 33 | df2
 34 | print(df2.shape) #Out: (2362, 14)
 35 | 
 36 | # For the preceding dataset, let's assume a linear relationship between 
 37 | # the output variable 'Total Spend' and the predictor variables: 
 38 | # 'Monthly Income' and 'Transaction Time', and both set of dummy variables
 39 | 
 40 | # Input Variables
 41 | feature_cols = ['Monthly Income','Transaction Time','City_Tier 1','City_Tier 2','City_Tier 3','Sex_Female','Sex_Male']
 42 | X = df2[feature_cols]
 43 | # Output Variable
 44 | Y = df2['Total Spend']
 45 | lm = LinearRegression()
 46 | lm.fit(X,Y)
 47 | 
 48 | # Model Parameters
 49 | print (lm.intercept_) # Out: 3655.72940769
 50 | print (lm.coef_)
 51 | # Out: [   0.15297825    0.12372609  119.6632516   -16.67901801 -102.9842336
 52 | #  -94.15779883   94.15779883]
 53 | zipped = zip(feature_cols, lm.coef_)
 54 | list(zipped)
 55 | # Out: 
 56 | #[('Monthly Income', 0.15297824609320515),
 57 | # ('Transaction Time', 0.12372608642620003),
 58 | # ('City_Tier 1', 119.66325160390119),
 59 | # ('City_Tier 2', -16.679018007990429),
 60 | # ('City_Tier 3', -102.98423359591075),
 61 | # ('Sex_Female', -94.157798830320132),
 62 | # ('Sex_Male', 94.157798830320075)]
 63 | 
 64 | # R2 Score
 65 | lm.score(X,Y) # Out: 0.19478920552885381
 66 | 
 67 | # Model written out:
 68 | # Total_Spend=
 69 | # 3655.72 + 0.12*Transaction Time + 0.15*Monthly Income 
 70 | # + 119*City_Tier 1-16*City_Tier 2 - 102*City_Tier 3
 71 | # -94*Sex_Female+94*Sex_Male
 72 | 
 73 | # Calculate the RSE
 74 | df2['total_spend_pred']=3720.72940769 + 0.12*df2['Transaction Time']+0.15*df2['Monthly Income']+119*df2['City_Tier 1']-16*df2['City_Tier 2']
 75 | -102*df2['City_Tier 3']-94*df2['Sex_Female']+94*df2['Sex_Male']
 76 | df2['RSE']=(df2['Total Spend']-df2['total_spend_pred'])**2
 77 | RSEd=df2.sum()['RSE']
 78 | RSE=np.sqrt(RSEd/2354) # 2362 - 7 - 1 = 2354 
 79 | salesmean=np.mean(df2['Total Spend'])
 80 | error=RSE/salesmean
 81 | RSE,salesmean,error
 82 | # Out: (2518.8520388731386, 6163.176415976714, 0.40869380800840849)
 83 | 
 84 | # IMPROVEMENT
 85 | # Mask the first variable from the resulting list using the iloc method of subsetting
 86 | dummy_gender=pd.get_dummies(df['Gender'],prefix='Sex').iloc[:, 1:]
 87 | dummy_city_tier=pd.get_dummies(df['City Tier'],prefix='City').iloc[:, 1:]
 88 | column_name=df.columns.values.tolist()
 89 | df3=df[column_name].join(dummy_gender)
 90 | column_name1=df3.columns.values.tolist()
 91 | df4=df3[column_name1].join(dummy_city_tier)
 92 | df4
 93 | 
 94 | feature_cols = ['Monthly Income','Transaction Time','City_Tier 2','City_Tier 3','Sex_Male']
 95 | X = df2[feature_cols]
 96 | Y = df2['Total Spend']
 97 | lm = LinearRegression()
 98 | lm.fit(X,Y)
 99 | 
100 | # Model Parameters
101 | print (lm.intercept_) # Out: 3681.23486046
102 | print (lm.coef_)
103 | # Out: [  1.52978246e-01   1.23726086e-01  -1.36342270e+02  -2.22647485e+02
104 | #   1.88315598e+02]
105 | zipped = zip(feature_cols, lm.coef_)
106 | list(zipped)
107 | # Out: 
108 | #[('Monthly Income', 0.15297824609320468),
109 | # ('Transaction Time', 0.12372608642590291),
110 | # ('City_Tier 2', -136.34226961189117),
111 | # ('City_Tier 3', -222.6474851998114),
112 | # ('Sex_Male', 188.31559766064038)]
113 | 


--------------------------------------------------------------------------------
/Ch05/linearRegressionRFE.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat May  7 14:03:02 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | # Linear regression with scikit-learn
 9 | 
10 | import pandas as pd
11 | from sklearn.feature_selection import RFE
12 | from sklearn.svm import SVR
13 | 
14 | # Import Data from CSV file.
15 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
16 | filename= 'Advertising.csv'
17 | file = filepath+'/'+filename
18 | 
19 | advert=pd.read_csv(file)
20 | advert.head()
21 | 
22 | # Feature selection with scikit-learn
23 | # Recursive Feature Elimination (RFE)
24 | feature_cols = ['TV', 'Radio','Newspaper']
25 | X = advert[feature_cols]
26 | Y = advert['Sales']
27 | # Choose 'linear' model.
28 | estimator = SVR(kernel="linear")
29 | # number of desired variables
30 | selector = RFE(estimator,2,step=1)
31 | selector = selector.fit(X, Y)
32 | 
33 | # Selected variables.
34 | selector.support_ # Out: array([ True,  True, False], dtype=bool)
35 |  # X consists of three variables: TV, radio, and newspaper.
36 | # Newspaper hasn't been selected.
37 | 
38 | # Selector ranking
39 | selector.ranking_ # Out: array([1, 1, 2])
40 | # All the selected variables will have a ranking of 1.
41 | # Rest are shown in descending order.


--------------------------------------------------------------------------------
/Ch05/linearRegressionSKL.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat May  7 14:03:02 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | # Linear regression with scikit-learn
 9 | 
10 | import pandas as pd
11 | from sklearn.linear_model import LinearRegression
12 | from sklearn.cross_validation import train_test_split
13 | 
14 | # Import Data from CSV file.
15 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
16 | filename= 'Advertising.csv'
17 | file = filepath+'/'+filename
18 | 
19 | advert=pd.read_csv(file)
20 | advert.head()
21 | 
22 | # Split dataset into training and testing
23 | feature_cols = ['TV', 'Radio']
24 | X = advert[feature_cols]
25 | Y = advert['Sales']
26 | trainX,testX,trainY,testY = train_test_split(X,Y, test_size = 0.2)
27 | lm = LinearRegression()
28 | lm.fit(trainX, trainY)
29 | 
30 | print (lm.intercept_) # Out: 2.98314900713
31 | print (lm.coef_) # Out: [ 'TV': 0.04536014  'Radio': 0.18767089]
32 | 
33 | zipped = zip(feature_cols, lm.coef_)
34 | list(zipped)
35 | # Out:
36 | #[('TV', 0.044571627228483394), ('Radio', 0.19465327712760053)]
37 | 
38 | # Rsquared
39 | lm.score(trainX, trainY) # Out: 0.89235897920220186
40 | 
41 | # The model can be used to predict the value of sales using TV and radio 
42 | # variables from the test dataset
43 | lm.predict(testX)


--------------------------------------------------------------------------------
/Ch05/linearRegressionSMF.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu May  5 21:05:23 2016
  4 | 
  5 | @author: jasonm_dev
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import statsmodels.formula.api as smf
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | # Import Data from CSV file.
 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
 15 | filename= 'Advertising.csv'
 16 | file = filepath+'/'+filename
 17 | 
 18 | advert=pd.read_csv(file)
 19 | advert.head()
 20 | 
 21 | # SECTION 1: Linear regression using the statsmodel library
 22 | # Model Assumption
 23 | # Model 1: A linear relationship between advertising costs on TV and sales
 24 | # i.e. Sales = f(TV)= alpha + beta*TV
 25 | # Created a best fit using the least sum of square method
 26 | model1=smf.ols(formula='Sales~TV',data=advert).fit()
 27 | model1.params # Intercept(alpha): 7.032594; TV(beta): 0.047537
 28 | model1.pvalues # Intercept(alpha): 1.406300e-35; TV(beta): 1.467390e-42
 29 | # p-values are very small, therfore parameters are significant.
 30 | model1.rsquared # 0.61187505085007099
 31 | model1.summary()
 32 | # the F-statistic for this model is very high 
 33 | # and the associated p-value is negligible, 
 34 | # suggesting that the parameter estimates for this model 
 35 | # were all significant and non-zero.
 36 | 
 37 | # Predict the values of sales based on the equation
 38 | sales_pred=model1.predict(pd.DataFrame(advert['TV']))
 39 | sales_pred
 40 | 
 41 | # Calculate RSE term for model 1
 42 | advert['sales_pred']=0.047537*advert['TV']+7.03
 43 | advert['RSE']=(advert['Sales']-advert['sales_pred'])**2
 44 | RSEd=advert.sum()['RSE']
 45 | RSE=np.sqrt(RSEd/198) # # Df Residuals (n-p-1): 200-1-1 = 198 
 46 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables
 47 | salesmean=np.mean(advert['Sales'])
 48 | error=RSE/salesmean
 49 | RSE,salesmean,error
 50 | # (3.2586573692471279, 14.022500000000003, 0.23238776033140504)
 51 | # The current model carries a 23% error and the R2 is 0.61 < 0.9
 52 | # F-statistic: 312.1
 53 | 
 54 | # Plot the Sales predicted vs TV Advertising costs
 55 | #%matplotlib inline
 56 | advert.plot(kind='scatter', x='TV', y='Sales')
 57 | plt.plot(pd.DataFrame(advert['TV']),sales_pred,c='red',linewidth=2)
 58 | plt.title('Predicted Sales vs TV Advertising Costs')
 59 | 
 60 | 
 61 | # SECTION 2: Multiple linear regression 
 62 | # Model 2: 
 63 | # Sales = f(TV,Newspaper)= alpha + beta1*TV+ beta2*Newspaper
 64 | model2=smf.ols(formula='Sales~TV+Newspaper',data=advert).fit()
 65 | model2.params 
 66 | # Intercept(alpha): 5.774948; TV(beta1): 0.046901; Newspaper(beta2): 0.044219
 67 | model2.pvalues 
 68 | # Intercept(alpha): 3.145860e-22; TV(beta1): 5.507584e-44; 
 69 | # Newspaper(beta2): 2.217084e-05
 70 | # p-values are very small, therfore parameters are significant.
 71 | model2.rsquared # 0.64583549382932715
 72 | model2.summary()
 73 | 
 74 |  # Predict the values of sales based on the equation of model 2
 75 | sales_pred2=model2.predict(advert[['TV','Newspaper']])
 76 | sales_pred2
 77 | 
 78 | # Calculate RSE term for model 2
 79 | advert['sales_pred2']=5.77 + 0.046*advert['TV'] + 0.04*advert['Newspaper']
 80 | advert['RSE2']=(advert['Sales']-advert['sales_pred2'])**2
 81 | RSEd2=advert.sum()['RSE2']
 82 | RSE2=np.sqrt(RSEd2/197) # Df Residuals (n-p-1): 200-2-1 = 197 
 83 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables
 84 | salesmean=np.mean(advert['Sales'])
 85 | error2=RSE2/salesmean
 86 | RSE2,salesmean,error2
 87 | # (3.1346969895743846, 14.022500000000003, 0.22354765481008265)
 88 | # The current model carries a 22% error and the R2 is 0.64 < 0.9
 89 | # F-statistic: 179.6
 90 | 
 91 | 
 92 | # Model 3: 
 93 | # Sales = f(TV,Radio)= alpha + beta1*TV+ beta2*Radio
 94 | model3=smf.ols(formula='Sales~TV+Radio',data=advert).fit()
 95 | model3.params 
 96 | # Intercept(alpha): 2.921100; TV(beta1): 0.045755; Radio(beta2): 0.187994
 97 | model3.pvalues 
 98 | # Intercept(alpha): 4.565557e-19; TV(beta1): 5.436980e-82; 
 99 | # Radio(beta2): 9.776972e-59
100 | # p-values are very small, therfore parameters are significant.
101 | model3.rsquared # 0.89719426108289568
102 | model3.summary()
103 | 
104 |  # Predict the values of sales based on the equation of model 3
105 | sales_pred3=model3.predict(advert[['TV','Radio']])
106 | sales_pred3
107 | 
108 | # Calculate RSE term for model 3
109 | advert['sales_pred3']=2.92 + 0.045*advert['TV'] + 0.18*advert['Radio']
110 | advert['RSE3']=(advert['Sales']-advert['sales_pred3'])**2
111 | RSEd3=advert.sum()['RSE3']
112 | RSE3=np.sqrt(RSEd3/197) # Df Residuals (n-p-1): 200-2-1 = 197 
113 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables
114 | salesmean=np.mean(advert['Sales'])
115 | error3=RSE3/salesmean
116 | RSE3,salesmean,error3
117 | # (1.7136206211553162, 14.022500000000003, 0.12220507193120456)
118 | # The current model carries a 12% error and the R2 is 0.89 < 0.9
119 | # F-statistic: 859.6 =>  indicating a very efficient model.
120 | 
121 | 
122 | # Model 4: 
123 | # Sales = f(TV,Radio)= alpha + beta1*TV + beta2*Radio + beta3*Newspaper
124 | model4=smf.ols(formula='Sales~TV+Radio+Newspaper',data=advert).fit()
125 | model4.params 
126 | # Intercept(alpha): 2.938889; TV(beta1): 0.045765; 
127 | # Radio(beta2): 0.188530; Newspaper(beta3): -0.001037
128 | model4.pvalues 
129 | # Intercept(alpha): 1.267295e-17; TV(beta1): 1.509960e-81; 
130 | # Radio(beta2): 1.505339e-54; Newspaper(beta3): 8.599151e-01
131 | # p-values are very small, therfore parameters are significant.
132 | model4.rsquared # 0.89721063817895219
133 | model4.summary()
134 | 
135 | # Predict the values of sales based on the equation of model 4
136 | sales_pred4=model4.predict(advert[['TV','Radio','Newspaper']])
137 | sales_pred4
138 | 
139 | # Calculate RSE term for model 4
140 | advert['sales_pred4']=2.938 + 0.045*advert['TV'] + 0.188*advert['Radio'] - 0.001*advert['Newspaper']
141 | advert['RSE4']=(advert['Sales']-advert['sales_pred4'])**2
142 | RSEd4=advert.sum()['RSE4']
143 | RSE4=np.sqrt(RSEd4/196) # Df Residuals (n-p-1): 200-3-1 = 196
144 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables
145 | salesmean=np.mean(advert['Sales'])
146 | error4=RSE4/salesmean
147 | RSE4,salesmean,error4
148 | # (1.691523011857319, 14.022500000000003, 0.12062920391209261)
149 | # The current model carries a 12% error and the R2 is 0.89 < 0.9
150 | # F-statistic: 570.3 =>  
151 | # This suggests that the partial benefit of adding newspaper to the model 
152 | # containing TV and radio is negative.
153 | # RSE does not increase as book says. It decreases from 1.71 to 1.69.
154 | 
155 | 
156 | # Multi-collinearity
157 | # Calculate the Variance Inflation Factor
158 | # It is a method to quantify the rise in the variability of the coefficient 
159 | # estimate of a particular variable because of high correlation between two or
160 | # more than two predictor variables.
161 | 
162 | # VIF for the Newspaper
163 | modelVIF1=smf.ols(formula='Newspaper~TV+Radio',data=advert).fit()
164 | rsquared1=modelVIF1.rsquared 
165 | VIF1=1/(1-rsquared1)
166 | VIF1 # Out: 1.1451873787239286
167 | 
168 | # VIF for the Radio
169 | modelVIF2=smf.ols(formula='Radio~TV+Newspaper',data=advert).fit()
170 | rsquared2=modelVIF2.rsquared 
171 | VIF2=1/(1-rsquared2)
172 | VIF2 # Out: 1.1449519171055353
173 | 
174 | # VIF for the TV
175 | modelVIF3=smf.ols(formula='TV~Newspaper+Radio',data=advert).fit()
176 | rsquared3=modelVIF3.rsquared 
177 | VIF3=1/(1-rsquared3)
178 | VIF3 # Out: 1.0046107849396502
179 | 
180 | # Summary:
181 | # Newspaper and Radio have the same VIF and are thus correlated with one another.
182 | # Model 3 with TV and Radio is superior to Model 2 with TV and Newspaper.
183 | # Model 4 with all 3 variable is actually weaker than Model 3.
184 | 
185 | # Training and testing data split
186 | a=np.random.randn(len(advert))
187 | check=a<0.8
188 | training=advert[check] # Out: 152
189 | testing=advert[~check] # Out: 48
190 | 
191 | # Model 5: [model will changeeach time its run because of random generator.]
192 | # Sales = f(TV,Radio)= alpha + beta1*TV+ beta2*Radio
193 | model5=smf.ols(formula='Sales~TV+Radio',data=training).fit()
194 | model5.params 
195 | # Intercept(alpha): 2.771009; TV(beta1): 0.047188; Radio(beta2): 0.185030
196 | model5.pvalues 
197 | # Intercept(alpha): 6.613587e-13; TV(beta1): 2.625145e-62; 
198 | # Radio(beta2): 1.803356e-41
199 | # p-values are very small, therfore parameters are significant.
200 | model5.rsquared # 0.89415688916044844
201 | model5.summary() # F-statistic: 629.4 
202 | 
203 | # Predict the values of sales based on the equation of model 5 using testing data
204 | sales_pred5=model5.predict(training[['TV','Radio']])
205 | sales_pred5
206 | 
207 | # Calculate RSE term for model 5
208 | testing['sales_pred5']=2.7710 + 0.0472*testing['TV'] + 0.1850*testing['Radio']
209 | testing['RSE5']=(testing['Sales']-testing['sales_pred5'])**2
210 | RSEd5=testing.sum()['RSE5']
211 | RSE5=np.sqrt(RSEd5/45) # len(testing) = 48; (n-p-1): 48-2-1 = 45
212 | # [1/(n-p-1)]:n=number of data points;p=number of predictor variables
213 | salesmean=np.mean(testing['Sales'])
214 | error5=RSE5/salesmean
215 | RSE5,salesmean,error5
216 | # (1.4032428224556619, 14.120833333333335, 0.099373938444779819)
217 | # The current model carries a 11% error and the R2 is 0.89 < 0.9 


--------------------------------------------------------------------------------
/Ch05/nonlinearRegression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat May  7 14:56:09 2016
  4 | 
  5 | @author: jasonm_dev
  6 | """
  7 | 
  8 | # Transforming a variable to fit non-linear relations
  9 | import pandas as pd
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.linear_model import LinearRegression
 13 | from sklearn.preprocessing import PolynomialFeatures
 14 | from sklearn import linear_model
 15 | 
 16 | # Import Data from CSV file.
 17 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
 18 | filename= 'Auto.csv'
 19 | file = filepath+'/'+filename
 20 | 
 21 | data=pd.read_csv(file)
 22 | data.head()
 23 | print(data.shape) #Out: (406, 9)
 24 | 
 25 | # Plot data to check linearity
 26 | #%matplotlib inline
 27 | data['mpg']=data['mpg'].dropna()
 28 | data['horsepower']=data['horsepower'].dropna()
 29 | plt.plot(data['horsepower'],data['mpg'],'ro')
 30 | plt.xlabel('Horsepower')
 31 | plt.ylabel('MPG (Miles Per Gallon)')
 32 | 
 33 | # Model 1
 34 | # Try linear model.
 35 | # MPG = co + alpha*HP
 36 | X=data['horsepower'].fillna(data['horsepower'].mean())
 37 | Y=data['mpg'].fillna(data['mpg'].mean())
 38 | lm=LinearRegression()
 39 | lm.fit(X[:,np.newaxis],Y)
 40 | 
 41 | # Plot Again
 42 | #plt.plot(data['horsepower'],data['mpg'],'ro')
 43 | #plt.plot(X,lm.predict(X[:,np.newaxis]),color='blue')
 44 | 
 45 | # R2 score
 46 | lm.score(X[:,np.newaxis],Y) # Out: 0.57465334064502505
 47 | 
 48 | # Alternative method for RSE
 49 | RSEd=(Y-lm.predict(X[:,np.newaxis]))**2
 50 | RSE1=np.sqrt(np.sum(RSEd)/389)
 51 | ymean=np.mean(Y)
 52 | error1=RSE1/ymean
 53 | RSE1,error1 # Out: (5.1496254786975237, 0.21899719414044677)
 54 | 
 55 | # Model 2
 56 | # In the form of mpg = co+a1.horsepower**2,
 57 | X2=data['horsepower'].fillna(data['horsepower'].mean())*data['horsepower'].fillna(data['horsepower'].mean())
 58 | Y2=data['mpg'].fillna(data['mpg'].mean())
 59 | lm2=LinearRegression()
 60 | lm2.fit(X2[:,np.newaxis],Y2)
 61 | 
 62 | type(lm2.predict(X2[:,np.newaxis]))
 63 | RSEd=(Y2-lm2.predict(X2[:,np.newaxis]))**2
 64 | RSE2=np.sqrt(np.sum(RSEd)/390)
 65 | ymean=np.mean(Y2)
 66 | error2=RSE2/ymean
 67 | RSE2,error2,ymean 
 68 | # Out: (5.6591995312606125, 0.24066775798625065, 23.51457286432162)
 69 | 
 70 | # R2 score
 71 | lm2.score(X2[:,np.newaxis],Y2) # Out: 0.48498870348232048
 72 | 
 73 | print (lm2.intercept_) # Out: 30.405683105
 74 | print (lm2.coef_) # Out:[ 0. -0.43404318  0.00112615]
 75 | 
 76 | # Model 3
 77 | # Attempt polynomial fit with 2 degrees
 78 | X3=data['horsepower'].fillna(data['horsepower'].mean())
 79 | Y3=data['mpg'].fillna(data['mpg'].mean())
 80 | poly = PolynomialFeatures(degree=2)
 81 | X3_ = poly.fit_transform(X3[:,np.newaxis])
 82 | clf3 = linear_model.LinearRegression()
 83 | clf3.fit(X3_, Y3)
 84 | 
 85 | print (clf3.intercept_) # Out: 55.0261924471
 86 | print (clf3.coef_) # Out: [-0.00055043]
 87 | 
 88 | # R2 score # R2 = 0.688
 89 | clf3.score(X3_,Y3) # Out:  0.6439066584257469
 90 | 
 91 | # Model 4
 92 | # Attempt polynomial fit with 5 degrees
 93 | X5=data['horsepower'].fillna(data['horsepower'].mean())
 94 | Y5=data['mpg'].fillna(data['mpg'].mean())
 95 | poly = PolynomialFeatures(degree=5)
 96 | X5_ = poly.fit_transform(X5[:,np.newaxis])
 97 | clf5 = linear_model.LinearRegression()
 98 | clf5.fit(X5_, Y5)
 99 | 
100 | print (clf5.intercept_) # Out: -40.6939920548
101 | print (clf5.coef_) 
102 | # Out:[  0.00000000e+00   4.00021890e+00  -7.54802463e-02   6.19621638e-04
103 | #  -2.36220983e-06   3.41983064e-09]
104 | 
105 | # R2 = 0.7
106 | clf5.score(X5_,Y5) # Out: 0.6547512491826567
107 | 
108 | # Model 5
109 | # Try y = 1/x or 1/x2
110 | 
111 | 
112 | # Plot All
113 | XP = np.arange(45,248,0.5)
114 | M2 = 30.405683105 -0.00055043*XP**2
115 | M3 = 55.0261924471 - 0.43404318*XP + 0.00112615*XP**2
116 | M4 = -40.6939920548 + 4.00021890e+00*XP -7.54802463e-02*XP**2 + 6.19621638e-04*XP**3 -2.36220983e-06*XP**4 + 3.41983064e-09*XP**5
117 | 
118 | plt.plot(data['horsepower'],data['mpg'],'ro') # Actual Data
119 | plt.plot(XP,lm.predict(XP[:,np.newaxis]),color='magenta')
120 | plt.plot(XP,M2,color='blue') # Model 2
121 | plt.plot(XP,M3,color='green') # Model 3
122 | plt.plot(XP,M4,color='yellow') # Model 4


--------------------------------------------------------------------------------
/Ch06/Histogram of Age.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Histogram of Age.png


--------------------------------------------------------------------------------
/Ch06/Purchase Frequency for Day of Week'.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Purchase Frequency for Day of Week'.png


--------------------------------------------------------------------------------
/Ch06/Purchase Frequency for Education Level.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Purchase Frequency for Education Level.png


--------------------------------------------------------------------------------
/Ch06/Purchase Frequency for Month of the Year.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Purchase Frequency for Month of the Year.png


--------------------------------------------------------------------------------
/Ch06/ROC Curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/ROC Curve.png


--------------------------------------------------------------------------------
/Ch06/Stacked Bar Chart of Marital Status vs Purchase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch06/Stacked Bar Chart of Marital Status vs Purchase.png


--------------------------------------------------------------------------------
/Ch06/logisticRegression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat May  7 21:48:26 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | # Understanding the math behind logistic regression
 9 | import pandas as pd
10 | import numpy as np
11 | 
12 | # Import Data from CSV file.
13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
14 | filename= 'Gender Purchase.csv'
15 | file = filepath+'/'+filename
16 | 
17 | df=pd.read_csv(file)
18 | df.head()
19 | print(df.shape) #Out: (511, 2)
20 | 
21 | # Contingency table for the dataset
22 | contingency_table=pd.crosstab(df['Gender'],df['Purchase'])
23 | contingency_table
24 | # Add horizontally
25 | contingency_table.sum(axis=1)
26 | # Add vertically
27 | contingency_table.sum(axis=0)
28 | 
29 | # Calculate the proportions
30 | contingency_table.astype('float').div(contingency_table.sum(axis=1),axis=0)
31 | 
32 | 


--------------------------------------------------------------------------------
/Ch06/logisticRegressionImplementation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 11 20:15:24 2016
  4 | 
  5 | @author: jasonm_dev
  6 | """
  7 | 
  8 | # Implementing logistic regression with Python
  9 | import pandas as pd
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | # Import Data from CSV file.
 14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
 15 | filename= 'bank.csv'
 16 | file = filepath+'/'+filename
 17 | 
 18 | bank=pd.read_csv(file, sep=';')
 19 | bank.head()
 20 | print(bank.shape) #Out: (4119, 21)
 21 | 
 22 | # Column Names
 23 | bank.columns.values
 24 | 
 25 | # Out: array(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
 26 | #       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
 27 | #       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
 28 | #       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'], dtype=object)
 29 | 
 30 | # Type of the column from the dataset
 31 | bank.dtypes
 32 | #Out[6]: 
 33 | #age                 int64
 34 | #job                object
 35 | #marital            object
 36 | #education          object
 37 | #default            object
 38 | #housing            object
 39 | #loan               object
 40 | #contact            object
 41 | #month              object
 42 | #day_of_week        object
 43 | #duration            int64
 44 | #campaign            int64
 45 | #pdays               int64
 46 | #previous            int64
 47 | #poutcome           object
 48 | #emp.var.rate      float64
 49 | #cons.price.idx    float64
 50 | #cons.conf.idx     float64
 51 | #euribor3m         float64
 52 | #nr.employed       float64
 53 | #y                  object
 54 | #dtype: object
 55 | 
 56 | # Processing the data
 57 | # the 'y' column is the customer variable with outcome'yes' and 'no'.
 58 | # Convert column to something that can be used, i.e. '1' and '0'
 59 | bank['y']=(bank['y']=='yes').astype(int)
 60 | 
 61 | # Education column has many categories and needs to be reduced.
 62 | bank['education'].unique()
 63 | 
 64 | # The basic category has been repeated three times probably to 
 65 | # capture 4, 6, and 9 years of education. Let us club these three together 
 66 | # and call them basic. Other modified as well.
 67 | bank['education']=np.where(bank['education'] =='basic.9y', 'Basic', bank['education'])
 68 | bank['education']=np.where(bank['education'] =='basic.6y', 'Basic', bank['education'])
 69 | bank['education']=np.where(bank['education'] =='basic.4y', 'Basic', bank['education'])
 70 | bank['education']=np.where(bank['education'] =='university.degree', 'University Degree', bank['education'])
 71 | bank['education']=np.where(bank['education'] =='professional.course', 'Professional Course', bank['education'])
 72 | bank['education']=np.where(bank['education'] =='high.school', 'High School', bank['education'])
 73 | bank['education']=np.where(bank['education'] =='illiterate', 'Illiterate', bank['education'])
 74 | bank['education']=np.where(bank['education'] =='unknown', 'Unknown', bank['education'])
 75 | 
 76 | # Data exploration
 77 | # The number of people who purchased the term deposit
 78 | bank['y'].value_counts() # Out: Out[12]: [ '0' 3668, '1' 451]
 79 | 
 80 | # Many numbers, so lets gets an overview.
 81 | bank.groupby('y').mean()
 82 | # Categorical means
 83 | bank.groupby('education').mean()
 84 | 
 85 | # Data visualization
 86 | 
 87 | # Tabular data
 88 | pd.crosstab(bank.education,bank.y)
 89 | # %matplotlib inline
 90 | #pd.crosstab(bank.education,bank.y).plot(kind='bar')
 91 | #plt.title('Purchase Frequency for Education Level')
 92 | #plt.xlabel('Education')
 93 | #plt.ylabel('Frequency of Purchase')
 94 | 
 95 | # Stacked bar chart of marital staus and purchase of term deposit.
 96 | #table=pd.crosstab(bank.marital,bank.y)
 97 | #table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
 98 | #plt.title('Stacked Bar Chart of Marital Status vs Purchase')
 99 | #plt.xlabel('Marital Status')
100 | #plt.ylabel('Proportion of Customers')
101 | 
102 | # Bar chart of Purchase Frequency for Day of Week'
103 | #pd.crosstab(bank.day_of_week,bank.y).plot(kind='bar')
104 | #plt.title('Purchase Frequency for Day of Week')
105 | #plt.xlabel('Day of Week')
106 | #plt.ylabel('Frequency of Purchase')
107 | 
108 | # Bar chart of Purchase Frequency for Day of Week'
109 | #pd.crosstab(bank.month,bank.y).plot(kind='bar')
110 | #plt.title('Purchase Frequency for Month of the Year')
111 | #plt.xlabel('Month of the Year')
112 | #plt.ylabel('Frequency of Purchase')
113 | 
114 | # Histogram of Age
115 | #bank.age.hist()
116 | #plt.title('Histogram of Age')
117 | #plt.xlabel('Age')
118 | #plt.ylabel('Frequency')
119 | 
120 | # Creating dummy variables for categorical variables
121 | cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
122 | for var in cat_vars:
123 |     cat_list='var'+'_'+var
124 |     cat_list = pd.get_dummies(bank[var], prefix=var)
125 |     bank1=bank.join(cat_list)
126 |     bank=bank1
127 |     
128 | # Remove actual categories once dummies have been created
129 | bank_vars=bank.columns.values.tolist()
130 | to_keep=[i for i in bank_vars if i not in cat_vars]
131 | 
132 | # Subset the bank dataframe to only keep the columns present
133 | bank_final=bank[to_keep]
134 | bank_final.columns.values
135 | 
136 | # Y outcomes and X predictors can now be calculated
137 | bank_final_vars=bank_final.columns.values.tolist()
138 | Y=['y']
139 | X=[i for i in bank_final_vars if i not in Y ]
140 | 
141 | # Feature selection  
142 | # All 12 columns can be selected
143 | from sklearn import datasets
144 | from sklearn.feature_selection import RFE
145 | from sklearn.linear_model import LogisticRegression
146 | 
147 | model = LogisticRegression()
148 | 
149 | # Selct a model with 12 variables.
150 | rfe = RFE(model, 12)
151 | rfe = rfe.fit(bank_final[X],bank_final[Y] )
152 | 
153 | # Print out the support array
154 | print(rfe.support_)
155 | # Print out the ranking
156 | print(rfe.ranking_) 
157 | # The columns with true or 1 shall be selected for the final selection.
158 | 
159 | # 'previous', 'euribor3m', 'job_entrepreneur', 'job_self-employed', 
160 | # 'poutcome_success', 'poutcome_failure', 'month_oct', 'month_may','month_mar',
161 | # 'month_jun', 'month_jul', 'month_dec' 
162 | 
163 | # Fit a logistic regression model using the preceding selected variables 
164 | # as predictor variables, with the y as the outcome variable
165 | cols=['previous', 'euribor3m', 'job_entrepreneur', 'job_self-employed', 'poutcome_success', 'poutcome_failure', 'month_oct', 'month_may',
166 |     'month_mar', 'month_jun', 'month_jul', 'month_dec'] 
167 | # Dataframe taht just has the selected columns
168 | X=bank_final[cols]
169 | Y=bank_final['y']
170 | 
171 | # Implementing the model
172 | import statsmodels.api as sm
173 | logit_model=sm.Logit(Y,X)
174 | result=logit_model.fit()
175 | print (result.summary())
176 | 
177 | # The statsmodel.api method can be used while exploring and fine-tuning the model.
178 | # One advantage of this method is that p-values are calculated automatically 
179 | # in the result summary. 
180 | # The scikit-learn method can be used in the final model used to predict the outcome.
181 | # The scikit-learn method doesn't have this facility, 
182 | # but is more powerful for calculation-intensive tasks such as prediction, 
183 | # calculating scores, and advanced functions such as feature selection. 
184 | 
185 | # Fit the model
186 | from sklearn import linear_model
187 | clf = linear_model.LogisticRegression()
188 | clf.fit(X, Y)
189 | 
190 | # Calculate the accuracy
191 | clf.score(X,Y) #Out = 0.90216071862102454
192 | # The value comes out to be .902. The mean value of the outcome is .11, 
193 | # meaning that the outcome is positive (1) around 11% of the time and negative 
194 | # around 89% of the time. 
195 | 
196 | # Get the values of the coefficients
197 | zipped = list(zip(X.columns, np.transpose(clf.coef_)))
198 | pd.DataFrame(zipped)
199 | 
200 | # Out: 
201 | #                    0                  1
202 | #0            previous   [0.379831612876]
203 | #1           euribor3m  [-0.502749071837]
204 | #2    job_entrepreneur  [-0.343066155888]
205 | #3   job_self-employed  [-0.335064163493]
206 | #4    poutcome_success    [1.07783253323]
207 | #5    poutcome_failure  [-0.753161867894]
208 | #6           month_oct   [0.411855745929]
209 | #7           month_may  [-0.743089630936]
210 | #8           month_mar     [1.2703612295]
211 | #9           month_jun   [0.509694983142]
212 | #10          month_jul   [0.382087449085]
213 | #11          month_dec   [0.873316799315]
214 | 
215 | # Model validation and evaluation
216 | # Split into training and testing sets
217 | from sklearn.cross_validation import train_test_split
218 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
219 | 
220 | # Build a logistic regression model ove the training dataset
221 | from sklearn import linear_model
222 | from sklearn import metrics
223 | clf1 = linear_model.LogisticRegression()
224 | clf1.fit(X_train, Y_train)
225 | 
226 | # Get probalities and classifications
227 | probs = clf1.predict_proba(X_test)
228 | 
229 | # Out: [ Negative, Positive]
230 | # array([[ 0.93352157,  0.06647843],
231 | #       ..., 
232 | #       [ 0.24746608,  0.75253392]])
233 | 
234 | # Get predicted outcomes
235 | predicted = clf1.predict(X_test)
236 | print(predicted) # Out: [0 0 0 ..., 0 0 1]
237 | # Default cut off is at 0.5.
238 | # We saw that 10% of customers brought product, hence 0.1 cut-off.
239 | 
240 | # Changing the threshold value
241 | prob=probs[:,1] # Take second column, i.e. positive outcomes
242 | prob_df=pd.DataFrame(prob) # Push to dataframe
243 | prob_df['predict']=np.where(prob_df[0]>=0.10,1,0) 
244 | prob_df.head() 
245 | # [ @0.1 => 28%, @0.15 => 18%, @0.05 => 65%]
246 | 
247 | # Accuracy of the model
248 | print (metrics.accuracy_score(Y_test, predicted)) # Out: 0.902103559871
249 | 
250 | # Cross validation
251 | # Using the k-fold method
252 | # Use a 8-fold cross validation method
253 | # CAlculates the accuracy of each iteration
254 | from sklearn.cross_validation import cross_val_score
255 | scores = cross_val_score(linear_model.LogisticRegression(), X, Y, scoring='accuracy', cv=8)
256 | print (scores)
257 | # Out: [ 0.91860465  0.90310078  0.89534884  0.90679612  0.89883268  
258 | # 0.89299611  0.90466926  0.89883268]
259 | print (scores.mean()) # Out: 0.902397639921
260 | 
261 | 
262 | # Model Validation
263 | # ROC Curve
264 | # Run model and calculate the probabilities for each observation
265 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
266 | clf1 = linear_model.LogisticRegression()
267 | clf1.fit(X_train, Y_train)
268 | probs = clf1.predict_proba(X_test)
269 | 
270 | # Each probable value is compared to threshold probability and categorized as 
271 | # 1 (postive outcome)
272 | prob=probs[:,1]
273 | prob_df=pd.DataFrame(prob)
274 | prob_df['predict']=np.where(prob_df[0]>=0.05,1,0)
275 | prob_df['actual']=Y_test #TODO: Comes out as NAN 
276 | prob_df.head()
277 | 
278 | # Confusion matrix
279 | confusion_matrix=pd.crosstab(prob_df['actual'],prob_df['predict'])
280 | confusion_matrix
281 | 
282 | # Plot ROC curve manually
283 | 
284 | #%matplotlib inline
285 | Sensitivity=[1,0.95,0.87,0.62,0.67,0.59,0.5,0.41,0]
286 | FPR=[1,0.76,0.62,0.23,0.27,0.17,0.12,0.07,0]
287 | #plt.plot(FPR,Sensitivity,marker='o',linestyle='--',color='r')
288 | x=[i*0.01 for i in range(100)]
289 | y=[i*0.01 for i in range(100)]
290 | #plt.plot(x,y)
291 | #plt.xlabel('(1-Specificity)')
292 | #plt.ylabel('Sensitivity')
293 | #plt.title('ROC Curve')
294 | 
295 | # Using scikit-learn package to plot the ROC Curve
296 | #TODO: 
297 | from sklearn import metrics
298 | from ggplot import *
299 | 
300 | prob = clf1.predict_proba(X_test)[:,1]
301 | fpr, sensitivity, _ = metrics.roc_curve(Y_test, prob)
302 | 
303 | df = pd.DataFrame(dict(fpr=fpr, sensitivity=sensitivity))
304 | ggplot(df, aes(x='fpr', y='sensitivity')) + geom_line() +\
305 |     geom_abline(linetype='dashed')
306 |     
307 | # Area under the curve
308 | auc = metrics.auc(fpr,sensitivity)
309 | auc    
310 | 
311 | # Area under curve can be plotted.
312 | ggplot(df, aes(x='fpr', ymin=0, ymax='sensitivity')) +\
313 |     geom_area(alpha=0.2) +\
314 |     geom_line(aes(y='sensitivity')) +\
315 |     ggtitle("ROC Curve w/ AUC=%s" % str(auc))
316 |     
317 | 


--------------------------------------------------------------------------------
/Ch06/logisticRegressionScratch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue May 10 20:28:13 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | # Building the logistic regression model from scratch
 9 | # Step 1: defining the likelihood function
10 | def likelihood(y,pi):
11 |     import numpy as np
12 |     ll=1
13 |     ll_in=list(range(1,len(y)+1))
14 |     for i in range(len(y)):
15 |         ll_in[i]=np.where(y[i]==1,pi[i],(1-pi[i]))
16 |         ll=ll*ll_in[i]
17 |     return ll
18 |     
19 | # Step 2: calculating probability for each observation
20 | def logitprob(X,beta):
21 |     import numpy as np
22 |     rows=np.shape(X)[0]
23 |     cols=np.shape(X)[1]
24 |     pi=list(range(1,rows+1))
25 |     expon=list(range(1,rows+1))
26 |     for i in range(rows):
27 |         expon[i]=0
28 |         for j in range(cols):
29 |             ex=X[i][j]*beta[j]
30 |             expon[i]=ex+expon[i]
31 |         with np.errstate(divide='ignore', invalid='ignore'):
32 |             pi[i]=np.exp(expon[i])/(1+np.exp(expon[i]))
33 |     return pi
34 |     
35 | # Step 3: Calculate the W diagonal matrix
36 | def findW(pi):
37 |     import numpy as np
38 |     W=np.zeros(len(pi)*len(pi)).reshape(len(pi),len(pi))
39 |     for i in range(len(pi)):
40 |         print (i)
41 |         W[i,i]=pi[i]*(1-pi[i])
42 |         W[i,i].astype(float)
43 |     return W
44 |     
45 | # Step 4: defining the logistic function
46 | def logistic(X,Y,limit):
47 |     import numpy as np
48 |     from numpy import linalg
49 |     nrow=np.shape(X)[0]
50 |     bias=np.ones(nrow).reshape(nrow,1)
51 |     X_new=np.append(X,bias,axis=1)
52 |     ncol=np.shape(X_new)[1]
53 |     beta=np.zeros(ncol).reshape(ncol,1)
54 |     root_diff=np.array(range(1,ncol+1)).reshape(ncol,1)
55 |     iter_i=10000
56 |     while(iter_i>limit):
57 |         print (iter_i, limit)
58 |         pi=logitprob(X_new,beta)
59 |         print (pi)
60 |         W=findW(pi)
61 |         print (W)
62 |         print (X_new)
63 |         print (Y-np.transpose(pi))
64 |         print (np.array((linalg.inv(np.matrix(np.transpose(X_new))*np.matrix(W)*np.matrix(X_new)))*(np.transpose(np.matrix(X_new))*np.matrix(Y-np.transpose(pi)).transpose())))
65 |         print (beta)
66 |         print (type(np.matrix(np.transpose(Y-np.transpose(pi)))) )
67 |         print (np.matrix(Y-np.transpose(pi)).transpose().shape)
68 |         print (np.matrix(np.transpose(X_new)).shape)
69 |         root_diff=np.array((linalg.inv(np.matrix(np.transpose(X_new))*np.matrix(W)*np.matrix(X_new)))*(np.transpose(np.matrix(X_new))*np.matrix(Y-np.transpose(pi)).transpose()))
70 |         beta=beta+root_diff
71 |         iter_i=np.sum(root_diff*root_diff)
72 |         ll=likelihood(Y,pi)
73 |         print (beta)
74 |         print (beta.shape)
75 |     return beta
76 |     
77 | # Testing the model
78 | import numpy as np
79 | X=np.array(range(10)).reshape(10,1)
80 | Y=[0,0,0,0,1,0,1,0,1,1]
81 | bias=np.ones(10).reshape(10,1)
82 | X_new=np.append(X,bias,axis=1)
83 | 
84 | # Running logistic Regression using our function
85 | a=logistic(X,Y,0.000000001)
86 | ll=likelihood(Y,logitprob(X,a))
87 | #Coefficient of X = 0.66 , Intercept = -3.69
88 | 
89 | # From stasmodel.api
90 | import statsmodels.api as sm
91 | logit_model=sm.Logit(Y,X_new)
92 | result=logit_model.fit()
93 | print (result.summary())
94 | #Coefficient of X = 0.66, Intercept = -3.69


--------------------------------------------------------------------------------
/Ch07/Histogram of Clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch07/Histogram of Clusters.png


--------------------------------------------------------------------------------
/Ch07/Histogramn of Cluster Labels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch07/Histogramn of Cluster Labels.png


--------------------------------------------------------------------------------
/Ch07/clusterWine.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May 13 13:33:18 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | 
 8 | # Classify wine by chemical composition
 9 | import pandas as pd
10 | import matplotlib.pyplot as plt
11 | 
12 | # Import Data from CSV file.
13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
14 | filename= 'wine.csv'
15 | file = filepath+'/'+filename
16 | df=pd.read_csv(file,sep=';')
17 | df.head()
18 | 
19 | # Plot data to have a look at the quality
20 | #% matplotlib inline
21 | #plt.hist(df['quality'])
22 | 
23 | # Check the mean of the quality
24 | df.groupby('quality').mean()
25 | 
26 | # Normalizing the values in the dataset
27 | df_norm = (df - df.min()) / (df.max() - df.min())
28 | df_norm.head() 
29 | 
30 | # Hierarchical clustering using scikit-learn
31 | from sklearn.cluster import AgglomerativeClustering
32 | ward = AgglomerativeClustering(n_clusters=6, linkage='ward').fit(df_norm)
33 | md=pd.Series(ward.labels_)
34 | ward.children_
35 | 
36 | # Plot the histogram of cluster labels
37 | #plt.hist(md)
38 | #plt.title('Histogram of Cluster Label')
39 | #plt.xlabel('Cluster')
40 | #plt.ylabel('Frequency')
41 | 
42 | # K-Means clustering using scikit-learn
43 | # fits the k-means clustering model to the wine dataset
44 | from sklearn.cluster import KMeans
45 | from sklearn import datasets
46 | model=KMeans(n_clusters=6)
47 | model.fit(df_norm)
48 | 
49 | # an array depicting the cluster the row belongs to
50 | model.labels_
51 | # Out: array([4, 4, 4, ..., 0, 0, 3], dtype=int32)
52 | 
53 | # Make the array apart of the dataframe
54 | md=pd.Series(model.labels_)
55 | df_norm['clust']=md
56 | df_norm.head()
57 | 
58 | # Centroids for each cluster
59 | model.cluster_centers_
60 | 
61 | # j-score
62 | model.inertia_
63 | 
64 | # Plot histogram of the cluster
65 | plt.hist(df_norm['clust'])
66 | plt.title('Histogram of Clusters')
67 | plt.xlabel('Cluster')
68 | plt.ylabel('Frequency')
69 | 
70 | # Calculate the mean of the composition for each cluster and component
71 | df_norm.groupby('clust').mean()


--------------------------------------------------------------------------------
/Ch07/kMeanClustering.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May 13 13:15:01 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | # K-means clustering
 8 | import numpy as np
 9 | 
10 | # Define an observation set of 30x3
11 | obs=np.random.random(90).reshape(30,3)
12 | obs
13 | 
14 | # I decided that I want two clusters 
15 | c1=np.random.choice(range(len(obs)))
16 | c2=np.random.choice(range(len(obs)))
17 | clust_cen=np.vstack([obs[c1],obs[c2]])
18 | clust_cen # 2 rows in array correspond to 2 cluster centroids.
19 | 
20 | # Implement k-menas clustering
21 | from scipy.cluster.vq import vq
22 | vq(obs,clust_cen)
23 | 
24 | # First array tells us which cluster the observation belongs to.
25 | # '0' for c1, '1' for c2
26 | # i.e. obs1 is with c2, obs2 is with c1
27 | # Second array tells us how far the observation is from it cluster centroid.
28 | # obs1 is 0.25 units away from c2 cluster centroid
29 | # obs1 is 0.49 units away from c1 cluster centroid
30 | #(array([1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
31 | #        0, 0, 1, 1, 0, 0, 1], dtype=int32),
32 | # array([ 0.24932073,  0.49594229,  0.28098465,  0.50348212,  0.        ,
33 | #        0.61496506,  0.26303013,  0.42779775,  0.59958318,  0.3468564 ,
34 | #         0.40935109,  0.58624004,  0.42803874,  0.78335592,  0.50565815,
35 | #         0.61892717,  0.57338804,  0.51580769,  0.37107392,  0.54979847,
36 | #         0.48482825,  0.5257047 ,  0.50568491,  0.43748909,  0.71436479,
37 | #         0.        ,  0.39646343,  0.47429546,  0.21875716,  0.59853208]))
38 | 
39 | # FInd the cluster centroid for the two centroids
40 | from scipy.cluster.vq import kmeans
41 | kmeans(obs,clust_cen)
42 | # The two rows in the array correspond to the two final cluster centroids.
43 | # At the end, J-score, which we seek to minimize
44 | # (array([[ 0.62260732,  0.69445579,  0.50227104],
45 | #        [ 0.37635439,  0.32446748,  0.32121864]]), 0.36366199194289345)
46 | 
47 | 
48 | # Alternatively, just provide the number of required clusters.
49 | from scipy.cluster.vq import kmeans
50 | kmeans(obs,2)


--------------------------------------------------------------------------------
/Ch08/decisionTreeIris.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May 13 15:11:39 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | # Implementing a decision tree with scikit-learn
 8 | 
 9 | import pandas as pd
10 | import numpy as np
11 | 
12 | # Import Data from CSV file.
13 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
14 | filename= 'iris.csv'
15 | file = filepath+'/'+filename
16 | data=pd.read_csv(file)
17 | data.head()
18 | 
19 | # Unique species
20 | data['Species'].unique() 
21 | # Out: array(['setosa', 'versicolor', 'virginica'], dtype=object)
22 | 
23 | # 1st get the predictor and the target variables
24 | colnames=data.columns.values.tolist()
25 | predictors=colnames[:4]
26 | target=colnames[4]
27 | 
28 | # Split into training and test data
29 | # Generate a uniform random distribution of numbers between 0 and 1.
30 | # train data selected is any data which has a number less than 0.75.
31 | # Complement goes to the test data 
32 | data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75
33 | train, test = data[data['is_train']==True], data[data['is_train']==False]
34 | 
35 | # Create a decision tree.
36 | from sklearn.tree import DecisionTreeClassifier
37 | dt = DecisionTreeClassifier(criterion='entropy',min_samples_split=20, random_state=99)
38 | dt.fit(train[predictors], train[target])
39 | # min_samples_split specifies the mnimum number of observations required 
40 | # to split a node into a subnode.
41 | # Default = 2
42 | # Recommended = 20
43 | 
44 | 
45 | # Test predicted model
46 | # Predicts class (species) of the flower via decision tree
47 | preds=dt.predict(test[predictors])
48 | # Creates a tablecomparing the Actual species and the predicted species.
49 | pd.crosstab(test['Species'],preds,rownames=['Actual'],colnames=['Predictions'])
50 | 
51 | # Visualizing the tree
52 | # Create a .dot file from the Decision Tree Classifier
53 | from sklearn.tree import export_graphviz
54 | dotfilename= 'dtree2.dot'
55 | dotfiles = filepath+'/'+dotfilename
56 | with open(dotfiles, 'w') as dotfile:
57 |     export_graphviz(dt, out_file = dotfile, feature_names = predictors)
58 | dotfile.close()
59 | 
60 | # Rendering a dotfile into a tree
61 | # After installing graphviz
62 | from os import system
63 | 
64 | system("dot -Tpng //home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/dtree2.dot -o //home/jasonm_dev/coding/learning-python-predictive-analytics/datasets/dtree2.png")
65 | 
66 | # Cross validate the etire dataset.
67 | X=data[predictors]
68 | Y=data[target]
69 | dt1 = DecisionTreeClassifier(criterion='entropy',max_depth=5, min_samples_split=20, random_state=99)
70 | dt1.fit(X,Y)
71 | # Import the cross validation methods from sklearn and perform the cross validation
72 | from sklearn.cross_validation import KFold
73 | crossvalidation = KFold(n=X.shape[0], n_folds=10, shuffle=True, random_state=1)
74 | from sklearn.cross_validation import cross_val_score
75 | score = np.mean(cross_val_score(dt1, X, Y, scoring='accuracy', cv=crossvalidation, n_jobs=1))
76 | score #Out: 0.93333333333333335
77 | 
78 | # Feature importance test
79 | # Higher the value, the higher the feature importance
80 | dt1.feature_importances_ 
81 | # Out: array([ 0. ,  0. ,  0.66869158,  0.33130842])
82 | # Out: ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
83 | # 1st: petal.length then petal width


--------------------------------------------------------------------------------
/Ch08/dtree2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/Ch08/dtree2.png


--------------------------------------------------------------------------------
/Ch08/randomForest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 16 20:01:46 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | """
 8 | Implementing a regression tree using Python
 9 | """
10 | import pandas as pd
11 | import numpy as np
12 | 
13 | # Import Data from CSV file.
14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
15 | filename= 'Boston.csv'
16 | file = filepath+'/'+filename
17 | data=pd.read_csv(file)
18 | data.head() #Out: (506, 14)
19 | 
20 | # First 13 varaibles set as predictor variables
21 | # and the last (MEDV) as the target variable
22 | colnames=data.columns.values.tolist()
23 | predictors=colnames[:13]
24 | target=colnames[13]
25 | X=data[predictors]
26 | Y=data[target]
27 | 
28 | # Build the random forest model.
29 | from sklearn.ensemble import RandomForestRegressor
30 | # Node size(min_samples_leaf): not so important here
31 | # Number of trees (n_estimators): generally around 500 
32 | # Number of predictors sampled: 2 - 5 
33 | # number of jobs running parallel (n_jobs)
34 | rf = RandomForestRegressor(n_jobs=2,oob_score=True,n_estimators=10)
35 | rf.fit(X,Y)
36 | 
37 | # The predicted values can be obtained
38 | rf.oob_prediction_
39 | #Let us now make the predictions a part of the data frame and have a look at it. 
40 | data['rf_pred']=rf.oob_prediction_
41 | cols=['rf_pred','medv']
42 | data[cols].head()
43 | 
44 | # To calculate a mean squared error we use oob predicted and actual values.
45 | data['rf_pred']=rf.oob_prediction_
46 | data['err']=(data['rf_pred']-data['medv'])**2
47 | sum(data['err'])/506 # Out[23]: 23.031183503507172
48 | 
49 | # oob score
50 | rf.oob_score_ # Out[24]: 0.72718189300945413
51 | 
52 | # Try with bigger sample
53 | rf2 = RandomForestRegressor(n_jobs=2,oob_score=True,n_estimators=500)
54 | rf2.fit(X,Y)
55 | data['rf2_pred']=rf2.oob_prediction_
56 | cols2=['rf2_pred','medv']
57 | data[cols2].head()
58 | data['rf2_pred']=rf2.oob_prediction_
59 | data['err2']=(data['rf2_pred']-data['medv'])**2
60 | sum(data['err2'])/506 # Out[23]: 10.05342135115402
61 | rf2.oob_score_ # Out[24]: 0.88091122710291847


--------------------------------------------------------------------------------
/Ch08/regressionTree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 16 20:01:46 2016
 4 | 
 5 | @author: jasonm_dev
 6 | """
 7 | """
 8 | Implementing a regression tree using Python
 9 | """
10 | import pandas as pd
11 | import numpy as np
12 | 
13 | # Import Data from CSV file.
14 | filepath = '/home/jasonm_dev/coding/learning-python-predictive-analytics/datasets'
15 | filename= 'Boston.csv'
16 | file = filepath+'/'+filename
17 | data=pd.read_csv(file)
18 | data.head() #Out: (506, 14)
19 | 
20 | # First 13 varaibles set as predictor variables
21 | # and the last (MEDV) as the target variable
22 | colnames=data.columns.values.tolist()
23 | predictors=colnames[:13]
24 | target=colnames[13]
25 | X=data[predictors]
26 | Y=data[target]
27 | 
28 | # Build the regression tree model.
29 | from sklearn.tree import DecisionTreeRegressor
30 | # min number of observation per node for split: 30
31 | # min number of observations per node to classify as leaf: 10
32 | regression_tree = DecisionTreeRegressor(min_samples_split=30,min_samples_leaf=10,random_state=0)
33 | regression_tree.fit(X,Y)
34 | 
35 | # Use model to make predictions
36 | reg_tree_pred=regression_tree.predict(data[predictors])
37 | data['pred']=reg_tree_pred
38 | cols=['pred','medv']
39 | # Compare prediction with actual
40 | data[cols]
41 | 
42 | # Cross-validate the model and check accuracy
43 | from sklearn.cross_validation import KFold
44 | from sklearn.cross_validation import cross_val_score
45 | 
46 | crossvalidation = KFold(n=X.shape[0], n_folds=10,shuffle=True, random_state=1)
47 | score = np.mean(cross_val_score(regression_tree, X, Y,scoring='mean_squared_error', cv=crossvalidation,n_jobs=1))
48 | score #Out[14]: -20.107307036443846
49 | 
50 | # The feature importance can be checked
51 | regression_tree.feature_importances_
52 | """Out[16]: 
53 | array([ 0.03421203,  0.        ,  0.00116059,  0.        ,  0.01856163,
54 |         0.6308568 ,  0.01725115,  0.00137451,  0.        ,  0.00236983,
55 |         0.00933325,  0.        ,  0.28488021])
56 | In [8]: colnames
57 | Out[8]: 
58 | ['crim',
59 |  'zn',
60 |  'indus',
61 |  'chas',
62 |  'nox',
63 |  'rm',
64 |  'age',
65 |  'dis',
66 |  'rad',
67 |  'tax',
68 |  'ptratio',
69 |  'black',
70 |  'lstat',
71 |  'medv']
72 |  
73 | The most important varaibles are age, lstat and rm in ascending order.
74 | Highest values have the most impoprtance. DOn't agree with selected variables.
75 | """
76 | 
77 | 


--------------------------------------------------------------------------------
/ISSUELOG.md:
--------------------------------------------------------------------------------
1 | #ISSUELOG
2 | 
3 | - [ ] Open file in chapter2
4 | - [ ] Add y=1/x to mpg vs hp model
5 | - [ ] File: logisticRegressionImplementation.py
6 |  - [ ] After installing ggplot within conda, still does not work
7 |  - [ ] dframe show nan instead of zero which affect the confusion matrix


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Predictive Analytics with Python
  2 | 
  3 | These are my notes from working through the book
  4 | [*Learning Predictive Analytics with Python*](https://www.packtpub.com/big-data-and-business-intelligence/learning-predictive-analytics-python)
  5 | by [Ashish Kumar](https://in.linkedin.com/in/ashishk64)
  6 | and published on Feb 2016.
  7 | 
  8 | ## General
  9 | ###Chapter 1: Getting Started with Predictive Modelling
 10 | - [x] Installed Anaconda Package.
 11 |  - [x] Python3.5 has been installed.
 12 |  - [x] Book follows python2, so some codes is modified along the way for python3.
 13 | 
 14 | ###Chapter 2: Data Cleaning
 15 | - [x] Reading the data: variations and examples
 16 | - [x] Data frames and delimiters.
 17 | 
 18 | ####Case 1: Reading a dataset using the read_csv method
 19 | - [x] File: titanicReadCSV.py
 20 | - [x] File: titanicReadCSV1.py
 21 | - [x] File: readCustomerChurn.py
 22 | - [x] File: readCustomerChurn2.py
 23 | - [x] File: changeDelimiter.py
 24 | 
 25 | ####Case 2: Reading a dataset using the open method of Python
 26 | - [x] File: readDatasetByOpenMethod.py
 27 | 
 28 | ####Case 3: Reading data from a URL
 29 | - [x] Modified the code that it works and prints out line by line dictionary of the dataset.
 30 | - [x] File: readURLLib2Iris.py
 31 | - [x] File: readURLMedals.py
 32 | 
 33 | ####Case 4: Miscellaneous cases
 34 | - [x] File: readXLS.py
 35 | - [x] Created the file above to read from both .xls an .xlsx
 36 | 
 37 | ####Basics: Summary, dimensions, and structure
 38 | - [x] File: basicDataCheck.py
 39 | - [x] Created the file above to read from both .xls an .xlsx
 40 | 
 41 | ####Handling missing values
 42 | - [x] File: basicDataCheck.py
 43 | - [x] RE: Treating missing data like NaN or None
 44 | - [x] Deletion orr imputaion
 45 | 
 46 | ####Creating dummy variables
 47 | - [x] File: basicDataCheck.py
 48 | - [x] Split into new variable 'sex_female' and 'sex_male'
 49 | - [x] Remove column 'sex'
 50 | - [x] Add both dummy column created above.
 51 | 
 52 | ####Visualizing a dataset by basic plotting
 53 | - [x] File: plotData.py
 54 | - [x] Figure file: ScatterPlots.jpeg
 55 | - [x] Plot Types: Scatterplot, Histograms and boxplots
 56 | 
 57 | ###Chapter 3: Data Wrangling
 58 | ####Subsetting a dataset
 59 | - [x] Selecting Columns
 60 |  - [x] File: subsetDataset.py
 61 | - [x] Selecting Rows
 62 |  - [x] File: subsetDatasetRows.py
 63 | - [x] Selecting a combination of rows and columns
 64 |  - [x] File: subsetColRows.py
 65 | - [x] Creating new columns
 66 |  - [x] File: subsetNewCol.py
 67 | 
 68 | ####Generating random numbers and their usage
 69 | - [x] Various methods for generating random numbers
 70 |  - [x] File: generateRandomNumbers.py
 71 | - [x] Seeding a random number
 72 |  - [x] File: generateRandomNumbers.py
 73 | - [x] Generating random numbers following probability distributions
 74 |  - [x] File: generateRandomProbDistr.py
 75 |  - [x] Probability density function: PDF = Prob(X=x)
 76 |  - [x] Cumulative density function: CDF(x) = Prob(X<=x)
 77 |  - [x] Uniform distribution: random variables occur with the same (uniform) frequency/probability
 78 |  - [x] Normal distribution: Bell Curve and most ubiquitous and versatile probability distribution
 79 | - [x] Using the Monte-Carlo simulation to find the value of pi
 80 |  - [x] File: calcPi.py
 81 |  - [x] Geometry and mathematics behind the calculation of pi
 82 | - [x] Generating a dummy data frame
 83 |  - [x] File: generateDummyDataFrame.py
 84 | 
 85 | ####Grouping the data – aggregation, filtering, and transformation
 86 | - [x] File: groupData.py
 87 | - [x] Grouping
 88 | - [x] Aggregation
 89 | - [x] Filtering
 90 | - [x] Transformation
 91 | - [x] Miscellaneous operations
 92 | 
 93 | ####Random sampling – splitting a dataset in training and testing datasets
 94 | - [ ] File: splitDataTrainTest.py
 95 |  - [x] Method 1: using the Customer Churn Model
 96 |  - [x] Method 2: using sklearn
 97 |  - [ ] Method 3: using the shuffle function
 98 | 
 99 | ####Concatenating and appending data
100 | - [x] File: concatenateAndAppend.py
101 | - [x] File: appendManyFiles.py
102 | 
103 | ####Merging/joining datasets
104 | - [x] File: mergeJoin.py
105 | - [x] Inner Join
106 | - [x] Left Join
107 | - [x] Right Join
108 | - [x] An example of the Inner Join
109 | - [x] An example of the Left Join
110 | - [x] An example of the Right Join
111 | - [x] Summary of Joins in terms of their length
112 | 
113 | ###Chapter 4: Statistical Concepts for Predictive Modelling
114 | ####Random sampling and central limit theorem
115 | ####Hypothesis testing
116 | - [x] Null versus alternate hypothesis
117 | - [x] Z-statistic and t-statistic
118 | - [x] Confidence intervals, significance levels, and p-values
119 | - [x] Different kinds of hypothesis test
120 | - [x] A step-by-step guide to do a hypothesis test
121 | - [x] An example of a hypothesis test
122 | 
123 | ####Chi-square testing
124 | ####Correlation
125 | - [x] File: linearRegression.py
126 | - [x] File: linearRegressionFunction.py
127 | - [x] Picture: TVSalesCorrelationPlot.png
128 | - [x] Picture: RadioSalesCorrelationPlot.png
129 | - [x] Picture: NewspaperSalesCorrelationPlot.png
130 | 
131 | ###Chapter 5: Linear Regression with Python
132 | ####Understanding the maths behind linear regression
133 | - [x] Linear regression using simulated data
134 |  - [x] File: linearRegression.py
135 |  - [x] Picture: CurrentVsPredicted1.png
136 |  - [x] Picture: CurrentVsPredictedVsMean1.png
137 |  - [x] Picture: CurrentVsPredictedVsModel1.png
138 | 
139 | ####Making sense of result parameters
140 | - [x] File: linearRegression.py
141 | - [x] p-values
142 | - [x] F-statistics
143 | - [x] Residual Standard Error (RSE)
144 | 
145 | ####Implementing linear regression with Python
146 | - [x] File: linearRegressionSMF.py
147 | - [x] Linear regression using the statsmodel library
148 | - [x] Multiple linear regression
149 | - [x] Multi-collinearity: sub-optimal performance of the model
150 |  - [x] Variance Inflation Factor
151 |   - [x]  It is a method to quantify the rise in the variability of the coefficient estimate of a particular variable because of high correlation between two or more than two predictor variables.
152 | 
153 | ####Model validation
154 | - [x] Training and testing data split 
155 |  - [x] File: linearRegressionSMF.py
156 | - [x] Linear regression with scikit-learn
157 |  - [x] File: linearRegressionSKL.py 
158 | - [x] Feature selection with scikit-learn
159 |  - [x] Recursive Feature Elimination (RFE)
160 |  - [x] File: linearRegressionRFE.py
161 | 
162 | ####Handling other issues in linear regression
163 | - [x] Handling categorical variables
164 |  - [x] File: linearRegressionECom.py
165 | - [x] Transforming a variable to fit non-linear relations
166 |  - [x] File: nonlinearRegression.py
167 |  - [x] Picture:  MPGVSHorsepower.png
168 |  - [x] Picture:  MPGVSHorsepowerVsLine.png
169 |  - [x] Picture:  MPGVSHorsepowerModels.png
170 | - [x] Handling outliers
171 | - [x] Other considerations and assumptions for linear regression
172 | 
173 | ###Chapter 6: Logistic Regression with Python
174 | ####Linear regression versus logistic regression
175 | ####Understanding the math behind logistic regression
176 | - [x] File: logisticRegression.py
177 | - [x] Contingency tables
178 | - [x] Conditional probability
179 | - [x] Odds ratio
180 | - [x] Moving on to logistic regression from linear regression
181 | - [x] Estimation using the Maximum Likelihood Method
182 |  - [x] Building the logistic regression model from scratch
183 |  - [x] File: logisticRegressionScratch.py
184 |  - [ ] Read above again.
185 | - [x] Making sense of logistic regression parameters
186 |  - [x] Wald test
187 |  - [x] Likelihood Ratio Test statistic
188 |  - [x] Chi-square test
189 | - [x]
190 | 
191 | ####Implementing logistic regression with Python
192 | - [x] File: logisticRegressionImplementation.py
193 | - [x] Processing the data
194 | - [x] Data exploration
195 | - [x] Data visualization
196 | - [x] Creating dummy variables for categorical variables
197 | - [x] Feature selection
198 | - [x] Implementing the model
199 | 
200 | ####Model validation and evaluation
201 | - [x] File: logisticRegressionImplementation.py
202 | - [x] Cross validation
203 | 
204 | ####Model validation
205 | - [x] File: logisticRegressionImplementation.py
206 | - [x] The ROC curve {see terms}
207 | 
208 | ###Chapter 7: Clustering with Python 
209 | ####Introduction to clustering – what, why, and how?
210 | - [x] What is clustering?
211 | - [x] How is clustering used?
212 | - [x] Why do we do clustering?
213 | 
214 | ####Mathematics behind clustering
215 | - [x] Distances between two observations
216 |  - [x] Euclidean distance
217 |  - [x] Manhattan distance
218 |  - [x] Minkowski distance
219 |  - [x] The distance matrix
220 | - [x] Normalizing the distances
221 | - [x] Linkage methods
222 |  - [x] Single linkage
223 |  - [x] Compete linkage
224 |  - [x] Average linkage
225 |  - [x] Centroid linkage
226 |  - [x] Ward's method uses ANOVA method
227 | - [x] Hierarchical clustering
228 | - [x] K-means clustering
229 |  - [x] File: kMeanClustering.py
230 | 
231 | ####Implementing clustering using Python
232 | - [x] File: clusterWine.py
233 | - [x] Importing and exploring the dataset
234 | - [x] Normalizing the values in the dataset
235 | - [x] Hierarchical clustering using scikit-learn
236 | - [x] K-Means clustering using scikit-learn
237 |  - [x] Interpreting the cluster
238 | 
239 | ####Fine-tuning the clustering 
240 | - [x] The elbow method
241 | - [x] Silhouette Coefficient
242 | 
243 | ###Chapter 8: Trees and Random Forests with Python
244 | ####Introducing decision trees
245 | - [x] A decision tree
246 | 
247 | ####Understanding the mathematics behind decision trees
248 | - [x] Homogeneity
249 | - [x] Entropy
250 | - [x] Information gain
251 | - [x] ID3 algorithm to create a decision tree
252 | - [x] Gini index
253 | - [x] Reduction in Variance
254 | - [x] Pruning a tree 
255 | - [x] Handling a continuous numerical variable
256 | - [x] Handling a missing value of an attribute
257 | 
258 | ####Implementing a decision tree with scikit-learn
259 | - [x] File: decisionTreeIris.py
260 | - [x] Visualizing the tree
261 | - [x] Picture: dtree2.png
262 | - [x] File: dtree2.dot
263 | - [x] Cross-validating and pruning the decision tree
264 | 
265 | ####Understanding and implementing regression trees
266 | - [x] File: regressionTree.py
267 | - [x] Regression tree algorithm
268 | - [x] Implementing a regression tree using Python
269 | 
270 | ####Understanding and implementing random forests
271 | - [x] File: randomForest.py
272 | - [x] The random forest algorithm
273 | - [x] Implementing a random forest using Python
274 | - [x] Why do random forests work?
275 | - [x] Important parameters for random forests
276 | 
277 | 
278 | ###Chapter 9: Best Practices for Predictive Modelling
279 | ####Best practices for coding
280 | - [x] Commenting the codes
281 | - [x] Defining functions for substantial individual tasks
282 |  - [x] Example 1
283 |  - [x] Example 2
284 |  - [x] Example 3
285 | - [x] Avoid hard-coding of variables as much as possible
286 | - [x] Version control
287 | - [x] Using standard libraries, methods, and formulas
288 | 
289 | ####Best practices for data handling
290 | 
291 | ####Best practices for algorithms
292 | 
293 | ####Best practices for statistics
294 | 
295 | ####Best practices for business contexts
296 | 
297 | 
298 | 
299 | 


--------------------------------------------------------------------------------
/datasets/Advertising.csv:
--------------------------------------------------------------------------------
  1 | TV,Radio,Newspaper,Sales
  2 | 230.1,37.8,69.2,22.1
  3 | 44.5,39.3,45.1,10.4
  4 | 17.2,45.9,69.3,9.3
  5 | 151.5,41.3,58.5,18.5
  6 | 180.8,10.8,58.4,12.9
  7 | 8.7,48.9,75,7.2
  8 | 57.5,32.8,23.5,11.8
  9 | 120.2,19.6,11.6,13.2
 10 | 8.6,2.1,1,4.8
 11 | 199.8,2.6,21.2,10.6
 12 | 66.1,5.8,24.2,8.6
 13 | 214.7,24,4,17.4
 14 | 23.8,35.1,65.9,9.2
 15 | 97.5,7.6,7.2,9.7
 16 | 204.1,32.9,46,19
 17 | 195.4,47.7,52.9,22.4
 18 | 67.8,36.6,114,12.5
 19 | 281.4,39.6,55.8,24.4
 20 | 69.2,20.5,18.3,11.3
 21 | 147.3,23.9,19.1,14.6
 22 | 218.4,27.7,53.4,18
 23 | 237.4,5.1,23.5,12.5
 24 | 13.2,15.9,49.6,5.6
 25 | 228.3,16.9,26.2,15.5
 26 | 62.3,12.6,18.3,9.7
 27 | 262.9,3.5,19.5,12
 28 | 142.9,29.3,12.6,15
 29 | 240.1,16.7,22.9,15.9
 30 | 248.8,27.1,22.9,18.9
 31 | 70.6,16,40.8,10.5
 32 | 292.9,28.3,43.2,21.4
 33 | 112.9,17.4,38.6,11.9
 34 | 97.2,1.5,30,9.6
 35 | 265.6,20,0.3,17.4
 36 | 95.7,1.4,7.4,9.5
 37 | 290.7,4.1,8.5,12.8
 38 | 266.9,43.8,5,25.4
 39 | 74.7,49.4,45.7,14.7
 40 | 43.1,26.7,35.1,10.1
 41 | 228,37.7,32,21.5
 42 | 202.5,22.3,31.6,16.6
 43 | 177,33.4,38.7,17.1
 44 | 293.6,27.7,1.8,20.7
 45 | 206.9,8.4,26.4,12.9
 46 | 25.1,25.7,43.3,8.5
 47 | 175.1,22.5,31.5,14.9
 48 | 89.7,9.9,35.7,10.6
 49 | 239.9,41.5,18.5,23.2
 50 | 227.2,15.8,49.9,14.8
 51 | 66.9,11.7,36.8,9.7
 52 | 199.8,3.1,34.6,11.4
 53 | 100.4,9.6,3.6,10.7
 54 | 216.4,41.7,39.6,22.6
 55 | 182.6,46.2,58.7,21.2
 56 | 262.7,28.8,15.9,20.2
 57 | 198.9,49.4,60,23.7
 58 | 7.3,28.1,41.4,5.5
 59 | 136.2,19.2,16.6,13.2
 60 | 210.8,49.6,37.7,23.8
 61 | 210.7,29.5,9.3,18.4
 62 | 53.5,2,21.4,8.1
 63 | 261.3,42.7,54.7,24.2
 64 | 239.3,15.5,27.3,15.7
 65 | 102.7,29.6,8.4,14
 66 | 131.1,42.8,28.9,18
 67 | 69,9.3,0.9,9.3
 68 | 31.5,24.6,2.2,9.5
 69 | 139.3,14.5,10.2,13.4
 70 | 237.4,27.5,11,18.9
 71 | 216.8,43.9,27.2,22.3
 72 | 199.1,30.6,38.7,18.3
 73 | 109.8,14.3,31.7,12.4
 74 | 26.8,33,19.3,8.8
 75 | 129.4,5.7,31.3,11
 76 | 213.4,24.6,13.1,17
 77 | 16.9,43.7,89.4,8.7
 78 | 27.5,1.6,20.7,6.9
 79 | 120.5,28.5,14.2,14.2
 80 | 5.4,29.9,9.4,5.3
 81 | 116,7.7,23.1,11
 82 | 76.4,26.7,22.3,11.8
 83 | 239.8,4.1,36.9,12.3
 84 | 75.3,20.3,32.5,11.3
 85 | 68.4,44.5,35.6,13.6
 86 | 213.5,43,33.8,21.7
 87 | 193.2,18.4,65.7,15.2
 88 | 76.3,27.5,16,12
 89 | 110.7,40.6,63.2,16
 90 | 88.3,25.5,73.4,12.9
 91 | 109.8,47.8,51.4,16.7
 92 | 134.3,4.9,9.3,11.2
 93 | 28.6,1.5,33,7.3
 94 | 217.7,33.5,59,19.4
 95 | 250.9,36.5,72.3,22.2
 96 | 107.4,14,10.9,11.5
 97 | 163.3,31.6,52.9,16.9
 98 | 197.6,3.5,5.9,11.7
 99 | 184.9,21,22,15.5
100 | 289.7,42.3,51.2,25.4
101 | 135.2,41.7,45.9,17.2
102 | 222.4,4.3,49.8,11.7
103 | 296.4,36.3,100.9,23.8
104 | 280.2,10.1,21.4,14.8
105 | 187.9,17.2,17.9,14.7
106 | 238.2,34.3,5.3,20.7
107 | 137.9,46.4,59,19.2
108 | 25,11,29.7,7.2
109 | 90.4,0.3,23.2,8.7
110 | 13.1,0.4,25.6,5.3
111 | 255.4,26.9,5.5,19.8
112 | 225.8,8.2,56.5,13.4
113 | 241.7,38,23.2,21.8
114 | 175.7,15.4,2.4,14.1
115 | 209.6,20.6,10.7,15.9
116 | 78.2,46.8,34.5,14.6
117 | 75.1,35,52.7,12.6
118 | 139.2,14.3,25.6,12.2
119 | 76.4,0.8,14.8,9.4
120 | 125.7,36.9,79.2,15.9
121 | 19.4,16,22.3,6.6
122 | 141.3,26.8,46.2,15.5
123 | 18.8,21.7,50.4,7
124 | 224,2.4,15.6,11.6
125 | 123.1,34.6,12.4,15.2
126 | 229.5,32.3,74.2,19.7
127 | 87.2,11.8,25.9,10.6
128 | 7.8,38.9,50.6,6.6
129 | 80.2,0,9.2,8.8
130 | 220.3,49,3.2,24.7
131 | 59.6,12,43.1,9.7
132 | 0.7,39.6,8.7,1.6
133 | 265.2,2.9,43,12.7
134 | 8.4,27.2,2.1,5.7
135 | 219.8,33.5,45.1,19.6
136 | 36.9,38.6,65.6,10.8
137 | 48.3,47,8.5,11.6
138 | 25.6,39,9.3,9.5
139 | 273.7,28.9,59.7,20.8
140 | 43,25.9,20.5,9.6
141 | 184.9,43.9,1.7,20.7
142 | 73.4,17,12.9,10.9
143 | 193.7,35.4,75.6,19.2
144 | 220.5,33.2,37.9,20.1
145 | 104.6,5.7,34.4,10.4
146 | 96.2,14.8,38.9,11.4
147 | 140.3,1.9,9,10.3
148 | 240.1,7.3,8.7,13.2
149 | 243.2,49,44.3,25.4
150 | 38,40.3,11.9,10.9
151 | 44.7,25.8,20.6,10.1
152 | 280.7,13.9,37,16.1
153 | 121,8.4,48.7,11.6
154 | 197.6,23.3,14.2,16.6
155 | 171.3,39.7,37.7,19
156 | 187.8,21.1,9.5,15.6
157 | 4.1,11.6,5.7,3.2
158 | 93.9,43.5,50.5,15.3
159 | 149.8,1.3,24.3,10.1
160 | 11.7,36.9,45.2,7.3
161 | 131.7,18.4,34.6,12.9
162 | 172.5,18.1,30.7,14.4
163 | 85.7,35.8,49.3,13.3
164 | 188.4,18.1,25.6,14.9
165 | 163.5,36.8,7.4,18
166 | 117.2,14.7,5.4,11.9
167 | 234.5,3.4,84.8,11.9
168 | 17.9,37.6,21.6,8
169 | 206.8,5.2,19.4,12.2
170 | 215.4,23.6,57.6,17.1
171 | 284.3,10.6,6.4,15
172 | 50,11.6,18.4,8.4
173 | 164.5,20.9,47.4,14.5
174 | 19.6,20.1,17,7.6
175 | 168.4,7.1,12.8,11.7
176 | 222.4,3.4,13.1,11.5
177 | 276.9,48.9,41.8,27
178 | 248.4,30.2,20.3,20.2
179 | 170.2,7.8,35.2,11.7
180 | 276.7,2.3,23.7,11.8
181 | 165.6,10,17.6,12.6
182 | 156.6,2.6,8.3,10.5
183 | 218.5,5.4,27.4,12.2
184 | 56.2,5.7,29.7,8.7
185 | 287.6,43,71.8,26.2
186 | 253.8,21.3,30,17.6
187 | 205,45.1,19.6,22.6
188 | 139.5,2.1,26.6,10.3
189 | 191.1,28.7,18.2,17.3
190 | 286,13.9,3.7,15.9
191 | 18.7,12.1,23.4,6.7
192 | 39.5,41.1,5.8,10.8
193 | 75.5,10.8,6,9.9
194 | 17.2,4.1,31.6,5.9
195 | 166.8,42,3.6,19.6
196 | 149.7,35.6,6,17.3
197 | 38.2,3.7,13.8,7.6
198 | 94.2,4.9,8.1,9.7
199 | 177,9.3,6.4,12.8
200 | 283.6,42,66.2,25.5
201 | 232.1,8.6,8.7,13.4
202 | 


--------------------------------------------------------------------------------
/datasets/Auto.csv:
--------------------------------------------------------------------------------
  1 | mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
  2 | 18,8,307,130,3504,12,70,1,chevrolet chevelle malibu
  3 | 15,8,350,165,3693,11.5,70,1,buick skylark 320
  4 | 18,8,318,150,3436,11,70,1,plymouth satellite
  5 | 16,8,304,150,3433,12,70,1,amc rebel sst
  6 | 17,8,302,140,3449,10.5,70,1,ford torino
  7 | 15,8,429,198,4341,10,70,1,ford galaxie 500
  8 | 14,8,454,220,4354,9,70,1,chevrolet impala
  9 | 14,8,440,215,4312,8.5,70,1,plymouth fury iii
 10 | 14,8,455,225,4425,10,70,1,pontiac catalina
 11 | 15,8,390,190,3850,8.5,70,1,amc ambassador dpl
 12 | NA,4,133,115,3090,17.5,70,2,citroen ds-21 pallas
 13 | NA,8,350,165,4142,11.5,70,1,chevrolet chevelle concours (sw)
 14 | NA,8,351,153,4034,11,70,1,ford torino (sw)
 15 | NA,8,383,175,4166,10.5,70,1,plymouth satellite (sw)
 16 | NA,8,360,175,3850,11,70,1,amc rebel sst (sw)
 17 | 15,8,383,170,3563,10,70,1,dodge challenger se
 18 | 14,8,340,160,3609,8,70,1,plymouth 'cuda 340
 19 | NA,8,302,140,3353,8,70,1,ford mustang boss 302
 20 | 15,8,400,150,3761,9.5,70,1,chevrolet monte carlo
 21 | 14,8,455,225,3086,10,70,1,buick estate wagon (sw)
 22 | 24,4,113,95,2372,15,70,3,toyota corona mark ii
 23 | 22,6,198,95,2833,15.5,70,1,plymouth duster
 24 | 18,6,199,97,2774,15.5,70,1,amc hornet
 25 | 21,6,200,85,2587,16,70,1,ford maverick
 26 | 27,4,97,88,2130,14.5,70,3,datsun pl510
 27 | 26,4,97,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
 28 | 25,4,110,87,2672,17.5,70,2,peugeot 504
 29 | 24,4,107,90,2430,14.5,70,2,audi 100 ls
 30 | 25,4,104,95,2375,17.5,70,2,saab 99e
 31 | 26,4,121,113,2234,12.5,70,2,bmw 2002
 32 | 21,6,199,90,2648,15,70,1,amc gremlin
 33 | 10,8,360,215,4615,14,70,1,ford f250
 34 | 10,8,307,200,4376,15,70,1,chevy c20
 35 | 11,8,318,210,4382,13.5,70,1,dodge d200
 36 | 9,8,304,193,4732,18.5,70,1,hi 1200d
 37 | 27,4,97,88,2130,14.5,71,3,datsun pl510
 38 | 28,4,140,90,2264,15.5,71,1,chevrolet vega 2300
 39 | 25,4,113,95,2228,14,71,3,toyota corona
 40 | 25,4,98,NA,2046,19,71,1,ford pinto
 41 | NA,4,97,48,1978,20,71,2,volkswagen super beetle 117
 42 | 19,6,232,100,2634,13,71,1,amc gremlin
 43 | 16,6,225,105,3439,15.5,71,1,plymouth satellite custom
 44 | 17,6,250,100,3329,15.5,71,1,chevrolet chevelle malibu
 45 | 19,6,250,88,3302,15.5,71,1,ford torino 500
 46 | 18,6,232,100,3288,15.5,71,1,amc matador
 47 | 14,8,350,165,4209,12,71,1,chevrolet impala
 48 | 14,8,400,175,4464,11.5,71,1,pontiac catalina brougham
 49 | 14,8,351,153,4154,13.5,71,1,ford galaxie 500
 50 | 14,8,318,150,4096,13,71,1,plymouth fury iii
 51 | 12,8,383,180,4955,11.5,71,1,dodge monaco (sw)
 52 | 13,8,400,170,4746,12,71,1,ford country squire (sw)
 53 | 13,8,400,175,5140,12,71,1,pontiac safari (sw)
 54 | 18,6,258,110,2962,13.5,71,1,amc hornet sportabout (sw)
 55 | 22,4,140,72,2408,19,71,1,chevrolet vega (sw)
 56 | 19,6,250,100,3282,15,71,1,pontiac firebird
 57 | 18,6,250,88,3139,14.5,71,1,ford mustang
 58 | 23,4,122,86,2220,14,71,1,mercury capri 2000
 59 | 28,4,116,90,2123,14,71,2,opel 1900
 60 | 30,4,79,70,2074,19.5,71,2,peugeot 304
 61 | 30,4,88,76,2065,14.5,71,2,fiat 124b
 62 | 31,4,71,65,1773,19,71,3,toyota corolla 1200
 63 | 35,4,72,69,1613,18,71,3,datsun 1200
 64 | 27,4,97,60,1834,19,71,2,volkswagen model 111
 65 | 26,4,91,70,1955,20.5,71,1,plymouth cricket
 66 | 24,4,113,95,2278,15.5,72,3,toyota corona hardtop
 67 | 25,4,97.5,80,2126,17,72,1,dodge colt hardtop
 68 | 23,4,97,54,2254,23.5,72,2,volkswagen type 3
 69 | 20,4,140,90,2408,19.5,72,1,chevrolet vega
 70 | 21,4,122,86,2226,16.5,72,1,ford pinto runabout
 71 | 13,8,350,165,4274,12,72,1,chevrolet impala
 72 | 14,8,400,175,4385,12,72,1,pontiac catalina
 73 | 15,8,318,150,4135,13.5,72,1,plymouth fury iii
 74 | 14,8,351,153,4129,13,72,1,ford galaxie 500
 75 | 17,8,304,150,3672,11.5,72,1,amc ambassador sst
 76 | 11,8,429,208,4633,11,72,1,mercury marquis
 77 | 13,8,350,155,4502,13.5,72,1,buick lesabre custom
 78 | 12,8,350,160,4456,13.5,72,1,oldsmobile delta 88 royale
 79 | 13,8,400,190,4422,12.5,72,1,chrysler newport royal
 80 | 19,3,70,97,2330,13.5,72,3,mazda rx2 coupe
 81 | 15,8,304,150,3892,12.5,72,1,amc matador (sw)
 82 | 13,8,307,130,4098,14,72,1,chevrolet chevelle concours (sw)
 83 | 13,8,302,140,4294,16,72,1,ford gran torino (sw)
 84 | 14,8,318,150,4077,14,72,1,plymouth satellite custom (sw)
 85 | 18,4,121,112,2933,14.5,72,2,volvo 145e (sw)
 86 | 22,4,121,76,2511,18,72,2,volkswagen 411 (sw)
 87 | 21,4,120,87,2979,19.5,72,2,peugeot 504 (sw)
 88 | 26,4,96,69,2189,18,72,2,renault 12 (sw)
 89 | 22,4,122,86,2395,16,72,1,ford pinto (sw)
 90 | 28,4,97,92,2288,17,72,3,datsun 510 (sw)
 91 | 23,4,120,97,2506,14.5,72,3,toyouta corona mark ii (sw)
 92 | 28,4,98,80,2164,15,72,1,dodge colt (sw)
 93 | 27,4,97,88,2100,16.5,72,3,toyota corolla 1600 (sw)
 94 | 13,8,350,175,4100,13,73,1,buick century 350
 95 | 14,8,304,150,3672,11.5,73,1,amc matador
 96 | 13,8,350,145,3988,13,73,1,chevrolet malibu
 97 | 14,8,302,137,4042,14.5,73,1,ford gran torino
 98 | 15,8,318,150,3777,12.5,73,1,dodge coronet custom
 99 | 12,8,429,198,4952,11.5,73,1,mercury marquis brougham
100 | 13,8,400,150,4464,12,73,1,chevrolet caprice classic
101 | 13,8,351,158,4363,13,73,1,ford ltd
102 | 14,8,318,150,4237,14.5,73,1,plymouth fury gran sedan
103 | 13,8,440,215,4735,11,73,1,chrysler new yorker brougham
104 | 12,8,455,225,4951,11,73,1,buick electra 225 custom
105 | 13,8,360,175,3821,11,73,1,amc ambassador brougham
106 | 18,6,225,105,3121,16.5,73,1,plymouth valiant
107 | 16,6,250,100,3278,18,73,1,chevrolet nova custom
108 | 18,6,232,100,2945,16,73,1,amc hornet
109 | 18,6,250,88,3021,16.5,73,1,ford maverick
110 | 23,6,198,95,2904,16,73,1,plymouth duster
111 | 26,4,97,46,1950,21,73,2,volkswagen super beetle
112 | 11,8,400,150,4997,14,73,1,chevrolet impala
113 | 12,8,400,167,4906,12.5,73,1,ford country
114 | 13,8,360,170,4654,13,73,1,plymouth custom suburb
115 | 12,8,350,180,4499,12.5,73,1,oldsmobile vista cruiser
116 | 18,6,232,100,2789,15,73,1,amc gremlin
117 | 20,4,97,88,2279,19,73,3,toyota carina
118 | 21,4,140,72,2401,19.5,73,1,chevrolet vega
119 | 22,4,108,94,2379,16.5,73,3,datsun 610
120 | 18,3,70,90,2124,13.5,73,3,maxda rx3
121 | 19,4,122,85,2310,18.5,73,1,ford pinto
122 | 21,6,155,107,2472,14,73,1,mercury capri v6
123 | 26,4,98,90,2265,15.5,73,2,fiat 124 sport coupe
124 | 15,8,350,145,4082,13,73,1,chevrolet monte carlo s
125 | 16,8,400,230,4278,9.5,73,1,pontiac grand prix
126 | 29,4,68,49,1867,19.5,73,2,fiat 128
127 | 24,4,116,75,2158,15.5,73,2,opel manta
128 | 20,4,114,91,2582,14,73,2,audi 100ls
129 | 19,4,121,112,2868,15.5,73,2,volvo 144ea
130 | 15,8,318,150,3399,11,73,1,dodge dart custom
131 | 24,4,121,110,2660,14,73,2,saab 99le
132 | 20,6,156,122,2807,13.5,73,3,toyota mark ii
133 | 11,8,350,180,3664,11,73,1,oldsmobile omega
134 | 20,6,198,95,3102,16.5,74,1,plymouth duster
135 | 21,6,200,NA,2875,17,74,1,ford maverick
136 | 19,6,232,100,2901,16,74,1,amc hornet
137 | 15,6,250,100,3336,17,74,1,chevrolet nova
138 | 31,4,79,67,1950,19,74,3,datsun b210
139 | 26,4,122,80,2451,16.5,74,1,ford pinto
140 | 32,4,71,65,1836,21,74,3,toyota corolla 1200
141 | 25,4,140,75,2542,17,74,1,chevrolet vega
142 | 16,6,250,100,3781,17,74,1,chevrolet chevelle malibu classic
143 | 16,6,258,110,3632,18,74,1,amc matador
144 | 18,6,225,105,3613,16.5,74,1,plymouth satellite sebring
145 | 16,8,302,140,4141,14,74,1,ford gran torino
146 | 13,8,350,150,4699,14.5,74,1,buick century luxus (sw)
147 | 14,8,318,150,4457,13.5,74,1,dodge coronet custom (sw)
148 | 14,8,302,140,4638,16,74,1,ford gran torino (sw)
149 | 14,8,304,150,4257,15.5,74,1,amc matador (sw)
150 | 29,4,98,83,2219,16.5,74,2,audi fox
151 | 26,4,79,67,1963,15.5,74,2,volkswagen dasher
152 | 26,4,97,78,2300,14.5,74,2,opel manta
153 | 31,4,76,52,1649,16.5,74,3,toyota corona
154 | 32,4,83,61,2003,19,74,3,datsun 710
155 | 28,4,90,75,2125,14.5,74,1,dodge colt
156 | 24,4,90,75,2108,15.5,74,2,fiat 128
157 | 26,4,116,75,2246,14,74,2,fiat 124 tc
158 | 24,4,120,97,2489,15,74,3,honda civic
159 | 26,4,108,93,2391,15.5,74,3,subaru
160 | 31,4,79,67,2000,16,74,2,fiat x1.9
161 | 19,6,225,95,3264,16,75,1,plymouth valiant custom
162 | 18,6,250,105,3459,16,75,1,chevrolet nova
163 | 15,6,250,72,3432,21,75,1,mercury monarch
164 | 15,6,250,72,3158,19.5,75,1,ford maverick
165 | 16,8,400,170,4668,11.5,75,1,pontiac catalina
166 | 15,8,350,145,4440,14,75,1,chevrolet bel air
167 | 16,8,318,150,4498,14.5,75,1,plymouth grand fury
168 | 14,8,351,148,4657,13.5,75,1,ford ltd
169 | 17,6,231,110,3907,21,75,1,buick century
170 | 16,6,250,105,3897,18.5,75,1,chevroelt chevelle malibu
171 | 15,6,258,110,3730,19,75,1,amc matador
172 | 18,6,225,95,3785,19,75,1,plymouth fury
173 | 21,6,231,110,3039,15,75,1,buick skyhawk
174 | 20,8,262,110,3221,13.5,75,1,chevrolet monza 2+2
175 | 13,8,302,129,3169,12,75,1,ford mustang ii
176 | 29,4,97,75,2171,16,75,3,toyota corolla
177 | 23,4,140,83,2639,17,75,1,ford pinto
178 | 20,6,232,100,2914,16,75,1,amc gremlin
179 | 23,4,140,78,2592,18.5,75,1,pontiac astro
180 | 24,4,134,96,2702,13.5,75,3,toyota corona
181 | 25,4,90,71,2223,16.5,75,2,volkswagen dasher
182 | 24,4,119,97,2545,17,75,3,datsun 710
183 | 18,6,171,97,2984,14.5,75,1,ford pinto
184 | 29,4,90,70,1937,14,75,2,volkswagen rabbit
185 | 19,6,232,90,3211,17,75,1,amc pacer
186 | 23,4,115,95,2694,15,75,2,audi 100ls
187 | 23,4,120,88,2957,17,75,2,peugeot 504
188 | 22,4,121,98,2945,14.5,75,2,volvo 244dl
189 | 25,4,121,115,2671,13.5,75,2,saab 99le
190 | 33,4,91,53,1795,17.5,75,3,honda civic cvcc
191 | 28,4,107,86,2464,15.5,76,2,fiat 131
192 | 25,4,116,81,2220,16.9,76,2,opel 1900
193 | 25,4,140,92,2572,14.9,76,1,capri ii
194 | 26,4,98,79,2255,17.7,76,1,dodge colt
195 | 27,4,101,83,2202,15.3,76,2,renault 12tl
196 | 17.5,8,305,140,4215,13,76,1,chevrolet chevelle malibu classic
197 | 16,8,318,150,4190,13,76,1,dodge coronet brougham
198 | 15.5,8,304,120,3962,13.9,76,1,amc matador
199 | 14.5,8,351,152,4215,12.8,76,1,ford gran torino
200 | 22,6,225,100,3233,15.4,76,1,plymouth valiant
201 | 22,6,250,105,3353,14.5,76,1,chevrolet nova
202 | 24,6,200,81,3012,17.6,76,1,ford maverick
203 | 22.5,6,232,90,3085,17.6,76,1,amc hornet
204 | 29,4,85,52,2035,22.2,76,1,chevrolet chevette
205 | 24.5,4,98,60,2164,22.1,76,1,chevrolet woody
206 | 29,4,90,70,1937,14.2,76,2,vw rabbit
207 | 33,4,91,53,1795,17.4,76,3,honda civic
208 | 20,6,225,100,3651,17.7,76,1,dodge aspen se
209 | 18,6,250,78,3574,21,76,1,ford granada ghia
210 | 18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj
211 | 17.5,6,258,95,3193,17.8,76,1,amc pacer d/l
212 | 29.5,4,97,71,1825,12.2,76,2,volkswagen rabbit
213 | 32,4,85,70,1990,17,76,3,datsun b-210
214 | 28,4,97,75,2155,16.4,76,3,toyota corolla
215 | 26.5,4,140,72,2565,13.6,76,1,ford pinto
216 | 20,4,130,102,3150,15.7,76,2,volvo 245
217 | 13,8,318,150,3940,13.2,76,1,plymouth volare premier v8
218 | 19,4,120,88,3270,21.9,76,2,peugeot 504
219 | 19,6,156,108,2930,15.5,76,3,toyota mark ii
220 | 16.5,6,168,120,3820,16.7,76,2,mercedes-benz 280s
221 | 16.5,8,350,180,4380,12.1,76,1,cadillac seville
222 | 13,8,350,145,4055,12,76,1,chevy c10
223 | 13,8,302,130,3870,15,76,1,ford f108
224 | 13,8,318,150,3755,14,76,1,dodge d100
225 | 31.5,4,98,68,2045,18.5,77,3,honda accord cvcc
226 | 30,4,111,80,2155,14.8,77,1,buick opel isuzu deluxe
227 | 36,4,79,58,1825,18.6,77,2,renault 5 gtl
228 | 25.5,4,122,96,2300,15.5,77,1,plymouth arrow gs
229 | 33.5,4,85,70,1945,16.8,77,3,datsun f-10 hatchback
230 | 17.5,8,305,145,3880,12.5,77,1,chevrolet caprice classic
231 | 17,8,260,110,4060,19,77,1,oldsmobile cutlass supreme
232 | 15.5,8,318,145,4140,13.7,77,1,dodge monaco brougham
233 | 15,8,302,130,4295,14.9,77,1,mercury cougar brougham
234 | 17.5,6,250,110,3520,16.4,77,1,chevrolet concours
235 | 20.5,6,231,105,3425,16.9,77,1,buick skylark
236 | 19,6,225,100,3630,17.7,77,1,plymouth volare custom
237 | 18.5,6,250,98,3525,19,77,1,ford granada
238 | 16,8,400,180,4220,11.1,77,1,pontiac grand prix lj
239 | 15.5,8,350,170,4165,11.4,77,1,chevrolet monte carlo landau
240 | 15.5,8,400,190,4325,12.2,77,1,chrysler cordoba
241 | 16,8,351,149,4335,14.5,77,1,ford thunderbird
242 | 29,4,97,78,1940,14.5,77,2,volkswagen rabbit custom
243 | 24.5,4,151,88,2740,16,77,1,pontiac sunbird coupe
244 | 26,4,97,75,2265,18.2,77,3,toyota corolla liftback
245 | 25.5,4,140,89,2755,15.8,77,1,ford mustang ii 2+2
246 | 30.5,4,98,63,2051,17,77,1,chevrolet chevette
247 | 33.5,4,98,83,2075,15.9,77,1,dodge colt m/m
248 | 30,4,97,67,1985,16.4,77,3,subaru dl
249 | 30.5,4,97,78,2190,14.1,77,2,volkswagen dasher
250 | 22,6,146,97,2815,14.5,77,3,datsun 810
251 | 21.5,4,121,110,2600,12.8,77,2,bmw 320i
252 | 21.5,3,80,110,2720,13.5,77,3,mazda rx-4
253 | 43.1,4,90,48,1985,21.5,78,2,volkswagen rabbit custom diesel
254 | 36.1,4,98,66,1800,14.4,78,1,ford fiesta
255 | 32.8,4,78,52,1985,19.4,78,3,mazda glc deluxe
256 | 39.4,4,85,70,2070,18.6,78,3,datsun b210 gx
257 | 36.1,4,91,60,1800,16.4,78,3,honda civic cvcc
258 | 19.9,8,260,110,3365,15.5,78,1,oldsmobile cutlass salon brougham
259 | 19.4,8,318,140,3735,13.2,78,1,dodge diplomat
260 | 20.2,8,302,139,3570,12.8,78,1,mercury monarch ghia
261 | 19.2,6,231,105,3535,19.2,78,1,pontiac phoenix lj
262 | 20.5,6,200,95,3155,18.2,78,1,chevrolet malibu
263 | 20.2,6,200,85,2965,15.8,78,1,ford fairmont (auto)
264 | 25.1,4,140,88,2720,15.4,78,1,ford fairmont (man)
265 | 20.5,6,225,100,3430,17.2,78,1,plymouth volare
266 | 19.4,6,232,90,3210,17.2,78,1,amc concord
267 | 20.6,6,231,105,3380,15.8,78,1,buick century special
268 | 20.8,6,200,85,3070,16.7,78,1,mercury zephyr
269 | 18.6,6,225,110,3620,18.7,78,1,dodge aspen
270 | 18.1,6,258,120,3410,15.1,78,1,amc concord d/l
271 | 19.2,8,305,145,3425,13.2,78,1,chevrolet monte carlo landau
272 | 17.7,6,231,165,3445,13.4,78,1,buick regal sport coupe (turbo)
273 | 18.1,8,302,139,3205,11.2,78,1,ford futura
274 | 17.5,8,318,140,4080,13.7,78,1,dodge magnum xe
275 | 30,4,98,68,2155,16.5,78,1,chevrolet chevette
276 | 27.5,4,134,95,2560,14.2,78,3,toyota corona
277 | 27.2,4,119,97,2300,14.7,78,3,datsun 510
278 | 30.9,4,105,75,2230,14.5,78,1,dodge omni
279 | 21.1,4,134,95,2515,14.8,78,3,toyota celica gt liftback
280 | 23.2,4,156,105,2745,16.7,78,1,plymouth sapporo
281 | 23.8,4,151,85,2855,17.6,78,1,oldsmobile starfire sx
282 | 23.9,4,119,97,2405,14.9,78,3,datsun 200-sx
283 | 20.3,5,131,103,2830,15.9,78,2,audi 5000
284 | 17,6,163,125,3140,13.6,78,2,volvo 264gl
285 | 21.6,4,121,115,2795,15.7,78,2,saab 99gle
286 | 16.2,6,163,133,3410,15.8,78,2,peugeot 604sl
287 | 31.5,4,89,71,1990,14.9,78,2,volkswagen scirocco
288 | 29.5,4,98,68,2135,16.6,78,3,honda accord lx
289 | 21.5,6,231,115,3245,15.4,79,1,pontiac lemans v6
290 | 19.8,6,200,85,2990,18.2,79,1,mercury zephyr 6
291 | 22.3,4,140,88,2890,17.3,79,1,ford fairmont 4
292 | 20.2,6,232,90,3265,18.2,79,1,amc concord dl 6
293 | 20.6,6,225,110,3360,16.6,79,1,dodge aspen 6
294 | 17,8,305,130,3840,15.4,79,1,chevrolet caprice classic
295 | 17.6,8,302,129,3725,13.4,79,1,ford ltd landau
296 | 16.5,8,351,138,3955,13.2,79,1,mercury grand marquis
297 | 18.2,8,318,135,3830,15.2,79,1,dodge st. regis
298 | 16.9,8,350,155,4360,14.9,79,1,buick estate wagon (sw)
299 | 15.5,8,351,142,4054,14.3,79,1,ford country squire (sw)
300 | 19.2,8,267,125,3605,15,79,1,chevrolet malibu classic (sw)
301 | 18.5,8,360,150,3940,13,79,1,chrysler lebaron town @ country (sw)
302 | 31.9,4,89,71,1925,14,79,2,vw rabbit custom
303 | 34.1,4,86,65,1975,15.2,79,3,maxda glc deluxe
304 | 35.7,4,98,80,1915,14.4,79,1,dodge colt hatchback custom
305 | 27.4,4,121,80,2670,15,79,1,amc spirit dl
306 | 25.4,5,183,77,3530,20.1,79,2,mercedes benz 300d
307 | 23,8,350,125,3900,17.4,79,1,cadillac eldorado
308 | 27.2,4,141,71,3190,24.8,79,2,peugeot 504
309 | 23.9,8,260,90,3420,22.2,79,1,oldsmobile cutlass salon brougham
310 | 34.2,4,105,70,2200,13.2,79,1,plymouth horizon
311 | 34.5,4,105,70,2150,14.9,79,1,plymouth horizon tc3
312 | 31.8,4,85,65,2020,19.2,79,3,datsun 210
313 | 37.3,4,91,69,2130,14.7,79,2,fiat strada custom
314 | 28.4,4,151,90,2670,16,79,1,buick skylark limited
315 | 28.8,6,173,115,2595,11.3,79,1,chevrolet citation
316 | 26.8,6,173,115,2700,12.9,79,1,oldsmobile omega brougham
317 | 33.5,4,151,90,2556,13.2,79,1,pontiac phoenix
318 | 41.5,4,98,76,2144,14.7,80,2,vw rabbit
319 | 38.1,4,89,60,1968,18.8,80,3,toyota corolla tercel
320 | 32.1,4,98,70,2120,15.5,80,1,chevrolet chevette
321 | 37.2,4,86,65,2019,16.4,80,3,datsun 310
322 | 28,4,151,90,2678,16.5,80,1,chevrolet citation
323 | 26.4,4,140,88,2870,18.1,80,1,ford fairmont
324 | 24.3,4,151,90,3003,20.1,80,1,amc concord
325 | 19.1,6,225,90,3381,18.7,80,1,dodge aspen
326 | 34.3,4,97,78,2188,15.8,80,2,audi 4000
327 | 29.8,4,134,90,2711,15.5,80,3,toyota corona liftback
328 | 31.3,4,120,75,2542,17.5,80,3,mazda 626
329 | 37,4,119,92,2434,15,80,3,datsun 510 hatchback
330 | 32.2,4,108,75,2265,15.2,80,3,toyota corolla
331 | 46.6,4,86,65,2110,17.9,80,3,mazda glc
332 | 27.9,4,156,105,2800,14.4,80,1,dodge colt
333 | 40.8,4,85,65,2110,19.2,80,3,datsun 210
334 | 44.3,4,90,48,2085,21.7,80,2,vw rabbit c (diesel)
335 | 43.4,4,90,48,2335,23.7,80,2,vw dasher (diesel)
336 | 36.4,5,121,67,2950,19.9,80,2,audi 5000s (diesel)
337 | 30,4,146,67,3250,21.8,80,2,mercedes-benz 240d
338 | 44.6,4,91,67,1850,13.8,80,3,honda civic 1500 gl
339 | 40.9,4,85,NA,1835,17.3,80,2,renault lecar deluxe
340 | 33.8,4,97,67,2145,18,80,3,subaru dl
341 | 29.8,4,89,62,1845,15.3,80,2,vokswagen rabbit
342 | 32.7,6,168,132,2910,11.4,80,3,datsun 280-zx
343 | 23.7,3,70,100,2420,12.5,80,3,mazda rx-7 gs
344 | 35,4,122,88,2500,15.1,80,2,triumph tr7 coupe
345 | 23.6,4,140,NA,2905,14.3,80,1,ford mustang cobra
346 | 32.4,4,107,72,2290,17,80,3,honda accord
347 | 27.2,4,135,84,2490,15.7,81,1,plymouth reliant
348 | 26.6,4,151,84,2635,16.4,81,1,buick skylark
349 | 25.8,4,156,92,2620,14.4,81,1,dodge aries wagon (sw)
350 | 23.5,6,173,110,2725,12.6,81,1,chevrolet citation
351 | 30,4,135,84,2385,12.9,81,1,plymouth reliant
352 | 39.1,4,79,58,1755,16.9,81,3,toyota starlet
353 | 39,4,86,64,1875,16.4,81,1,plymouth champ
354 | 35.1,4,81,60,1760,16.1,81,3,honda civic 1300
355 | 32.3,4,97,67,2065,17.8,81,3,subaru
356 | 37,4,85,65,1975,19.4,81,3,datsun 210 mpg
357 | 37.7,4,89,62,2050,17.3,81,3,toyota tercel
358 | 34.1,4,91,68,1985,16,81,3,mazda glc 4
359 | 34.7,4,105,63,2215,14.9,81,1,plymouth horizon 4
360 | 34.4,4,98,65,2045,16.2,81,1,ford escort 4w
361 | 29.9,4,98,65,2380,20.7,81,1,ford escort 2h
362 | 33,4,105,74,2190,14.2,81,2,volkswagen jetta
363 | 34.5,4,100,NA,2320,15.8,81,2,renault 18i
364 | 33.7,4,107,75,2210,14.4,81,3,honda prelude
365 | 32.4,4,108,75,2350,16.8,81,3,toyota corolla
366 | 32.9,4,119,100,2615,14.8,81,3,datsun 200sx
367 | 31.6,4,120,74,2635,18.3,81,3,mazda 626
368 | 28.1,4,141,80,3230,20.4,81,2,peugeot 505s turbo diesel
369 | NA,4,121,110,2800,15.4,81,2,saab 900s
370 | 30.7,6,145,76,3160,19.6,81,2,volvo diesel
371 | 25.4,6,168,116,2900,12.6,81,3,toyota cressida
372 | 24.2,6,146,120,2930,13.8,81,3,datsun 810 maxima
373 | 22.4,6,231,110,3415,15.8,81,1,buick century
374 | 26.6,8,350,105,3725,19,81,1,oldsmobile cutlass ls
375 | 20.2,6,200,88,3060,17.1,81,1,ford granada gl
376 | 17.6,6,225,85,3465,16.6,81,1,chrysler lebaron salon
377 | 28,4,112,88,2605,19.6,82,1,chevrolet cavalier
378 | 27,4,112,88,2640,18.6,82,1,chevrolet cavalier wagon
379 | 34,4,112,88,2395,18,82,1,chevrolet cavalier 2-door
380 | 31,4,112,85,2575,16.2,82,1,pontiac j2000 se hatchback
381 | 29,4,135,84,2525,16,82,1,dodge aries se
382 | 27,4,151,90,2735,18,82,1,pontiac phoenix
383 | 24,4,140,92,2865,16.4,82,1,ford fairmont futura
384 | 23,4,151,NA,3035,20.5,82,1,amc concord dl
385 | 36,4,105,74,1980,15.3,82,2,volkswagen rabbit l
386 | 37,4,91,68,2025,18.2,82,3,mazda glc custom l
387 | 31,4,91,68,1970,17.6,82,3,mazda glc custom
388 | 38,4,105,63,2125,14.7,82,1,plymouth horizon miser
389 | 36,4,98,70,2125,17.3,82,1,mercury lynx l
390 | 36,4,120,88,2160,14.5,82,3,nissan stanza xe
391 | 36,4,107,75,2205,14.5,82,3,honda accord
392 | 34,4,108,70,2245,16.9,82,3,toyota corolla
393 | 38,4,91,67,1965,15,82,3,honda civic
394 | 32,4,91,67,1965,15.7,82,3,honda civic (auto)
395 | 38,4,91,67,1995,16.2,82,3,datsun 310 gx
396 | 25,6,181,110,2945,16.4,82,1,buick century limited
397 | 38,6,262,85,3015,17,82,1,oldsmobile cutlass ciera (diesel)
398 | 26,4,156,92,2585,14.5,82,1,chrysler lebaron medallion
399 | 22,6,232,112,2835,14.7,82,1,ford granada l
400 | 32,4,144,96,2665,13.9,82,3,toyota celica gt
401 | 36,4,135,84,2370,13,82,1,dodge charger 2.2
402 | 27,4,151,90,2950,17.3,82,1,chevrolet camaro
403 | 27,4,140,86,2790,15.6,82,1,ford mustang gl
404 | 44,4,97,52,2130,24.6,82,2,vw pickup
405 | 32,4,135,84,2295,11.6,82,1,dodge rampage
406 | 28,4,120,79,2625,18.6,82,1,ford ranger
407 | 31,4,119,82,2720,19.4,82,1,chevy s-10
408 | 


--------------------------------------------------------------------------------
/datasets/Bank data dictionary.txt:
--------------------------------------------------------------------------------
 1 |    1 - age (numeric)
 2 |    2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
 3 |    3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
 4 |    4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
 5 |    5 - default: has credit in default? (categorical: "no","yes","unknown")
 6 |    6 - housing: has housing loan? (categorical: "no","yes","unknown")
 7 |    7 - loan: has personal loan? (categorical: "no","yes","unknown")
 8 |    # related with the last contact of the current campaign:
 9 |    8 - contact: contact communication type (categorical: "cellular","telephone") 
10 |    9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
11 |   10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
12 |   11 - duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
13 |    # other attributes:
14 |   12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
15 |   13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
16 |   14 - previous: number of contacts performed before this campaign and for this client (numeric)
17 |   15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
18 |    # social and economic context attributes
19 |   16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
20 |   17 - cons.price.idx: consumer price index - monthly indicator (numeric)     
21 |   18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
22 |   19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
23 |   20 - nr.employed: number of employees - quarterly indicator (numeric)
24 | 
25 |   Output variable (desired target):
26 |   21 - y - has the client subscribed a term deposit? (binary: "yes","no")
27 | 


--------------------------------------------------------------------------------
/datasets/Customer Churn Columns.csv:
--------------------------------------------------------------------------------
 1 | Column_Names
 2 | A
 3 | Bob
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | 


--------------------------------------------------------------------------------
/datasets/Gender Purchase.csv:
--------------------------------------------------------------------------------
  1 | Gender,Purchase
  2 | Female,Yes
  3 | Female,Yes
  4 | Female,No
  5 | Male,No
  6 | Male,Yes
  7 | Female,Yes
  8 | Male,No
  9 | Female,Yes
 10 | Female,No
 11 | Female,Yes
 12 | Female,No
 13 | Male,No
 14 | Male,Yes
 15 | Male,No
 16 | Female,Yes
 17 | Male,Yes
 18 | Male,Yes
 19 | Male,Yes
 20 | Female,Yes
 21 | Female,No
 22 | Male,Yes
 23 | Male,Yes
 24 | Male,No
 25 | Female,Yes
 26 | Male,Yes
 27 | Female,Yes
 28 | Male,No
 29 | Male,No
 30 | Female,Yes
 31 | Female,Yes
 32 | Male,No
 33 | Female,Yes
 34 | Female,Yes
 35 | Female,No
 36 | Female,No
 37 | Female,Yes
 38 | Male,Yes
 39 | Female,Yes
 40 | Female,Yes
 41 | Female,Yes
 42 | Male,Yes
 43 | Male,No
 44 | Female,Yes
 45 | Female,No
 46 | Female,Yes
 47 | Female,Yes
 48 | Female,No
 49 | Male,Yes
 50 | Female,No
 51 | Female,Yes
 52 | Female,No
 53 | Male,No
 54 | Female,Yes
 55 | Male,Yes
 56 | Female,No
 57 | Female,No
 58 | Female,No
 59 | Female,Yes
 60 | Male,Yes
 61 | Female,Yes
 62 | Male,Yes
 63 | Male,No
 64 | Male,Yes
 65 | Male,Yes
 66 | Male,No
 67 | Male,Yes
 68 | Female,Yes
 69 | Female,No
 70 | Male,Yes
 71 | Female,No
 72 | Male,Yes
 73 | Female,Yes
 74 | Female,Yes
 75 | Female,No
 76 | Female,No
 77 | Male,Yes
 78 | Male,No
 79 | Male,No
 80 | Male,No
 81 | Male,No
 82 | Female,No
 83 | Male,No
 84 | Male,No
 85 | Female,Yes
 86 | Female,Yes
 87 | Female,Yes
 88 | Female,Yes
 89 | Female,Yes
 90 | Female,Yes
 91 | Male,No
 92 | Male,Yes
 93 | Female,Yes
 94 | Male,No
 95 | Male,No
 96 | Female,Yes
 97 | Female,No
 98 | Male,Yes
 99 | Female,Yes
100 | Female,Yes
101 | Male,Yes
102 | Male,No
103 | Male,Yes
104 | Female,No
105 | Male,Yes
106 | Female,Yes
107 | Female,Yes
108 | Male,Yes
109 | Female,No
110 | Male,No
111 | Female,Yes
112 | Female,No
113 | Male,Yes
114 | Male,Yes
115 | Male,Yes
116 | Male,No
117 | Male,No
118 | Female,No
119 | Female,No
120 | Male,Yes
121 | Female,No
122 | Female,Yes
123 | Female,No
124 | Female,Yes
125 | Female,No
126 | Male,Yes
127 | Female,Yes
128 | Female,No
129 | Male,No
130 | Female,Yes
131 | Female,Yes
132 | Male,No
133 | Female,Yes
134 | Female,Yes
135 | Male,Yes
136 | Male,No
137 | Male,Yes
138 | Female,Yes
139 | Female,Yes
140 | Female,No
141 | Female,No
142 | Male,Yes
143 | Male,Yes
144 | Male,No
145 | Female,Yes
146 | Male,Yes
147 | Male,No
148 | Female,Yes
149 | Male,No
150 | Male,No
151 | Female,Yes
152 | Female,No
153 | Female,Yes
154 | Male,Yes
155 | Male,Yes
156 | Female,Yes
157 | Male,No
158 | Male,Yes
159 | Male,No
160 | Male,No
161 | Female,No
162 | Male,Yes
163 | Female,No
164 | Male,Yes
165 | Male,Yes
166 | Male,Yes
167 | Male,Yes
168 | Female,Yes
169 | Female,No
170 | Female,Yes
171 | Female,Yes
172 | Female,No
173 | Female,Yes
174 | Female,No
175 | Male,Yes
176 | Male,No
177 | Female,No
178 | Male,No
179 | Male,No
180 | Male,No
181 | Female,Yes
182 | Female,Yes
183 | Female,No
184 | Female,No
185 | Female,No
186 | Female,No
187 | Female,Yes
188 | Male,No
189 | Female,Yes
190 | Female,Yes
191 | Female,No
192 | Female,No
193 | Female,No
194 | Female,Yes
195 | Female,Yes
196 | Male,No
197 | Male,No
198 | Male,Yes
199 | Female,No
200 | Male,No
201 | Female,Yes
202 | Female,Yes
203 | Female,No
204 | Female,No
205 | Male,No
206 | Male,No
207 | Male,No
208 | Female,Yes
209 | Male,Yes
210 | Male,No
211 | Female,Yes
212 | Female,Yes
213 | Male,No
214 | Female,No
215 | Male,Yes
216 | Male,No
217 | Male,Yes
218 | Male,Yes
219 | Female,Yes
220 | Female,Yes
221 | Male,No
222 | Female,No
223 | Male,Yes
224 | Male,No
225 | Male,Yes
226 | Male,No
227 | Female,Yes
228 | Female,Yes
229 | Female,No
230 | Male,No
231 | Male,No
232 | Female,No
233 | Male,No
234 | Male,Yes
235 | Female,Yes
236 | Female,Yes
237 | Female,No
238 | Male,No
239 | Female,No
240 | Female,Yes
241 | Female,No
242 | Male,Yes
243 | Male,Yes
244 | Female,Yes
245 | Female,Yes
246 | Female,Yes
247 | Male,No
248 | Male,Yes
249 | Female,No
250 | Male,Yes
251 | Male,Yes
252 | Male,No
253 | Female,Yes
254 | Female,No
255 | Female,No
256 | Female,Yes
257 | Female,Yes
258 | Male,No
259 | Male,No
260 | Male,No
261 | Male,No
262 | Male,No
263 | Female,Yes
264 | Female,No
265 | Female,Yes
266 | Male,Yes
267 | Female,Yes
268 | Female,Yes
269 | Male,No
270 | Male,No
271 | Male,No
272 | Male,No
273 | Male,No
274 | Female,Yes
275 | Female,Yes
276 | Female,No
277 | Male,No
278 | Female,Yes
279 | Female,Yes
280 | Female,Yes
281 | Female,Yes
282 | Male,No
283 | Male,No
284 | Female,No
285 | Male,No
286 | Male,No
287 | Female,No
288 | Female,Yes
289 | Male,No
290 | Female,Yes
291 | Female,No
292 | Female,Yes
293 | Female,No
294 | Male,No
295 | Female,Yes
296 | Male,No
297 | Male,Yes
298 | Female,Yes
299 | Female,Yes
300 | Female,Yes
301 | Female,No
302 | Male,Yes
303 | Female,No
304 | Male,No
305 | Female,Yes
306 | Male,Yes
307 | Male,No
308 | Female,Yes
309 | Female,Yes
310 | Female,Yes
311 | Female,Yes
312 | Female,No
313 | Male,Yes
314 | Male,No
315 | Female,Yes
316 | Female,Yes
317 | Female,No
318 | Female,Yes
319 | Female,Yes
320 | Male,No
321 | Female,No
322 | Male,No
323 | Female,No
324 | Male,No
325 | Male,No
326 | Male,Yes
327 | Female,Yes
328 | Male,Yes
329 | Male,No
330 | Male,Yes
331 | Male,Yes
332 | Male,Yes
333 | Male,No
334 | Female,Yes
335 | Male,Yes
336 | Male,No
337 | Male,Yes
338 | Male,Yes
339 | Female,Yes
340 | Male,No
341 | Male,Yes
342 | Male,Yes
343 | Female,Yes
344 | Female,No
345 | Female,No
346 | Female,No
347 | Male,Yes
348 | Female,No
349 | Male,No
350 | Female,Yes
351 | Female,No
352 | Male,Yes
353 | Female,No
354 | Female,No
355 | Male,Yes
356 | Female,No
357 | Female,No
358 | Male,Yes
359 | Female,Yes
360 | Female,Yes
361 | Male,Yes
362 | Male,No
363 | Male,Yes
364 | Female,No
365 | Female,Yes
366 | Male,Yes
367 | Male,Yes
368 | Male,Yes
369 | Male,No
370 | Male,Yes
371 | Male,No
372 | Male,No
373 | Female,Yes
374 | Female,No
375 | Female,Yes
376 | Female,No
377 | Male,Yes
378 | Female,Yes
379 | Female,Yes
380 | Male,No
381 | Female,No
382 | Female,No
383 | Female,No
384 | Male,Yes
385 | Female,Yes
386 | Female,Yes
387 | Male,Yes
388 | Male,No
389 | Female,No
390 | Male,No
391 | Female,Yes
392 | Male,No
393 | Female,Yes
394 | Male,Yes
395 | Female,Yes
396 | Male,Yes
397 | Male,Yes
398 | Male,No
399 | Male,No
400 | Male,No
401 | Female,No
402 | Female,No
403 | Male,Yes
404 | Female,Yes
405 | Female,No
406 | Female,Yes
407 | Male,Yes
408 | Male,No
409 | Female,No
410 | Male,No
411 | Female,Yes
412 | Female,Yes
413 | Female,No
414 | Male,No
415 | Male,Yes
416 | Male,No
417 | Male,Yes
418 | Female,Yes
419 | Male,Yes
420 | Male,Yes
421 | Female,No
422 | Male,No
423 | Female,No
424 | Female,No
425 | Female,No
426 | Female,Yes
427 | Male,Yes
428 | Male,Yes
429 | Male,No
430 | Male,No
431 | Male,No
432 | Female,Yes
433 | Male,No
434 | Male,Yes
435 | Female,Yes
436 | Male,Yes
437 | Male,Yes
438 | Female,No
439 | Female,Yes
440 | Female,No
441 | Female,Yes
442 | Female,Yes
443 | Male,Yes
444 | Male,Yes
445 | Male,No
446 | Female,Yes
447 | Male,No
448 | Male,Yes
449 | Female,Yes
450 | Female,No
451 | Female,No
452 | Female,No
453 | Male,No
454 | Female,Yes
455 | Male,Yes
456 | Male,No
457 | Male,Yes
458 | Female,No
459 | Male,No
460 | Male,No
461 | Female,Yes
462 | Male,No
463 | Female,Yes
464 | Female,Yes
465 | Male,Yes
466 | Female,Yes
467 | Male,Yes
468 | Female,Yes
469 | Female,Yes
470 | Male,No
471 | Female,No
472 | Female,Yes
473 | Female,No
474 | Male,No
475 | Female,No
476 | Male,Yes
477 | Female,No
478 | Male,Yes
479 | Female,Yes
480 | Male,No
481 | Male,No
482 | Female,No
483 | Male,No
484 | Male,No
485 | Male,No
486 | Male,No
487 | Male,Yes
488 | Male,Yes
489 | Male,Yes
490 | Female,Yes
491 | Male,Yes
492 | Male,Yes
493 | Female,Yes
494 | Female,No
495 | Male,Yes
496 | Female,Yes
497 | Female,Yes
498 | Female,Yes
499 | Male,Yes
500 | Male,Yes
501 | Female,Yes
502 | Male,Yes
503 | Male,Yes
504 | Male,No
505 | Female,Yes
506 | Female,Yes
507 | Male,Yes
508 | Male,Yes
509 | Female,Yes
510 | Male,No
511 | Female,Yes
512 | Female,Yes
513 | 


--------------------------------------------------------------------------------
/datasets/Titanic Description.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/Titanic Description.txt


--------------------------------------------------------------------------------
/datasets/dtree2.dot:
--------------------------------------------------------------------------------
 1 | digraph Tree {
 2 | node [shape=box] ;
 3 | 0 [label="Petal.Length <= 2.45\nentropy = 1.5849\nsamples = 127\nvalue = [42, 42, 43]"] ;
 4 | 1 [label="entropy = 0.0\nsamples = 42\nvalue = [42, 0, 0]"] ;
 5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
 6 | 2 [label="Petal.Width <= 1.65\nentropy = 0.9999\nsamples = 85\nvalue = [0, 42, 43]"] ;
 7 | 0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
 8 | 3 [label="Petal.Length <= 4.95\nentropy = 0.3591\nsamples = 44\nvalue = [0, 41, 3]"] ;
 9 | 2 -> 3 ;
10 | 4 [label="entropy = 0.0\nsamples = 40\nvalue = [0, 40, 0]"] ;
11 | 3 -> 4 ;
12 | 5 [label="entropy = 0.8113\nsamples = 4\nvalue = [0, 1, 3]"] ;
13 | 3 -> 5 ;
14 | 6 [label="Petal.Length <= 4.85\nentropy = 0.1654\nsamples = 41\nvalue = [0, 1, 40]"] ;
15 | 2 -> 6 ;
16 | 7 [label="entropy = 0.8113\nsamples = 4\nvalue = [0, 1, 3]"] ;
17 | 6 -> 7 ;
18 | 8 [label="entropy = 0.0\nsamples = 37\nvalue = [0, 0, 37]"] ;
19 | 6 -> 8 ;
20 | }


--------------------------------------------------------------------------------
/datasets/dtree2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/dtree2.png


--------------------------------------------------------------------------------
/datasets/iris.csv:
--------------------------------------------------------------------------------
  1 | Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3,1.4,0.1,setosa
 15 | 4.3,3,1.1,0.1,setosa
 16 | 5.8,4,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5,3,1.6,0.2,setosa
 28 | 5,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.2,setosa
 37 | 5,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.6,1.4,0.1,setosa
 40 | 4.4,3,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5,3.3,1.4,0.2,setosa
 52 | 7,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5,2,3.5,1,versicolor
 63 | 5.9,3,4.2,1.5,versicolor
 64 | 6,2.2,4,1,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3,5,1.7,versicolor
 80 | 6,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6,2.7,5.1,1.6,versicolor
 86 | 5.4,3,4.5,1.5,versicolor
 87 | 6,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3,4.1,1.3,versicolor
 91 | 5.5,2.5,4,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3,4.6,1.4,versicolor
 94 | 5.8,2.6,4,1.2,versicolor
 95 | 5,2.3,3.3,1,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3,5.8,2.2,virginica
107 | 7.6,3,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3,5.5,2.1,virginica
115 | 5.7,2.5,5,2,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6,2.2,5,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2,virginica
124 | 7.7,2.8,6.7,2,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6,3,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3,5.2,2.3,virginica
148 | 6.3,2.5,5,1.9,virginica
149 | 6.5,3,5.2,2,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/datasets/lotsofdata/010.csv:
--------------------------------------------------------------------------------
   1 | "Date","sulfate","nitrate","ID"
   2 | "2002-01-01",NA,NA,10
   3 | "2002-01-02",NA,NA,10
   4 | "2002-01-03",NA,NA,10
   5 | "2002-01-04",NA,NA,10
   6 | "2002-01-05",NA,NA,10
   7 | "2002-01-06",NA,NA,10
   8 | "2002-01-07",NA,NA,10
   9 | "2002-01-08",NA,NA,10
  10 | "2002-01-09",NA,NA,10
  11 | "2002-01-10",NA,NA,10
  12 | "2002-01-11",NA,NA,10
  13 | "2002-01-12",NA,NA,10
  14 | "2002-01-13",NA,NA,10
  15 | "2002-01-14",NA,NA,10
  16 | "2002-01-15",NA,NA,10
  17 | "2002-01-16",NA,NA,10
  18 | "2002-01-17",NA,NA,10
  19 | "2002-01-18",NA,NA,10
  20 | "2002-01-19",NA,NA,10
  21 | "2002-01-20",NA,NA,10
  22 | "2002-01-21",NA,NA,10
  23 | "2002-01-22",NA,NA,10
  24 | "2002-01-23",NA,NA,10
  25 | "2002-01-24",NA,NA,10
  26 | "2002-01-25",NA,NA,10
  27 | "2002-01-26",NA,NA,10
  28 | "2002-01-27",NA,NA,10
  29 | "2002-01-28",NA,NA,10
  30 | "2002-01-29",NA,NA,10
  31 | "2002-01-30",NA,NA,10
  32 | "2002-01-31",NA,NA,10
  33 | "2002-02-01",NA,NA,10
  34 | "2002-02-02",NA,NA,10
  35 | "2002-02-03",NA,NA,10
  36 | "2002-02-04",NA,NA,10
  37 | "2002-02-05",NA,NA,10
  38 | "2002-02-06",NA,NA,10
  39 | "2002-02-07",NA,NA,10
  40 | "2002-02-08",0.782,2.05,10
  41 | "2002-02-09",NA,NA,10
  42 | "2002-02-10",NA,NA,10
  43 | "2002-02-11",NA,NA,10
  44 | "2002-02-12",NA,NA,10
  45 | "2002-02-13",0.416,0.292,10
  46 | "2002-02-14",NA,NA,10
  47 | "2002-02-15",NA,NA,10
  48 | "2002-02-16",NA,NA,10
  49 | "2002-02-17",NA,NA,10
  50 | "2002-02-18",NA,NA,10
  51 | "2002-02-19",0.665,0.544,10
  52 | "2002-02-20",NA,NA,10
  53 | "2002-02-21",NA,NA,10
  54 | "2002-02-22",NA,NA,10
  55 | "2002-02-23",NA,NA,10
  56 | "2002-02-24",NA,NA,10
  57 | "2002-02-25",0.537,0.977,10
  58 | "2002-02-26",NA,NA,10
  59 | "2002-02-27",NA,NA,10
  60 | "2002-02-28",NA,NA,10
  61 | "2002-03-01",NA,NA,10
  62 | "2002-03-02",NA,NA,10
  63 | "2002-03-03",NA,NA,10
  64 | "2002-03-04",NA,NA,10
  65 | "2002-03-05",NA,NA,10
  66 | "2002-03-06",NA,NA,10
  67 | "2002-03-07",NA,NA,10
  68 | "2002-03-08",NA,NA,10
  69 | "2002-03-09",1.25,1,10
  70 | "2002-03-10",NA,NA,10
  71 | "2002-03-11",NA,NA,10
  72 | "2002-03-12",NA,NA,10
  73 | "2002-03-13",NA,NA,10
  74 | "2002-03-14",NA,NA,10
  75 | "2002-03-15",0.985,0.722,10
  76 | "2002-03-16",NA,NA,10
  77 | "2002-03-17",NA,NA,10
  78 | "2002-03-18",1.04,2.39,10
  79 | "2002-03-19",NA,NA,10
  80 | "2002-03-20",NA,NA,10
  81 | "2002-03-21",NA,NA,10
  82 | "2002-03-22",NA,NA,10
  83 | "2002-03-23",NA,NA,10
  84 | "2002-03-24",0.698,0.589,10
  85 | "2002-03-25",NA,NA,10
  86 | "2002-03-26",NA,NA,10
  87 | "2002-03-27",NA,0.294,10
  88 | "2002-03-28",NA,NA,10
  89 | "2002-03-29",NA,NA,10
  90 | "2002-03-30",0.755,0.396,10
  91 | "2002-03-31",NA,NA,10
  92 | "2002-04-01",NA,NA,10
  93 | "2002-04-02",NA,NA,10
  94 | "2002-04-03",NA,NA,10
  95 | "2002-04-04",NA,NA,10
  96 | "2002-04-05",1.55,1.05,10
  97 | "2002-04-06",NA,NA,10
  98 | "2002-04-07",NA,NA,10
  99 | "2002-04-08",1.31,1.21,10
 100 | "2002-04-09",NA,NA,10
 101 | "2002-04-10",NA,NA,10
 102 | "2002-04-11",1.52,0.218,10
 103 | "2002-04-12",NA,NA,10
 104 | "2002-04-13",NA,NA,10
 105 | "2002-04-14",0.903,1.36,10
 106 | "2002-04-15",NA,NA,10
 107 | "2002-04-16",NA,NA,10
 108 | "2002-04-17",0.469,0.27,10
 109 | "2002-04-18",NA,NA,10
 110 | "2002-04-19",NA,NA,10
 111 | "2002-04-20",NA,NA,10
 112 | "2002-04-21",NA,NA,10
 113 | "2002-04-22",NA,NA,10
 114 | "2002-04-23",0.913,0.427,10
 115 | "2002-04-24",NA,NA,10
 116 | "2002-04-25",NA,NA,10
 117 | "2002-04-26",1.05,0.334,10
 118 | "2002-04-27",NA,NA,10
 119 | "2002-04-28",NA,NA,10
 120 | "2002-04-29",1.21,1.12,10
 121 | "2002-04-30",NA,NA,10
 122 | "2002-05-01",NA,NA,10
 123 | "2002-05-02",0.328,0.149,10
 124 | "2002-05-03",NA,NA,10
 125 | "2002-05-04",NA,NA,10
 126 | "2002-05-05",NA,NA,10
 127 | "2002-05-06",NA,NA,10
 128 | "2002-05-07",NA,NA,10
 129 | "2002-05-08",0.873,0.482,10
 130 | "2002-05-09",NA,NA,10
 131 | "2002-05-10",NA,NA,10
 132 | "2002-05-11",NA,NA,10
 133 | "2002-05-12",NA,NA,10
 134 | "2002-05-13",NA,NA,10
 135 | "2002-05-14",0.634,0.339,10
 136 | "2002-05-15",NA,NA,10
 137 | "2002-05-16",NA,NA,10
 138 | "2002-05-17",0.866,0.442,10
 139 | "2002-05-18",NA,NA,10
 140 | "2002-05-19",NA,NA,10
 141 | "2002-05-20",1.1,0.454,10
 142 | "2002-05-21",NA,NA,10
 143 | "2002-05-22",NA,NA,10
 144 | "2002-05-23",0.264,0.221,10
 145 | "2002-05-24",NA,NA,10
 146 | "2002-05-25",NA,NA,10
 147 | "2002-05-26",0.659,0.507,10
 148 | "2002-05-27",NA,NA,10
 149 | "2002-05-28",NA,NA,10
 150 | "2002-05-29",0.219,0.181,10
 151 | "2002-05-30",NA,NA,10
 152 | "2002-05-31",NA,NA,10
 153 | "2002-06-01",NA,NA,10
 154 | "2002-06-02",NA,NA,10
 155 | "2002-06-03",NA,NA,10
 156 | "2002-06-04",NA,NA,10
 157 | "2002-06-05",NA,NA,10
 158 | "2002-06-06",NA,NA,10
 159 | "2002-06-07",0.595,0.266,10
 160 | "2002-06-08",NA,NA,10
 161 | "2002-06-09",NA,NA,10
 162 | "2002-06-10",NA,NA,10
 163 | "2002-06-11",NA,NA,10
 164 | "2002-06-12",NA,NA,10
 165 | "2002-06-13",NA,NA,10
 166 | "2002-06-14",NA,NA,10
 167 | "2002-06-15",NA,NA,10
 168 | "2002-06-16",0.787,0.456,10
 169 | "2002-06-17",NA,NA,10
 170 | "2002-06-18",NA,NA,10
 171 | "2002-06-19",NA,NA,10
 172 | "2002-06-20",NA,NA,10
 173 | "2002-06-21",NA,NA,10
 174 | "2002-06-22",0.592,0.214,10
 175 | "2002-06-23",NA,NA,10
 176 | "2002-06-24",NA,NA,10
 177 | "2002-06-25",0.387,0.305,10
 178 | "2002-06-26",NA,NA,10
 179 | "2002-06-27",NA,NA,10
 180 | "2002-06-28",NA,0.401,10
 181 | "2002-06-29",NA,NA,10
 182 | "2002-06-30",NA,NA,10
 183 | "2002-07-01",NA,NA,10
 184 | "2002-07-02",NA,NA,10
 185 | "2002-07-03",NA,NA,10
 186 | "2002-07-04",NA,NA,10
 187 | "2002-07-05",NA,NA,10
 188 | "2002-07-06",NA,NA,10
 189 | "2002-07-07",1.53,0.373,10
 190 | "2002-07-08",NA,NA,10
 191 | "2002-07-09",NA,NA,10
 192 | "2002-07-10",NA,NA,10
 193 | "2002-07-11",NA,NA,10
 194 | "2002-07-12",NA,NA,10
 195 | "2002-07-13",0.862,0.37,10
 196 | "2002-07-14",NA,NA,10
 197 | "2002-07-15",NA,NA,10
 198 | "2002-07-16",0.706,0.404,10
 199 | "2002-07-17",NA,NA,10
 200 | "2002-07-18",NA,NA,10
 201 | "2002-07-19",NA,NA,10
 202 | "2002-07-20",NA,NA,10
 203 | "2002-07-21",NA,NA,10
 204 | "2002-07-22",0.394,0.219,10
 205 | "2002-07-23",NA,NA,10
 206 | "2002-07-24",NA,NA,10
 207 | "2002-07-25",0.966,0.376,10
 208 | "2002-07-26",NA,NA,10
 209 | "2002-07-27",NA,NA,10
 210 | "2002-07-28",0.766,0.393,10
 211 | "2002-07-29",NA,NA,10
 212 | "2002-07-30",NA,NA,10
 213 | "2002-07-31",0.413,0.21,10
 214 | "2002-08-01",NA,NA,10
 215 | "2002-08-02",NA,NA,10
 216 | "2002-08-03",NA,NA,10
 217 | "2002-08-04",NA,NA,10
 218 | "2002-08-05",NA,NA,10
 219 | "2002-08-06",NA,NA,10
 220 | "2002-08-07",NA,NA,10
 221 | "2002-08-08",NA,NA,10
 222 | "2002-08-09",NA,NA,10
 223 | "2002-08-10",NA,NA,10
 224 | "2002-08-11",NA,NA,10
 225 | "2002-08-12",NA,NA,10
 226 | "2002-08-13",NA,NA,10
 227 | "2002-08-14",NA,NA,10
 228 | "2002-08-15",NA,NA,10
 229 | "2002-08-16",NA,NA,10
 230 | "2002-08-17",NA,NA,10
 231 | "2002-08-18",0.839,0.315,10
 232 | "2002-08-19",NA,NA,10
 233 | "2002-08-20",NA,NA,10
 234 | "2002-08-21",NA,NA,10
 235 | "2002-08-22",NA,NA,10
 236 | "2002-08-23",NA,NA,10
 237 | "2002-08-24",0.258,0.216,10
 238 | "2002-08-25",NA,NA,10
 239 | "2002-08-26",NA,NA,10
 240 | "2002-08-27",NA,0.687,10
 241 | "2002-08-28",NA,NA,10
 242 | "2002-08-29",NA,NA,10
 243 | "2002-08-30",0.445,0.203,10
 244 | "2002-08-31",NA,NA,10
 245 | "2002-09-01",NA,NA,10
 246 | "2002-09-02",NA,NA,10
 247 | "2002-09-03",NA,NA,10
 248 | "2002-09-04",NA,NA,10
 249 | "2002-09-05",NA,0.236,10
 250 | "2002-09-06",NA,NA,10
 251 | "2002-09-07",NA,NA,10
 252 | "2002-09-08",NA,NA,10
 253 | "2002-09-09",NA,NA,10
 254 | "2002-09-10",NA,NA,10
 255 | "2002-09-11",0.247,0.0751,10
 256 | "2002-09-12",NA,NA,10
 257 | "2002-09-13",NA,NA,10
 258 | "2002-09-14",NA,NA,10
 259 | "2002-09-15",NA,NA,10
 260 | "2002-09-16",NA,NA,10
 261 | "2002-09-17",NA,0.231,10
 262 | "2002-09-18",NA,NA,10
 263 | "2002-09-19",NA,NA,10
 264 | "2002-09-20",NA,NA,10
 265 | "2002-09-21",NA,NA,10
 266 | "2002-09-22",NA,NA,10
 267 | "2002-09-23",0.433,0.246,10
 268 | "2002-09-24",NA,NA,10
 269 | "2002-09-25",NA,NA,10
 270 | "2002-09-26",NA,0.343,10
 271 | "2002-09-27",NA,NA,10
 272 | "2002-09-28",NA,NA,10
 273 | "2002-09-29",NA,0.235,10
 274 | "2002-09-30",NA,NA,10
 275 | "2002-10-01",NA,NA,10
 276 | "2002-10-02",NA,0.319,10
 277 | "2002-10-03",NA,NA,10
 278 | "2002-10-04",NA,NA,10
 279 | "2002-10-05",NA,NA,10
 280 | "2002-10-06",NA,NA,10
 281 | "2002-10-07",NA,NA,10
 282 | "2002-10-08",NA,NA,10
 283 | "2002-10-09",NA,NA,10
 284 | "2002-10-10",NA,NA,10
 285 | "2002-10-11",NA,NA,10
 286 | "2002-10-12",NA,NA,10
 287 | "2002-10-13",NA,NA,10
 288 | "2002-10-14",NA,NA,10
 289 | "2002-10-15",NA,NA,10
 290 | "2002-10-16",NA,NA,10
 291 | "2002-10-17",0.402,0.556,10
 292 | "2002-10-18",NA,NA,10
 293 | "2002-10-19",NA,NA,10
 294 | "2002-10-20",0.336,0.411,10
 295 | "2002-10-21",NA,NA,10
 296 | "2002-10-22",NA,NA,10
 297 | "2002-10-23",NA,NA,10
 298 | "2002-10-24",NA,NA,10
 299 | "2002-10-25",NA,NA,10
 300 | "2002-10-26",NA,0.547,10
 301 | "2002-10-27",NA,NA,10
 302 | "2002-10-28",NA,NA,10
 303 | "2002-10-29",0.197,0.159,10
 304 | "2002-10-30",NA,NA,10
 305 | "2002-10-31",NA,NA,10
 306 | "2002-11-01",NA,NA,10
 307 | "2002-11-02",NA,0.644,10
 308 | "2002-11-03",NA,NA,10
 309 | "2002-11-04",NA,NA,10
 310 | "2002-11-05",NA,NA,10
 311 | "2002-11-06",NA,NA,10
 312 | "2002-11-07",NA,NA,10
 313 | "2002-11-08",NA,NA,10
 314 | "2002-11-09",NA,NA,10
 315 | "2002-11-10",0.387,0.669,10
 316 | "2002-11-11",NA,NA,10
 317 | "2002-11-12",NA,NA,10
 318 | "2002-11-13",0.587,0.973,10
 319 | "2002-11-14",NA,NA,10
 320 | "2002-11-15",NA,NA,10
 321 | "2002-11-16",0.365,2.11,10
 322 | "2002-11-17",NA,NA,10
 323 | "2002-11-18",NA,NA,10
 324 | "2002-11-19",NA,1.24,10
 325 | "2002-11-20",NA,NA,10
 326 | "2002-11-21",NA,NA,10
 327 | "2002-11-22",NA,NA,10
 328 | "2002-11-23",NA,NA,10
 329 | "2002-11-24",NA,NA,10
 330 | "2002-11-25",NA,0.394,10
 331 | "2002-11-26",NA,NA,10
 332 | "2002-11-27",NA,NA,10
 333 | "2002-11-28",NA,NA,10
 334 | "2002-11-29",NA,NA,10
 335 | "2002-11-30",NA,NA,10
 336 | "2002-12-01",NA,NA,10
 337 | "2002-12-02",NA,NA,10
 338 | "2002-12-03",NA,NA,10
 339 | "2002-12-04",NA,0.563,10
 340 | "2002-12-05",NA,NA,10
 341 | "2002-12-06",NA,NA,10
 342 | "2002-12-07",0.414,0.557,10
 343 | "2002-12-08",NA,NA,10
 344 | "2002-12-09",NA,NA,10
 345 | "2002-12-10",NA,NA,10
 346 | "2002-12-11",NA,NA,10
 347 | "2002-12-12",NA,NA,10
 348 | "2002-12-13",0.434,1.2,10
 349 | "2002-12-14",NA,NA,10
 350 | "2002-12-15",NA,NA,10
 351 | "2002-12-16",NA,NA,10
 352 | "2002-12-17",NA,1.49,10
 353 | "2002-12-18",NA,NA,10
 354 | "2002-12-19",0.753,1.33,10
 355 | "2002-12-20",NA,NA,10
 356 | "2002-12-21",NA,NA,10
 357 | "2002-12-22",NA,NA,10
 358 | "2002-12-23",NA,NA,10
 359 | "2002-12-24",NA,NA,10
 360 | "2002-12-25",NA,1.01,10
 361 | "2002-12-26",NA,NA,10
 362 | "2002-12-27",NA,NA,10
 363 | "2002-12-28",NA,NA,10
 364 | "2002-12-29",NA,NA,10
 365 | "2002-12-30",NA,NA,10
 366 | "2002-12-31",0.752,2.16,10
 367 | "2003-01-01",NA,NA,10
 368 | "2003-01-02",NA,NA,10
 369 | "2003-01-03",0.386,0.863,10
 370 | "2003-01-04",NA,NA,10
 371 | "2003-01-05",NA,NA,10
 372 | "2003-01-06",NA,NA,10
 373 | "2003-01-07",NA,NA,10
 374 | "2003-01-08",NA,NA,10
 375 | "2003-01-09",0.349,0.822,10
 376 | "2003-01-10",NA,NA,10
 377 | "2003-01-11",NA,NA,10
 378 | "2003-01-12",NA,0.931,10
 379 | "2003-01-13",NA,NA,10
 380 | "2003-01-14",NA,NA,10
 381 | "2003-01-15",0.393,0.659,10
 382 | "2003-01-16",NA,NA,10
 383 | "2003-01-17",NA,NA,10
 384 | "2003-01-18",NA,1.13,10
 385 | "2003-01-19",NA,NA,10
 386 | "2003-01-20",NA,NA,10
 387 | "2003-01-21",NA,NA,10
 388 | "2003-01-22",NA,NA,10
 389 | "2003-01-23",NA,NA,10
 390 | "2003-01-24",NA,NA,10
 391 | "2003-01-25",NA,NA,10
 392 | "2003-01-26",NA,NA,10
 393 | "2003-01-27",NA,0.576,10
 394 | "2003-01-28",NA,NA,10
 395 | "2003-01-29",NA,NA,10
 396 | "2003-01-30",0.392,1.12,10
 397 | "2003-01-31",NA,NA,10
 398 | "2003-02-01",NA,NA,10
 399 | "2003-02-02",NA,1.24,10
 400 | "2003-02-03",NA,NA,10
 401 | "2003-02-04",NA,NA,10
 402 | "2003-02-05",NA,NA,10
 403 | "2003-02-06",NA,NA,10
 404 | "2003-02-07",NA,NA,10
 405 | "2003-02-08",0.253,0.503,10
 406 | "2003-02-09",NA,NA,10
 407 | "2003-02-10",NA,NA,10
 408 | "2003-02-11",0.74,0.62,10
 409 | "2003-02-12",NA,NA,10
 410 | "2003-02-13",NA,NA,10
 411 | "2003-02-14",NA,0.746,10
 412 | "2003-02-15",NA,NA,10
 413 | "2003-02-16",NA,NA,10
 414 | "2003-02-17",NA,NA,10
 415 | "2003-02-18",NA,NA,10
 416 | "2003-02-19",NA,NA,10
 417 | "2003-02-20",NA,0.351,10
 418 | "2003-02-21",NA,NA,10
 419 | "2003-02-22",NA,NA,10
 420 | "2003-02-23",0.237,0.192,10
 421 | "2003-02-24",NA,NA,10
 422 | "2003-02-25",NA,NA,10
 423 | "2003-02-26",NA,NA,10
 424 | "2003-02-27",NA,NA,10
 425 | "2003-02-28",NA,NA,10
 426 | "2003-03-01",NA,NA,10
 427 | "2003-03-02",NA,NA,10
 428 | "2003-03-03",NA,NA,10
 429 | "2003-03-04",0.665,0.583,10
 430 | "2003-03-05",NA,NA,10
 431 | "2003-03-06",NA,NA,10
 432 | "2003-03-07",0.514,0.543,10
 433 | "2003-03-08",NA,NA,10
 434 | "2003-03-09",NA,NA,10
 435 | "2003-03-10",0.86,0.132,10
 436 | "2003-03-11",NA,NA,10
 437 | "2003-03-12",NA,NA,10
 438 | "2003-03-13",1.9,0.205,10
 439 | "2003-03-14",NA,NA,10
 440 | "2003-03-15",NA,NA,10
 441 | "2003-03-16",0.488,0.461,10
 442 | "2003-03-17",NA,NA,10
 443 | "2003-03-18",NA,NA,10
 444 | "2003-03-19",NA,NA,10
 445 | "2003-03-20",NA,NA,10
 446 | "2003-03-21",NA,NA,10
 447 | "2003-03-22",1.12,0.741,10
 448 | "2003-03-23",NA,NA,10
 449 | "2003-03-24",NA,NA,10
 450 | "2003-03-25",NA,NA,10
 451 | "2003-03-26",NA,NA,10
 452 | "2003-03-27",NA,NA,10
 453 | "2003-03-28",0.705,0.231,10
 454 | "2003-03-29",NA,NA,10
 455 | "2003-03-30",NA,NA,10
 456 | "2003-03-31",NA,0.148,10
 457 | "2003-04-01",NA,NA,10
 458 | "2003-04-02",NA,NA,10
 459 | "2003-04-03",0.766,0.629,10
 460 | "2003-04-04",NA,NA,10
 461 | "2003-04-05",NA,NA,10
 462 | "2003-04-06",0.904,0.134,10
 463 | "2003-04-07",NA,NA,10
 464 | "2003-04-08",NA,NA,10
 465 | "2003-04-09",0.577,0.414,10
 466 | "2003-04-10",NA,NA,10
 467 | "2003-04-11",NA,NA,10
 468 | "2003-04-12",NA,NA,10
 469 | "2003-04-13",NA,NA,10
 470 | "2003-04-14",NA,NA,10
 471 | "2003-04-15",NA,NA,10
 472 | "2003-04-16",NA,NA,10
 473 | "2003-04-17",NA,NA,10
 474 | "2003-04-18",0.453,0.53,10
 475 | "2003-04-19",NA,NA,10
 476 | "2003-04-20",NA,NA,10
 477 | "2003-04-21",0.143,0.134,10
 478 | "2003-04-22",NA,NA,10
 479 | "2003-04-23",NA,NA,10
 480 | "2003-04-24",0.414,0.328,10
 481 | "2003-04-25",NA,NA,10
 482 | "2003-04-26",NA,NA,10
 483 | "2003-04-27",NA,NA,10
 484 | "2003-04-28",NA,NA,10
 485 | "2003-04-29",NA,NA,10
 486 | "2003-04-30",1.14,0.358,10
 487 | "2003-05-01",NA,NA,10
 488 | "2003-05-02",NA,NA,10
 489 | "2003-05-03",NA,NA,10
 490 | "2003-05-04",NA,NA,10
 491 | "2003-05-05",NA,NA,10
 492 | "2003-05-06",1.39,0.77,10
 493 | "2003-05-07",NA,NA,10
 494 | "2003-05-08",NA,NA,10
 495 | "2003-05-09",NA,NA,10
 496 | "2003-05-10",NA,NA,10
 497 | "2003-05-11",NA,NA,10
 498 | "2003-05-12",2.27,0.367,10
 499 | "2003-05-13",NA,NA,10
 500 | "2003-05-14",NA,NA,10
 501 | "2003-05-15",0.991,0.321,10
 502 | "2003-05-16",NA,NA,10
 503 | "2003-05-17",NA,NA,10
 504 | "2003-05-18",1.02,0.306,10
 505 | "2003-05-19",NA,NA,10
 506 | "2003-05-20",NA,NA,10
 507 | "2003-05-21",0.783,0.358,10
 508 | "2003-05-22",NA,NA,10
 509 | "2003-05-23",NA,NA,10
 510 | "2003-05-24",0.544,0.165,10
 511 | "2003-05-25",NA,NA,10
 512 | "2003-05-26",NA,NA,10
 513 | "2003-05-27",NA,NA,10
 514 | "2003-05-28",NA,NA,10
 515 | "2003-05-29",NA,NA,10
 516 | "2003-05-30",NA,NA,10
 517 | "2003-05-31",NA,NA,10
 518 | "2003-06-01",NA,NA,10
 519 | "2003-06-02",NA,NA,10
 520 | "2003-06-03",NA,NA,10
 521 | "2003-06-04",NA,NA,10
 522 | "2003-06-05",NA,NA,10
 523 | "2003-06-06",NA,NA,10
 524 | "2003-06-07",NA,NA,10
 525 | "2003-06-08",NA,0.304,10
 526 | "2003-06-09",NA,NA,10
 527 | "2003-06-10",NA,NA,10
 528 | "2003-06-11",NA,0.59,10
 529 | "2003-06-12",NA,NA,10
 530 | "2003-06-13",NA,NA,10
 531 | "2003-06-14",0.46,0.218,10
 532 | "2003-06-15",NA,NA,10
 533 | "2003-06-16",NA,NA,10
 534 | "2003-06-17",0.447,0.188,10
 535 | "2003-06-18",NA,NA,10
 536 | "2003-06-19",NA,NA,10
 537 | "2003-06-20",0.769,0.323,10
 538 | "2003-06-21",NA,NA,10
 539 | "2003-06-22",NA,NA,10
 540 | "2003-06-23",0.645,0.331,10
 541 | "2003-06-24",NA,NA,10
 542 | "2003-06-25",NA,NA,10
 543 | "2003-06-26",NA,NA,10
 544 | "2003-06-27",NA,NA,10
 545 | "2003-06-28",NA,NA,10
 546 | "2003-06-29",NA,NA,10
 547 | "2003-06-30",NA,NA,10
 548 | "2003-07-01",NA,NA,10
 549 | "2003-07-02",0.571,0.202,10
 550 | "2003-07-03",NA,NA,10
 551 | "2003-07-04",NA,NA,10
 552 | "2003-07-05",0.741,0.199,10
 553 | "2003-07-06",NA,NA,10
 554 | "2003-07-07",NA,NA,10
 555 | "2003-07-08",0.417,0.15,10
 556 | "2003-07-09",NA,NA,10
 557 | "2003-07-10",NA,NA,10
 558 | "2003-07-11",0.7,0.36,10
 559 | "2003-07-12",NA,NA,10
 560 | "2003-07-13",NA,NA,10
 561 | "2003-07-14",NA,0.224,10
 562 | "2003-07-15",NA,NA,10
 563 | "2003-07-16",NA,NA,10
 564 | "2003-07-17",NA,NA,10
 565 | "2003-07-18",NA,NA,10
 566 | "2003-07-19",NA,NA,10
 567 | "2003-07-20",NA,NA,10
 568 | "2003-07-21",NA,NA,10
 569 | "2003-07-22",NA,NA,10
 570 | "2003-07-23",0.754,0.279,10
 571 | "2003-07-24",NA,NA,10
 572 | "2003-07-25",NA,NA,10
 573 | "2003-07-26",NA,NA,10
 574 | "2003-07-27",NA,NA,10
 575 | "2003-07-28",NA,NA,10
 576 | "2003-07-29",0.365,0.129,10
 577 | "2003-07-30",NA,NA,10
 578 | "2003-07-31",NA,NA,10
 579 | "2003-08-01",NA,NA,10
 580 | "2003-08-02",NA,NA,10
 581 | "2003-08-03",NA,NA,10
 582 | "2003-08-04",NA,NA,10
 583 | "2003-08-05",NA,0.199,10
 584 | "2003-08-06",NA,NA,10
 585 | "2003-08-07",NA,NA,10
 586 | "2003-08-08",NA,NA,10
 587 | "2003-08-09",NA,NA,10
 588 | "2003-08-10",NA,NA,10
 589 | "2003-08-11",NA,NA,10
 590 | "2003-08-12",NA,NA,10
 591 | "2003-08-13",NA,NA,10
 592 | "2003-08-14",NA,NA,10
 593 | "2003-08-15",NA,NA,10
 594 | "2003-08-16",0.281,0.204,10
 595 | "2003-08-17",NA,NA,10
 596 | "2003-08-18",NA,NA,10
 597 | "2003-08-19",0.768,0.254,10
 598 | "2003-08-20",NA,NA,10
 599 | "2003-08-21",NA,NA,10
 600 | "2003-08-22",NA,NA,10
 601 | "2003-08-23",NA,NA,10
 602 | "2003-08-24",NA,NA,10
 603 | "2003-08-25",NA,NA,10
 604 | "2003-08-26",NA,NA,10
 605 | "2003-08-27",NA,NA,10
 606 | "2003-08-28",NA,0.563,10
 607 | "2003-08-29",NA,NA,10
 608 | "2003-08-30",NA,NA,10
 609 | "2003-08-31",0.782,0.213,10
 610 | "2003-09-01",NA,NA,10
 611 | "2003-09-02",NA,NA,10
 612 | "2003-09-03",0.455,0.219,10
 613 | "2003-09-04",NA,NA,10
 614 | "2003-09-05",NA,NA,10
 615 | "2003-09-06",NA,NA,10
 616 | "2003-09-07",NA,NA,10
 617 | "2003-09-08",NA,NA,10
 618 | "2003-09-09",NA,0.325,10
 619 | "2003-09-10",NA,NA,10
 620 | "2003-09-11",NA,NA,10
 621 | "2003-09-12",0.509,0.294,10
 622 | "2003-09-13",NA,NA,10
 623 | "2003-09-14",NA,NA,10
 624 | "2003-09-15",0.257,0.116,10
 625 | "2003-09-16",NA,NA,10
 626 | "2003-09-17",NA,NA,10
 627 | "2003-09-18",NA,NA,10
 628 | "2003-09-19",NA,NA,10
 629 | "2003-09-20",NA,NA,10
 630 | "2003-09-21",NA,NA,10
 631 | "2003-09-22",NA,NA,10
 632 | "2003-09-23",NA,NA,10
 633 | "2003-09-24",NA,NA,10
 634 | "2003-09-25",NA,NA,10
 635 | "2003-09-26",NA,NA,10
 636 | "2003-09-27",0.538,0.265,10
 637 | "2003-09-28",NA,NA,10
 638 | "2003-09-29",NA,NA,10
 639 | "2003-09-30",0.367,0.161,10
 640 | "2003-10-01",NA,NA,10
 641 | "2003-10-02",NA,NA,10
 642 | "2003-10-03",NA,NA,10
 643 | "2003-10-04",NA,NA,10
 644 | "2003-10-05",NA,NA,10
 645 | "2003-10-06",NA,NA,10
 646 | "2003-10-07",NA,NA,10
 647 | "2003-10-08",NA,NA,10
 648 | "2003-10-09",NA,NA,10
 649 | "2003-10-10",NA,NA,10
 650 | "2003-10-11",NA,NA,10
 651 | "2003-10-12",0.321,0.673,10
 652 | "2003-10-13",NA,NA,10
 653 | "2003-10-14",NA,NA,10
 654 | "2003-10-15",NA,NA,10
 655 | "2003-10-16",NA,NA,10
 656 | "2003-10-17",NA,NA,10
 657 | "2003-10-18",0.349,0.173,10
 658 | "2003-10-19",NA,NA,10
 659 | "2003-10-20",0.299,0.43,10
 660 | "2003-10-21",NA,NA,10
 661 | "2003-10-22",NA,NA,10
 662 | "2003-10-23",NA,NA,10
 663 | "2003-10-24",NA,NA,10
 664 | "2003-10-25",NA,NA,10
 665 | "2003-10-26",NA,NA,10
 666 | "2003-10-27",NA,NA,10
 667 | "2003-10-28",NA,NA,10
 668 | "2003-10-29",NA,NA,10
 669 | "2003-10-30",0.408,0.295,10
 670 | "2003-10-31",NA,NA,10
 671 | "2003-11-01",NA,NA,10
 672 | "2003-11-02",0.265,0.628,10
 673 | "2003-11-03",NA,NA,10
 674 | "2003-11-04",NA,NA,10
 675 | "2003-11-05",NA,NA,10
 676 | "2003-11-06",NA,NA,10
 677 | "2003-11-07",NA,NA,10
 678 | "2003-11-08",NA,0.145,10
 679 | "2003-11-09",NA,NA,10
 680 | "2003-11-10",NA,NA,10
 681 | "2003-11-11",NA,NA,10
 682 | "2003-11-12",NA,NA,10
 683 | "2003-11-13",NA,NA,10
 684 | "2003-11-14",0.377,0.195,10
 685 | "2003-11-15",NA,NA,10
 686 | "2003-11-16",NA,NA,10
 687 | "2003-11-17",0.47,0.782,10
 688 | "2003-11-18",NA,NA,10
 689 | "2003-11-19",NA,NA,10
 690 | "2003-11-20",0.283,0.981,10
 691 | "2003-11-21",NA,NA,10
 692 | "2003-11-22",NA,NA,10
 693 | "2003-11-23",0.268,0.389,10
 694 | "2003-11-24",NA,NA,10
 695 | "2003-11-25",NA,NA,10
 696 | "2003-11-26",NA,0.307,10
 697 | "2003-11-27",NA,NA,10
 698 | "2003-11-28",NA,NA,10
 699 | "2003-11-29",0.454,0.877,10
 700 | "2003-11-30",NA,NA,10
 701 | "2003-12-01",NA,NA,10
 702 | "2003-12-02",1.08,0.936,10
 703 | "2003-12-03",NA,NA,10
 704 | "2003-12-04",NA,NA,10
 705 | "2003-12-05",NA,NA,10
 706 | "2003-12-06",NA,NA,10
 707 | "2003-12-07",NA,NA,10
 708 | "2003-12-08",0.324,0.856,10
 709 | "2003-12-09",NA,NA,10
 710 | "2003-12-10",NA,NA,10
 711 | "2003-12-11",0.284,1.33,10
 712 | "2003-12-12",NA,NA,10
 713 | "2003-12-13",NA,NA,10
 714 | "2003-12-14",0.272,2.31,10
 715 | "2003-12-15",NA,NA,10
 716 | "2003-12-16",NA,NA,10
 717 | "2003-12-17",0.323,0.998,10
 718 | "2003-12-18",NA,NA,10
 719 | "2003-12-19",NA,NA,10
 720 | "2003-12-20",0.541,1.37,10
 721 | "2003-12-21",NA,NA,10
 722 | "2003-12-22",NA,NA,10
 723 | "2003-12-23",NA,NA,10
 724 | "2003-12-24",NA,NA,10
 725 | "2003-12-25",NA,NA,10
 726 | "2003-12-26",0.778,1.69,10
 727 | "2003-12-27",NA,NA,10
 728 | "2003-12-28",NA,NA,10
 729 | "2003-12-29",NA,NA,10
 730 | "2003-12-30",0.201,0.386,10
 731 | "2003-12-31",NA,NA,10
 732 | "2004-01-01",NA,1.68,10
 733 | "2004-01-02",NA,NA,10
 734 | "2004-01-03",NA,NA,10
 735 | "2004-01-04",1.65,2.09,10
 736 | "2004-01-05",NA,NA,10
 737 | "2004-01-06",NA,NA,10
 738 | "2004-01-07",NA,NA,10
 739 | "2004-01-08",NA,NA,10
 740 | "2004-01-09",NA,NA,10
 741 | "2004-01-10",0.206,0.289,10
 742 | "2004-01-11",NA,NA,10
 743 | "2004-01-12",NA,NA,10
 744 | "2004-01-13",NA,NA,10
 745 | "2004-01-14",NA,NA,10
 746 | "2004-01-15",NA,NA,10
 747 | "2004-01-16",NA,NA,10
 748 | "2004-01-17",NA,NA,10
 749 | "2004-01-18",NA,NA,10
 750 | "2004-01-19",NA,NA,10
 751 | "2004-01-20",NA,NA,10
 752 | "2004-01-21",NA,NA,10
 753 | "2004-01-22",0.208,0.357,10
 754 | "2004-01-23",NA,NA,10
 755 | "2004-01-24",NA,NA,10
 756 | "2004-01-25",0.751,1.29,10
 757 | "2004-01-26",NA,NA,10
 758 | "2004-01-27",NA,NA,10
 759 | "2004-01-28",0.601,0.59,10
 760 | "2004-01-29",NA,NA,10
 761 | "2004-01-30",NA,NA,10
 762 | "2004-01-31",0.412,0.446,10
 763 | "2004-02-01",NA,NA,10
 764 | "2004-02-02",NA,NA,10
 765 | "2004-02-03",NA,0.876,10
 766 | "2004-02-04",NA,NA,10
 767 | "2004-02-05",NA,NA,10
 768 | "2004-02-06",0.541,0.524,10
 769 | "2004-02-07",NA,NA,10
 770 | "2004-02-08",NA,NA,10
 771 | "2004-02-09",0.571,1.04,10
 772 | "2004-02-10",NA,NA,10
 773 | "2004-02-11",NA,NA,10
 774 | "2004-02-12",0.407,0.69,10
 775 | "2004-02-13",NA,NA,10
 776 | "2004-02-14",NA,NA,10
 777 | "2004-02-15",NA,1.28,10
 778 | "2004-02-16",NA,NA,10
 779 | "2004-02-17",NA,NA,10
 780 | "2004-02-18",0.415,1.1,10
 781 | "2004-02-19",NA,NA,10
 782 | "2004-02-20",NA,NA,10
 783 | "2004-02-21",0.227,0.716,10
 784 | "2004-02-22",NA,NA,10
 785 | "2004-02-23",NA,NA,10
 786 | "2004-02-24",0.442,0.754,10
 787 | "2004-02-25",NA,NA,10
 788 | "2004-02-26",NA,NA,10
 789 | "2004-02-27",NA,NA,10
 790 | "2004-02-28",NA,NA,10
 791 | "2004-02-29",NA,NA,10
 792 | "2004-03-01",NA,NA,10
 793 | "2004-03-02",NA,NA,10
 794 | "2004-03-03",NA,NA,10
 795 | "2004-03-04",0.311,1.39,10
 796 | "2004-03-05",NA,NA,10
 797 | "2004-03-06",NA,NA,10
 798 | "2004-03-07",0.425,0.564,10
 799 | "2004-03-08",NA,NA,10
 800 | "2004-03-09",NA,NA,10
 801 | "2004-03-10",0.446,0.615,10
 802 | "2004-03-11",NA,NA,10
 803 | "2004-03-12",NA,NA,10
 804 | "2004-03-13",NA,NA,10
 805 | "2004-03-14",NA,NA,10
 806 | "2004-03-15",NA,NA,10
 807 | "2004-03-16",NA,NA,10
 808 | "2004-03-17",NA,NA,10
 809 | "2004-03-18",NA,NA,10
 810 | "2004-03-19",0.486,0.213,10
 811 | "2004-03-20",NA,NA,10
 812 | "2004-03-21",NA,NA,10
 813 | "2004-03-22",NA,NA,10
 814 | "2004-03-23",0.891,1.29,10
 815 | "2004-03-24",NA,NA,10
 816 | "2004-03-25",NA,NA,10
 817 | "2004-03-26",NA,NA,10
 818 | "2004-03-27",NA,NA,10
 819 | "2004-03-28",0.686,0.739,10
 820 | "2004-03-29",NA,NA,10
 821 | "2004-03-30",NA,NA,10
 822 | "2004-03-31",0.983,0.77,10
 823 | "2004-04-01",NA,NA,10
 824 | "2004-04-02",NA,NA,10
 825 | "2004-04-03",0.471,0.753,10
 826 | "2004-04-04",NA,NA,10
 827 | "2004-04-05",NA,NA,10
 828 | "2004-04-06",0.305,0.818,10
 829 | "2004-04-07",NA,NA,10
 830 | "2004-04-08",NA,NA,10
 831 | "2004-04-09",NA,NA,10
 832 | "2004-04-10",NA,NA,10
 833 | "2004-04-11",NA,NA,10
 834 | "2004-04-12",0.458,1.35,10
 835 | "2004-04-13",NA,NA,10
 836 | "2004-04-14",NA,NA,10
 837 | "2004-04-15",0.625,0.545,10
 838 | "2004-04-16",NA,NA,10
 839 | "2004-04-17",NA,NA,10
 840 | "2004-04-18",0.442,0.641,10
 841 | "2004-04-19",NA,NA,10
 842 | "2004-04-20",NA,NA,10
 843 | "2004-04-21",0.48,0.717,10
 844 | "2004-04-22",NA,NA,10
 845 | "2004-04-23",NA,NA,10
 846 | "2004-04-24",NA,0.355,10
 847 | "2004-04-25",NA,NA,10
 848 | "2004-04-26",NA,NA,10
 849 | "2004-04-27",0.117,0.0636,10
 850 | "2004-04-28",NA,NA,10
 851 | "2004-04-29",NA,NA,10
 852 | "2004-04-30",NA,0.163,10
 853 | "2004-05-01",NA,NA,10
 854 | "2004-05-02",NA,NA,10
 855 | "2004-05-03",0.352,0.388,10
 856 | "2004-05-04",NA,NA,10
 857 | "2004-05-05",NA,NA,10
 858 | "2004-05-06",0.769,0.585,10
 859 | "2004-05-07",NA,NA,10
 860 | "2004-05-08",NA,NA,10
 861 | "2004-05-09",NA,NA,10
 862 | "2004-05-10",NA,NA,10
 863 | "2004-05-11",NA,NA,10
 864 | "2004-05-12",1.01,1.73,10
 865 | "2004-05-13",NA,NA,10
 866 | "2004-05-14",NA,NA,10
 867 | "2004-05-15",0.667,0.28,10
 868 | "2004-05-16",NA,NA,10
 869 | "2004-05-17",NA,NA,10
 870 | "2004-05-18",0.415,0.381,10
 871 | "2004-05-19",NA,NA,10
 872 | "2004-05-20",NA,NA,10
 873 | "2004-05-21",1.53,0.44,10
 874 | "2004-05-22",NA,NA,10
 875 | "2004-05-23",NA,NA,10
 876 | "2004-05-24",NA,NA,10
 877 | "2004-05-25",NA,NA,10
 878 | "2004-05-26",NA,NA,10
 879 | "2004-05-27",0.394,0.281,10
 880 | "2004-05-28",NA,NA,10
 881 | "2004-05-29",NA,NA,10
 882 | "2004-05-30",0.349,0.222,10
 883 | "2004-05-31",NA,NA,10
 884 | "2004-06-01",NA,NA,10
 885 | "2004-06-02",NA,NA,10
 886 | "2004-06-03",NA,NA,10
 887 | "2004-06-04",NA,NA,10
 888 | "2004-06-05",NA,NA,10
 889 | "2004-06-06",NA,NA,10
 890 | "2004-06-07",NA,NA,10
 891 | "2004-06-08",NA,NA,10
 892 | "2004-06-09",NA,NA,10
 893 | "2004-06-10",NA,NA,10
 894 | "2004-06-11",NA,NA,10
 895 | "2004-06-12",NA,NA,10
 896 | "2004-06-13",NA,NA,10
 897 | "2004-06-14",NA,NA,10
 898 | "2004-06-15",NA,NA,10
 899 | "2004-06-16",NA,NA,10
 900 | "2004-06-17",NA,NA,10
 901 | "2004-06-18",NA,NA,10
 902 | "2004-06-19",NA,NA,10
 903 | "2004-06-20",NA,NA,10
 904 | "2004-06-21",NA,NA,10
 905 | "2004-06-22",NA,NA,10
 906 | "2004-06-23",NA,NA,10
 907 | "2004-06-24",NA,NA,10
 908 | "2004-06-25",NA,NA,10
 909 | "2004-06-26",NA,NA,10
 910 | "2004-06-27",NA,NA,10
 911 | "2004-06-28",NA,NA,10
 912 | "2004-06-29",NA,NA,10
 913 | "2004-06-30",NA,NA,10
 914 | "2004-07-01",NA,NA,10
 915 | "2004-07-02",NA,NA,10
 916 | "2004-07-03",NA,NA,10
 917 | "2004-07-04",NA,NA,10
 918 | "2004-07-05",NA,NA,10
 919 | "2004-07-06",NA,NA,10
 920 | "2004-07-07",NA,NA,10
 921 | "2004-07-08",NA,NA,10
 922 | "2004-07-09",NA,NA,10
 923 | "2004-07-10",NA,NA,10
 924 | "2004-07-11",NA,NA,10
 925 | "2004-07-12",NA,NA,10
 926 | "2004-07-13",NA,NA,10
 927 | "2004-07-14",NA,NA,10
 928 | "2004-07-15",NA,NA,10
 929 | "2004-07-16",NA,NA,10
 930 | "2004-07-17",NA,NA,10
 931 | "2004-07-18",NA,NA,10
 932 | "2004-07-19",NA,NA,10
 933 | "2004-07-20",NA,NA,10
 934 | "2004-07-21",NA,NA,10
 935 | "2004-07-22",NA,NA,10
 936 | "2004-07-23",NA,NA,10
 937 | "2004-07-24",NA,NA,10
 938 | "2004-07-25",NA,NA,10
 939 | "2004-07-26",NA,NA,10
 940 | "2004-07-27",NA,NA,10
 941 | "2004-07-28",NA,NA,10
 942 | "2004-07-29",NA,NA,10
 943 | "2004-07-30",NA,NA,10
 944 | "2004-07-31",NA,NA,10
 945 | "2004-08-01",NA,NA,10
 946 | "2004-08-02",NA,NA,10
 947 | "2004-08-03",NA,NA,10
 948 | "2004-08-04",NA,NA,10
 949 | "2004-08-05",NA,NA,10
 950 | "2004-08-06",NA,NA,10
 951 | "2004-08-07",NA,NA,10
 952 | "2004-08-08",NA,NA,10
 953 | "2004-08-09",NA,NA,10
 954 | "2004-08-10",NA,NA,10
 955 | "2004-08-11",NA,NA,10
 956 | "2004-08-12",NA,NA,10
 957 | "2004-08-13",NA,NA,10
 958 | "2004-08-14",NA,NA,10
 959 | "2004-08-15",NA,NA,10
 960 | "2004-08-16",NA,NA,10
 961 | "2004-08-17",NA,NA,10
 962 | "2004-08-18",NA,NA,10
 963 | "2004-08-19",NA,NA,10
 964 | "2004-08-20",NA,NA,10
 965 | "2004-08-21",NA,NA,10
 966 | "2004-08-22",NA,NA,10
 967 | "2004-08-23",NA,NA,10
 968 | "2004-08-24",NA,NA,10
 969 | "2004-08-25",NA,NA,10
 970 | "2004-08-26",NA,NA,10
 971 | "2004-08-27",NA,NA,10
 972 | "2004-08-28",NA,NA,10
 973 | "2004-08-29",NA,NA,10
 974 | "2004-08-30",NA,NA,10
 975 | "2004-08-31",NA,NA,10
 976 | "2004-09-01",NA,NA,10
 977 | "2004-09-02",NA,NA,10
 978 | "2004-09-03",NA,NA,10
 979 | "2004-09-04",NA,NA,10
 980 | "2004-09-05",NA,NA,10
 981 | "2004-09-06",NA,NA,10
 982 | "2004-09-07",NA,NA,10
 983 | "2004-09-08",NA,NA,10
 984 | "2004-09-09",NA,NA,10
 985 | "2004-09-10",NA,NA,10
 986 | "2004-09-11",NA,NA,10
 987 | "2004-09-12",NA,NA,10
 988 | "2004-09-13",NA,NA,10
 989 | "2004-09-14",NA,NA,10
 990 | "2004-09-15",NA,NA,10
 991 | "2004-09-16",NA,NA,10
 992 | "2004-09-17",NA,NA,10
 993 | "2004-09-18",NA,NA,10
 994 | "2004-09-19",NA,NA,10
 995 | "2004-09-20",NA,NA,10
 996 | "2004-09-21",NA,NA,10
 997 | "2004-09-22",NA,NA,10
 998 | "2004-09-23",NA,NA,10
 999 | "2004-09-24",NA,NA,10
1000 | "2004-09-25",NA,NA,10
1001 | "2004-09-26",NA,NA,10
1002 | "2004-09-27",NA,NA,10
1003 | "2004-09-28",NA,NA,10
1004 | "2004-09-29",NA,NA,10
1005 | "2004-09-30",NA,NA,10
1006 | "2004-10-01",NA,NA,10
1007 | "2004-10-02",NA,NA,10
1008 | "2004-10-03",NA,NA,10
1009 | "2004-10-04",NA,NA,10
1010 | "2004-10-05",NA,NA,10
1011 | "2004-10-06",NA,NA,10
1012 | "2004-10-07",NA,NA,10
1013 | "2004-10-08",NA,NA,10
1014 | "2004-10-09",NA,NA,10
1015 | "2004-10-10",NA,NA,10
1016 | "2004-10-11",NA,NA,10
1017 | "2004-10-12",NA,NA,10
1018 | "2004-10-13",NA,NA,10
1019 | "2004-10-14",NA,NA,10
1020 | "2004-10-15",NA,NA,10
1021 | "2004-10-16",NA,NA,10
1022 | "2004-10-17",NA,NA,10
1023 | "2004-10-18",NA,NA,10
1024 | "2004-10-19",NA,NA,10
1025 | "2004-10-20",NA,NA,10
1026 | "2004-10-21",NA,NA,10
1027 | "2004-10-22",NA,NA,10
1028 | "2004-10-23",NA,NA,10
1029 | "2004-10-24",NA,NA,10
1030 | "2004-10-25",NA,NA,10
1031 | "2004-10-26",NA,NA,10
1032 | "2004-10-27",NA,NA,10
1033 | "2004-10-28",NA,NA,10
1034 | "2004-10-29",NA,NA,10
1035 | "2004-10-30",NA,NA,10
1036 | "2004-10-31",NA,NA,10
1037 | "2004-11-01",NA,NA,10
1038 | "2004-11-02",NA,NA,10
1039 | "2004-11-03",NA,NA,10
1040 | "2004-11-04",NA,NA,10
1041 | "2004-11-05",NA,NA,10
1042 | "2004-11-06",NA,NA,10
1043 | "2004-11-07",NA,NA,10
1044 | "2004-11-08",NA,NA,10
1045 | "2004-11-09",NA,NA,10
1046 | "2004-11-10",NA,NA,10
1047 | "2004-11-11",NA,NA,10
1048 | "2004-11-12",NA,NA,10
1049 | "2004-11-13",NA,NA,10
1050 | "2004-11-14",NA,NA,10
1051 | "2004-11-15",NA,NA,10
1052 | "2004-11-16",NA,NA,10
1053 | "2004-11-17",NA,NA,10
1054 | "2004-11-18",NA,NA,10
1055 | "2004-11-19",NA,NA,10
1056 | "2004-11-20",NA,NA,10
1057 | "2004-11-21",NA,NA,10
1058 | "2004-11-22",NA,NA,10
1059 | "2004-11-23",NA,NA,10
1060 | "2004-11-24",NA,NA,10
1061 | "2004-11-25",NA,NA,10
1062 | "2004-11-26",NA,NA,10
1063 | "2004-11-27",NA,NA,10
1064 | "2004-11-28",NA,NA,10
1065 | "2004-11-29",NA,NA,10
1066 | "2004-11-30",NA,NA,10
1067 | "2004-12-01",NA,NA,10
1068 | "2004-12-02",NA,NA,10
1069 | "2004-12-03",NA,NA,10
1070 | "2004-12-04",NA,NA,10
1071 | "2004-12-05",NA,NA,10
1072 | "2004-12-06",NA,NA,10
1073 | "2004-12-07",NA,NA,10
1074 | "2004-12-08",NA,NA,10
1075 | "2004-12-09",NA,NA,10
1076 | "2004-12-10",NA,NA,10
1077 | "2004-12-11",NA,NA,10
1078 | "2004-12-12",NA,NA,10
1079 | "2004-12-13",NA,NA,10
1080 | "2004-12-14",NA,NA,10
1081 | "2004-12-15",NA,NA,10
1082 | "2004-12-16",NA,NA,10
1083 | "2004-12-17",NA,NA,10
1084 | "2004-12-18",NA,NA,10
1085 | "2004-12-19",NA,NA,10
1086 | "2004-12-20",NA,NA,10
1087 | "2004-12-21",NA,NA,10
1088 | "2004-12-22",NA,NA,10
1089 | "2004-12-23",NA,NA,10
1090 | "2004-12-24",NA,NA,10
1091 | "2004-12-25",NA,NA,10
1092 | "2004-12-26",NA,NA,10
1093 | "2004-12-27",NA,NA,10
1094 | "2004-12-28",NA,NA,10
1095 | "2004-12-29",NA,NA,10
1096 | "2004-12-30",NA,NA,10
1097 | "2004-12-31",NA,NA,10
1098 | 


--------------------------------------------------------------------------------
/datasets/lotsofdata/012.csv:
--------------------------------------------------------------------------------
  1 | "Date","sulfate","nitrate","ID"
  2 | "2004-01-01",NA,NA,12
  3 | "2004-01-02",NA,NA,12
  4 | "2004-01-03",NA,NA,12
  5 | "2004-01-04",NA,NA,12
  6 | "2004-01-05",NA,NA,12
  7 | "2004-01-06",NA,NA,12
  8 | "2004-01-07",NA,NA,12
  9 | "2004-01-08",NA,NA,12
 10 | "2004-01-09",NA,NA,12
 11 | "2004-01-10",NA,NA,12
 12 | "2004-01-11",NA,NA,12
 13 | "2004-01-12",NA,NA,12
 14 | "2004-01-13",NA,NA,12
 15 | "2004-01-14",NA,NA,12
 16 | "2004-01-15",NA,NA,12
 17 | "2004-01-16",NA,NA,12
 18 | "2004-01-17",NA,NA,12
 19 | "2004-01-18",NA,NA,12
 20 | "2004-01-19",NA,NA,12
 21 | "2004-01-20",NA,NA,12
 22 | "2004-01-21",NA,NA,12
 23 | "2004-01-22",NA,NA,12
 24 | "2004-01-23",NA,NA,12
 25 | "2004-01-24",NA,NA,12
 26 | "2004-01-25",NA,NA,12
 27 | "2004-01-26",NA,NA,12
 28 | "2004-01-27",NA,NA,12
 29 | "2004-01-28",NA,NA,12
 30 | "2004-01-29",NA,NA,12
 31 | "2004-01-30",NA,NA,12
 32 | "2004-01-31",NA,NA,12
 33 | "2004-02-01",NA,NA,12
 34 | "2004-02-02",NA,NA,12
 35 | "2004-02-03",NA,NA,12
 36 | "2004-02-04",NA,NA,12
 37 | "2004-02-05",NA,NA,12
 38 | "2004-02-06",NA,NA,12
 39 | "2004-02-07",NA,NA,12
 40 | "2004-02-08",NA,NA,12
 41 | "2004-02-09",NA,NA,12
 42 | "2004-02-10",NA,NA,12
 43 | "2004-02-11",NA,NA,12
 44 | "2004-02-12",NA,NA,12
 45 | "2004-02-13",NA,NA,12
 46 | "2004-02-14",NA,NA,12
 47 | "2004-02-15",NA,NA,12
 48 | "2004-02-16",NA,NA,12
 49 | "2004-02-17",NA,NA,12
 50 | "2004-02-18",NA,NA,12
 51 | "2004-02-19",NA,NA,12
 52 | "2004-02-20",NA,NA,12
 53 | "2004-02-21",NA,NA,12
 54 | "2004-02-22",NA,NA,12
 55 | "2004-02-23",NA,NA,12
 56 | "2004-02-24",NA,NA,12
 57 | "2004-02-25",NA,NA,12
 58 | "2004-02-26",NA,NA,12
 59 | "2004-02-27",NA,NA,12
 60 | "2004-02-28",NA,NA,12
 61 | "2004-02-29",NA,NA,12
 62 | "2004-03-01",NA,NA,12
 63 | "2004-03-02",NA,NA,12
 64 | "2004-03-03",NA,NA,12
 65 | "2004-03-04",NA,NA,12
 66 | "2004-03-05",NA,NA,12
 67 | "2004-03-06",NA,NA,12
 68 | "2004-03-07",NA,NA,12
 69 | "2004-03-08",NA,NA,12
 70 | "2004-03-09",NA,NA,12
 71 | "2004-03-10",NA,NA,12
 72 | "2004-03-11",NA,NA,12
 73 | "2004-03-12",NA,NA,12
 74 | "2004-03-13",NA,NA,12
 75 | "2004-03-14",NA,NA,12
 76 | "2004-03-15",NA,NA,12
 77 | "2004-03-16",NA,NA,12
 78 | "2004-03-17",NA,NA,12
 79 | "2004-03-18",NA,NA,12
 80 | "2004-03-19",NA,NA,12
 81 | "2004-03-20",NA,NA,12
 82 | "2004-03-21",NA,NA,12
 83 | "2004-03-22",NA,NA,12
 84 | "2004-03-23",NA,NA,12
 85 | "2004-03-24",NA,NA,12
 86 | "2004-03-25",NA,NA,12
 87 | "2004-03-26",NA,NA,12
 88 | "2004-03-27",NA,NA,12
 89 | "2004-03-28",0.0353,0.0598,12
 90 | "2004-03-29",NA,NA,12
 91 | "2004-03-30",NA,NA,12
 92 | "2004-03-31",NA,NA,12
 93 | "2004-04-01",NA,NA,12
 94 | "2004-04-02",NA,NA,12
 95 | "2004-04-03",NA,NA,12
 96 | "2004-04-04",NA,NA,12
 97 | "2004-04-05",NA,NA,12
 98 | "2004-04-06",NA,NA,12
 99 | "2004-04-07",NA,NA,12
100 | "2004-04-08",NA,NA,12
101 | "2004-04-09",1.33,0.95,12
102 | "2004-04-10",NA,NA,12
103 | "2004-04-11",NA,NA,12
104 | "2004-04-12",NA,NA,12
105 | "2004-04-13",NA,NA,12
106 | "2004-04-14",NA,NA,12
107 | "2004-04-15",1.29,0.255,12
108 | "2004-04-16",NA,NA,12
109 | "2004-04-17",NA,NA,12
110 | "2004-04-18",NA,NA,12
111 | "2004-04-19",NA,NA,12
112 | "2004-04-20",NA,NA,12
113 | "2004-04-21",1.94,0.853,12
114 | "2004-04-22",NA,NA,12
115 | "2004-04-23",NA,NA,12
116 | "2004-04-24",NA,NA,12
117 | "2004-04-25",NA,NA,12
118 | "2004-04-26",NA,NA,12
119 | "2004-04-27",1.83,0.348,12
120 | "2004-04-28",NA,NA,12
121 | "2004-04-29",NA,NA,12
122 | "2004-04-30",NA,NA,12
123 | "2004-05-01",NA,NA,12
124 | "2004-05-02",NA,NA,12
125 | "2004-05-03",1.16,0.275,12
126 | "2004-05-04",NA,NA,12
127 | "2004-05-05",NA,NA,12
128 | "2004-05-06",NA,NA,12
129 | "2004-05-07",NA,NA,12
130 | "2004-05-08",NA,NA,12
131 | "2004-05-09",1.65,0.438,12
132 | "2004-05-10",NA,NA,12
133 | "2004-05-11",NA,NA,12
134 | "2004-05-12",NA,NA,12
135 | "2004-05-13",NA,NA,12
136 | "2004-05-14",NA,NA,12
137 | "2004-05-15",1.26,0.486,12
138 | "2004-05-16",NA,NA,12
139 | "2004-05-17",NA,NA,12
140 | "2004-05-18",NA,NA,12
141 | "2004-05-19",NA,NA,12
142 | "2004-05-20",NA,NA,12
143 | "2004-05-21",0.804,0.383,12
144 | "2004-05-22",NA,NA,12
145 | "2004-05-23",NA,NA,12
146 | "2004-05-24",NA,NA,12
147 | "2004-05-25",NA,NA,12
148 | "2004-05-26",NA,NA,12
149 | "2004-05-27",1.34,0.692,12
150 | "2004-05-28",NA,NA,12
151 | "2004-05-29",NA,NA,12
152 | "2004-05-30",NA,NA,12
153 | "2004-05-31",NA,NA,12
154 | "2004-06-01",NA,NA,12
155 | "2004-06-02",1.35,0.446,12
156 | "2004-06-03",NA,NA,12
157 | "2004-06-04",NA,NA,12
158 | "2004-06-05",NA,NA,12
159 | "2004-06-06",NA,NA,12
160 | "2004-06-07",NA,NA,12
161 | "2004-06-08",1.04,0.422,12
162 | "2004-06-09",NA,NA,12
163 | "2004-06-10",NA,NA,12
164 | "2004-06-11",NA,NA,12
165 | "2004-06-12",NA,NA,12
166 | "2004-06-13",NA,NA,12
167 | "2004-06-14",1.28,0.307,12
168 | "2004-06-15",NA,NA,12
169 | "2004-06-16",NA,NA,12
170 | "2004-06-17",NA,NA,12
171 | "2004-06-18",NA,NA,12
172 | "2004-06-19",NA,NA,12
173 | "2004-06-20",0.561,0.184,12
174 | "2004-06-21",NA,NA,12
175 | "2004-06-22",NA,NA,12
176 | "2004-06-23",NA,NA,12
177 | "2004-06-24",NA,NA,12
178 | "2004-06-25",NA,NA,12
179 | "2004-06-26",1.15,0.36,12
180 | "2004-06-27",NA,NA,12
181 | "2004-06-28",NA,NA,12
182 | "2004-06-29",NA,NA,12
183 | "2004-06-30",NA,NA,12
184 | "2004-07-01",NA,NA,12
185 | "2004-07-02",0.532,0.196,12
186 | "2004-07-03",NA,NA,12
187 | "2004-07-04",NA,NA,12
188 | "2004-07-05",NA,NA,12
189 | "2004-07-06",NA,NA,12
190 | "2004-07-07",NA,NA,12
191 | "2004-07-08",NA,NA,12
192 | "2004-07-09",NA,NA,12
193 | "2004-07-10",NA,NA,12
194 | "2004-07-11",NA,NA,12
195 | "2004-07-12",NA,NA,12
196 | "2004-07-13",NA,NA,12
197 | "2004-07-14",1.19,0.957,12
198 | "2004-07-15",NA,NA,12
199 | "2004-07-16",NA,NA,12
200 | "2004-07-17",NA,NA,12
201 | "2004-07-18",NA,NA,12
202 | "2004-07-19",NA,NA,12
203 | "2004-07-20",1.6,0.459,12
204 | "2004-07-21",NA,NA,12
205 | "2004-07-22",NA,NA,12
206 | "2004-07-23",NA,NA,12
207 | "2004-07-24",NA,NA,12
208 | "2004-07-25",NA,NA,12
209 | "2004-07-26",0.973,0.325,12
210 | "2004-07-27",NA,NA,12
211 | "2004-07-28",NA,NA,12
212 | "2004-07-29",NA,NA,12
213 | "2004-07-30",NA,NA,12
214 | "2004-07-31",NA,NA,12
215 | "2004-08-01",1.57,0.366,12
216 | "2004-08-02",NA,NA,12
217 | "2004-08-03",NA,NA,12
218 | "2004-08-04",NA,NA,12
219 | "2004-08-05",NA,NA,12
220 | "2004-08-06",NA,NA,12
221 | "2004-08-07",NA,NA,12
222 | "2004-08-08",NA,NA,12
223 | "2004-08-09",NA,NA,12
224 | "2004-08-10",NA,NA,12
225 | "2004-08-11",NA,NA,12
226 | "2004-08-12",NA,NA,12
227 | "2004-08-13",1.22,0.337,12
228 | "2004-08-14",NA,NA,12
229 | "2004-08-15",NA,NA,12
230 | "2004-08-16",NA,NA,12
231 | "2004-08-17",NA,NA,12
232 | "2004-08-18",NA,NA,12
233 | "2004-08-19",1.04,0.337,12
234 | "2004-08-20",NA,NA,12
235 | "2004-08-21",NA,NA,12
236 | "2004-08-22",NA,NA,12
237 | "2004-08-23",NA,NA,12
238 | "2004-08-24",NA,NA,12
239 | "2004-08-25",1.24,0.268,12
240 | "2004-08-26",NA,NA,12
241 | "2004-08-27",NA,NA,12
242 | "2004-08-28",NA,NA,12
243 | "2004-08-29",NA,NA,12
244 | "2004-08-30",NA,NA,12
245 | "2004-08-31",1.34,0.361,12
246 | "2004-09-01",NA,NA,12
247 | "2004-09-02",NA,NA,12
248 | "2004-09-03",NA,NA,12
249 | "2004-09-04",NA,NA,12
250 | "2004-09-05",NA,NA,12
251 | "2004-09-06",0.876,0.38,12
252 | "2004-09-07",NA,NA,12
253 | "2004-09-08",NA,NA,12
254 | "2004-09-09",NA,NA,12
255 | "2004-09-10",NA,NA,12
256 | "2004-09-11",NA,NA,12
257 | "2004-09-12",1.88,0.781,12
258 | "2004-09-13",NA,NA,12
259 | "2004-09-14",NA,NA,12
260 | "2004-09-15",NA,NA,12
261 | "2004-09-16",NA,NA,12
262 | "2004-09-17",NA,NA,12
263 | "2004-09-18",1.37,0.288,12
264 | "2004-09-19",NA,NA,12
265 | "2004-09-20",NA,NA,12
266 | "2004-09-21",NA,NA,12
267 | "2004-09-22",NA,NA,12
268 | "2004-09-23",NA,NA,12
269 | "2004-09-24",1.06,0.755,12
270 | "2004-09-25",NA,NA,12
271 | "2004-09-26",NA,NA,12
272 | "2004-09-27",NA,NA,12
273 | "2004-09-28",NA,NA,12
274 | "2004-09-29",NA,NA,12
275 | "2004-09-30",0.882,0.48,12
276 | "2004-10-01",NA,NA,12
277 | "2004-10-02",NA,NA,12
278 | "2004-10-03",NA,NA,12
279 | "2004-10-04",NA,NA,12
280 | "2004-10-05",NA,NA,12
281 | "2004-10-06",1.44,0.409,12
282 | "2004-10-07",NA,NA,12
283 | "2004-10-08",NA,NA,12
284 | "2004-10-09",NA,NA,12
285 | "2004-10-10",NA,NA,12
286 | "2004-10-11",NA,NA,12
287 | "2004-10-12",1.88,0.501,12
288 | "2004-10-13",NA,NA,12
289 | "2004-10-14",NA,NA,12
290 | "2004-10-15",NA,NA,12
291 | "2004-10-16",NA,NA,12
292 | "2004-10-17",NA,NA,12
293 | "2004-10-18",1.38,0.826,12
294 | "2004-10-19",NA,NA,12
295 | "2004-10-20",NA,NA,12
296 | "2004-10-21",NA,NA,12
297 | "2004-10-22",NA,NA,12
298 | "2004-10-23",NA,NA,12
299 | "2004-10-24",NA,2.9,12
300 | "2004-10-25",NA,NA,12
301 | "2004-10-26",NA,NA,12
302 | "2004-10-27",NA,NA,12
303 | "2004-10-28",NA,NA,12
304 | "2004-10-29",NA,NA,12
305 | "2004-10-30",0.137,0.0314,12
306 | "2004-10-31",NA,NA,12
307 | "2004-11-01",NA,NA,12
308 | "2004-11-02",NA,NA,12
309 | "2004-11-03",NA,NA,12
310 | "2004-11-04",NA,NA,12
311 | "2004-11-05",1.01,0.402,12
312 | "2004-11-06",NA,NA,12
313 | "2004-11-07",NA,NA,12
314 | "2004-11-08",NA,NA,12
315 | "2004-11-09",NA,NA,12
316 | "2004-11-10",NA,NA,12
317 | "2004-11-11",1.11,2.74,12
318 | "2004-11-12",NA,NA,12
319 | "2004-11-13",NA,NA,12
320 | "2004-11-14",NA,NA,12
321 | "2004-11-15",NA,NA,12
322 | "2004-11-16",NA,NA,12
323 | "2004-11-17",0.7,3.98,12
324 | "2004-11-18",NA,NA,12
325 | "2004-11-19",NA,NA,12
326 | "2004-11-20",NA,NA,12
327 | "2004-11-21",NA,NA,12
328 | "2004-11-22",NA,NA,12
329 | "2004-11-23",NA,1.21,12
330 | "2004-11-24",NA,NA,12
331 | "2004-11-25",NA,NA,12
332 | "2004-11-26",NA,NA,12
333 | "2004-11-27",NA,NA,12
334 | "2004-11-28",NA,NA,12
335 | "2004-11-29",0.563,0.39,12
336 | "2004-11-30",NA,NA,12
337 | "2004-12-01",NA,NA,12
338 | "2004-12-02",NA,NA,12
339 | "2004-12-03",NA,NA,12
340 | "2004-12-04",NA,NA,12
341 | "2004-12-05",0.819,3.83,12
342 | "2004-12-06",NA,NA,12
343 | "2004-12-07",NA,NA,12
344 | "2004-12-08",NA,NA,12
345 | "2004-12-09",NA,NA,12
346 | "2004-12-10",NA,NA,12
347 | "2004-12-11",0.752,2.72,12
348 | "2004-12-12",NA,NA,12
349 | "2004-12-13",NA,NA,12
350 | "2004-12-14",NA,NA,12
351 | "2004-12-15",NA,NA,12
352 | "2004-12-16",NA,NA,12
353 | "2004-12-17",0.654,0.318,12
354 | "2004-12-18",NA,NA,12
355 | "2004-12-19",NA,NA,12
356 | "2004-12-20",NA,NA,12
357 | "2004-12-21",NA,NA,12
358 | "2004-12-22",NA,NA,12
359 | "2004-12-23",0.564,1.08,12
360 | "2004-12-24",NA,NA,12
361 | "2004-12-25",NA,NA,12
362 | "2004-12-26",NA,NA,12
363 | "2004-12-27",NA,NA,12
364 | "2004-12-28",NA,NA,12
365 | "2004-12-29",0.573,0.482,12
366 | "2004-12-30",NA,NA,12
367 | "2004-12-31",NA,NA,12
368 | "2005-01-01",NA,NA,12
369 | "2005-01-02",NA,NA,12
370 | "2005-01-03",NA,NA,12
371 | "2005-01-04",0.201,0.816,12
372 | "2005-01-05",NA,NA,12
373 | "2005-01-06",NA,NA,12
374 | "2005-01-07",NA,NA,12
375 | "2005-01-08",NA,NA,12
376 | "2005-01-09",NA,NA,12
377 | "2005-01-10",1.17,4.58,12
378 | "2005-01-11",NA,NA,12
379 | "2005-01-12",NA,NA,12
380 | "2005-01-13",NA,NA,12
381 | "2005-01-14",NA,NA,12
382 | "2005-01-15",NA,NA,12
383 | "2005-01-16",0.48,3.3,12
384 | "2005-01-17",NA,NA,12
385 | "2005-01-18",NA,NA,12
386 | "2005-01-19",NA,NA,12
387 | "2005-01-20",NA,NA,12
388 | "2005-01-21",NA,NA,12
389 | "2005-01-22",1.96,5.01,12
390 | "2005-01-23",NA,NA,12
391 | "2005-01-24",NA,NA,12
392 | "2005-01-25",NA,NA,12
393 | "2005-01-26",NA,NA,12
394 | "2005-01-27",NA,NA,12
395 | "2005-01-28",1.29,3.44,12
396 | "2005-01-29",NA,NA,12
397 | "2005-01-30",NA,NA,12
398 | "2005-01-31",NA,NA,12
399 | "2005-02-01",NA,NA,12
400 | "2005-02-02",NA,NA,12
401 | "2005-02-03",0.886,0.216,12
402 | "2005-02-04",NA,NA,12
403 | "2005-02-05",NA,NA,12
404 | "2005-02-06",NA,NA,12
405 | "2005-02-07",NA,NA,12
406 | "2005-02-08",NA,NA,12
407 | "2005-02-09",0.677,1.01,12
408 | "2005-02-10",NA,NA,12
409 | "2005-02-11",NA,NA,12
410 | "2005-02-12",NA,NA,12
411 | "2005-02-13",NA,NA,12
412 | "2005-02-14",NA,NA,12
413 | "2005-02-15",0.992,6.23,12
414 | "2005-02-16",NA,NA,12
415 | "2005-02-17",NA,NA,12
416 | "2005-02-18",NA,NA,12
417 | "2005-02-19",NA,NA,12
418 | "2005-02-20",NA,NA,12
419 | "2005-02-21",0.603,0.63,12
420 | "2005-02-22",NA,NA,12
421 | "2005-02-23",NA,NA,12
422 | "2005-02-24",NA,NA,12
423 | "2005-02-25",NA,NA,12
424 | "2005-02-26",NA,NA,12
425 | "2005-02-27",0.759,1.3,12
426 | "2005-02-28",NA,NA,12
427 | "2005-03-01",NA,NA,12
428 | "2005-03-02",NA,NA,12
429 | "2005-03-03",NA,NA,12
430 | "2005-03-04",NA,NA,12
431 | "2005-03-05",0.382,0.426,12
432 | "2005-03-06",NA,NA,12
433 | "2005-03-07",NA,NA,12
434 | "2005-03-08",NA,NA,12
435 | "2005-03-09",NA,NA,12
436 | "2005-03-10",NA,NA,12
437 | "2005-03-11",0.889,0.8,12
438 | "2005-03-12",NA,NA,12
439 | "2005-03-13",NA,NA,12
440 | "2005-03-14",NA,NA,12
441 | "2005-03-15",NA,NA,12
442 | "2005-03-16",NA,NA,12
443 | "2005-03-17",1.02,1.77,12
444 | "2005-03-18",NA,NA,12
445 | "2005-03-19",NA,NA,12
446 | "2005-03-20",NA,NA,12
447 | "2005-03-21",NA,NA,12
448 | "2005-03-22",NA,NA,12
449 | "2005-03-23",0.453,0.475,12
450 | "2005-03-24",NA,NA,12
451 | "2005-03-25",NA,NA,12
452 | "2005-03-26",NA,NA,12
453 | "2005-03-27",NA,NA,12
454 | "2005-03-28",NA,NA,12
455 | "2005-03-29",0.795,0.293,12
456 | "2005-03-30",NA,NA,12
457 | "2005-03-31",NA,NA,12
458 | "2005-04-01",NA,NA,12
459 | "2005-04-02",NA,NA,12
460 | "2005-04-03",NA,NA,12
461 | "2005-04-04",0.894,0.376,12
462 | "2005-04-05",NA,NA,12
463 | "2005-04-06",NA,NA,12
464 | "2005-04-07",NA,NA,12
465 | "2005-04-08",NA,NA,12
466 | "2005-04-09",NA,NA,12
467 | "2005-04-10",0.372,0.148,12
468 | "2005-04-11",NA,NA,12
469 | "2005-04-12",NA,NA,12
470 | "2005-04-13",NA,NA,12
471 | "2005-04-14",NA,NA,12
472 | "2005-04-15",NA,NA,12
473 | "2005-04-16",2.31,0.837,12
474 | "2005-04-17",NA,NA,12
475 | "2005-04-18",NA,NA,12
476 | "2005-04-19",NA,NA,12
477 | "2005-04-20",NA,NA,12
478 | "2005-04-21",NA,NA,12
479 | "2005-04-22",0.916,1.1,12
480 | "2005-04-23",NA,NA,12
481 | "2005-04-24",NA,NA,12
482 | "2005-04-25",NA,NA,12
483 | "2005-04-26",NA,NA,12
484 | "2005-04-27",NA,NA,12
485 | "2005-04-28",1.1,0.314,12
486 | "2005-04-29",NA,NA,12
487 | "2005-04-30",NA,NA,12
488 | "2005-05-01",NA,NA,12
489 | "2005-05-02",NA,NA,12
490 | "2005-05-03",NA,NA,12
491 | "2005-05-04",1.8,0.353,12
492 | "2005-05-05",NA,NA,12
493 | "2005-05-06",NA,NA,12
494 | "2005-05-07",NA,NA,12
495 | "2005-05-08",NA,NA,12
496 | "2005-05-09",NA,NA,12
497 | "2005-05-10",0.873,0.284,12
498 | "2005-05-11",NA,NA,12
499 | "2005-05-12",NA,NA,12
500 | "2005-05-13",NA,NA,12
501 | "2005-05-14",NA,NA,12
502 | "2005-05-15",NA,NA,12
503 | "2005-05-16",NA,NA,12
504 | "2005-05-17",NA,NA,12
505 | "2005-05-18",NA,NA,12
506 | "2005-05-19",NA,NA,12
507 | "2005-05-20",NA,NA,12
508 | "2005-05-21",NA,NA,12
509 | "2005-05-22",1.22,0.402,12
510 | "2005-05-23",NA,NA,12
511 | "2005-05-24",NA,NA,12
512 | "2005-05-25",NA,NA,12
513 | "2005-05-26",NA,NA,12
514 | "2005-05-27",NA,NA,12
515 | "2005-05-28",NA,NA,12
516 | "2005-05-29",NA,NA,12
517 | "2005-05-30",NA,NA,12
518 | "2005-05-31",NA,NA,12
519 | "2005-06-01",NA,NA,12
520 | "2005-06-02",NA,NA,12
521 | "2005-06-03",1.33,0.447,12
522 | "2005-06-04",NA,NA,12
523 | "2005-06-05",NA,NA,12
524 | "2005-06-06",NA,NA,12
525 | "2005-06-07",NA,NA,12
526 | "2005-06-08",NA,NA,12
527 | "2005-06-09",0.995,0.443,12
528 | "2005-06-10",NA,NA,12
529 | "2005-06-11",NA,NA,12
530 | "2005-06-12",NA,NA,12
531 | "2005-06-13",NA,NA,12
532 | "2005-06-14",NA,NA,12
533 | "2005-06-15",2.02,0.426,12
534 | "2005-06-16",NA,NA,12
535 | "2005-06-17",NA,NA,12
536 | "2005-06-18",NA,NA,12
537 | "2005-06-19",NA,NA,12
538 | "2005-06-20",NA,NA,12
539 | "2005-06-21",2.77,0.365,12
540 | "2005-06-22",NA,NA,12
541 | "2005-06-23",NA,NA,12
542 | "2005-06-24",NA,NA,12
543 | "2005-06-25",NA,NA,12
544 | "2005-06-26",NA,NA,12
545 | "2005-06-27",1.45,0.203,12
546 | "2005-06-28",NA,NA,12
547 | "2005-06-29",NA,NA,12
548 | "2005-06-30",NA,NA,12
549 | "2005-07-01",NA,NA,12
550 | "2005-07-02",NA,NA,12
551 | "2005-07-03",NA,NA,12
552 | "2005-07-04",NA,NA,12
553 | "2005-07-05",NA,NA,12
554 | "2005-07-06",NA,NA,12
555 | "2005-07-07",NA,NA,12
556 | "2005-07-08",NA,NA,12
557 | "2005-07-09",1.56,0.405,12
558 | "2005-07-10",NA,NA,12
559 | "2005-07-11",NA,NA,12
560 | "2005-07-12",NA,NA,12
561 | "2005-07-13",NA,NA,12
562 | "2005-07-14",NA,NA,12
563 | "2005-07-15",2.37,0.428,12
564 | "2005-07-16",NA,NA,12
565 | "2005-07-17",NA,NA,12
566 | "2005-07-18",NA,NA,12
567 | "2005-07-19",NA,NA,12
568 | "2005-07-20",NA,NA,12
569 | "2005-07-21",1.72,0.366,12
570 | "2005-07-22",NA,NA,12
571 | "2005-07-23",NA,NA,12
572 | "2005-07-24",NA,NA,12
573 | "2005-07-25",NA,NA,12
574 | "2005-07-26",NA,NA,12
575 | "2005-07-27",1.25,0.261,12
576 | "2005-07-28",NA,NA,12
577 | "2005-07-29",NA,NA,12
578 | "2005-07-30",NA,NA,12
579 | "2005-07-31",NA,NA,12
580 | "2005-08-01",NA,NA,12
581 | "2005-08-02",NA,NA,12
582 | "2005-08-03",NA,NA,12
583 | "2005-08-04",NA,NA,12
584 | "2005-08-05",NA,NA,12
585 | "2005-08-06",NA,NA,12
586 | "2005-08-07",NA,NA,12
587 | "2005-08-08",1.44,0.742,12
588 | "2005-08-09",NA,NA,12
589 | "2005-08-10",NA,NA,12
590 | "2005-08-11",NA,NA,12
591 | "2005-08-12",NA,NA,12
592 | "2005-08-13",NA,NA,12
593 | "2005-08-14",1.55,0.265,12
594 | "2005-08-15",NA,NA,12
595 | "2005-08-16",NA,NA,12
596 | "2005-08-17",NA,NA,12
597 | "2005-08-18",NA,NA,12
598 | "2005-08-19",NA,NA,12
599 | "2005-08-20",NA,NA,12
600 | "2005-08-21",NA,NA,12
601 | "2005-08-22",NA,NA,12
602 | "2005-08-23",NA,NA,12
603 | "2005-08-24",NA,NA,12
604 | "2005-08-25",NA,NA,12
605 | "2005-08-26",NA,NA,12
606 | "2005-08-27",1.39,0.253,12
607 | "2005-08-28",NA,NA,12
608 | "2005-08-29",NA,NA,12
609 | "2005-08-30",NA,NA,12
610 | "2005-08-31",NA,NA,12
611 | "2005-09-01",1.69,0.561,12
612 | "2005-09-02",NA,NA,12
613 | "2005-09-03",NA,NA,12
614 | "2005-09-04",NA,NA,12
615 | "2005-09-05",NA,NA,12
616 | "2005-09-06",NA,NA,12
617 | "2005-09-07",1.65,0.24,12
618 | "2005-09-08",NA,NA,12
619 | "2005-09-09",NA,NA,12
620 | "2005-09-10",NA,NA,12
621 | "2005-09-11",NA,NA,12
622 | "2005-09-12",NA,NA,12
623 | "2005-09-13",1.07,0.361,12
624 | "2005-09-14",NA,NA,12
625 | "2005-09-15",NA,NA,12
626 | "2005-09-16",NA,NA,12
627 | "2005-09-17",NA,NA,12
628 | "2005-09-18",NA,NA,12
629 | "2005-09-19",1.19,0.357,12
630 | "2005-09-20",NA,NA,12
631 | "2005-09-21",NA,NA,12
632 | "2005-09-22",NA,NA,12
633 | "2005-09-23",NA,NA,12
634 | "2005-09-24",NA,NA,12
635 | "2005-09-25",1,0.257,12
636 | "2005-09-26",NA,NA,12
637 | "2005-09-27",NA,NA,12
638 | "2005-09-28",NA,NA,12
639 | "2005-09-29",NA,NA,12
640 | "2005-09-30",NA,NA,12
641 | "2005-10-01",1.15,0.34,12
642 | "2005-10-02",NA,NA,12
643 | "2005-10-03",NA,NA,12
644 | "2005-10-04",NA,NA,12
645 | "2005-10-05",NA,NA,12
646 | "2005-10-06",NA,NA,12
647 | "2005-10-07",1.18,0.257,12
648 | "2005-10-08",NA,NA,12
649 | "2005-10-09",NA,NA,12
650 | "2005-10-10",NA,NA,12
651 | "2005-10-11",NA,NA,12
652 | "2005-10-12",NA,NA,12
653 | "2005-10-13",1.24,1.96,12
654 | "2005-10-14",NA,NA,12
655 | "2005-10-15",NA,NA,12
656 | "2005-10-16",NA,NA,12
657 | "2005-10-17",NA,NA,12
658 | "2005-10-18",NA,NA,12
659 | "2005-10-19",NA,NA,12
660 | "2005-10-20",NA,NA,12
661 | "2005-10-21",NA,NA,12
662 | "2005-10-22",NA,NA,12
663 | "2005-10-23",NA,NA,12
664 | "2005-10-24",NA,NA,12
665 | "2005-10-25",NA,NA,12
666 | "2005-10-26",NA,NA,12
667 | "2005-10-27",1.79,0.653,12
668 | "2005-10-28",NA,NA,12
669 | "2005-10-29",NA,NA,12
670 | "2005-10-30",NA,NA,12
671 | "2005-10-31",1.77,0.508,12
672 | "2005-11-01",NA,NA,12
673 | "2005-11-02",NA,NA,12
674 | "2005-11-03",NA,NA,12
675 | "2005-11-04",NA,NA,12
676 | "2005-11-05",NA,NA,12
677 | "2005-11-06",1.83,0.791,12
678 | "2005-11-07",NA,NA,12
679 | "2005-11-08",NA,NA,12
680 | "2005-11-09",NA,NA,12
681 | "2005-11-10",NA,NA,12
682 | "2005-11-11",NA,NA,12
683 | "2005-11-12",0.969,1.43,12
684 | "2005-11-13",NA,NA,12
685 | "2005-11-14",NA,NA,12
686 | "2005-11-15",NA,NA,12
687 | "2005-11-16",NA,NA,12
688 | "2005-11-17",NA,NA,12
689 | "2005-11-18",0.537,0.187,12
690 | "2005-11-19",NA,NA,12
691 | "2005-11-20",NA,NA,12
692 | "2005-11-21",NA,NA,12
693 | "2005-11-22",NA,NA,12
694 | "2005-11-23",NA,NA,12
695 | "2005-11-24",0.775,1.95,12
696 | "2005-11-25",NA,NA,12
697 | "2005-11-26",NA,NA,12
698 | "2005-11-27",NA,NA,12
699 | "2005-11-28",NA,NA,12
700 | "2005-11-29",0.508,2.68,12
701 | "2005-11-30",NA,NA,12
702 | "2005-12-01",NA,NA,12
703 | "2005-12-02",NA,NA,12
704 | "2005-12-03",NA,NA,12
705 | "2005-12-04",NA,NA,12
706 | "2005-12-05",NA,NA,12
707 | "2005-12-06",0.553,2.41,12
708 | "2005-12-07",NA,NA,12
709 | "2005-12-08",NA,NA,12
710 | "2005-12-09",NA,NA,12
711 | "2005-12-10",NA,NA,12
712 | "2005-12-11",NA,NA,12
713 | "2005-12-12",1.34,7.51,12
714 | "2005-12-13",NA,NA,12
715 | "2005-12-14",NA,NA,12
716 | "2005-12-15",NA,NA,12
717 | "2005-12-16",NA,NA,12
718 | "2005-12-17",NA,NA,12
719 | "2005-12-18",0.987,4.69,12
720 | "2005-12-19",NA,NA,12
721 | "2005-12-20",NA,NA,12
722 | "2005-12-21",NA,NA,12
723 | "2005-12-22",NA,NA,12
724 | "2005-12-23",NA,NA,12
725 | "2005-12-24",0.814,5.53,12
726 | "2005-12-25",NA,NA,12
727 | "2005-12-26",NA,NA,12
728 | "2005-12-27",NA,NA,12
729 | "2005-12-28",NA,NA,12
730 | "2005-12-29",NA,NA,12
731 | "2005-12-30",NA,NA,12
732 | "2005-12-31",NA,NA,12
733 | 


--------------------------------------------------------------------------------
/datasets/medals/Athelete_Country_Map.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/medals/Athelete_Country_Map.csv


--------------------------------------------------------------------------------
/datasets/medals/Athelete_Sports_Map.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/medals/Athelete_Sports_Map.csv


--------------------------------------------------------------------------------
/datasets/medals/Medals.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JasonMDev/learning-python-predictive-analytics/6757b9ef3e7bbb803d4a1c00d999225cbb4fe2eb/datasets/medals/Medals.csv


--------------------------------------------------------------------------------